1 /*
2 * Copyright (c) 2012 Tim Ruehsen
3 * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4 *
5 * This file is part of libwget.
6 *
7 * Libwget is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * Libwget is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libwget. If not, see <https://www.gnu.org/licenses/>.
19 *
20 *
21 * URI/IRI routines
22 * about encoding see http://nikitathespider.com/articles/EncodingDivination.html
23 * about GET encoding see https://stackoverflow.com/questions/1549213/whats-the-correct-encoding-of-http-get-request-strings
24 * RFC 3986: URI generic syntax
25 *
26 *
27 * Changelog
28 * 25.04.2012 Tim Ruehsen created
29 *
30 */
31
32 #include <config.h>
33
34 #include <string.h>
35 #include <errno.h>
36 #include "c-ctype.h"
37
38 #include <wget.h>
39 #include "private.h"
40
41 /**
42 * \file
43 * \brief Functions to work with URIs and IRIs
44 * \defgroup libwget-iri URIs/IRIs
45 *
46 * @{
47 *
48 * URI/IRI parsing and manipulation functions.
49 *
50 * IRIs are processed according to [RFC 3987](https://datatracker.ietf.org/doc/rfc3987/).
51 * Functions that escape certain characters (such as wget_iri_escape()) work according to
52 * [RFC 3986](https://datatracker.ietf.org/doc/rfc3986/).
53 *
54 * The \ref wget_iri_st "wget_iri" structure represents an IRI. You generate one from a string with wget_iri_parse() or
55 * wget_iri_parse_base(). You can use wget_iri_clone() to generate another identical \ref wget_iri_st "wget_iri".
56 *
57 * You can access each of the fields of a \ref wget_iri_st "wget_iri" (such as `path`) independently, and you can use
58 * the getters here to escape each of those parts, or for convenience (e.g wget_iri_get_escaped_host(),
59 * wget_iri_get_escaped_resource(), etc.).
60 *
61 * URIs/IRIs are all internally treated in UTF-8. The parsing functions that generate a \ref wget_iri_st "wget_iri" structure
62 * (wget_iri_parse() and wget_iri_parse_base()) thus convert the input string to UTF-8 before anything else.
63 * These functions take an `encoding` parameter that tells which is the original encoding of that string.
64 *
65 * Conversely, the getters (for example, wget_iri_get_path()) can convert the output string from UTF-8
66 * to an encoding of choice. The desired encoding is also specified in the `encoding` parameter.
67 *
68 * The `encoding` parameter, in all functions that accept it, is a string with the name of a character set
69 * supported by GNU libiconv. You can find such a list elsewhere, but popular examples are "utf-8", "utf-16" or "iso-8859-1".
70 */
71
72 static const char
73 *default_page = "index.html";
74 static size_t
75 default_page_length = 10;
76
77 static struct iri_scheme {
78 uint16_t port;
79 const char name[6];
80 } schemes[] = {
81 [WGET_IRI_SCHEME_HTTP] = { 80, "http" },
82 [WGET_IRI_SCHEME_HTTPS] = { 443, "https" },
83 };
84
85 /**
86 * \param[in] scheme Scheme to get name for
87 * \return Name of \p scheme (e.g. "http" or "https") or NULL is not supported
88 *
89 * Maps \p scheme to it's string representation.
90 */
wget_iri_scheme_get_name(wget_iri_scheme scheme)91 const char *wget_iri_scheme_get_name(wget_iri_scheme scheme)
92 {
93 if ((unsigned) scheme < countof(schemes))
94 return schemes[scheme].name;
95
96 return NULL;
97 }
98
99 /**
100 * \param[in] iri An IRI
101 * \return 1 if the scheme is supported, 0 if not
102 *
103 * Tells whether the IRI's scheme is supported or not.
104 */
wget_iri_supported(const wget_iri * iri)105 bool wget_iri_supported(const wget_iri *iri)
106 {
107 return (unsigned) iri->scheme < countof(schemes);
108 }
109
110
111 /* \cond _hide_internal_symbols */
112 #define IRI_CTYPE_GENDELIM (1<<0)
113 #define iri_isgendelim(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_GENDELIM)
114
115 #define IRI_CTYPE_SUBDELIM (1<<1)
116 #define iri_issubdelim(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_SUBDELIM)
117
118 #define IRI_CTYPE_UNRESERVED (1<<2)
119 #define iri_isunreserved(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_UNRESERVED)
120
121 #define iri_isscheme(c) (c_isalnum(c) || c == '+' || c == '-' || c == '.')
122 /* \endcond */
123
124 static const unsigned char
125 iri_ctype[256] = {
126 [':'] = IRI_CTYPE_GENDELIM,
127 ['/'] = IRI_CTYPE_GENDELIM,
128 ['?'] = IRI_CTYPE_GENDELIM,
129 ['#'] = IRI_CTYPE_GENDELIM,
130 ['['] = IRI_CTYPE_GENDELIM,
131 [']'] = IRI_CTYPE_GENDELIM,
132 ['@'] = IRI_CTYPE_GENDELIM,
133
134 ['!'] = IRI_CTYPE_SUBDELIM,
135 ['$'] = IRI_CTYPE_SUBDELIM,
136 ['&'] = IRI_CTYPE_SUBDELIM,
137 ['\''] = IRI_CTYPE_SUBDELIM,
138 ['('] = IRI_CTYPE_SUBDELIM,
139 [')'] = IRI_CTYPE_SUBDELIM,
140 ['*'] = IRI_CTYPE_SUBDELIM,
141 ['+'] = IRI_CTYPE_SUBDELIM,
142 [','] = IRI_CTYPE_SUBDELIM,
143 [';'] = IRI_CTYPE_SUBDELIM,
144 ['='] = IRI_CTYPE_SUBDELIM,
145
146 ['0'] = IRI_CTYPE_UNRESERVED,
147 ['1'] = IRI_CTYPE_UNRESERVED,
148 ['2'] = IRI_CTYPE_UNRESERVED,
149 ['3'] = IRI_CTYPE_UNRESERVED,
150 ['4'] = IRI_CTYPE_UNRESERVED,
151 ['5'] = IRI_CTYPE_UNRESERVED,
152 ['6'] = IRI_CTYPE_UNRESERVED,
153 ['7'] = IRI_CTYPE_UNRESERVED,
154 ['8'] = IRI_CTYPE_UNRESERVED,
155 ['9'] = IRI_CTYPE_UNRESERVED,
156 ['a'] = IRI_CTYPE_UNRESERVED,
157 ['b'] = IRI_CTYPE_UNRESERVED,
158 ['c'] = IRI_CTYPE_UNRESERVED,
159 ['d'] = IRI_CTYPE_UNRESERVED,
160 ['e'] = IRI_CTYPE_UNRESERVED,
161 ['f'] = IRI_CTYPE_UNRESERVED,
162 ['g'] = IRI_CTYPE_UNRESERVED,
163 ['h'] = IRI_CTYPE_UNRESERVED,
164 ['i'] = IRI_CTYPE_UNRESERVED,
165 ['j'] = IRI_CTYPE_UNRESERVED,
166 ['k'] = IRI_CTYPE_UNRESERVED,
167 ['l'] = IRI_CTYPE_UNRESERVED,
168 ['m'] = IRI_CTYPE_UNRESERVED,
169 ['n'] = IRI_CTYPE_UNRESERVED,
170 ['o'] = IRI_CTYPE_UNRESERVED,
171 ['p'] = IRI_CTYPE_UNRESERVED,
172 ['q'] = IRI_CTYPE_UNRESERVED,
173 ['r'] = IRI_CTYPE_UNRESERVED,
174 ['s'] = IRI_CTYPE_UNRESERVED,
175 ['t'] = IRI_CTYPE_UNRESERVED,
176 ['u'] = IRI_CTYPE_UNRESERVED,
177 ['v'] = IRI_CTYPE_UNRESERVED,
178 ['w'] = IRI_CTYPE_UNRESERVED,
179 ['x'] = IRI_CTYPE_UNRESERVED,
180 ['y'] = IRI_CTYPE_UNRESERVED,
181 ['z'] = IRI_CTYPE_UNRESERVED,
182 ['A'] = IRI_CTYPE_UNRESERVED,
183 ['B'] = IRI_CTYPE_UNRESERVED,
184 ['C'] = IRI_CTYPE_UNRESERVED,
185 ['D'] = IRI_CTYPE_UNRESERVED,
186 ['E'] = IRI_CTYPE_UNRESERVED,
187 ['F'] = IRI_CTYPE_UNRESERVED,
188 ['G'] = IRI_CTYPE_UNRESERVED,
189 ['H'] = IRI_CTYPE_UNRESERVED,
190 ['I'] = IRI_CTYPE_UNRESERVED,
191 ['J'] = IRI_CTYPE_UNRESERVED,
192 ['K'] = IRI_CTYPE_UNRESERVED,
193 ['L'] = IRI_CTYPE_UNRESERVED,
194 ['M'] = IRI_CTYPE_UNRESERVED,
195 ['N'] = IRI_CTYPE_UNRESERVED,
196 ['O'] = IRI_CTYPE_UNRESERVED,
197 ['P'] = IRI_CTYPE_UNRESERVED,
198 ['Q'] = IRI_CTYPE_UNRESERVED,
199 ['R'] = IRI_CTYPE_UNRESERVED,
200 ['S'] = IRI_CTYPE_UNRESERVED,
201 ['T'] = IRI_CTYPE_UNRESERVED,
202 ['U'] = IRI_CTYPE_UNRESERVED,
203 ['V'] = IRI_CTYPE_UNRESERVED,
204 ['W'] = IRI_CTYPE_UNRESERVED,
205 ['X'] = IRI_CTYPE_UNRESERVED,
206 ['Y'] = IRI_CTYPE_UNRESERVED,
207 ['Z'] = IRI_CTYPE_UNRESERVED,
208 ['-'] = IRI_CTYPE_UNRESERVED,
209 ['.'] = IRI_CTYPE_UNRESERVED,
210 ['_'] = IRI_CTYPE_UNRESERVED,
211 ['~'] = IRI_CTYPE_UNRESERVED
212 };
213
214 /**
215 * \param[in] c A character
216 * \return 1 if \p c is a generic delimiter, 0 if not
217 *
218 * Tests whether \p c is a generic delimiter (gen-delim),
219 * according to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2).
220 */
wget_iri_isgendelim(char c)221 bool wget_iri_isgendelim(char c)
222 {
223 // return strchr(":/?#[]@",c)!=NULL;
224 return iri_isgendelim(c);
225 }
226
227 /**
228 * \param[in] c A character
229 * \return 1 if \p c is a subcomponent delimiter, 0 if not
230 *
231 * Tests whether \p c is a subcomponent delimiter (sub-delim)
232 * according to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2).
233 */
wget_iri_issubdelim(char c)234 bool wget_iri_issubdelim(char c)
235 {
236 // return strchr("!$&\'()*+,;=",c)!=NULL;
237 return iri_issubdelim(c);
238 }
239
240 /**
241 * \param[in] c A character
242 * \return 1 if \p c is a reserved character, 0 if not
243 *
244 * Tests whether \p c is a reserved character.
245 *
246 * According to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2),
247 * the set of reserved characters is formed
248 * by the generic delimiters (gen-delims, wget_iri_isgendelim()) and the
249 * subcomponent delimiters (sub-delims, wget_iri_is_subdelim()).
250 *
251 * This function is thus equivalent to:
252 *
253 * return wget_iri_isgendelim(c) || wget_iri_issubdelim(c);
254 *
255 */
wget_iri_isreserved(char c)256 bool wget_iri_isreserved(char c)
257 {
258 return wget_iri_isgendelim(c) || wget_iri_issubdelim(c);
259 }
260
261 /**
262 * \param[in] c A character
263 * \return 1 if \p c is an unreserved character, 0 if not
264 *
265 * Tests whether \p c is an unreserved character.
266 */
wget_iri_isunreserved(char c)267 bool wget_iri_isunreserved(char c)
268 {
269 return iri_isunreserved(c);
270 }
271
unhex(unsigned char c)272 static unsigned char WGET_GCC_CONST unhex(unsigned char c)
273 {
274 return c <= '9' ? c - '0' : (c <= 'F' ? c - 'A' + 10 : c - 'a' + 10);
275 }
276
iri_unescape_inline(char * src,int ctype)277 static char *iri_unescape_inline(char *src, int ctype)
278 {
279 char *ret = NULL;
280 unsigned char *s = (unsigned char *)src; // just a helper to avoid casting a lot
281 unsigned char *d = s;
282
283 while (*s) {
284 if (*s == '%') {
285 if (c_isxdigit(s[1]) && c_isxdigit(s[2])) {
286 unsigned char c = (unsigned char) (unhex(s[1]) << 4) | unhex(s[2]);
287 if (!ctype || (!(iri_ctype[(unsigned char)(c)] & ctype) && c != '%')) {
288 *d++ = c;
289 s += 3;
290 ret = src;
291 continue;
292 }
293 }
294 } else if (*s == '&') {
295 // entities are case sensitive (RFC1866, 3.2.3)
296 if (!strncmp((char *) s + 1, "amp;", 4)) {
297 *d++ = '&';
298 s += 5;
299 ret = src;
300 continue;
301 } else if (!strncmp((char *) s + 1, "gt;", 3)) {
302 *d++ = '>';
303 s += 4;
304 ret = src;
305 continue;
306 } else if (!strncmp((char *) s + 1, "lt;", 3)) {
307 *d++ = '<';
308 s += 4;
309 ret = src;
310 continue;
311 } else if (!strncmp((char *) s + 1, "quot;", 5)) {
312 *d++ = '\"';
313 s += 6;
314 ret = src;
315 continue;
316 } else if (!strncmp((char *) s + 1, "apos;", 5)) {
317 *d++ = '\'';
318 s += 6;
319 ret = src;
320 continue;
321 }
322 } else if (*s == '#') {
323 uint32_t value = 0;
324
325 if (s[1] == 'x') {
326 unsigned char *p = s + 2;
327 while (c_isxdigit(*p)) {
328 value = (value << 4) | unhex(*p);
329 p++;
330 }
331 if (*p == ';') {
332 if (value > 0 && value < 128) {
333 *d++ = (unsigned char) value;
334 s = p + 1;
335 continue;
336 }
337 // else: we have to convert the unicode value to whatever encoding the URL is in (likely UTF-8)
338 // this cannot be done inline since the URL's length may increase
339 }
340 } else {
341 unsigned char *p = s + 1;
342 while (c_isdigit(*p) && value <= 0x10FFFF) { // max. Unicode value
343 value = value * 10 + (*p - '0');
344 p++;
345 }
346 if (*p == ';') {
347 if (value > 0 && value < 128) {
348 *d++ = (unsigned char) value;
349 s = p + 1;
350 continue;
351 }
352 // else: we have to convert the unicode value to whatever encoding the URL is in (likely UTF-8)
353 // this cannot be done inline since the URL's length may increase
354 }
355 }
356 } else if (*s == '\r' || *s == '\n') {
357 // Ignore / remove CR and LF from URLs. See https://gitlab.com/gnuwget/wget2/-/issues/522
358 s++;
359 continue;
360 }
361
362 *d++ = *s++;
363 }
364 *d = 0;
365
366 return ret;
367 }
368
369 /**
370 * \param[in] src A string
371 * \return A pointer to \p src, after the transformation is done
372 *
373 * Unescape a string. All the percent-encoded characters (`%XX`) are converted
374 * back to their original form.
375 *
376 * **The transformation is done inline**, so `src` will be modified after this function returns.
377 * If no percent-encoded characters are found, the string is left untouched.
378 */
wget_iri_unescape_inline(char * src)379 char *wget_iri_unescape_inline(char *src)
380 {
381 return iri_unescape_inline(src, 0);
382 }
383
384 /**
385 * \param[in] src A string
386 * \return A pointer to \p src, after the transformation is done
387 *
388 * Unescape a string except escaped generic delimiters (and escaped '%'.
389 * The percent-encoded characters (`%XX`) are converted back to their original form.
390 *
391 * This variant of unescaping is helpful before an URL is being parsed, so that
392 * the parser recognizes e.g. 'http%3A//' as relative URL (path) and not as a scheme.
393 *
394 * **The transformation is done inline**, so `src` will be modified after this function returns.
395 * If no characters were unescaped, the string is left untouched.
396 */
wget_iri_unescape_url_inline(char * src)397 char *wget_iri_unescape_url_inline(char *src)
398 {
399 return iri_unescape_inline(src, IRI_CTYPE_GENDELIM);
400 }
401
402 /**
403 * \param[in] iri An IRI
404 *
405 * Free the heap-allocated content of the provided IRI, but leave the rest
406 * of the fields.
407 *
408 * This function frees the following fields of \ref wget_iri_st "wget_iri":
409 *
410 * - `host`
411 * - `path`
412 * - `query`
413 * - `fragment`
414 * - `connection_part`
415 */
wget_iri_free_content(wget_iri * iri)416 void wget_iri_free_content(wget_iri *iri)
417 {
418 if (iri) {
419 if (iri->uri_allocated)
420 xfree(iri->uri);
421 if (iri->host_allocated)
422 xfree(iri->host);
423 if (iri->path_allocated)
424 xfree(iri->path);
425 if (iri->query_allocated)
426 xfree(iri->query);
427 if (iri->fragment_allocated)
428 xfree(iri->fragment);
429 xfree(iri->connection_part);
430 }
431 }
432
433 /**
434 * \param[in] iri A pointer to a pointer to an IRI (a \ref wget_iri_st "wget_iri")
435 *
436 * Destroy a \ref wget_iri_st "wget_iri" structure.
437 *
438 * The provided pointer is set to NULL.
439 */
wget_iri_free(wget_iri ** iri)440 void wget_iri_free(wget_iri **iri)
441 {
442 if (iri && *iri) {
443 wget_iri_free_content(*iri);
444 xfree(*iri);
445 }
446 }
447
448 // URIs are assumed to be unescaped at this point
449
450 /**
451 * \param[in] url A URL/IRI
452 * \param[in] encoding Original encoding of \p url
453 * \return A libwget IRI (`wget_iri`)
454 *
455 * The host, path, query and fragment parts will be converted to UTF-8 from
456 * the encoding given in the parameter \p encoding. GNU libiconv is used
457 * to perform the conversion, so this value should be the name of a valid character set
458 * supported by that library, such as "utf-8" or "iso-8859-1".
459 */
wget_iri_parse(const char * url,const char * encoding)460 wget_iri *wget_iri_parse(const char *url, const char *encoding)
461 {
462 wget_iri *iri;
463 char *p, *s, *authority, c;
464 size_t slen, extra;
465 int have_scheme;
466
467 if (!url)
468 return NULL;
469
470 /*
471 URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
472 hier-part = "//" authority path-abempty / path-absolute / path-rootless / path-empty
473 scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
474 */
475 while (c_isspace(*url)) url++;
476 if (!*url) return NULL;
477 /*
478 // first unescape, than convert to UTF-8
479 if (strchr(url, '%')) {
480 char *unesc_url = wget_strdup(url);
481
482 wget_percent_unescape(unesc_url);
483
484 if (wget_str_needs_encoding(unesc_url)) {
485 if ((url = wget_str_to_utf8(unesc_url, encoding)))
486 xfree(unesc_url);
487 else
488 url = unesc_url; // on error, use what we have
489 } else
490 url = unesc_url;
491
492 url_allocated = 1;
493 } else {
494 url_allocated = 0;
495
496 if (wget_str_needs_encoding(url)) {
497 if ((s = wget_str_to_utf8(url, encoding))) {
498 url = s;
499 url_allocated = 1;
500 }
501 }
502 }
503 */
504
505 if (c_isalpha(*url)) {
506 const char *x;
507 have_scheme = 1;
508
509 for (x = url; *x && iri_isscheme(*x); x++)
510 ;
511
512 if (*x != ':' || c_isdigit(x[1]))
513 have_scheme = 0; // not a scheme
514 } else
515 have_scheme = 0;
516
517 // just use one block of memory for all parsed URI parts
518 slen = strlen(url);
519 extra = have_scheme ? 0 : sizeof("http://") - 1; // extra space for http://
520
521 iri = wget_malloc(sizeof(wget_iri) + (slen + extra + 1) * 2);
522 if (!iri)
523 return NULL;
524
525 memset(iri, 0, sizeof(wget_iri));
526
527 if (have_scheme) {
528 iri->msize = slen + 1;
529 iri->uri = memcpy(((char *)iri) + sizeof(wget_iri), url, iri->msize);
530 p = s = memcpy((char *)iri->uri + iri->msize, url, iri->msize);
531 s = strchr(s, ':'); // we know there is a :
532 *s++ = 0;
533
534 // p points to scheme
535 wget_iri_unescape_inline(p); // percent unescape
536 wget_strtolower(p); // convert to lowercase
537
538 bool found = false; // assume the scheme is unsupported
539
540 // find the scheme in our static list of supported schemes
541 // for later comparisons we compare pointers (avoiding strcasecmp())
542 for (unsigned it = 0; it < countof(schemes); it++) {
543 if (!strcmp(schemes[it].name, p)) {
544 iri->scheme = it;
545 iri->port = schemes[it].port;
546 found = true;
547 break;
548 }
549 }
550
551 if (!found) {
552 debug_printf("Unsupported scheme in '%s'\n", url);
553 wget_iri_free(&iri);
554 return NULL;
555 }
556 } else {
557 // add http:// scheme to url
558 iri->uri = memcpy(((char *)iri) + sizeof(wget_iri), "http://", extra);
559 memcpy(((char *)iri) + sizeof(wget_iri) + extra, url, slen + 1);
560 iri->msize = slen + 1 + extra;
561 s = memcpy((char *)iri->uri + iri->msize, "http://", extra);
562 memcpy((char *)iri->uri + iri->msize + extra, url, slen + 1);
563 s[extra - 3] = 0;
564 s += extra;
565
566 iri->scheme = WGET_IRI_SCHEME_HTTP;
567 iri->port = schemes[WGET_IRI_SCHEME_HTTP].port;
568 }
569
570 // if (url_allocated)
571 // xfree(url);
572
573 // this is true for http, https, ftp, file (accept any number of /, like most browsers)
574 while (*s == '/')
575 s++;
576
577 // authority
578 authority = s;
579 while (*s && *s != '/' && *s != '?' && *s != '#')
580 s++;
581 c = *s;
582 if (c) *s++ = 0;
583 wget_iri_unescape_inline(authority);
584
585 // left over: [path][?query][#fragment]
586 if (c == '/') {
587 iri->path = s;
588 while (*s && *s != '?' && *s != '#')
589 s++;
590 c = *s;
591 if (c) *s++ = 0;
592 wget_iri_unescape_inline((char *)iri->path);
593 }
594
595 if (c == '?') {
596 iri->query = s;
597 while (*s && *s != '#') {
598 if (*s == '+')
599 *s = ' ';
600 s++;
601 }
602 c = *s;
603 if (c) *s++ = 0;
604 /* do not unescape query else we get ambiguity for chars like &, =, +, ... */
605 }
606
607 if (c == '#') {
608 iri->fragment = s;
609 s += strlen(s);
610 wget_iri_unescape_inline((char *)iri->fragment);
611 }
612
613 if (*s) {
614 debug_printf("unparsed rest '%s'\n", s);
615 }
616
617 if (*authority) {
618 s = authority;
619 p = strchr(authority, '@');
620 if (p) {
621 iri->userinfo = s;
622 *p = 0;
623 if ((s = strchr(s, ':'))) {
624 *s = 0;
625 iri->password = s + 1;
626 }
627 s = p + 1;
628 }
629 if (*s == '[') {
630 p = strrchr(s, ']');
631 if (p) {
632 iri->host = s + 1;
633 *p = 0;
634 s = p + 1;
635 } else {
636 // something is broken
637 iri->host = s + 1;
638 s += strlen(s);
639 }
640 } else {
641 iri->host = s;
642 while (*s && *s != ':')
643 s++;
644 }
645 if (*s == ':') {
646 if (c_isdigit(s[1])) {
647 int port = atoi(s + 1);
648 if (port > 0 && port < 65536) {
649 iri->port = (uint16_t) port;
650 iri->port_given = true;
651 }
652 }
653 }
654 *s = 0;
655 }
656
657 // now unescape all components (not interested in display, userinfo, password right now)
658
659 if (iri->host) {
660 wget_strtolower((char *)iri->host);
661 if (wget_str_needs_encoding(iri->host)) {
662 if ((s = wget_str_to_utf8(iri->host, encoding))) {
663 iri->host = s;
664 iri->host_allocated = true;
665 }
666 }
667 if ((p = (char *)wget_str_to_ascii(iri->host)) != iri->host) {
668 if (iri->host_allocated)
669 xfree(iri->host);
670 iri->host = p;
671 iri->host_allocated = true;
672 }
673
674 // Finally, if the host is a literal IPv4 or IPv6 address, mark it as so
675 if (wget_ip_is_family(iri->host, WGET_NET_FAMILY_IPV4) || wget_ip_is_family(iri->host, WGET_NET_FAMILY_IPV6))
676 iri->is_ip_address = true;
677 }
678
679 if (!iri->host) {
680 error_printf(_("Missing host/domain in URI '%s'\n"), iri->uri);
681 wget_iri_free(&iri);
682 return NULL;
683 }
684
685 if (iri->path && wget_str_needs_encoding(iri->path)) {
686 if ((s = wget_str_to_utf8(iri->path, encoding))) {
687 iri->path = s;
688 iri->path_allocated = true;
689 }
690 }
691
692 if (iri->query && wget_str_needs_encoding(iri->query)) {
693 if ((s = wget_str_to_utf8(iri->query, encoding))) {
694 iri->query = s;
695 iri->query_allocated = true;
696 }
697 }
698
699 if (iri->fragment && wget_str_needs_encoding(iri->fragment)) {
700 if ((s = wget_str_to_utf8(iri->fragment, encoding))) {
701 iri->fragment = s;
702 iri->fragment_allocated = true;
703 }
704 }
705
706 /*
707 debug_printf("scheme=%s\n",iri->scheme);
708 debug_printf("host=%s\n",iri->host);
709 debug_printf("path=%s\n",iri->path);
710 debug_printf("query=%s\n",iri->query);
711 debug_printf("fragment=%s\n",iri->fragment);
712 */
713
714 return iri;
715 }
716
717 /**
718 * \param[in] iri An IRI
719 * \return A new IRI, with the exact same contents as the provided one.
720 *
721 * Clone the provided IRI.
722 */
wget_iri_clone(const wget_iri * iri)723 wget_iri *wget_iri_clone(const wget_iri *iri)
724 {
725 if (!iri || !iri->uri)
726 return NULL;
727
728 size_t slen = strlen(iri->uri);
729 wget_iri *clone = wget_malloc(sizeof(wget_iri) + (slen + 1) + iri->msize);
730
731 if (!clone)
732 return NULL;
733
734 memcpy(clone, iri, sizeof(wget_iri));
735 clone->uri = memcpy(((char *)clone) + sizeof(wget_iri), iri->uri, slen + 1);
736 memcpy((char *)clone->uri + slen + 1, (char *)iri->uri + slen + 1, iri->msize);
737 clone->uri_allocated = 0;
738
739 clone->connection_part = wget_strdup(iri->connection_part);
740
741 // adjust pointers
742 if (iri->host_allocated)
743 clone->host = wget_strdup(iri->host);
744 else
745 clone->host = iri->host ? (char *)clone + (size_t) (iri->host - (const char *)iri) : NULL;
746
747 clone->display = iri->display ? (char *)clone + (size_t) (iri->display - (const char *)iri): NULL;
748 // not adjust scheme, it is a pointer to a static string
749 clone->userinfo = iri->userinfo ? (char *)clone + (size_t) (iri->userinfo - (const char *)iri): NULL;
750 clone->password = iri->password ? (char *)clone + (size_t) (iri->password - (const char *)iri): NULL;
751
752 if (iri->path_allocated)
753 clone->path = wget_strdup(iri->path);
754 else
755 clone->path = iri->path ? (char *)clone + (size_t) (iri->path - (const char *)iri): NULL;
756
757 if (iri->query_allocated)
758 clone->query = wget_strdup(iri->query);
759 else
760 clone->query = iri->query ? (char *)clone + (size_t) (iri->query - (const char *)iri): NULL;
761
762 if (iri->fragment_allocated)
763 clone->fragment = wget_strdup(iri->fragment);
764 else
765 clone->fragment = iri->fragment ? (char *)clone + (size_t) (iri->fragment - (const char *)iri): NULL;
766
767 return clone;
768 }
769
770 /**
771 * \param[in] iri An IRI
772 * \param[in] buf A buffer, where the resulting string will be put
773 * \return The contents of the buffer \p buf
774 *
775 * Append the connection part of the IRI \p iri to \p buf.
776 *
777 * The connection part is formed by the scheme, the hostname, and optionally the port. For example:
778 *
779 * https://localhost:8080
780 * https://www.example.com
781 *
782 * It may be of the form `https://example.com:8080` if the port was provided when creating the IRI
783 * or of the form `https://example.com` otherwise.
784 */
wget_iri_get_connection_part(const wget_iri * iri,wget_buffer * buf)785 const char *wget_iri_get_connection_part(const wget_iri *iri, wget_buffer *buf)
786 {
787 if (iri) {
788 if (iri->port_given) {
789 wget_buffer_printf_append(buf, "%s://%s:%hu", schemes[iri->scheme].name, iri->host, iri->port);
790 } else {
791 wget_buffer_printf_append(buf, "%s://%s", schemes[iri->scheme].name, iri->host);
792 }
793 }
794
795 return buf->data;
796 }
797
798 // normalize /../ and remove /./
799
normalize_path(char * path)800 static size_t WGET_GCC_NONNULL_ALL normalize_path(char *path)
801 {
802 char *p1 = path, *p2 = path;
803
804 debug_printf("path %s ->\n", path);
805
806 // skip ./ and ../ at the beginning of the path
807 for (;;) {
808 if (*p2 == '/')
809 p2++;
810 else if (*p2 == '.') {
811 if (p2[1] == '/')
812 p2 += 2;
813 else if (p2[1] == '.') {
814 if (p2[2] == '/')
815 p2 += 3;
816 else if (!p2[2])
817 p2 += 2;
818 else
819 break;
820 }
821 else if (!p2[1])
822 p2++;
823 else
824 break;
825 } else
826 break;
827 }
828
829 // normalize path but stop at query or fragment
830 while (*p2 && *p2 != '?' && *p2 != '#') {
831 if (*p2 == '/') {
832 if (p2[1] == '.') {
833 if (!strncmp(p2, "/../", 4)) {
834 // go one level up
835 p2 += 3;
836 while (p1 > path && *--p1 != '/');
837 } else if (!strcmp(p2, "/..")) {
838 p2 += 3;
839 while (p1 > path && *--p1 != '/');
840 if (p1 > path) *p1++='/';
841 } else if (!strncmp(p2, "/./", 3)) {
842 p2 += 2;
843 } else if (!strcmp(p2, "/.")) {
844 p2 += 2;
845 if (p1 > path) *p1++='/';
846 } else
847 *p1++ = *p2++;
848 } else if (p1 == path)
849 p2++; // avoid leading slash
850 else if (p2[1] == '/')
851 p2++; // double slash to single slash
852 else
853 *p1++ = *p2++;
854 } else
855 *p1++ = *p2++;
856 }
857
858 if (p1 != p2) {
859 while (*p2)
860 *p1++ = *p2++;
861
862 *p1 = 0;
863 } else {
864 p1 += strlen(p1);
865 }
866
867 debug_printf(" %s\n", path);
868
869 return p1 - path;
870 }
871
872 // create an absolute URI from a base + relative URI
873
874 //char *iri_relative_to_absolute(IRI *iri, const char *tag, const char *val, size_t len, char *dst, size_t dst_size)
875 /**
876 * \param[in] base A base IRI
877 * \param[in] val A path, or another URI
878 * \param[in] len Length of the string \p val or -1
879 * \param[in] buf Destination buffer, where the result will be copied.
880 * \return A new URI (string) which is based on the base IRI \p base provided, or NULL in case of error.
881 *
882 * Calculates a new URI which is based on the provided IRI \p base.
883 *
884 * Taking the IRI \p base as a starting point, a new URI is created with the path \p val, which may be
885 * a relative or absolute path, or even a whole URI. The result is returned as a string, and if the buffer
886 * \p buf is provided, it is also placed there.
887 *
888 * If \p val is an absolute path (it begins with a `/`), it is normalized first. Then the provided IRI's
889 * path is replaced by that new path. If it's a relative path, the file name of the \p base IRI's path
890 * is replaced by that path. Finally, if \p val begins with a scheme (such as `https://`) that string is returned
891 * untouched, and placed in the buffer if provided.
892 *
893 * If \p base is NULL, then \p val must itself be an absolute URI. Likewise, if \p buf is NULL,
894 * then \p val must also be an absolute URI.
895 *
896 * if \p len is `-1`, the length of \p val will be the result from `strlen(val)`.
897 */
wget_iri_relative_to_abs(const wget_iri * base,const char * val,size_t len,wget_buffer * buf)898 const char *wget_iri_relative_to_abs(const wget_iri *base, const char *val, size_t len, wget_buffer *buf)
899 {
900 debug_printf("*url = %.*s\n", (int)len, val);
901
902 if (len == (size_t) -1)
903 len = strlen(val);
904
905 if (*val == '/') {
906 if (base) {
907 char path[len + 1];
908
909 // strlcpy or snprintf are ineffective here since they do strlen(val), which might be large
910 wget_strscpy(path, val, len + 1);
911
912 if (len >= 2 && val[1] == '/') {
913 char *p;
914
915 // absolute URI without scheme: //authority/path...
916 if ((p = strchr(path + 2, '/')))
917 normalize_path(p + 1);
918
919 wget_buffer_strcpy(buf, schemes[base->scheme].name);
920 wget_buffer_strcat(buf, ":");
921 wget_buffer_strcat(buf, path);
922 debug_printf("*1 %s\n", buf->data);
923 } else {
924 // absolute path
925 normalize_path(path);
926
927 wget_buffer_reset(buf);
928 wget_iri_get_connection_part(base, buf);
929 wget_buffer_strcat(buf, "/");
930 wget_buffer_strcat(buf, path);
931 debug_printf("*2 %s\n", buf->data);
932 }
933 } else {
934 return NULL;
935 }
936 } else {
937 // see if URI begins with a scheme:
938 if (memchr(val, ':', len)) {
939 // absolute URI
940 if (buf) {
941 wget_buffer_memcpy(buf, val, len);
942 debug_printf("*3 %s\n", buf->data);
943 } else {
944 debug_printf("*3 %s\n", val);
945 return val;
946 }
947 } else if (base) {
948 // relative path
949 const char *lastsep = base->path ? strrchr(base->path, '/') : NULL;
950 wget_buffer_reset(buf);
951 wget_iri_get_connection_part(base, buf);
952 wget_buffer_strcat(buf, "/");
953
954 size_t tmp_len = buf->length;
955
956 if (lastsep)
957 wget_buffer_memcat(buf, base->path, lastsep - base->path + 1);
958
959 if (len)
960 wget_buffer_memcat(buf, val, len);
961
962 buf->length = normalize_path(buf->data + tmp_len) + tmp_len;
963
964 debug_printf("*4 %s %zu\n", buf->data, buf->length);
965 } else if (val[len] == 0) {
966 return val;
967 } else {
968 return NULL;
969 }
970 }
971
972 return likely(buf) ? buf->data : NULL;
973 }
974
975 /**
976 * \param[in] base The base IRI
977 * \param[in] url A relative/absolute path (or a URI) to be appended to \p base
978 * \param[in] encoding The encoding of \p url (e.g. "utf-8" or "iso-8859-1")
979 * \return A new IRI
980 *
981 * Generate a new IRI by using the provided IRI \p base as a base and the path \p url.
982 *
983 * This is equivalent to:
984 *
985 * wget_iri *iri = wget_iri_parse(wget_iri_relative_to_abs(base, url, strlen(url), NULL), encoding);
986 * return iri;
987 *
988 * As such, \p url can be a relative or absolute path, or another URI.
989 *
990 * If \p base is NULL, then the parameter \p url must itself be an absolute URI.
991 */
wget_iri_parse_base(const wget_iri * base,const char * url,const char * encoding)992 wget_iri *wget_iri_parse_base(const wget_iri *base, const char *url, const char *encoding)
993 {
994 wget_iri *iri;
995
996 if (base) {
997 wget_buffer buf;
998 char sbuf[256];
999
1000 wget_buffer_init(&buf, sbuf, sizeof(sbuf));
1001 iri = wget_iri_parse(wget_iri_relative_to_abs(base, url, (size_t) -1, &buf), encoding);
1002 wget_buffer_deinit(&buf);
1003 } else {
1004 // no base: just check URL for being an absolute URI
1005 iri = wget_iri_parse(wget_iri_relative_to_abs(NULL, url, (size_t) -1, NULL), encoding);
1006 }
1007
1008 return iri;
1009 }
1010
1011 // RFC conform comparison as described in https://tools.ietf.org/html/rfc2616#section-3.2.3
1012 /**
1013 * \param[in] iri1 An IRI
1014 * \param[in] iri2 Another IRI
1015 * \return 0 if both IRIs are equal according to RFC 2616 or a non-zero value otherwise
1016 *
1017 * Compare two IRIs.
1018 *
1019 * Comparison is performed according to [RFC 2616, sect. 3.2.3](https://tools.ietf.org/html/rfc2616#section-3.2.3).
1020 *
1021 * This function uses wget_strcasecmp() to compare the various parts of the IRIs so a non-zero negative return value
1022 * indicates that \p iri1 is less than \p iri2, whereas a positive value indicates \p iri1 is greater than \p iri2.
1023 */
wget_iri_compare(wget_iri * iri1,wget_iri * iri2)1024 int wget_iri_compare(wget_iri *iri1, wget_iri *iri2)
1025 {
1026 int n;
1027
1028 if (!iri1) {
1029 if (!iri2)
1030 return 0;
1031 else
1032 return -1;
1033 } else if (!iri2)
1034 return 1;
1035
1036 // info_printf("iri %p %p %s:%s %s:%s\n",iri1,iri2,iri1->scheme,iri1->port,iri2->scheme,iri2->port);
1037
1038 /*
1039 if (!iri1->path) {
1040 // if (iri2->path && strcmp(iri2->path, "/"))
1041 if (iri2->path)
1042 return -1;
1043 }
1044 else if (!iri2->path) {
1045 // if (iri1->path && strcmp(iri1->path, "/"))
1046 if (iri1->path)
1047 return 1;
1048 }
1049 */
1050 if ((n = wget_strcasecmp(iri1->path, iri2->path)))
1051 return n;
1052
1053 if ((n = wget_strcasecmp(iri1->query, iri2->query)))
1054 return n;
1055
1056 if (iri1->scheme != iri2->scheme)
1057 return iri1->scheme < iri2->scheme ? -1 : 1;
1058
1059 if ((n = iri1->port - iri2->port))
1060 return n;
1061
1062 // host is already lowercase, no need to call strcasecmp()
1063 if ((n = strcmp(iri1->host, iri2->host)))
1064 return n;
1065
1066 // if ((n = wget_strcasecmp(iri1->fragment, iri2->fragment)))
1067 // return n;
1068
1069 return 0;
1070 }
1071
1072 /**
1073 * \param[in] src A string, whose reserved characters are to be percent-encoded
1074 * \param[in] buf A buffer where the result will be copied.
1075 * \return The contents of the buffer \p buf after \p src has been encoded.
1076 *
1077 * Escapes (using percent-encoding) all the reserved characters in the string \p src.
1078 *
1079 * If \p src is NULL, the contents of the buffer \p buf are returned. \p buf cannot be NULL.
1080 */
wget_iri_escape(const char * src,wget_buffer * buf)1081 const char *wget_iri_escape(const char *src, wget_buffer *buf)
1082 {
1083 const char *begin;
1084
1085 if (!src)
1086 return buf->data;
1087
1088 for (begin = src; *src; src++) {
1089 if (!iri_isunreserved(*src)) {
1090 if (begin != src)
1091 wget_buffer_memcat(buf, begin, src - begin);
1092 begin = src + 1;
1093 wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src);
1094 }
1095 }
1096
1097 if (begin != src)
1098 wget_buffer_memcat(buf, begin, src - begin);
1099
1100 return buf->data;
1101 }
1102
1103 /**
1104 * \param[in] src A string, whose reserved characters are to be percent-encoded
1105 * \param[in] buf A buffer where the result will be copied.
1106 * \return The contents of the buffer \p buf after \p src has been encoded
1107 * as described in https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1.
1108 *
1109 * Escapes the path part of the URI suitable for GET/POST requests (origin-form).
1110 * origin-form = absolute-path [ "?" query ]
1111 * path-absolute = "/" [ segment-nz *( "/" segment ) ]
1112 * segment-nz = 1*pchar
1113 * segment = *pchar
1114 * pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
1115 */
wget_iri_escape_path(const char * src,wget_buffer * buf)1116 const char *wget_iri_escape_path(const char *src, wget_buffer *buf)
1117 {
1118 const char *begin;
1119
1120 for (begin = src; *src; src++) {
1121 if (!(iri_isunreserved(*src) || iri_issubdelim(*src) || *src == '/' || *src == ':' || *src == '@')) {
1122 if (begin != src)
1123 wget_buffer_memcat(buf, begin, src - begin);
1124 begin = src + 1;
1125 wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src);
1126 }
1127 }
1128
1129 if (begin != src)
1130 wget_buffer_memcat(buf, begin, src - begin);
1131
1132 return buf->data;
1133 }
1134
1135 /**
1136 * \param[in] src A string, whose reserved characters are to be percent-encoded
1137 * \param[in] buf A buffer where the result will be copied.
1138 * \return The contents of the buffer \p buf after \p src has been encoded.
1139 *
1140 * Escapes (using percent-encoding) all the reserved characters in the string \p src
1141 * (just like wget_iri_escape()), but **excluding the equal sign `=` and the ampersand `&`**.
1142 * This function is thus ideally suited for query parts of URIs.
1143 */
wget_iri_escape_query(const char * src,wget_buffer * buf)1144 const char *wget_iri_escape_query(const char *src, wget_buffer *buf)
1145 {
1146 const char *begin;
1147
1148 for (begin = src; *src; src++) {
1149 if (!iri_isunreserved(*src) && *src != '=' && *src != '&') {
1150 if (begin != src)
1151 wget_buffer_memcat(buf, begin, src - begin);
1152 begin = src + 1;
1153 if (*src == ' ')
1154 wget_buffer_memcat(buf, "+", 1);
1155 else
1156 wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src);
1157 }
1158 }
1159
1160 if (begin != src)
1161 wget_buffer_memcat(buf, begin, src - begin);
1162
1163 return buf->data;
1164 }
1165
1166 /**
1167 * \param[in] iri An IRI
1168 * \param[in] buf A buffer, where the resulting string will be put
1169 * \return The contents of the buffer \p buf
1170 *
1171 * Return the host part of the provided IRI. It is placed in the buffer \p buf
1172 * and also returned as a `const char *`.
1173 *
1174 * The host is escaped using wget_iri_escape().
1175 */
wget_iri_get_escaped_host(const wget_iri * iri,wget_buffer * buf)1176 const char *wget_iri_get_escaped_host(const wget_iri *iri, wget_buffer *buf)
1177 {
1178 return wget_iri_escape(iri->host, buf);
1179 }
1180
1181 /**
1182 * \param[in] iri An IRI
1183 * \param[in] buf A buffer, where the resulting string will be put
1184 * \return The contents of the buffer \p buf
1185 *
1186 * Return the resource string, suitable for use in HTTP requests.
1187 * Details:
1188 * https://datatracker.ietf.org/doc/html/rfc7230#section-3.1.1
1189 * https://datatracker.ietf.org/doc/html/rfc7230#section-2.7
1190 * https://datatracker.ietf.org/doc/html/rfc3986#section-3.3
1191 *
1192 * The resource string is comprised of the path, plus the query part, if present. Example:
1193 *
1194 * /foo/bar/?param_1=one¶m_2=two
1195 *
1196 * Both the path and the query are escaped using wget_iri_escape_path() and
1197 * wget_iri_escape_query(), respectively.
1198 *
1199 * The resulting string is placed in the buffer \p buf and also returned as a `const char *`.
1200 */
wget_iri_get_escaped_resource(const wget_iri * iri,wget_buffer * buf)1201 const char *wget_iri_get_escaped_resource(const wget_iri *iri, wget_buffer *buf)
1202 {
1203 if (iri->path)
1204 wget_iri_escape_path(iri->path, buf);
1205
1206 // Do not actually escape the query field. This part of the URL *MAY*
1207 // contain reserved characters which should be passed on as-is and without
1208 // escaping them. This is according to the rules laid out in RFC 2616 and
1209 // RFC 7230. But we have to replace spaces in any case.
1210 if (iri->query) {
1211 wget_buffer_memcat(buf, "?", 1);
1212 for (const char *p = iri->query; *p; p++)
1213 if (*p == ' ')
1214 wget_buffer_memcat(buf, "%20", 3);
1215 else
1216 wget_buffer_memcat(buf, p, 1);
1217 }
1218
1219 return buf->data;
1220 }
1221
1222 /**
1223 * \param[in] iri An IRI
1224 * \param[in] buf A buffer, where the resulting string will be put
1225 * \param[in] encoding Character set the string should be converted to
1226 * \return The contents of the buffer \p buf
1227 *
1228 * Get the path part of the provided IRI.
1229 *
1230 * The path is appended to \p buf. If \p buf is non-empty and does not end with
1231 * a path separator (`/`), then one is added before the path is appended to \p
1232 * buf.
1233 *
1234 * If \p encoding is provided, this function will try to convert the path (which is originally
1235 * in UTF-8) to that encoding.
1236 */
1237
wget_iri_get_path(const wget_iri * iri,wget_buffer * buf,const char * encoding)1238 char *wget_iri_get_path(const wget_iri *iri, wget_buffer *buf, const char *encoding)
1239 {
1240 if (buf->length != 0 && buf->data[buf->length - 1] != '/')
1241 wget_buffer_memcat(buf, "/", 1);
1242
1243 if (iri->path) {
1244 if (wget_strcasecmp_ascii(encoding, "utf-8")) {
1245 char *fname;
1246
1247 if ((fname = wget_utf8_to_str(iri->path, encoding))) {
1248 wget_buffer_strcat(buf, fname);
1249 xfree(fname);
1250 } else {
1251 // conversion failed, keep original string
1252 wget_buffer_strcat(buf, iri->path);
1253 }
1254 } else {
1255 wget_buffer_strcat(buf, iri->path);
1256 }
1257 }
1258
1259 if ((buf->length == 0 || buf->data[buf->length - 1] == '/') && default_page)
1260 wget_buffer_memcat(buf, default_page, default_page_length);
1261
1262 return buf->data;
1263 }
1264
1265 /**
1266 * \param[in] iri An IRI
1267 * \param[in] buf A buffer, where the resulting string will be put
1268 * \param[in] encoding Character set the string should be converted to
1269 * \return The contents of the buffer \p buf
1270 *
1271 * Take the query part, and escape the path separators (`/`), so that it can be used as part
1272 * of a filename.
1273 *
1274 * The resulting string will be placed in the buffer \p buf and also returned as a `const char *`.
1275 * If the provided IRI has no query part, then the original contents of \p buf are returned and \p buf
1276 * is kept untouched.
1277 *
1278 * If \p encoding is provided, this function will try to convert the query (which is originally
1279 * in UTF-8) to that encoding.
1280 */
wget_iri_get_query_as_filename(const wget_iri * iri,wget_buffer * buf,const char * encoding)1281 char *wget_iri_get_query_as_filename(const wget_iri *iri, wget_buffer *buf, const char *encoding)
1282 {
1283 if (iri->query) {
1284 const char *query;
1285 int allocated = 0;
1286
1287 wget_buffer_memcat(buf, "?", 1);
1288
1289 if (wget_strcasecmp_ascii(encoding, "utf-8")) {
1290 if ((query = wget_utf8_to_str(iri->query, encoding)))
1291 allocated = 1;
1292 else
1293 query = iri->query;
1294 } else {
1295 query = iri->query;
1296 }
1297
1298 int slashes = 0;
1299 const char *src = query;
1300
1301 // count slashes in query string
1302 while ((src = strchr(src, '/'))) {
1303 slashes++;
1304 src++;
1305 }
1306
1307 if (slashes) {
1308 // escape slashes to use query as part of a filename
1309 const char *begin;
1310
1311 for (src = begin = query; *src; src++) {
1312 if (*src == '/') {
1313 if (begin != src)
1314 wget_buffer_memcat(buf, begin, src - begin);
1315 begin = src + 1;
1316 wget_buffer_memcat(buf, "%2F", 3);
1317 }
1318 }
1319
1320 if (begin != src)
1321 wget_buffer_memcat(buf, begin, src - begin);
1322 } else {
1323 wget_buffer_strcat(buf, query);
1324 }
1325
1326 if (allocated)
1327 xfree(query);
1328 }
1329
1330 return buf->data;
1331 }
1332
1333 /**
1334 * \param[in] iri An IRI
1335 * \param[in] buf A buffer, where the resulting string will be put
1336 * \param[in] encoding Character set the string should be converted to
1337 * \return The contents of the buffer \p buf
1338 *
1339 * Get the filename of the path of the provided IRI.
1340 *
1341 * This is similar to wget_iri_get_path(), but instead of returning the whole path
1342 * it only returns the substring after the last occurrence of `/`. In other words, the
1343 * filename of the path.
1344 *
1345 * This is also known as the "basename" in the UNIX world, and the output of this function
1346 * would be equivalent to the output of the `basename(1)` tool.
1347 *
1348 * The path is copied into \p buf if it's empty. If the buffer \p buf is not empty,
1349 * it is appended to it after a path separator (`/`).
1350 *
1351 * If \p encoding is provided, this function will try to convert the path (which is originally
1352 * in UTF-8) to that encoding.
1353 */
wget_iri_get_basename(const wget_iri * iri,wget_buffer * buf,const char * encoding,int flags)1354 char *wget_iri_get_basename(const wget_iri *iri, wget_buffer *buf, const char *encoding, int flags)
1355 {
1356 if (iri->path) {
1357 char *fname;
1358
1359 if (wget_strcasecmp_ascii(encoding, "utf-8")) {
1360 char *p;
1361
1362 if ((p = strrchr(iri->path, '/'))) {
1363 if (!(fname = wget_utf8_to_str(p + 1, encoding)))
1364 wget_buffer_strcat(buf, p + 1); // conversion failed, keep original string
1365 } else {
1366 if (!(fname = wget_utf8_to_str(iri->path, encoding)))
1367 wget_buffer_strcat(buf, iri->path); // conversion failed, keep original string
1368 }
1369
1370 if (fname) {
1371 // conversion succeeded
1372 wget_buffer_strcat(buf, fname);
1373 xfree(fname);
1374 }
1375 } else {
1376 if ((fname = strrchr(iri->path, '/')))
1377 wget_buffer_strcat(buf, fname + 1);
1378 else
1379 wget_buffer_strcat(buf, iri->path);
1380 }
1381 }
1382
1383 if ((buf->length == 0 || buf->data[buf->length - 1] == '/') && default_page)
1384 wget_buffer_memcat(buf, default_page, default_page_length);
1385
1386 if (flags & WGET_IRI_WITH_QUERY)
1387 return wget_iri_get_query_as_filename(iri, buf, encoding);
1388
1389 return buf->data;
1390 }
1391
1392 // escaping: see https://tools.ietf.org/html/rfc2396#2 following (especially 2.4.2)
1393 /*const char *iri_escape(const char *uri)
1394 {
1395 int esc = 0;
1396 const char *p;
1397
1398 for (p = uri; *p; p++) {
1399 if (*p == '%') {
1400 if ((isxdigit(p[1]) && isxdigit(p[2])) || p[1] == '%')
1401 return uri; // assume that URI is already escaped
1402 esc++;
1403 } else if ()
1404 }
1405 }
1406 */
1407
wget_iri_set_defaultpage(const char * page)1408 void wget_iri_set_defaultpage(const char *page)
1409 {
1410 default_page = page;
1411 default_page_length = default_page ? strlen(default_page) : 0;
1412 }
1413
1414 /**
1415 * \param scheme The scheme for the new default port
1416 * \param port The new default port value for the given scheme
1417 * \return 0: success -1: Unknown scheme
1418 *
1419 * Set the default \p port for the given \p scheme.
1420 */
wget_iri_set_defaultport(wget_iri_scheme scheme,uint16_t port)1421 int wget_iri_set_defaultport(wget_iri_scheme scheme, uint16_t port)
1422 {
1423 if ((unsigned) scheme < countof(schemes)) {
1424 schemes[scheme].port = port;
1425 return 0;
1426 }
1427
1428 return -1;
1429 }
1430
1431 /**
1432 * \param[in] iri An IRI
1433 * \param[in] scheme A scheme, such as `http` or `https`.
1434 * \return The original scheme of IRI (ie. before the replacement)
1435 *
1436 * Set the scheme of the provided IRI. The IRI's original scheme
1437 * is replaced by the new one.
1438 *
1439 * If the IRI was using a default port (such as 80 for HTTP or 443 for HTTPS)
1440 * that port is modified as well to match the default port of the new scheme.
1441 * Otherwise the port is left untouched.
1442 */
wget_iri_set_scheme(wget_iri * iri,wget_iri_scheme scheme)1443 wget_iri_scheme wget_iri_set_scheme(wget_iri *iri, wget_iri_scheme scheme)
1444 {
1445 wget_iri_scheme old_scheme = iri->scheme;
1446
1447 if ((unsigned) scheme < countof(schemes) && iri->scheme != scheme) {
1448 iri->scheme = scheme;
1449
1450 // If the IRI is using the default port, also change it
1451 if (iri->port == schemes[old_scheme].port)
1452 iri->port = schemes[scheme].port;
1453
1454 size_t old_scheme_len = strlen(schemes[old_scheme].name);
1455
1456 if (strncmp(iri->uri, schemes[old_scheme].name, old_scheme_len) == 0 && iri->uri[old_scheme_len] == ':') {
1457 char *new_uri = wget_aprintf("%s%s", schemes[iri->scheme].name, iri->uri + old_scheme_len);
1458 if (iri->uri_allocated)
1459 xfree(iri->uri);
1460 iri->uri = new_uri;
1461 iri->uri_allocated = true;
1462 }
1463 }
1464
1465 return old_scheme;
1466 }
1467
1468 /** @} */
1469