1 /* URL handling.
2 Copyright (C) 1996-2011, 2015, 2018-2021 Free Software Foundation,
3 Inc.
4
5 This file is part of GNU Wget.
6
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or (at
10 your option) any later version.
11
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19
20 Additional permission under GNU GPL version 3 section 7
21
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
30
31 #include "wget.h"
32
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <assert.h>
39
40 #include "utils.h"
41 #include "url.h"
42 #include "host.h" /* for is_valid_ipv6_address */
43 #include "c-strcase.h"
44
45 #ifdef HAVE_ICONV
46 # include <iconv.h>
47 #endif
48 #include <langinfo.h>
49
50 #ifdef __VMS
51 #include "vms.h"
52 #endif /* def __VMS */
53
54 #ifdef TESTING
55 #include "../tests/unit-tests.h"
56 #endif
57
58 enum {
59 scm_disabled = 1, /* for https when OpenSSL fails to init. */
60 scm_has_params = 2, /* whether scheme has ;params */
61 scm_has_query = 4, /* whether scheme has ?query */
62 scm_has_fragment = 8 /* whether scheme has #fragment */
63 };
64
65 struct scheme_data
66 {
67 /* Short name of the scheme, such as "http" or "ftp". */
68 const char *name;
69 /* Leading string that identifies the scheme, such as "https://". */
70 const char *leading_string;
71 /* Default port of the scheme when none is specified. */
72 int default_port;
73 /* Various flags. */
74 int flags;
75 };
76
77 /* Supported schemes: */
78 static struct scheme_data supported_schemes[] =
79 {
80 { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment },
81 #ifdef HAVE_SSL
82 { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
83 #endif
84 { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment },
85 #ifdef HAVE_SSL
86 /*
87 * Explicit FTPS uses the same port as FTP.
88 * Implicit FTPS has its own port (990), but it is disabled by default.
89 */
90 { "ftps", "ftps://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment },
91 #endif
92
93 /* SCHEME_INVALID */
94 { NULL, NULL, -1, 0 }
95 };
96
97 /* Forward declarations: */
98
99 static bool path_simplify (enum url_scheme, char *);
100
101 /* Support for escaping and unescaping of URL strings. */
102
103 /* Table of "reserved" and "unsafe" characters. Those terms are
104 rfc1738-speak, as such largely obsoleted by rfc2396 and later
105 specs, but the general idea remains.
106
107 A reserved character is the one that you can't decode without
108 changing the meaning of the URL. For example, you can't decode
109 "/foo/%2f/bar" into "/foo///bar" because the number and contents of
110 path components is different. Non-reserved characters can be
111 changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The
112 unsafe characters are loosely based on rfc1738, plus "$" and ",",
113 as recommended by rfc2396, and minus "~", which is very frequently
114 used (and sometimes unrecognized as %7E by broken servers).
115
116 An unsafe character is the one that should be encoded when URLs are
117 placed in foreign environments. E.g. space and newline are unsafe
118 in HTTP contexts because HTTP uses them as separator and line
119 terminator, so they must be encoded to %20 and %0A respectively.
120 "*" is unsafe in shell context, etc.
121
122 We determine whether a character is unsafe through static table
123 lookup. This code assumes ASCII character set and 8-bit chars. */
124
125 enum {
126 /* rfc1738 reserved chars + "$" and ",". */
127 urlchr_reserved = 1,
128
129 /* rfc1738 unsafe chars, plus non-printables. */
130 urlchr_unsafe = 2
131 };
132
133 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
134 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
135 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
136
137 /* Shorthands for the table: */
138 #define R urlchr_reserved
139 #define U urlchr_unsafe
140 #define RU R|U
141
142 static const unsigned char urlchr_table[256] =
143 {
144 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
145 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
146 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
147 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
148 U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */
149 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */
150 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
151 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
152 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
153 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
154 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
155 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
156 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
157 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
158 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
159 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */
160
161 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
162 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
163 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
164 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
165
166 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
167 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
168 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
169 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
170 };
171 #undef R
172 #undef U
173 #undef RU
174
175 static void
url_unescape_1(char * s,unsigned char mask)176 url_unescape_1 (char *s, unsigned char mask)
177 {
178 unsigned char *t = (unsigned char *) s; /* t - tortoise */
179 unsigned char *h = (unsigned char *) s; /* h - hare */
180
181 for (; *h; h++, t++)
182 {
183 if (*h != '%')
184 {
185 copychar:
186 *t = *h;
187 }
188 else
189 {
190 unsigned char c;
191 /* Do nothing if '%' is not followed by two hex digits. */
192 if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
193 goto copychar;
194 c = X2DIGITS_TO_NUM (h[1], h[2]);
195 if (urlchr_test(c, mask))
196 goto copychar;
197 /* Don't unescape %00 because there is no way to insert it
198 into a C string without effectively truncating it. */
199 if (c == '\0')
200 goto copychar;
201 *t = c;
202 h += 2;
203 }
204 }
205 *t = '\0';
206 }
207
208 /* URL-unescape the string S.
209
210 This is done by transforming the sequences "%HH" to the character
211 represented by the hexadecimal digits HH. If % is not followed by
212 two hexadecimal digits, it is inserted literally.
213
214 The transformation is done in place. If you need the original
215 string intact, make a copy before calling this function. */
216 void
url_unescape(char * s)217 url_unescape (char *s)
218 {
219 url_unescape_1 (s, 0);
220 }
221
222 /* URL-unescape the string S.
223
224 This functions behaves identically as url_unescape(), but does not
225 convert characters from "reserved". In other words, it only converts
226 "unsafe" characters. */
227 void
url_unescape_except_reserved(char * s)228 url_unescape_except_reserved (char *s)
229 {
230 url_unescape_1 (s, urlchr_reserved);
231 }
232
233 /* The core of url_escape_* functions. Escapes the characters that
234 match the provided mask in urlchr_table.
235
236 If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
237 returned unchanged. If ALLOW_PASSTHROUGH is false, a freshly
238 allocated string will be returned in all cases. */
239
240 static char *
url_escape_1(const char * s,unsigned char mask,bool allow_passthrough)241 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
242 {
243 const char *p1;
244 char *p2, *newstr;
245 int newlen;
246 int addition = 0;
247
248 for (p1 = s; *p1; p1++)
249 if (urlchr_test (*p1, mask))
250 addition += 2; /* Two more characters (hex digits) */
251
252 if (!addition)
253 return allow_passthrough ? (char *)s : xstrdup (s);
254
255 newlen = (p1 - s) + addition;
256 newstr = xmalloc (newlen + 1);
257
258 p1 = s;
259 p2 = newstr;
260 while (*p1)
261 {
262 /* Quote the characters that match the test mask. */
263 if (urlchr_test (*p1, mask))
264 {
265 unsigned char c = *p1++;
266 *p2++ = '%';
267 *p2++ = XNUM_TO_DIGIT (c >> 4);
268 *p2++ = XNUM_TO_DIGIT (c & 0xf);
269 }
270 else
271 *p2++ = *p1++;
272 }
273 assert (p2 - newstr == newlen);
274 *p2 = '\0';
275
276 return newstr;
277 }
278
279 /* URL-escape the unsafe characters (see urlchr_table) in a given
280 string, returning a freshly allocated string. */
281
282 char *
url_escape(const char * s)283 url_escape (const char *s)
284 {
285 return url_escape_1 (s, urlchr_unsafe, false);
286 }
287
288 /* URL-escape the unsafe and reserved characters (see urlchr_table) in
289 a given string, returning a freshly allocated string. */
290
291 char *
url_escape_unsafe_and_reserved(const char * s)292 url_escape_unsafe_and_reserved (const char *s)
293 {
294 return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
295 }
296
297 /* URL-escape the unsafe characters (see urlchr_table) in a given
298 string. If no characters are unsafe, S is returned. */
299
300 static char *
url_escape_allow_passthrough(const char * s)301 url_escape_allow_passthrough (const char *s)
302 {
303 return url_escape_1 (s, urlchr_unsafe, true);
304 }
305
306 /* Decide whether the char at position P needs to be encoded. (It is
307 not enough to pass a single char *P because the function may need
308 to inspect the surrounding context.)
309
310 Return true if the char should be escaped as %XX, false otherwise. */
311
312 static inline bool
char_needs_escaping(const char * p)313 char_needs_escaping (const char *p)
314 {
315 if (*p == '%')
316 {
317 if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
318 return false;
319 else
320 /* Garbled %.. sequence: encode `%'. */
321 return true;
322 }
323 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
324 return true;
325 else
326 return false;
327 }
328
329 /* Translate a %-escaped (but possibly non-conformant) input string S
330 into a %-escaped (and conformant) output string. If no characters
331 are encoded or decoded, return the same string S; otherwise, return
332 a freshly allocated string with the new contents.
333
334 After a URL has been run through this function, the protocols that
335 use `%' as the quote character can use the resulting string as-is,
336 while those that don't can use url_unescape to get to the intended
337 data. This function is stable: once the input is transformed,
338 further transformations of the result yield the same output.
339
340 Let's discuss why this function is needed.
341
342 Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since
343 a raw space character would mess up the HTTP request, it needs to
344 be quoted, like this:
345
346 GET /abc%20def HTTP/1.0
347
348 It would appear that the unsafe chars need to be quoted, for
349 example with url_escape. But what if we're requested to download
350 `abc%20def'? url_escape transforms "%" to "%25", which would leave
351 us with `abc%2520def'. This is incorrect -- since %-escapes are
352 part of URL syntax, "%20" is the correct way to denote a literal
353 space on the Wget command line. This leads to the conclusion that
354 in that case Wget should not call url_escape, but leave the `%20'
355 as is. This is clearly contradictory, but it only gets worse.
356
357 What if the requested URI is `abc%20 def'? If we call url_escape,
358 we end up with `/abc%2520%20def', which is almost certainly not
359 intended. If we don't call url_escape, we are left with the
360 embedded space and cannot complete the request. What the user
361 meant was for Wget to request `/abc%20%20def', and this is where
362 reencode_escapes kicks in.
363
364 Wget used to solve this by first decoding %-quotes, and then
365 encoding all the "unsafe" characters found in the resulting string.
366 This was wrong because it didn't preserve certain URL special
367 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
368 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
369 whether we considered `+' reserved (it is). One of these results
370 is inevitable because by the second step we would lose information
371 on whether the `+' was originally encoded or not. Both results
372 were wrong because in CGI parameters + means space, while %2B means
373 literal plus. reencode_escapes correctly translates the above to
374 "a%2B+b", i.e. returns the original string.
375
376 This function uses a modified version of the algorithm originally
377 proposed by Anon Sricharoenchai:
378
379 * Encode all "unsafe" characters, except those that are also
380 "reserved", to %XX. See urlchr_table for which characters are
381 unsafe and reserved.
382
383 * Encode the "%" characters not followed by two hex digits to
384 "%25".
385
386 * Pass through all other characters and %XX escapes as-is. (Up to
387 Wget 1.10 this decoded %XX escapes corresponding to "safe"
388 characters, but that was obtrusive and broke some servers.)
389
390 Anon's test case:
391
392 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
393 ->
394 "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
395
396 Simpler test cases:
397
398 "foo bar" -> "foo%20bar"
399 "foo%20bar" -> "foo%20bar"
400 "foo %20bar" -> "foo%20%20bar"
401 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
402 "foo%25%20bar" -> "foo%25%20bar"
403 "foo%2%20bar" -> "foo%252%20bar"
404 "foo+bar" -> "foo+bar" (plus is reserved!)
405 "foo%2b+bar" -> "foo%2b+bar" */
406
407 static char *
reencode_escapes(const char * s)408 reencode_escapes (const char *s)
409 {
410 const char *p1;
411 char *newstr, *p2;
412 int oldlen, newlen;
413
414 int encode_count = 0;
415
416 /* First pass: inspect the string to see if there's anything to do,
417 and to calculate the new length. */
418 for (p1 = s; *p1; p1++)
419 if (char_needs_escaping (p1))
420 ++encode_count;
421
422 if (!encode_count)
423 /* The string is good as it is. */
424 return (char *) s; /* C const model sucks. */
425
426 oldlen = p1 - s;
427 /* Each encoding adds two characters (hex digits). */
428 newlen = oldlen + 2 * encode_count;
429 newstr = xmalloc (newlen + 1);
430
431 /* Second pass: copy the string to the destination address, encoding
432 chars when needed. */
433 p1 = s;
434 p2 = newstr;
435
436 while (*p1)
437 if (char_needs_escaping (p1))
438 {
439 unsigned char c = *p1++;
440 *p2++ = '%';
441 *p2++ = XNUM_TO_DIGIT (c >> 4);
442 *p2++ = XNUM_TO_DIGIT (c & 0xf);
443 }
444 else
445 *p2++ = *p1++;
446
447 *p2 = '\0';
448 assert (p2 - newstr == newlen);
449 return newstr;
450 }
451
452 /* Returns the scheme type if the scheme is supported, or
453 SCHEME_INVALID if not. */
454
455 enum url_scheme
url_scheme(const char * url)456 url_scheme (const char *url)
457 {
458 int i;
459
460 for (i = 0; supported_schemes[i].leading_string; i++)
461 if (0 == c_strncasecmp (url, supported_schemes[i].leading_string,
462 strlen (supported_schemes[i].leading_string)))
463 {
464 if (!(supported_schemes[i].flags & scm_disabled))
465 return (enum url_scheme) i;
466 else
467 return SCHEME_INVALID;
468 }
469
470 return SCHEME_INVALID;
471 }
472
473 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
474
475 /* Return 1 if the URL begins with any "scheme", 0 otherwise. As
476 currently implemented, it returns true if URL begins with
477 [-+a-zA-Z0-9]+: . */
478
479 bool
url_has_scheme(const char * url)480 url_has_scheme (const char *url)
481 {
482 const char *p = url;
483
484 /* The first char must be a scheme char. */
485 if (!*p || !SCHEME_CHAR (*p))
486 return false;
487 ++p;
488 /* Followed by 0 or more scheme chars. */
489 while (*p && SCHEME_CHAR (*p))
490 ++p;
491 /* Terminated by ':'. */
492 return *p == ':';
493 }
494
495 bool
url_valid_scheme(const char * url)496 url_valid_scheme (const char *url)
497 {
498 enum url_scheme scheme = url_scheme (url);
499 return scheme != SCHEME_INVALID;
500 }
501
502 int
scheme_default_port(enum url_scheme scheme)503 scheme_default_port (enum url_scheme scheme)
504 {
505 return supported_schemes[scheme].default_port;
506 }
507
508 void
scheme_disable(enum url_scheme scheme)509 scheme_disable (enum url_scheme scheme)
510 {
511 supported_schemes[scheme].flags |= scm_disabled;
512 }
513
514 const char *
scheme_leading_string(enum url_scheme scheme)515 scheme_leading_string (enum url_scheme scheme)
516 {
517 return supported_schemes[scheme].leading_string;
518 }
519
520 /* Skip the username and password, if present in the URL. The
521 function should *not* be called with the complete URL, but with the
522 portion after the scheme.
523
524 If no username and password are found, return URL. */
525
526 static const char *
url_skip_credentials(const char * url)527 url_skip_credentials (const char *url)
528 {
529 /* Look for '@' that comes before terminators, such as '/', '?',
530 '#', or ';'. */
531 const char *p = (const char *)strpbrk (url, "@/?#;");
532 if (!p || *p != '@')
533 return url;
534 return p + 1;
535 }
536
537 /* Parse credentials contained in [BEG, END). The region is expected
538 to have come from a URL and is unescaped. */
539
540 static bool
parse_credentials(const char * beg,const char * end,char ** user,char ** passwd)541 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
542 {
543 char *colon;
544 const char *userend;
545
546 if (beg == end)
547 return false; /* empty user name */
548
549 colon = memchr (beg, ':', end - beg);
550 if (colon == beg)
551 return false; /* again empty user name */
552
553 if (colon)
554 {
555 *passwd = strdupdelim (colon + 1, end);
556 userend = colon;
557 url_unescape (*passwd);
558 }
559 else
560 {
561 *passwd = NULL;
562 userend = end;
563 }
564 *user = strdupdelim (beg, userend);
565 url_unescape (*user);
566 return true;
567 }
568
569 /* Used by main.c: detect URLs written using the "shorthand" URL forms
570 originally popularized by Netscape and NcFTP. HTTP shorthands look
571 like this:
572
573 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
574 www.foo.com[:port] -> http://www.foo.com[:port]
575
576 FTP shorthands look like this:
577
578 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
579 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
580
581 If the URL needs not or cannot be rewritten, return NULL. */
582
583 char *
rewrite_shorthand_url(const char * url)584 rewrite_shorthand_url (const char *url)
585 {
586 const char *p;
587 char *ret;
588
589 if (url_scheme (url) != SCHEME_INVALID)
590 return NULL;
591
592 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
593 latter Netscape. */
594 p = strpbrk (url, ":/");
595 if (p == url)
596 return NULL;
597
598 /* If we're looking at "://", it means the URL uses a scheme we
599 don't support, which may include "https" when compiled without
600 SSL support. Don't bogusly rewrite such URLs. */
601 if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
602 return NULL;
603
604 if (p && *p == ':')
605 {
606 /* Colon indicates ftp, as in foo.bar.com:path. Check for
607 special case of http port number ("localhost:10000"). */
608 int digits = strspn (p + 1, "0123456789");
609 if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
610 goto http;
611
612 /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
613 if ((ret = aprintf ("ftp://%s", url)) != NULL)
614 ret[6 + (p - url)] = '/';
615 }
616 else
617 {
618 http:
619 /* Just prepend "http://" to URL. */
620 ret = aprintf ("http://%s", url);
621 }
622 return ret;
623 }
624
625 static void split_path (const char *, char **, char **);
626
627 /* Like strpbrk, with the exception that it returns the pointer to the
628 terminating zero (end-of-string aka "eos") if no matching character
629 is found. */
630
631 static inline char *
strpbrk_or_eos(const char * s,const char * accept)632 strpbrk_or_eos (const char *s, const char *accept)
633 {
634 char *p = strpbrk (s, accept);
635 if (!p)
636 p = strchr (s, '\0');
637 return p;
638 }
639
640 /* Turn STR into lowercase; return true if a character was actually
641 changed. */
642
643 static bool
lowercase_str(char * str)644 lowercase_str (char *str)
645 {
646 bool changed = false;
647 for (; *str; str++)
648 if (c_isupper (*str))
649 {
650 changed = true;
651 *str = c_tolower (*str);
652 }
653 return changed;
654 }
655
656 static const char *
init_seps(enum url_scheme scheme)657 init_seps (enum url_scheme scheme)
658 {
659 static char seps[8] = ":/";
660 char *p = seps + 2;
661 int flags = supported_schemes[scheme].flags;
662
663 if (flags & scm_has_params)
664 *p++ = ';';
665 if (flags & scm_has_query)
666 *p++ = '?';
667 if (flags & scm_has_fragment)
668 *p++ = '#';
669 *p = '\0';
670 return seps;
671 }
672
673 static const char *parse_errors[] = {
674 #define PE_NO_ERROR 0
675 N_("No error"),
676 #define PE_UNSUPPORTED_SCHEME 1
677 N_("Unsupported scheme %s"), /* support for format token only here */
678 #define PE_MISSING_SCHEME 2
679 N_("Scheme missing"),
680 #define PE_INVALID_HOST_NAME 3
681 N_("Invalid host name"),
682 #define PE_BAD_PORT_NUMBER 4
683 N_("Bad port number"),
684 #define PE_INVALID_USER_NAME 5
685 N_("Invalid user name"),
686 #define PE_UNTERMINATED_IPV6_ADDRESS 6
687 N_("Unterminated IPv6 numeric address"),
688 #define PE_IPV6_NOT_SUPPORTED 7
689 N_("IPv6 addresses not supported"),
690 #define PE_INVALID_IPV6_ADDRESS 8
691 N_("Invalid IPv6 numeric address")
692 };
693
694 /* Parse a URL.
695
696 Return a new struct url if successful, NULL on error. In case of
697 error, and if ERROR is not NULL, also set *ERROR to the appropriate
698 error code. */
699 struct url *
url_parse(const char * url,int * error,struct iri * iri,bool percent_encode)700 url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
701 {
702 struct url *u;
703 const char *p;
704 bool path_modified, host_modified;
705
706 enum url_scheme scheme;
707 const char *seps;
708
709 const char *uname_b, *uname_e;
710 const char *host_b, *host_e;
711 const char *path_b, *path_e;
712 const char *params_b, *params_e;
713 const char *query_b, *query_e;
714 const char *fragment_b, *fragment_e;
715
716 int port;
717 char *user = NULL, *passwd = NULL;
718
719 const char *url_encoded = NULL;
720
721 int error_code;
722
723 scheme = url_scheme (url);
724 if (scheme == SCHEME_INVALID)
725 {
726 if (url_has_scheme (url))
727 error_code = PE_UNSUPPORTED_SCHEME;
728 else
729 error_code = PE_MISSING_SCHEME;
730 goto error;
731 }
732
733 url_encoded = url;
734
735 if (iri && iri->utf8_encode)
736 {
737 char *new_url = NULL;
738
739 iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, &new_url);
740 if (!iri->utf8_encode)
741 new_url = NULL;
742 else
743 {
744 xfree (iri->orig_url);
745 iri->orig_url = xstrdup (url);
746 url_encoded = reencode_escapes (new_url);
747 if (url_encoded != new_url)
748 xfree (new_url);
749 percent_encode = false;
750 }
751 }
752
753 if (percent_encode)
754 url_encoded = reencode_escapes (url);
755
756 p = url_encoded;
757 p += strlen (supported_schemes[scheme].leading_string);
758 uname_b = p;
759 p = url_skip_credentials (p);
760 uname_e = p;
761
762 /* scheme://user:pass@host[:port]... */
763 /* ^ */
764
765 /* We attempt to break down the URL into the components path,
766 params, query, and fragment. They are ordered like this:
767
768 scheme://host[:port][/path][;params][?query][#fragment] */
769
770 path_b = path_e = NULL;
771 params_b = params_e = NULL;
772 query_b = query_e = NULL;
773 fragment_b = fragment_e = NULL;
774
775 /* Initialize separators for optional parts of URL, depending on the
776 scheme. For example, FTP has params, and HTTP and HTTPS have
777 query string and fragment. */
778 seps = init_seps (scheme);
779
780 host_b = p;
781
782 if (*p == '[')
783 {
784 /* Handle IPv6 address inside square brackets. Ideally we'd
785 just look for the terminating ']', but rfc2732 mandates
786 rejecting invalid IPv6 addresses. */
787
788 /* The address begins after '['. */
789 host_b = p + 1;
790 host_e = strchr (host_b, ']');
791
792 if (!host_e)
793 {
794 error_code = PE_UNTERMINATED_IPV6_ADDRESS;
795 goto error;
796 }
797
798 #ifdef ENABLE_IPV6
799 /* Check if the IPv6 address is valid. */
800 if (!is_valid_ipv6_address(host_b, host_e))
801 {
802 error_code = PE_INVALID_IPV6_ADDRESS;
803 goto error;
804 }
805
806 /* Continue parsing after the closing ']'. */
807 p = host_e + 1;
808 #else
809 error_code = PE_IPV6_NOT_SUPPORTED;
810 goto error;
811 #endif
812
813 /* The closing bracket must be followed by a separator or by the
814 null char. */
815 /* http://[::1]... */
816 /* ^ */
817 if (!strchr (seps, *p))
818 {
819 /* Trailing garbage after []-delimited IPv6 address. */
820 error_code = PE_INVALID_HOST_NAME;
821 goto error;
822 }
823 }
824 else
825 {
826 p = strpbrk_or_eos (p, seps);
827 host_e = p;
828 }
829 ++seps; /* advance to '/' */
830
831 if (host_b == host_e)
832 {
833 error_code = PE_INVALID_HOST_NAME;
834 goto error;
835 }
836
837 port = scheme_default_port (scheme);
838 if (*p == ':')
839 {
840 const char *port_b, *port_e, *pp;
841
842 /* scheme://host:port/tralala */
843 /* ^ */
844 ++p;
845 port_b = p;
846 p = strpbrk_or_eos (p, seps);
847 port_e = p;
848
849 /* Allow empty port, as per rfc2396. */
850 if (port_b != port_e)
851 for (port = 0, pp = port_b; pp < port_e; pp++)
852 {
853 if (!c_isdigit (*pp))
854 {
855 /* http://host:12randomgarbage/blah */
856 /* ^ */
857 error_code = PE_BAD_PORT_NUMBER;
858 goto error;
859 }
860 port = 10 * port + (*pp - '0');
861 /* Check for too large port numbers here, before we have
862 a chance to overflow on bogus port values. */
863 if (port > 0xffff)
864 {
865 error_code = PE_BAD_PORT_NUMBER;
866 goto error;
867 }
868 }
869 }
870 /* Advance to the first separator *after* '/' (either ';' or '?',
871 depending on the scheme). */
872 ++seps;
873
874 /* Get the optional parts of URL, each part being delimited by
875 current location and the position of the next separator. */
876 #define GET_URL_PART(sepchar, var) do { \
877 if (*p == sepchar) \
878 var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps); \
879 ++seps; \
880 } while (0)
881
882 GET_URL_PART ('/', path);
883 if (supported_schemes[scheme].flags & scm_has_params)
884 GET_URL_PART (';', params);
885 if (supported_schemes[scheme].flags & scm_has_query)
886 GET_URL_PART ('?', query);
887 if (supported_schemes[scheme].flags & scm_has_fragment)
888 GET_URL_PART ('#', fragment);
889
890 #undef GET_URL_PART
891 assert (*p == 0);
892
893 if (uname_b != uname_e)
894 {
895 /* http://user:pass@host */
896 /* ^ ^ */
897 /* uname_b uname_e */
898 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
899 {
900 error_code = PE_INVALID_USER_NAME;
901 goto error;
902 }
903 }
904
905 u = xnew0 (struct url);
906 u->scheme = scheme;
907 u->host = strdupdelim (host_b, host_e);
908 u->port = port;
909 u->user = user;
910 u->passwd = passwd;
911
912 u->path = strdupdelim (path_b, path_e);
913 path_modified = path_simplify (scheme, u->path);
914 split_path (u->path, &u->dir, &u->file);
915
916 host_modified = lowercase_str (u->host);
917
918 /* Decode %HH sequences in host name. This is important not so much
919 to support %HH sequences in host names (which other browser
920 don't), but to support binary characters (which will have been
921 converted to %HH by reencode_escapes). */
922 if (strchr (u->host, '%'))
923 {
924 url_unescape (u->host);
925 host_modified = true;
926
927 /* check for invalid control characters in host name */
928 for (p = u->host; *p; p++)
929 {
930 if (c_iscntrl(*p))
931 {
932 url_free(u);
933 error_code = PE_INVALID_HOST_NAME;
934 goto error;
935 }
936 }
937
938 /* Apply IDNA regardless of iri->utf8_encode status */
939 if (opt.enable_iri && iri)
940 {
941 char *new = idn_encode (iri, u->host);
942 if (new)
943 {
944 xfree (u->host);
945 u->host = new;
946 host_modified = true;
947 }
948 }
949 }
950
951 if (params_b)
952 u->params = strdupdelim (params_b, params_e);
953 if (query_b)
954 u->query = strdupdelim (query_b, query_e);
955 if (fragment_b)
956 u->fragment = strdupdelim (fragment_b, fragment_e);
957
958 if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
959 {
960 /* If we suspect that a transformation has rendered what
961 url_string might return different from URL_ENCODED, rebuild
962 u->url using url_string. */
963 u->url = url_string (u, URL_AUTH_SHOW);
964
965 if (url_encoded != url)
966 xfree (url_encoded);
967 }
968 else
969 {
970 if (url_encoded == url)
971 u->url = xstrdup (url);
972 else
973 u->url = (char *) url_encoded;
974 }
975
976 return u;
977
978 error:
979 /* Cleanup in case of error: */
980 if (url_encoded && url_encoded != url)
981 xfree (url_encoded);
982
983 /* Transmit the error code to the caller, if the caller wants to
984 know. */
985 if (error)
986 *error = error_code;
987 return NULL;
988 }
989
990 /* Return the error message string from ERROR_CODE, which should have
991 been retrieved from url_parse. The error message is translated. */
992
993 char *
url_error(const char * url,int error_code)994 url_error (const char *url, int error_code)
995 {
996 assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
997
998 if (error_code == PE_UNSUPPORTED_SCHEME)
999 {
1000 char *error, *p;
1001 char *scheme = xstrdup (url);
1002 assert (url_has_scheme (url));
1003
1004 if ((p = strchr (scheme, ':')))
1005 *p = '\0';
1006 if (!c_strcasecmp (scheme, "https"))
1007 error = aprintf (_("HTTPS support not compiled in"));
1008 else
1009 error = aprintf (_(parse_errors[error_code]), quote (scheme));
1010 xfree (scheme);
1011
1012 return error;
1013 }
1014 else
1015 return xstrdup (_(parse_errors[error_code]));
1016 }
1017
1018 /* Split PATH into DIR and FILE. PATH comes from the URL and is
1019 expected to be URL-escaped.
1020
1021 The path is split into directory (the part up to the last slash)
1022 and file (the part after the last slash), which are subsequently
1023 unescaped. Examples:
1024
1025 PATH DIR FILE
1026 "foo/bar/baz" "foo/bar" "baz"
1027 "foo/bar/" "foo/bar" ""
1028 "foo" "" "foo"
1029 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!)
1030
1031 DIR and FILE are freshly allocated. */
1032
1033 static void
split_path(const char * path,char ** dir,char ** file)1034 split_path (const char *path, char **dir, char **file)
1035 {
1036 char *last_slash = strrchr (path, '/');
1037 if (!last_slash)
1038 {
1039 *dir = xstrdup ("");
1040 *file = xstrdup (path);
1041 }
1042 else
1043 {
1044 *dir = strdupdelim (path, last_slash);
1045 *file = xstrdup (last_slash + 1);
1046 }
1047 url_unescape (*dir);
1048 url_unescape (*file);
1049 }
1050
1051 /* Note: URL's "full path" is the path with the query string and
1052 params appended. The "fragment" (#foo) is intentionally ignored,
1053 but that might be changed. For example, if the original URL was
1054 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1055 the full path will be "/foo/bar/baz;bullshit?querystring". */
1056
1057 /* Return the length of the full path, without the terminating
1058 zero. */
1059
1060 static int
full_path_length(const struct url * url)1061 full_path_length (const struct url *url)
1062 {
1063 int len = 0;
1064
1065 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1066
1067 FROB (path);
1068 FROB (params);
1069 FROB (query);
1070
1071 #undef FROB
1072
1073 return len;
1074 }
1075
1076 /* Write out the full path. */
1077
1078 static void
full_path_write(const struct url * url,char * where)1079 full_path_write (const struct url *url, char *where)
1080 {
1081 #define FROB(el, chr) do { \
1082 char *f_el = url->el; \
1083 if (f_el) { \
1084 int l = strlen (f_el); \
1085 *where++ = chr; \
1086 memcpy (where, f_el, l); \
1087 where += l; \
1088 } \
1089 } while (0)
1090
1091 FROB (path, '/');
1092 FROB (params, ';');
1093 FROB (query, '?');
1094
1095 #undef FROB
1096 }
1097
1098 /* Public function for getting the "full path". E.g. if u->path is
1099 "foo/bar" and u->query is "param=value", full_path will be
1100 "/foo/bar?param=value". */
1101
1102 char *
url_full_path(const struct url * url)1103 url_full_path (const struct url *url)
1104 {
1105 int length = full_path_length (url);
1106 char *full_path = xmalloc (length + 1);
1107
1108 full_path_write (url, full_path);
1109 full_path[length] = '\0';
1110
1111 return full_path;
1112 }
1113
1114 /* Unescape CHR in an otherwise escaped STR. Used to selectively
1115 escaping of certain characters, such as "/" and ":". Returns a
1116 count of unescaped chars. */
1117
1118 static void
unescape_single_char(char * str,char chr)1119 unescape_single_char (char *str, char chr)
1120 {
1121 const char c1 = XNUM_TO_DIGIT (chr >> 4);
1122 const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1123 char *h = str; /* hare */
1124 char *t = str; /* tortoise */
1125 for (; *h; h++, t++)
1126 {
1127 if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1128 {
1129 *t = chr;
1130 h += 2;
1131 }
1132 else
1133 *t = *h;
1134 }
1135 *t = '\0';
1136 }
1137
1138 /* Escape unsafe and reserved characters, except for the slash
1139 characters. */
1140
1141 static char *
url_escape_dir(const char * dir)1142 url_escape_dir (const char *dir)
1143 {
1144 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1145 if (newdir == dir)
1146 return (char *)dir;
1147
1148 unescape_single_char (newdir, '/');
1149 return newdir;
1150 }
1151
1152 /* Sync u->path and u->url with u->dir and u->file. Called after
1153 u->file or u->dir have been changed, typically by the FTP code. */
1154
1155 static void
sync_path(struct url * u)1156 sync_path (struct url *u)
1157 {
1158 char *newpath, *efile, *edir;
1159
1160 xfree (u->path);
1161
1162 /* u->dir and u->file are not escaped. URL-escape them before
1163 reassembling them into u->path. That way, if they contain
1164 separators like '?' or even if u->file contains slashes, the
1165 path will be correctly assembled. (u->file can contain slashes
1166 if the URL specifies it with %2f, or if an FTP server returns
1167 it.) */
1168 edir = url_escape_dir (u->dir);
1169 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1170
1171 if (!*edir)
1172 newpath = xstrdup (efile);
1173 else
1174 {
1175 int dirlen = strlen (edir);
1176 int filelen = strlen (efile);
1177
1178 /* Copy "DIR/FILE" to newpath. */
1179 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1180 memcpy (p, edir, dirlen);
1181 p += dirlen;
1182 *p++ = '/';
1183 memcpy (p, efile, filelen);
1184 p += filelen;
1185 *p = '\0';
1186 }
1187
1188 u->path = newpath;
1189
1190 if (edir != u->dir)
1191 xfree (edir);
1192 if (efile != u->file)
1193 xfree (efile);
1194
1195 /* Regenerate u->url as well. */
1196 xfree (u->url);
1197 u->url = url_string (u, URL_AUTH_SHOW);
1198 }
1199
1200 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1201 This way we can sync u->path and u->url when they get changed. */
1202
1203 void
url_set_dir(struct url * url,const char * newdir)1204 url_set_dir (struct url *url, const char *newdir)
1205 {
1206 xfree (url->dir);
1207 url->dir = xstrdup (newdir);
1208 sync_path (url);
1209 }
1210
1211 void
url_set_file(struct url * url,const char * newfile)1212 url_set_file (struct url *url, const char *newfile)
1213 {
1214 xfree (url->file);
1215 url->file = xstrdup (newfile);
1216 sync_path (url);
1217 }
1218
1219 void
url_free(struct url * url)1220 url_free (struct url *url)
1221 {
1222 if (url)
1223 {
1224 xfree (url->host);
1225
1226 xfree (url->path);
1227 xfree (url->url);
1228
1229 xfree (url->params);
1230 xfree (url->query);
1231 xfree (url->fragment);
1232 xfree (url->user);
1233 xfree (url->passwd);
1234
1235 xfree (url->dir);
1236 xfree (url->file);
1237
1238 xfree (url);
1239 }
1240 }
1241
1242 /* Create all the necessary directories for PATH (a file). Calls
1243 make_directory internally. */
1244 int
mkalldirs(const char * path)1245 mkalldirs (const char *path)
1246 {
1247 const char *p;
1248 char *t;
1249 struct stat st;
1250 int res;
1251
1252 p = strrchr(path, '/');
1253 p = p == NULL ? path : p;
1254
1255 /* Don't create if it's just a file. */
1256 if ((p == path) && (*p != '/'))
1257 return 0;
1258 t = strdupdelim (path, p);
1259
1260 /* Check whether the directory exists. */
1261 if ((stat (t, &st) == 0))
1262 {
1263 if (S_ISDIR (st.st_mode))
1264 {
1265 xfree (t);
1266 return 0;
1267 }
1268 else
1269 {
1270 /* If the dir exists as a file name, remove it first. This
1271 is *only* for Wget to work with buggy old CERN http
1272 servers. Here is the scenario: When Wget tries to
1273 retrieve a directory without a slash, e.g.
1274 http://foo/bar (bar being a directory), CERN server will
1275 not redirect it too http://foo/bar/ -- it will generate a
1276 directory listing containing links to bar/file1,
1277 bar/file2, etc. Wget will lose because it saves this
1278 HTML listing to a file `bar', so it cannot create the
1279 directory. To work around this, if the file of the same
1280 name exists, we just remove it and create the directory
1281 anyway. */
1282 DEBUGP (("Removing %s because of directory danger!\n", t));
1283 if (unlink (t))
1284 logprintf (LOG_NOTQUIET, "Failed to unlink %s (%d): %s\n",
1285 t, errno, strerror(errno));
1286 }
1287 }
1288 res = make_directory (t);
1289 if (res != 0)
1290 logprintf (LOG_NOTQUIET, "%s: %s\n", t, strerror (errno));
1291 xfree (t);
1292 return res;
1293 }
1294
1295 /* Functions for constructing the file name out of URL components. */
1296
1297 /* A growable string structure, used by url_file_name and friends.
1298 This should perhaps be moved to utils.c.
1299
1300 The idea is to have a convenient and efficient way to construct a
1301 string by having various functions append data to it. Instead of
1302 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1303 functions in questions, we pass the pointer to this struct.
1304
1305 Functions that write to the members in this struct must make sure
1306 that base remains null terminated by calling append_null().
1307 */
1308
1309 struct growable {
1310 char *base;
1311 int size; /* memory allocated */
1312 int tail; /* string length */
1313 };
1314
1315 /* Ensure that the string can accept APPEND_COUNT more characters past
1316 the current TAIL position. If necessary, this will grow the string
1317 and update its allocated size. If the string is already large
1318 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1319 #define GROW(g, append_size) do { \
1320 struct growable *G_ = g; \
1321 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1322 } while (0)
1323
1324 /* Return the tail position of the string. */
1325 #define TAIL(r) ((r)->base + (r)->tail)
1326
1327 /* Move the tail position by APPEND_COUNT characters. */
1328 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1329
1330
1331 /* Append NULL to DEST. */
1332 static void
append_null(struct growable * dest)1333 append_null (struct growable *dest)
1334 {
1335 GROW (dest, 1);
1336 *TAIL (dest) = 0;
1337 }
1338
1339 /* Append CH to DEST. */
1340 static void
append_char(char ch,struct growable * dest)1341 append_char (char ch, struct growable *dest)
1342 {
1343 if (ch)
1344 {
1345 GROW (dest, 1);
1346 *TAIL (dest) = ch;
1347 TAIL_INCR (dest, 1);
1348 }
1349
1350 append_null (dest);
1351 }
1352
1353 /* Append the string STR to DEST. */
1354 static void
append_string(const char * str,struct growable * dest)1355 append_string (const char *str, struct growable *dest)
1356 {
1357 int l = strlen (str);
1358
1359 if (l)
1360 {
1361 GROW (dest, l);
1362 memcpy (TAIL (dest), str, l);
1363 TAIL_INCR (dest, l);
1364 }
1365
1366 append_null (dest);
1367 }
1368
1369
1370 enum {
1371 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1372 filechr_not_vms = 2, /* unusable on VMS (ODS5), 0x00-0x1F * ? */
1373 filechr_not_windows = 4, /* unusable on Windows, one of \|/<>?:*" */
1374 filechr_control = 8 /* a control character, e.g. 0-31 */
1375 };
1376
1377 #define FILE_CHAR_TEST(c, mask) \
1378 ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
1379 (filechr_table[(unsigned char)(c)] & (mask)))
1380
1381 /* Shorthands for the table: */
1382 #define U filechr_not_unix
1383 #define V filechr_not_vms
1384 #define W filechr_not_windows
1385 #define C filechr_control
1386
1387 #define UVWC U|V|W|C
1388 #define UW U|W
1389 #define VC V|C
1390 #define VW V|W
1391
1392 /* Table of characters unsafe under various conditions (see above).
1393
1394 Arguably we could also claim `%' to be unsafe, since we use it as
1395 the escape character. If we ever want to be able to reliably
1396 translate file name back to URL, this would become important
1397 crucial. Right now, it's better to be minimal in escaping. */
1398
1399 static const unsigned char filechr_table[256] =
1400 {
1401 UVWC, VC, VC, VC, VC, VC, VC, VC, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1402 VC, VC, VC, VC, VC, VC, VC, VC, /* BS HT LF VT FF CR SO SI */
1403 VC, VC, VC, VC, VC, VC, VC, VC, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1404 VC, VC, VC, VC, VC, VC, VC, VC, /* CAN EM SUB ESC FS GS RS US */
1405 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1406 0, 0, VW, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1407 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1408 0, 0, W, 0, W, 0, W, VW, /* 8 9 : ; < = > ? */
1409 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1410 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1411 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1412 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1413 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1414 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1415 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1416 0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */
1417
1418 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
1419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
1420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1422
1423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1424 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1427 };
1428 #undef U
1429 #undef V
1430 #undef W
1431 #undef C
1432 #undef UW
1433 #undef UVWC
1434 #undef VC
1435 #undef VW
1436
1437 /* FN_PORT_SEP is the separator between host and port in file names
1438 for non-standard port numbers. On Unix this is normally ':', as in
1439 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1440 because Windows can't handle ':' in file names. */
1441 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1442
1443 /* FN_QUERY_SEP is the separator between the file name and the URL
1444 query, normally '?'. Because VMS and Windows cannot handle '?' in a
1445 file name, we use '@' instead there. */
1446 #define FN_QUERY_SEP \
1447 (((opt.restrict_files_os != restrict_vms) && \
1448 (opt.restrict_files_os != restrict_windows)) ? '?' : '@')
1449 #define FN_QUERY_SEP_STR \
1450 (((opt.restrict_files_os != restrict_vms) && \
1451 (opt.restrict_files_os != restrict_windows)) ? "?" : "@")
1452
1453 /* Quote path element, characters in [b, e), as file name, and append
1454 the quoted string to DEST. Each character is quoted as per
1455 file_unsafe_char and the corresponding table.
1456
1457 If ESCAPED is true, the path element is considered to be
1458 URL-escaped and will be unescaped prior to inspection. */
1459
1460 static void
append_uri_pathel(const char * b,const char * e,bool escaped,struct growable * dest)1461 append_uri_pathel (const char *b, const char *e, bool escaped,
1462 struct growable *dest)
1463 {
1464 const char *p;
1465 char buf[1024];
1466 char *unescaped = NULL;
1467 int quoted, outlen;
1468 int mask;
1469 int max_length;
1470
1471 if (!dest)
1472 return;
1473
1474 if (opt.restrict_files_os == restrict_unix)
1475 mask = filechr_not_unix;
1476 else if (opt.restrict_files_os == restrict_vms)
1477 mask = filechr_not_vms;
1478 else
1479 mask = filechr_not_windows;
1480
1481 if (opt.restrict_files_ctrl)
1482 mask |= filechr_control;
1483
1484 /* Copy [b, e) to PATHEL and URL-unescape it. */
1485 if (escaped)
1486 {
1487 size_t len = e - b;
1488 if (len < sizeof (buf))
1489 unescaped = buf;
1490 else
1491 unescaped = xmalloc(len + 1);
1492
1493 memcpy(unescaped, b, len);
1494 unescaped[len] = 0;
1495
1496 url_unescape (unescaped);
1497 b = unescaped;
1498 e = unescaped + strlen (unescaped);
1499 }
1500
1501 /* Defang ".." when found as component of path. Remember that path
1502 comes from the URL and might contain malicious input. */
1503 if (e - b == 2 && b[0] == '.' && b[1] == '.')
1504 {
1505 b = "%2E%2E";
1506 e = b + 6;
1507 }
1508
1509 /* Walk the PATHEL string and check how many characters we'll need
1510 to quote. */
1511 quoted = 0;
1512 for (p = b; p < e; p++)
1513 if (FILE_CHAR_TEST (*p, mask))
1514 ++quoted;
1515
1516 /* Calculate the length of the output string. e-b is the input
1517 string length. Each quoted char introduces two additional
1518 characters in the string, hence 2*quoted. */
1519 outlen = (e - b) + (2 * quoted);
1520 # ifdef WINDOWS
1521 max_length = MAX_PATH;
1522 # else
1523 max_length = get_max_length(dest->base, dest->tail, _PC_NAME_MAX);
1524 # endif
1525 max_length -= CHOMP_BUFFER;
1526 if (max_length > 0 && outlen > max_length)
1527 {
1528 logprintf (LOG_NOTQUIET, "The destination name is too long (%d), reducing to %d\n", outlen, max_length);
1529
1530 outlen = max_length;
1531 }
1532 GROW (dest, outlen);
1533
1534 // This should not happen, but it's impossible to argue with static analysis that it can't happen
1535 // (in theory it can). So give static analyzers a hint.
1536 if (!dest->base)
1537 return;
1538
1539 if (!quoted)
1540 {
1541 /* If there's nothing to quote, we can simply append the string
1542 without processing it again. */
1543 memcpy (TAIL (dest), b, outlen);
1544 }
1545 else
1546 {
1547 char *q = TAIL (dest);
1548 int i;
1549
1550 for (i = 0, p = b; p < e; p++)
1551 {
1552 if (!FILE_CHAR_TEST (*p, mask))
1553 {
1554 if (i == outlen)
1555 break;
1556 *q++ = *p;
1557 i++;
1558 }
1559 else if (i + 3 > outlen)
1560 break;
1561 else
1562 {
1563 unsigned char ch = *p;
1564 *q++ = '%';
1565 *q++ = XNUM_TO_DIGIT (ch >> 4);
1566 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1567 i += 3;
1568 }
1569 }
1570 assert (q - TAIL (dest) <= outlen);
1571 }
1572
1573 /* Perform inline case transformation if required. */
1574 if (opt.restrict_files_case == restrict_lowercase
1575 || opt.restrict_files_case == restrict_uppercase)
1576 {
1577 char *q;
1578 for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1579 {
1580 if (opt.restrict_files_case == restrict_lowercase)
1581 *q = c_tolower (*q);
1582 else
1583 *q = c_toupper (*q);
1584 }
1585 }
1586
1587 TAIL_INCR (dest, outlen);
1588 append_null (dest);
1589
1590 if (unescaped && unescaped != buf)
1591 free (unescaped);
1592 }
1593
1594 #ifdef HAVE_ICONV
1595 static char *
convert_fname(char * fname)1596 convert_fname (char *fname)
1597 {
1598 char *converted_fname;
1599 const char *from_encoding = opt.encoding_remote;
1600 const char *to_encoding = opt.locale;
1601 iconv_t cd;
1602 size_t len, done, inlen, outlen;
1603 char *s;
1604 const char *orig_fname;
1605
1606 /* Defaults for remote and local encodings. */
1607 if (!from_encoding)
1608 from_encoding = "UTF-8";
1609 if (!to_encoding)
1610 to_encoding = nl_langinfo (CODESET);
1611
1612 cd = iconv_open (to_encoding, from_encoding);
1613 if (cd == (iconv_t) (-1))
1614 {
1615 logprintf (LOG_VERBOSE, _ ("Conversion from %s to %s isn't supported\n"),
1616 quote_n (0, from_encoding), quote_n (1, to_encoding));
1617 return fname;
1618 }
1619
1620 orig_fname = fname;
1621 inlen = strlen (fname);
1622 len = outlen = inlen * 2;
1623 converted_fname = s = xmalloc (outlen + 1);
1624 done = 0;
1625
1626 for (;;)
1627 {
1628 errno = 0;
1629 if (iconv (cd, (ICONV_CONST char **) &fname, &inlen, &s, &outlen) == 0
1630 && iconv (cd, NULL, NULL, &s, &outlen) == 0)
1631 {
1632 *(converted_fname + len - outlen - done) = '\0';
1633 iconv_close (cd);
1634 DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n",
1635 orig_fname, from_encoding, converted_fname, to_encoding));
1636 xfree (orig_fname);
1637 return converted_fname;
1638 }
1639
1640 /* Incomplete or invalid multibyte sequence */
1641 if (errno == EINVAL || errno == EILSEQ || errno == 0)
1642 {
1643 if (errno)
1644 logprintf (LOG_VERBOSE,
1645 _ ("Incomplete or invalid multibyte sequence encountered\n"));
1646 else
1647 logprintf (LOG_VERBOSE,
1648 _ ("Unconvertable multibyte sequence encountered\n"));
1649 xfree (converted_fname);
1650 converted_fname = (char *) orig_fname;
1651 break;
1652 }
1653 else if (errno == E2BIG) /* Output buffer full */
1654 {
1655 done = len;
1656 len = outlen = done + inlen * 2;
1657 converted_fname = xrealloc (converted_fname, outlen + 1);
1658 s = converted_fname + done;
1659 }
1660 else /* Weird, we got an unspecified error */
1661 {
1662 logprintf (LOG_VERBOSE, _ ("Unhandled errno %d\n"), errno);
1663 xfree (converted_fname);
1664 converted_fname = (char *) orig_fname;
1665 break;
1666 }
1667 }
1668 DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n",
1669 orig_fname, from_encoding, to_encoding));
1670
1671 iconv_close (cd);
1672
1673 return converted_fname;
1674 }
1675 #else
1676 static char *
convert_fname(char * fname)1677 convert_fname (char *fname)
1678 {
1679 return fname;
1680 }
1681 #endif
1682
1683 /* Append to DEST the directory structure that corresponds the
1684 directory part of URL's path. For example, if the URL is
1685 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1686
1687 Each path element ("dir1" and "dir2" in the above example) is
1688 examined, url-unescaped, and re-escaped as file name element.
1689
1690 Additionally, it cuts as many directories from the path as
1691 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1692 will produce "bar" for the above example. For 2 or more, it will
1693 produce "".
1694
1695 Each component of the path is quoted for use as file name. */
1696
1697 static void
append_dir_structure(const struct url * u,struct growable * dest)1698 append_dir_structure (const struct url *u, struct growable *dest)
1699 {
1700 char *pathel, *next;
1701 int cut = opt.cut_dirs;
1702
1703 /* Go through the path components, de-URL-quote them, and quote them
1704 (if necessary) as file names. */
1705
1706 pathel = u->path;
1707 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1708 {
1709 if (cut-- > 0)
1710 continue;
1711 if (pathel == next)
1712 /* Ignore empty pathels. */
1713 continue;
1714
1715 if (dest->tail)
1716 append_char ('/', dest);
1717
1718 append_uri_pathel (pathel, next, true, dest);
1719 }
1720 }
1721
1722 /* Return a unique file name that matches the given URL as well as
1723 possible. Does not create directories on the file system. */
1724
1725 char *
url_file_name(const struct url * u,char * replaced_filename)1726 url_file_name (const struct url *u, char *replaced_filename)
1727 {
1728 struct growable fnres; /* stands for "file name result" */
1729 struct growable temp_fnres;
1730
1731 const char *u_file;
1732 char *fname, *unique, *fname_len_check;
1733 const char *index_filename = "index.html"; /* The default index file is index.html */
1734
1735 fnres.base = NULL;
1736 fnres.size = 0;
1737 fnres.tail = 0;
1738
1739 temp_fnres.base = NULL;
1740 temp_fnres.size = 0;
1741 temp_fnres.tail = 0;
1742
1743 /* If an alternative index file was defined, change index_filename */
1744 if (opt.default_page)
1745 index_filename = opt.default_page;
1746
1747
1748 /* Start with the directory prefix, if specified. */
1749 if (opt.dir_prefix)
1750 append_string (opt.dir_prefix, &fnres);
1751
1752 /* If "dirstruct" is turned on (typically the case with -r), add
1753 the host and port (unless those have been turned off) and
1754 directory structure. */
1755 /* All safe remote chars are unescaped and stored in temp_fnres,
1756 then converted to local and appended to fnres.
1757 Internationalized URL/IDN will produce punycode to lookup IP from DNS:
1758 https://en.wikipedia.org/wiki/URL
1759 https://en.wikipedia.org/wiki/Internationalized_domain_name
1760 Non-ASCII code chars in the path:
1761 https://en.wikipedia.org/wiki/List_of_Unicode_characters
1762 https://en.wikipedia.org/wiki/List_of_writing_systems */
1763 if (opt.dirstruct)
1764 {
1765 if (opt.protocol_directories)
1766 {
1767 if (temp_fnres.tail)
1768 append_char ('/', &temp_fnres);
1769 append_string (supported_schemes[u->scheme].name, &temp_fnres);
1770 }
1771 if (opt.add_hostdir)
1772 {
1773 if (temp_fnres.tail)
1774 append_char ('/', &temp_fnres);
1775 if (0 != strcmp (u->host, ".."))
1776 append_string (u->host, &temp_fnres);
1777 else
1778 /* Host name can come from the network; malicious DNS may
1779 allow ".." to be resolved, causing us to write to
1780 "../<file>". Defang such host names. */
1781 append_string ("%2E%2E", &temp_fnres);
1782 if (u->port != scheme_default_port (u->scheme))
1783 {
1784 char portstr[24];
1785 number_to_string (portstr, u->port);
1786 append_char (FN_PORT_SEP, &temp_fnres);
1787 append_string (portstr, &temp_fnres);
1788 }
1789 }
1790
1791 append_dir_structure (u, &temp_fnres);
1792 }
1793
1794 if (!replaced_filename)
1795 {
1796 /* Create the filename. */
1797 u_file = *u->file ? u->file : index_filename;
1798
1799 /* Append "?query" to the file name, even if empty,
1800 * and create fname_len_check. */
1801 if (u->query)
1802 fname_len_check = concat_strings (u_file, FN_QUERY_SEP_STR, u->query, NULL);
1803 else
1804 fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
1805 }
1806 else
1807 {
1808 u_file = replaced_filename;
1809 fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
1810 }
1811
1812 if (temp_fnres.tail)
1813 append_char ('/', &temp_fnres);
1814
1815 append_uri_pathel (fname_len_check,
1816 fname_len_check + strlen (fname_len_check), true, &temp_fnres);
1817
1818 /* Zero-terminate the temporary file name. */
1819 append_char ('\0', &temp_fnres);
1820
1821 /* convert all remote chars before length check and appending to local path */
1822 fname = convert_fname (temp_fnres.base);
1823 temp_fnres.base = NULL;
1824 temp_fnres.size = 0;
1825 temp_fnres.tail = 0;
1826 append_string (fname, &temp_fnres);
1827
1828 xfree (fname);
1829 xfree (fname_len_check);
1830
1831 /* The filename has already been 'cleaned' by append_uri_pathel() above. So,
1832 * just append it. */
1833 if (fnres.tail)
1834 append_char ('/', &fnres);
1835 append_string (temp_fnres.base, &fnres);
1836
1837 fname = fnres.base;
1838
1839 /* Make a final check that the path length is acceptable? */
1840 /* TODO: check fnres.base for path length problem */
1841
1842 xfree (temp_fnres.base);
1843
1844 /* Check the cases in which the unique extensions are not used:
1845 1) Clobbering is turned off (-nc).
1846 2) Retrieval with regetting.
1847 3) Timestamping is used.
1848 4) Hierarchy is built.
1849 5) Backups are specified.
1850
1851 The exception is the case when file does exist and is a
1852 directory (see `mkalldirs' for explanation). */
1853
1854 if (ALLOW_CLOBBER
1855 && !(file_exists_p (fname, NULL) && !file_non_directory_p (fname)))
1856 {
1857 unique = fname;
1858 }
1859 else
1860 {
1861 unique = unique_name_passthrough (fname);
1862 if (unique != fname)
1863 xfree (fname);
1864 }
1865
1866 /* On VMS, alter the name as required. */
1867 #ifdef __VMS
1868 {
1869 char *unique2;
1870
1871 unique2 = ods_conform( unique);
1872 if (unique2 != unique)
1873 {
1874 xfree (unique);
1875 unique = unique2;
1876 }
1877 }
1878 #endif /* def __VMS */
1879
1880 return unique;
1881 }
1882
1883 /* Resolve "." and ".." elements of PATH by destructively modifying
1884 PATH and return true if PATH has been modified, false otherwise.
1885
1886 The algorithm is in spirit similar to the one described in rfc1808,
1887 although implemented differently, in one pass. To recap, path
1888 elements containing only "." are removed, and ".." is taken to mean
1889 "back up one element". Single leading and trailing slashes are
1890 preserved.
1891
1892 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1893 test examples are provided below. If you change anything in this
1894 function, run test_path_simplify to make sure you haven't broken a
1895 test case. */
1896
1897 static bool
path_simplify(enum url_scheme scheme,char * path)1898 path_simplify (enum url_scheme scheme, char *path)
1899 {
1900 char *h = path; /* hare */
1901 char *t = path; /* tortoise */
1902 char *beg = path;
1903 char *end = strchr (path, '\0');
1904
1905 while (h < end)
1906 {
1907 /* Hare should be at the beginning of a path element. */
1908
1909 if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1910 {
1911 /* Ignore "./". */
1912 h += 2;
1913 }
1914 else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1915 {
1916 /* Handle "../" by retreating the tortoise by one path
1917 element -- but not past beginning. */
1918 if (t > beg)
1919 {
1920 /* Move backwards until T hits the beginning of the
1921 previous path element or the beginning of path. */
1922 for (--t; t > beg && t[-1] != '/'; t--)
1923 ;
1924 }
1925 else if (scheme == SCHEME_FTP
1926 #ifdef HAVE_SSL
1927 || scheme == SCHEME_FTPS
1928 #endif
1929 )
1930 {
1931 /* If we're at the beginning, copy the "../" literally
1932 and move the beginning so a later ".." doesn't remove
1933 it. This violates RFC 3986; but we do it for FTP
1934 anyway because there is otherwise no way to get at a
1935 parent directory, when the FTP server drops us in a
1936 non-root directory (which is not uncommon). */
1937 beg = t + 3;
1938 goto regular;
1939 }
1940 h += 3;
1941 }
1942 else
1943 {
1944 regular:
1945 /* A regular path element. If H hasn't advanced past T,
1946 simply skip to the next path element. Otherwise, copy
1947 the path element until the next slash. */
1948 if (t == h)
1949 {
1950 /* Skip the path element, including the slash. */
1951 while (h < end && *h != '/')
1952 t++, h++;
1953 if (h < end)
1954 t++, h++;
1955 }
1956 else
1957 {
1958 /* Copy the path element, including the final slash. */
1959 while (h < end && *h != '/')
1960 *t++ = *h++;
1961 if (h < end)
1962 *t++ = *h++;
1963 }
1964 }
1965 }
1966
1967 if (t != h)
1968 *t = '\0';
1969
1970 return t != h;
1971 }
1972
1973 /* Return the length of URL's path. Path is considered to be
1974 terminated by one or more of the ?query or ;params or #fragment,
1975 depending on the scheme. */
1976
1977 static const char *
path_end(const char * url)1978 path_end (const char *url)
1979 {
1980 enum url_scheme scheme = url_scheme (url);
1981 const char *seps;
1982 if (scheme == SCHEME_INVALID)
1983 scheme = SCHEME_HTTP; /* use http semantics for rel links */
1984 /* +2 to ignore the first two separators ':' and '/' */
1985 seps = init_seps (scheme) + 2;
1986 return strpbrk_or_eos (url, seps);
1987 }
1988
1989 /* Find the last occurrence of character C in the range [b, e), or
1990 NULL, if none are present. */
1991 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1992
1993 /* Merge BASE with LINK and return the resulting URI.
1994
1995 Either of the URIs may be absolute or relative, complete with the
1996 host name, or path only. This tries to reasonably handle all
1997 foreseeable cases. It only employs minimal URL parsing, without
1998 knowledge of the specifics of schemes.
1999
2000 I briefly considered making this function call path_simplify after
2001 the merging process, as rfc1738 seems to suggest. This is a bad
2002 idea for several reasons: 1) it complexifies the code, and 2)
2003 url_parse has to simplify path anyway, so it's wasteful to boot. */
2004
2005 char *
uri_merge(const char * base,const char * link)2006 uri_merge (const char *base, const char *link)
2007 {
2008 int linklength;
2009 const char *end;
2010 char *merge;
2011
2012 if (url_has_scheme (link))
2013 return xstrdup (link);
2014
2015 /* We may not examine BASE past END. */
2016 end = path_end (base);
2017 linklength = strlen (link);
2018
2019 if (!*link)
2020 {
2021 /* Empty LINK points back to BASE, query string and all. */
2022 return xstrdup (base);
2023 }
2024 else if (*link == '?')
2025 {
2026 /* LINK points to the same location, but changes the query
2027 string. Examples: */
2028 /* uri_merge("path", "?new") -> "path?new" */
2029 /* uri_merge("path?foo", "?new") -> "path?new" */
2030 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
2031 /* uri_merge("path#foo", "?new") -> "path?new" */
2032 int baselength = end - base;
2033 merge = xmalloc (baselength + linklength + 1);
2034 memcpy (merge, base, baselength);
2035 memcpy (merge + baselength, link, linklength);
2036 merge[baselength + linklength] = '\0';
2037 }
2038 else if (*link == '#')
2039 {
2040 /* uri_merge("path", "#new") -> "path#new" */
2041 /* uri_merge("path#foo", "#new") -> "path#new" */
2042 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
2043 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
2044 int baselength;
2045 const char *end1 = strchr (base, '#');
2046 if (!end1)
2047 end1 = base + strlen (base);
2048 baselength = end1 - base;
2049 merge = xmalloc (baselength + linklength + 1);
2050 memcpy (merge, base, baselength);
2051 memcpy (merge + baselength, link, linklength);
2052 merge[baselength + linklength] = '\0';
2053 }
2054 else if (*link == '/' && *(link + 1) == '/')
2055 {
2056 /* LINK begins with "//" and so is a net path: we need to
2057 replace everything after (and including) the double slash
2058 with LINK. */
2059
2060 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
2061 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
2062 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
2063
2064 int span;
2065 const char *slash;
2066 const char *start_insert;
2067
2068 /* Look for first slash. */
2069 slash = memchr (base, '/', end - base);
2070 /* If found slash and it is a double slash, then replace
2071 from this point, else default to replacing from the
2072 beginning. */
2073 if (slash && *(slash + 1) == '/')
2074 start_insert = slash;
2075 else
2076 start_insert = base;
2077
2078 span = start_insert - base;
2079 merge = xmalloc (span + linklength + 1);
2080 if (span)
2081 memcpy (merge, base, span);
2082 memcpy (merge + span, link, linklength);
2083 merge[span + linklength] = '\0';
2084 }
2085 else if (*link == '/')
2086 {
2087 /* LINK is an absolute path: we need to replace everything
2088 after (and including) the FIRST slash with LINK.
2089
2090 So, if BASE is "http://host/whatever/foo/bar", and LINK is
2091 "/qux/xyzzy", our result should be
2092 "http://host/qux/xyzzy". */
2093 int span;
2094 const char *slash;
2095 const char *start_insert = NULL; /* for gcc to shut up. */
2096 const char *pos = base;
2097 bool seen_slash_slash = false;
2098 /* We're looking for the first slash, but want to ignore
2099 double slash. */
2100 again:
2101 slash = memchr (pos, '/', end - pos);
2102 if (slash && !seen_slash_slash)
2103 if (*(slash + 1) == '/')
2104 {
2105 pos = slash + 2;
2106 seen_slash_slash = true;
2107 goto again;
2108 }
2109
2110 /* At this point, SLASH is the location of the first / after
2111 "//", or the first slash altogether. START_INSERT is the
2112 pointer to the location where LINK will be inserted. When
2113 examining the last two examples, keep in mind that LINK
2114 begins with '/'. */
2115
2116 if (!slash && !seen_slash_slash)
2117 /* example: "foo" */
2118 /* ^ */
2119 start_insert = base;
2120 else if (!slash && seen_slash_slash)
2121 /* example: "http://foo" */
2122 /* ^ */
2123 start_insert = end;
2124 else if (slash && !seen_slash_slash)
2125 /* example: "foo/bar" */
2126 /* ^ */
2127 start_insert = base;
2128 else if (slash && seen_slash_slash)
2129 /* example: "http://something/" */
2130 /* ^ */
2131 start_insert = slash;
2132
2133 span = start_insert - base;
2134 merge = xmalloc (span + linklength + 1);
2135 if (span)
2136 memcpy (merge, base, span);
2137 memcpy (merge + span, link, linklength);
2138 merge[span + linklength] = '\0';
2139 }
2140 else
2141 {
2142 /* LINK is a relative URL: we need to replace everything
2143 after last slash (possibly empty) with LINK.
2144
2145 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2146 our result should be "whatever/foo/qux/xyzzy". */
2147 bool need_explicit_slash = false;
2148 int span;
2149 const char *start_insert;
2150 const char *last_slash = find_last_char (base, end, '/');
2151 if (!last_slash)
2152 {
2153 /* No slash found at all. Replace what we have with LINK. */
2154 start_insert = base;
2155 }
2156 else if (last_slash && last_slash >= base + 2
2157 && last_slash[-2] == ':' && last_slash[-1] == '/')
2158 {
2159 /* example: http://host" */
2160 /* ^ */
2161 start_insert = end + 1;
2162 need_explicit_slash = true;
2163 }
2164 else
2165 {
2166 /* example: "whatever/foo/bar" */
2167 /* ^ */
2168 start_insert = last_slash + 1;
2169 }
2170
2171 span = start_insert - base;
2172 merge = xmalloc (span + linklength + 1);
2173 if (span)
2174 memcpy (merge, base, span);
2175 if (need_explicit_slash)
2176 merge[span - 1] = '/';
2177 memcpy (merge + span, link, linklength);
2178 merge[span + linklength] = '\0';
2179 }
2180
2181 return merge;
2182 }
2183
2184 #define APPEND(p, s) do { \
2185 int len = strlen (s); \
2186 memcpy (p, s, len); \
2187 p += len; \
2188 } while (0)
2189
2190 /* Use this instead of password when the actual password is supposed
2191 to be hidden. We intentionally use a generic string without giving
2192 away the number of characters in the password, like previous
2193 versions did. */
2194 #define HIDDEN_PASSWORD "*password*"
2195
2196 /* Recreate the URL string from the data in URL.
2197
2198 If HIDE is true (as it is when we're calling this on a URL we plan
2199 to print, but not when calling it to canonicalize a URL for use
2200 within the program), password will be hidden. Unsafe characters in
2201 the URL will be quoted. */
2202
2203 char *
url_string(const struct url * url,enum url_auth_mode auth_mode)2204 url_string (const struct url *url, enum url_auth_mode auth_mode)
2205 {
2206 int size;
2207 char *result, *p;
2208 char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
2209
2210 int scheme_port = supported_schemes[url->scheme].default_port;
2211 const char *scheme_str = supported_schemes[url->scheme].leading_string;
2212 int fplen = full_path_length (url);
2213
2214 bool brackets_around_host;
2215
2216 assert (scheme_str != NULL);
2217
2218 /* Make sure the user name and password are quoted. */
2219 if (url->user)
2220 {
2221 if (auth_mode != URL_AUTH_HIDE)
2222 {
2223 quoted_user = url_escape_allow_passthrough (url->user);
2224 if (url->passwd)
2225 {
2226 if (auth_mode == URL_AUTH_HIDE_PASSWD)
2227 quoted_passwd = (char *) HIDDEN_PASSWORD;
2228 else
2229 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2230 }
2231 }
2232 }
2233
2234 /* In the unlikely event that the host name contains non-printable
2235 characters, quote it for displaying to the user. */
2236 quoted_host = url_escape_allow_passthrough (url->host);
2237
2238 /* Undo the quoting of colons that URL escaping performs. IPv6
2239 addresses may legally contain colons, and in that case must be
2240 placed in square brackets. */
2241 if (quoted_host != url->host)
2242 unescape_single_char (quoted_host, ':');
2243 brackets_around_host = strchr (quoted_host, ':') != NULL;
2244
2245 size = (strlen (scheme_str)
2246 + strlen (quoted_host)
2247 + (brackets_around_host ? 2 : 0)
2248 + fplen
2249 + 1);
2250 if (url->port != scheme_port)
2251 size += 1 + numdigit (url->port);
2252 if (quoted_user)
2253 {
2254 size += 1 + strlen (quoted_user);
2255 if (quoted_passwd)
2256 size += 1 + strlen (quoted_passwd);
2257 }
2258
2259 p = result = xmalloc (size);
2260
2261 APPEND (p, scheme_str);
2262 if (quoted_user)
2263 {
2264 APPEND (p, quoted_user);
2265 if (quoted_passwd)
2266 {
2267 *p++ = ':';
2268 APPEND (p, quoted_passwd);
2269 }
2270 *p++ = '@';
2271 }
2272
2273 if (brackets_around_host)
2274 *p++ = '[';
2275 APPEND (p, quoted_host);
2276 if (brackets_around_host)
2277 *p++ = ']';
2278 if (url->port != scheme_port)
2279 {
2280 *p++ = ':';
2281 p = number_to_string (p, url->port);
2282 }
2283
2284 full_path_write (url, p);
2285 p += fplen;
2286 *p++ = '\0';
2287
2288 assert (p - result == size);
2289
2290 if (quoted_user && quoted_user != url->user)
2291 xfree (quoted_user);
2292 if (quoted_passwd && auth_mode == URL_AUTH_SHOW
2293 && quoted_passwd != url->passwd)
2294 xfree (quoted_passwd);
2295 if (quoted_host != url->host)
2296 xfree (quoted_host);
2297
2298 return result;
2299 }
2300
2301 /* Return true if scheme a is similar to scheme b.
2302
2303 Schemes are similar if they are equal. If SSL is supported, schemes
2304 are also similar if one is http (SCHEME_HTTP) and the other is https
2305 (SCHEME_HTTPS). */
2306 bool
schemes_are_similar_p(enum url_scheme a,enum url_scheme b)2307 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2308 {
2309 if (a == b)
2310 return true;
2311 #ifdef HAVE_SSL
2312 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2313 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2314 return true;
2315 #endif
2316 return false;
2317 }
2318
2319 static int
getchar_from_escaped_string(const char * str,char * c)2320 getchar_from_escaped_string (const char *str, char *c)
2321 {
2322 const char *p = str;
2323
2324 assert (str && *str);
2325 assert (c);
2326
2327 if (p[0] == '%')
2328 {
2329 if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2330 {
2331 *c = '%';
2332 return 1;
2333 }
2334 else
2335 {
2336 if (p[2] == 0)
2337 return 0; /* error: invalid string */
2338
2339 *c = X2DIGITS_TO_NUM (p[1], p[2]);
2340 if (URL_RESERVED_CHAR(*c))
2341 {
2342 *c = '%';
2343 return 1;
2344 }
2345 else
2346 return 3;
2347 }
2348 }
2349 else
2350 {
2351 *c = p[0];
2352 }
2353
2354 return 1;
2355 }
2356
2357 bool
are_urls_equal(const char * u1,const char * u2)2358 are_urls_equal (const char *u1, const char *u2)
2359 {
2360 const char *p, *q;
2361 int pp, qq;
2362 char ch1, ch2;
2363 assert(u1 && u2);
2364
2365 p = u1;
2366 q = u2;
2367
2368 while (*p && *q
2369 && (pp = getchar_from_escaped_string (p, &ch1))
2370 && (qq = getchar_from_escaped_string (q, &ch2))
2371 && (c_tolower(ch1) == c_tolower(ch2)))
2372 {
2373 p += pp;
2374 q += qq;
2375 }
2376
2377 return (*p == 0 && *q == 0 ? true : false);
2378 }
2379
2380 #ifdef TESTING
2381 /* Debugging and testing support for path_simplify. */
2382
2383 #if 0
2384 /* Debug: run path_simplify on PATH and return the result in a new
2385 string. Useful for calling from the debugger. */
2386 static char *
2387 ps (char *path)
2388 {
2389 char *copy = xstrdup (path);
2390 path_simplify (copy);
2391 return copy;
2392 }
2393 #endif
2394
2395 static const char *
run_test(const char * test,const char * expected_result,enum url_scheme scheme,bool expected_change)2396 run_test (const char *test, const char *expected_result, enum url_scheme scheme,
2397 bool expected_change)
2398 {
2399 char *test_copy = xstrdup (test);
2400 bool modified = path_simplify (scheme, test_copy);
2401
2402 if (0 != strcmp (test_copy, expected_result))
2403 {
2404 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2405 test, expected_result, test_copy);
2406 mu_assert ("", 0);
2407 }
2408 if (modified != expected_change)
2409 {
2410 if (expected_change)
2411 printf ("Expected modification with path_simplify(\"%s\").\n",
2412 test);
2413 else
2414 printf ("Expected no modification with path_simplify(\"%s\").\n",
2415 test);
2416 }
2417 xfree (test_copy);
2418 mu_assert ("", modified == expected_change);
2419 return NULL;
2420 }
2421
2422 const char *
test_path_simplify(void)2423 test_path_simplify (void)
2424 {
2425 static const struct {
2426 const char *test, *result;
2427 enum url_scheme scheme;
2428 bool should_modify;
2429 } tests[] = {
2430 { "", "", SCHEME_HTTP, false },
2431 { ".", "", SCHEME_HTTP, true },
2432 { "./", "", SCHEME_HTTP, true },
2433 { "..", "", SCHEME_HTTP, true },
2434 { "../", "", SCHEME_HTTP, true },
2435 { "..", "..", SCHEME_FTP, false },
2436 { "../", "../", SCHEME_FTP, false },
2437 { "foo", "foo", SCHEME_HTTP, false },
2438 { "foo/bar", "foo/bar", SCHEME_HTTP, false },
2439 { "foo///bar", "foo///bar", SCHEME_HTTP, false },
2440 { "foo/.", "foo/", SCHEME_HTTP, true },
2441 { "foo/./", "foo/", SCHEME_HTTP, true },
2442 { "foo./", "foo./", SCHEME_HTTP, false },
2443 { "foo/../bar", "bar", SCHEME_HTTP, true },
2444 { "foo/../bar/", "bar/", SCHEME_HTTP, true },
2445 { "foo/bar/..", "foo/", SCHEME_HTTP, true },
2446 { "foo/bar/../x", "foo/x", SCHEME_HTTP, true },
2447 { "foo/bar/../x/", "foo/x/", SCHEME_HTTP, true },
2448 { "foo/..", "", SCHEME_HTTP, true },
2449 { "foo/../..", "", SCHEME_HTTP, true },
2450 { "foo/../../..", "", SCHEME_HTTP, true },
2451 { "foo/../../bar/../../baz", "baz", SCHEME_HTTP, true },
2452 { "foo/../..", "..", SCHEME_FTP, true },
2453 { "foo/../../..", "../..", SCHEME_FTP, true },
2454 { "foo/../../bar/../../baz", "../../baz", SCHEME_FTP, true },
2455 { "a/b/../../c", "c", SCHEME_HTTP, true },
2456 { "./a/../b", "b", SCHEME_HTTP, true }
2457 };
2458 unsigned i;
2459
2460 for (i = 0; i < countof (tests); i++)
2461 {
2462 const char *message;
2463 const char *test = tests[i].test;
2464 const char *expected_result = tests[i].result;
2465 enum url_scheme scheme = tests[i].scheme;
2466 bool expected_change = tests[i].should_modify;
2467
2468 message = run_test (test, expected_result, scheme, expected_change);
2469 if (message) return message;
2470 }
2471 return NULL;
2472 }
2473
2474 const char *
test_append_uri_pathel(void)2475 test_append_uri_pathel(void)
2476 {
2477 unsigned i;
2478 static const struct {
2479 const char *original_url;
2480 const char *input;
2481 bool escaped;
2482 const char *expected_result;
2483 } test_array[] = {
2484 { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2485 };
2486
2487 for (i = 0; i < countof(test_array); ++i)
2488 {
2489 struct growable dest;
2490 const char *p = test_array[i].input;
2491
2492 memset (&dest, 0, sizeof (dest));
2493
2494 append_string (test_array[i].original_url, &dest);
2495 append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2496
2497 mu_assert ("test_append_uri_pathel: wrong result",
2498 strcmp (dest.base, test_array[i].expected_result) == 0);
2499 xfree (dest.base);
2500 }
2501
2502 return NULL;
2503 }
2504
2505 const char *
test_are_urls_equal(void)2506 test_are_urls_equal(void)
2507 {
2508 unsigned i;
2509 static const struct {
2510 const char *url1;
2511 const char *url2;
2512 bool expected_result;
2513 } test_array[] = {
2514 { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/", true },
2515 { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2516 { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/", false },
2517 { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/", true },
2518 { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/", false },
2519 { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/", false },
2520 };
2521
2522 for (i = 0; i < countof(test_array); ++i)
2523 {
2524 mu_assert ("test_are_urls_equal: wrong result",
2525 are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2526 }
2527
2528 return NULL;
2529 }
2530
2531 #endif /* TESTING */
2532
2533 /*
2534 * vim: et ts=2 sw=2
2535 */
2536