1 /* URL handling.
2    Copyright (C) 1996-2011, 2015, 2018-2021 Free Software Foundation,
3    Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work.  */
31 #include "wget.h"
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <assert.h>
40 #include "utils.h"
41 #include "url.h"
42 #include "host.h"  /* for is_valid_ipv6_address */
43 #include "c-strcase.h"
45 #ifdef HAVE_ICONV
46 # include <iconv.h>
47 #endif
48 #include <langinfo.h>
50 #ifdef __VMS
51 #include "vms.h"
52 #endif /* def __VMS */
54 #ifdef TESTING
55 #include "../tests/unit-tests.h"
56 #endif
58 enum {
59   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
60   scm_has_params = 2,           /* whether scheme has ;params */
61   scm_has_query = 4,            /* whether scheme has ?query */
62   scm_has_fragment = 8          /* whether scheme has #fragment */
63 };
65 struct scheme_data
66 {
67   /* Short name of the scheme, such as "http" or "ftp". */
68   const char *name;
69   /* Leading string that identifies the scheme, such as "https://". */
70   const char *leading_string;
71   /* Default port of the scheme when none is specified. */
72   int default_port;
73   /* Various flags. */
74   int flags;
75 };
77 /* Supported schemes: */
78 static struct scheme_data supported_schemes[] =
79 {
80   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
81 #ifdef HAVE_SSL
82   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
83 #endif
84   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
85 #ifdef HAVE_SSL
86   /*
87    * Explicit FTPS uses the same port as FTP.
88    * Implicit FTPS has its own port (990), but it is disabled by default.
89    */
90   { "ftps",     "ftps://",  DEFAULT_FTP_PORT,  scm_has_params|scm_has_fragment },
91 #endif
94   { NULL,       NULL,       -1,                 0 }
95 };
97 /* Forward declarations: */
99 static bool path_simplify (enum url_scheme, char *);
101 /* Support for escaping and unescaping of URL strings.  */
103 /* Table of "reserved" and "unsafe" characters.  Those terms are
104    rfc1738-speak, as such largely obsoleted by rfc2396 and later
105    specs, but the general idea remains.
107    A reserved character is the one that you can't decode without
108    changing the meaning of the URL.  For example, you can't decode
109    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
110    path components is different.  Non-reserved characters can be
111    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
112    unsafe characters are loosely based on rfc1738, plus "$" and ",",
113    as recommended by rfc2396, and minus "~", which is very frequently
114    used (and sometimes unrecognized as %7E by broken servers).
116    An unsafe character is the one that should be encoded when URLs are
117    placed in foreign environments.  E.g. space and newline are unsafe
118    in HTTP contexts because HTTP uses them as separator and line
119    terminator, so they must be encoded to %20 and %0A respectively.
120    "*" is unsafe in shell context, etc.
122    We determine whether a character is unsafe through static table
123    lookup.  This code assumes ASCII character set and 8-bit chars.  */
125 enum {
126   /* rfc1738 reserved chars + "$" and ",".  */
127   urlchr_reserved = 1,
129   /* rfc1738 unsafe chars, plus non-printables.  */
130   urlchr_unsafe   = 2
131 };
133 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
134 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
135 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
137 /* Shorthands for the table: */
138 #define R  urlchr_reserved
139 #define U  urlchr_unsafe
140 #define RU R|U
142 static const unsigned char urlchr_table[256] =
143 {
144   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
145   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
146   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
147   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
148   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
149   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
150   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
151   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
152  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
153   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
154   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
155   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
156   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
157   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
158   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
159   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
161   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
162   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
163   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
164   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
166   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
167   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
168   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
169   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
170 };
171 #undef R
172 #undef U
173 #undef RU
175 static void
url_unescape_1(char * s,unsigned char mask)176 url_unescape_1 (char *s, unsigned char mask)
177 {
178   unsigned char *t = (unsigned char *) s; /* t - tortoise */
179   unsigned char *h = (unsigned char *) s; /* h - hare     */
181   for (; *h; h++, t++)
182     {
183       if (*h != '%')
184         {
185         copychar:
186           *t = *h;
187         }
188       else
189         {
190           unsigned char c;
191           /* Do nothing if '%' is not followed by two hex digits. */
192           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
193             goto copychar;
194           c = X2DIGITS_TO_NUM (h[1], h[2]);
195           if (urlchr_test(c, mask))
196             goto copychar;
197           /* Don't unescape %00 because there is no way to insert it
198              into a C string without effectively truncating it. */
199           if (c == '\0')
200             goto copychar;
201           *t = c;
202           h += 2;
203         }
204     }
205   *t = '\0';
206 }
208 /* URL-unescape the string S.
210    This is done by transforming the sequences "%HH" to the character
211    represented by the hexadecimal digits HH.  If % is not followed by
212    two hexadecimal digits, it is inserted literally.
214    The transformation is done in place.  If you need the original
215    string intact, make a copy before calling this function.  */
216 void
url_unescape(char * s)217 url_unescape (char *s)
218 {
219   url_unescape_1 (s, 0);
220 }
222 /* URL-unescape the string S.
224    This functions behaves identically as url_unescape(), but does not
225    convert characters from "reserved". In other words, it only converts
226    "unsafe" characters.  */
227 void
url_unescape_except_reserved(char * s)228 url_unescape_except_reserved (char *s)
229 {
230   url_unescape_1 (s, urlchr_reserved);
231 }
233 /* The core of url_escape_* functions.  Escapes the characters that
234    match the provided mask in urlchr_table.
236    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
237    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
238    allocated string will be returned in all cases.  */
240 static char *
url_escape_1(const char * s,unsigned char mask,bool allow_passthrough)241 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
242 {
243   const char *p1;
244   char *p2, *newstr;
245   int newlen;
246   int addition = 0;
248   for (p1 = s; *p1; p1++)
249     if (urlchr_test (*p1, mask))
250       addition += 2;            /* Two more characters (hex digits) */
252   if (!addition)
253     return allow_passthrough ? (char *)s : xstrdup (s);
255   newlen = (p1 - s) + addition;
256   newstr = xmalloc (newlen + 1);
258   p1 = s;
259   p2 = newstr;
260   while (*p1)
261     {
262       /* Quote the characters that match the test mask. */
263       if (urlchr_test (*p1, mask))
264         {
265           unsigned char c = *p1++;
266           *p2++ = '%';
267           *p2++ = XNUM_TO_DIGIT (c >> 4);
268           *p2++ = XNUM_TO_DIGIT (c & 0xf);
269         }
270       else
271         *p2++ = *p1++;
272     }
273   assert (p2 - newstr == newlen);
274   *p2 = '\0';
276   return newstr;
277 }
279 /* URL-escape the unsafe characters (see urlchr_table) in a given
280    string, returning a freshly allocated string.  */
282 char *
url_escape(const char * s)283 url_escape (const char *s)
284 {
285   return url_escape_1 (s, urlchr_unsafe, false);
286 }
288 /* URL-escape the unsafe and reserved characters (see urlchr_table) in
289    a given string, returning a freshly allocated string.  */
291 char *
url_escape_unsafe_and_reserved(const char * s)292 url_escape_unsafe_and_reserved (const char *s)
293 {
294   return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
295 }
297 /* URL-escape the unsafe characters (see urlchr_table) in a given
298    string.  If no characters are unsafe, S is returned.  */
300 static char *
url_escape_allow_passthrough(const char * s)301 url_escape_allow_passthrough (const char *s)
302 {
303   return url_escape_1 (s, urlchr_unsafe, true);
304 }
306 /* Decide whether the char at position P needs to be encoded.  (It is
307    not enough to pass a single char *P because the function may need
308    to inspect the surrounding context.)
310    Return true if the char should be escaped as %XX, false otherwise.  */
312 static inline bool
char_needs_escaping(const char * p)313 char_needs_escaping (const char *p)
314 {
315   if (*p == '%')
316     {
317       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
318         return false;
319       else
320         /* Garbled %.. sequence: encode `%'. */
321         return true;
322     }
323   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
324     return true;
325   else
326     return false;
327 }
329 /* Translate a %-escaped (but possibly non-conformant) input string S
330    into a %-escaped (and conformant) output string.  If no characters
331    are encoded or decoded, return the same string S; otherwise, return
332    a freshly allocated string with the new contents.
334    After a URL has been run through this function, the protocols that
335    use `%' as the quote character can use the resulting string as-is,
336    while those that don't can use url_unescape to get to the intended
337    data.  This function is stable: once the input is transformed,
338    further transformations of the result yield the same output.
340    Let's discuss why this function is needed.
342    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
343    a raw space character would mess up the HTTP request, it needs to
344    be quoted, like this:
346        GET /abc%20def HTTP/1.0
348    It would appear that the unsafe chars need to be quoted, for
349    example with url_escape.  But what if we're requested to download
350    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
351    us with `abc%2520def'.  This is incorrect -- since %-escapes are
352    part of URL syntax, "%20" is the correct way to denote a literal
353    space on the Wget command line.  This leads to the conclusion that
354    in that case Wget should not call url_escape, but leave the `%20'
355    as is.  This is clearly contradictory, but it only gets worse.
357    What if the requested URI is `abc%20 def'?  If we call url_escape,
358    we end up with `/abc%2520%20def', which is almost certainly not
359    intended.  If we don't call url_escape, we are left with the
360    embedded space and cannot complete the request.  What the user
361    meant was for Wget to request `/abc%20%20def', and this is where
362    reencode_escapes kicks in.
364    Wget used to solve this by first decoding %-quotes, and then
365    encoding all the "unsafe" characters found in the resulting string.
366    This was wrong because it didn't preserve certain URL special
367    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
368    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
369    whether we considered `+' reserved (it is).  One of these results
370    is inevitable because by the second step we would lose information
371    on whether the `+' was originally encoded or not.  Both results
372    were wrong because in CGI parameters + means space, while %2B means
373    literal plus.  reencode_escapes correctly translates the above to
374    "a%2B+b", i.e. returns the original string.
376    This function uses a modified version of the algorithm originally
377    proposed by Anon Sricharoenchai:
379    * Encode all "unsafe" characters, except those that are also
380      "reserved", to %XX.  See urlchr_table for which characters are
381      unsafe and reserved.
383    * Encode the "%" characters not followed by two hex digits to
384      "%25".
386    * Pass through all other characters and %XX escapes as-is.  (Up to
387      Wget 1.10 this decoded %XX escapes corresponding to "safe"
388      characters, but that was obtrusive and broke some servers.)
390    Anon's test case:
392    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
393    ->
394    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
396    Simpler test cases:
398    "foo bar"         -> "foo%20bar"
399    "foo%20bar"       -> "foo%20bar"
400    "foo %20bar"      -> "foo%20%20bar"
401    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
402    "foo%25%20bar"    -> "foo%25%20bar"
403    "foo%2%20bar"     -> "foo%252%20bar"
404    "foo+bar"         -> "foo+bar"            (plus is reserved!)
405    "foo%2b+bar"      -> "foo%2b+bar"  */
407 static char *
reencode_escapes(const char * s)408 reencode_escapes (const char *s)
409 {
410   const char *p1;
411   char *newstr, *p2;
412   int oldlen, newlen;
414   int encode_count = 0;
416   /* First pass: inspect the string to see if there's anything to do,
417      and to calculate the new length.  */
418   for (p1 = s; *p1; p1++)
419     if (char_needs_escaping (p1))
420       ++encode_count;
422   if (!encode_count)
423     /* The string is good as it is. */
424     return (char *) s;          /* C const model sucks. */
426   oldlen = p1 - s;
427   /* Each encoding adds two characters (hex digits).  */
428   newlen = oldlen + 2 * encode_count;
429   newstr = xmalloc (newlen + 1);
431   /* Second pass: copy the string to the destination address, encoding
432      chars when needed.  */
433   p1 = s;
434   p2 = newstr;
436   while (*p1)
437     if (char_needs_escaping (p1))
438       {
439         unsigned char c = *p1++;
440         *p2++ = '%';
441         *p2++ = XNUM_TO_DIGIT (c >> 4);
442         *p2++ = XNUM_TO_DIGIT (c & 0xf);
443       }
444     else
445       *p2++ = *p1++;
447   *p2 = '\0';
448   assert (p2 - newstr == newlen);
449   return newstr;
450 }
452 /* Returns the scheme type if the scheme is supported, or
453    SCHEME_INVALID if not.  */
455 enum url_scheme
url_scheme(const char * url)456 url_scheme (const char *url)
457 {
458   int i;
460   for (i = 0; supported_schemes[i].leading_string; i++)
461     if (0 == c_strncasecmp (url, supported_schemes[i].leading_string,
462                           strlen (supported_schemes[i].leading_string)))
463       {
464         if (!(supported_schemes[i].flags & scm_disabled))
465           return (enum url_scheme) i;
466         else
467           return SCHEME_INVALID;
468       }
470   return SCHEME_INVALID;
471 }
473 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
475 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
476    currently implemented, it returns true if URL begins with
477    [-+a-zA-Z0-9]+: .  */
479 bool
url_has_scheme(const char * url)480 url_has_scheme (const char *url)
481 {
482   const char *p = url;
484   /* The first char must be a scheme char. */
485   if (!*p || !SCHEME_CHAR (*p))
486     return false;
487   ++p;
488   /* Followed by 0 or more scheme chars. */
489   while (*p && SCHEME_CHAR (*p))
490     ++p;
491   /* Terminated by ':'. */
492   return *p == ':';
493 }
495 bool
url_valid_scheme(const char * url)496 url_valid_scheme (const char *url)
497 {
498   enum url_scheme scheme = url_scheme (url);
499   return scheme != SCHEME_INVALID;
500 }
502 int
scheme_default_port(enum url_scheme scheme)503 scheme_default_port (enum url_scheme scheme)
504 {
505   return supported_schemes[scheme].default_port;
506 }
508 void
scheme_disable(enum url_scheme scheme)509 scheme_disable (enum url_scheme scheme)
510 {
511   supported_schemes[scheme].flags |= scm_disabled;
512 }
514 const char *
scheme_leading_string(enum url_scheme scheme)515 scheme_leading_string (enum url_scheme scheme)
516 {
517   return supported_schemes[scheme].leading_string;
518 }
520 /* Skip the username and password, if present in the URL.  The
521    function should *not* be called with the complete URL, but with the
522    portion after the scheme.
524    If no username and password are found, return URL.  */
526 static const char *
url_skip_credentials(const char * url)527 url_skip_credentials (const char *url)
528 {
529   /* Look for '@' that comes before terminators, such as '/', '?',
530      '#', or ';'.  */
531   const char *p = (const char *)strpbrk (url, "@/?#;");
532   if (!p || *p != '@')
533     return url;
534   return p + 1;
535 }
537 /* Parse credentials contained in [BEG, END).  The region is expected
538    to have come from a URL and is unescaped.  */
540 static bool
parse_credentials(const char * beg,const char * end,char ** user,char ** passwd)541 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
542 {
543   char *colon;
544   const char *userend;
546   if (beg == end)
547     return false;               /* empty user name */
549   colon = memchr (beg, ':', end - beg);
550   if (colon == beg)
551     return false;               /* again empty user name */
553   if (colon)
554     {
555       *passwd = strdupdelim (colon + 1, end);
556       userend = colon;
557       url_unescape (*passwd);
558     }
559   else
560     {
561       *passwd = NULL;
562       userend = end;
563     }
564   *user = strdupdelim (beg, userend);
565   url_unescape (*user);
566   return true;
567 }
569 /* Used by main.c: detect URLs written using the "shorthand" URL forms
570    originally popularized by Netscape and NcFTP.  HTTP shorthands look
571    like this:
573    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
574    www.foo.com[:port]            -> http://www.foo.com[:port]
576    FTP shorthands look like this:
578    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
579    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
581    If the URL needs not or cannot be rewritten, return NULL.  */
583 char *
rewrite_shorthand_url(const char * url)584 rewrite_shorthand_url (const char *url)
585 {
586   const char *p;
587   char *ret;
589   if (url_scheme (url) != SCHEME_INVALID)
590     return NULL;
592   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
593      latter Netscape.  */
594   p = strpbrk (url, ":/");
595   if (p == url)
596     return NULL;
598   /* If we're looking at "://", it means the URL uses a scheme we
599      don't support, which may include "https" when compiled without
600      SSL support.  Don't bogusly rewrite such URLs.  */
601   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
602     return NULL;
604   if (p && *p == ':')
605     {
606       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
607          special case of http port number ("localhost:10000").  */
608       int digits = strspn (p + 1, "0123456789");
609       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
610         goto http;
612       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
613       if ((ret = aprintf ("ftp://%s", url)) != NULL)
614         ret[6 + (p - url)] = '/';
615     }
616   else
617     {
618     http:
619       /* Just prepend "http://" to URL. */
620       ret = aprintf ("http://%s", url);
621     }
622   return ret;
623 }
625 static void split_path (const char *, char **, char **);
627 /* Like strpbrk, with the exception that it returns the pointer to the
628    terminating zero (end-of-string aka "eos") if no matching character
629    is found.  */
631 static inline char *
strpbrk_or_eos(const char * s,const char * accept)632 strpbrk_or_eos (const char *s, const char *accept)
633 {
634   char *p = strpbrk (s, accept);
635   if (!p)
636     p = strchr (s, '\0');
637   return p;
638 }
640 /* Turn STR into lowercase; return true if a character was actually
641    changed. */
643 static bool
lowercase_str(char * str)644 lowercase_str (char *str)
645 {
646   bool changed = false;
647   for (; *str; str++)
648     if (c_isupper (*str))
649       {
650         changed = true;
651         *str = c_tolower (*str);
652       }
653   return changed;
654 }
656 static const char *
init_seps(enum url_scheme scheme)657 init_seps (enum url_scheme scheme)
658 {
659   static char seps[8] = ":/";
660   char *p = seps + 2;
661   int flags = supported_schemes[scheme].flags;
663   if (flags & scm_has_params)
664     *p++ = ';';
665   if (flags & scm_has_query)
666     *p++ = '?';
667   if (flags & scm_has_fragment)
668     *p++ = '#';
669   *p = '\0';
670   return seps;
671 }
673 static const char *parse_errors[] = {
674 #define PE_NO_ERROR                     0
675   N_("No error"),
676 #define PE_UNSUPPORTED_SCHEME           1
677   N_("Unsupported scheme %s"), /* support for format token only here */
678 #define PE_MISSING_SCHEME               2
679   N_("Scheme missing"),
680 #define PE_INVALID_HOST_NAME            3
681   N_("Invalid host name"),
682 #define PE_BAD_PORT_NUMBER              4
683   N_("Bad port number"),
684 #define PE_INVALID_USER_NAME            5
685   N_("Invalid user name"),
687   N_("Unterminated IPv6 numeric address"),
688 #define PE_IPV6_NOT_SUPPORTED           7
689   N_("IPv6 addresses not supported"),
690 #define PE_INVALID_IPV6_ADDRESS         8
691   N_("Invalid IPv6 numeric address")
692 };
694 /* Parse a URL.
696    Return a new struct url if successful, NULL on error.  In case of
697    error, and if ERROR is not NULL, also set *ERROR to the appropriate
698    error code. */
699 struct url *
url_parse(const char * url,int * error,struct iri * iri,bool percent_encode)700 url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
701 {
702   struct url *u;
703   const char *p;
704   bool path_modified, host_modified;
706   enum url_scheme scheme;
707   const char *seps;
709   const char *uname_b,     *uname_e;
710   const char *host_b,      *host_e;
711   const char *path_b,      *path_e;
712   const char *params_b,    *params_e;
713   const char *query_b,     *query_e;
714   const char *fragment_b,  *fragment_e;
716   int port;
717   char *user = NULL, *passwd = NULL;
719   const char *url_encoded = NULL;
721   int error_code;
723   scheme = url_scheme (url);
724   if (scheme == SCHEME_INVALID)
725     {
726       if (url_has_scheme (url))
727         error_code = PE_UNSUPPORTED_SCHEME;
728       else
729         error_code = PE_MISSING_SCHEME;
730       goto error;
731     }
733   url_encoded = url;
735   if (iri && iri->utf8_encode)
736     {
737       char *new_url = NULL;
739       iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, &new_url);
740       if (!iri->utf8_encode)
741         new_url = NULL;
742       else
743         {
744           xfree (iri->orig_url);
745           iri->orig_url = xstrdup (url);
746           url_encoded = reencode_escapes (new_url);
747           if (url_encoded != new_url)
748             xfree (new_url);
749           percent_encode = false;
750         }
751     }
753   if (percent_encode)
754     url_encoded = reencode_escapes (url);
756   p = url_encoded;
757   p += strlen (supported_schemes[scheme].leading_string);
758   uname_b = p;
759   p = url_skip_credentials (p);
760   uname_e = p;
762   /* scheme://user:pass@host[:port]... */
763   /*                    ^              */
765   /* We attempt to break down the URL into the components path,
766      params, query, and fragment.  They are ordered like this:
768        scheme://host[:port][/path][;params][?query][#fragment]  */
770   path_b     = path_e     = NULL;
771   params_b   = params_e   = NULL;
772   query_b    = query_e    = NULL;
773   fragment_b = fragment_e = NULL;
775   /* Initialize separators for optional parts of URL, depending on the
776      scheme.  For example, FTP has params, and HTTP and HTTPS have
777      query string and fragment. */
778   seps = init_seps (scheme);
780   host_b = p;
782   if (*p == '[')
783     {
784       /* Handle IPv6 address inside square brackets.  Ideally we'd
785          just look for the terminating ']', but rfc2732 mandates
786          rejecting invalid IPv6 addresses.  */
788       /* The address begins after '['. */
789       host_b = p + 1;
790       host_e = strchr (host_b, ']');
792       if (!host_e)
793         {
794           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
795           goto error;
796         }
798 #ifdef ENABLE_IPV6
799       /* Check if the IPv6 address is valid. */
800       if (!is_valid_ipv6_address(host_b, host_e))
801         {
802           error_code = PE_INVALID_IPV6_ADDRESS;
803           goto error;
804         }
806       /* Continue parsing after the closing ']'. */
807       p = host_e + 1;
808 #else
809       error_code = PE_IPV6_NOT_SUPPORTED;
810       goto error;
811 #endif
813       /* The closing bracket must be followed by a separator or by the
814          null char.  */
815       /* http://[::1]... */
816       /*             ^   */
817       if (!strchr (seps, *p))
818         {
819           /* Trailing garbage after []-delimited IPv6 address. */
820           error_code = PE_INVALID_HOST_NAME;
821           goto error;
822         }
823     }
824   else
825     {
826       p = strpbrk_or_eos (p, seps);
827       host_e = p;
828     }
829   ++seps;                       /* advance to '/' */
831   if (host_b == host_e)
832     {
833       error_code = PE_INVALID_HOST_NAME;
834       goto error;
835     }
837   port = scheme_default_port (scheme);
838   if (*p == ':')
839     {
840       const char *port_b, *port_e, *pp;
842       /* scheme://host:port/tralala */
843       /*              ^             */
844       ++p;
845       port_b = p;
846       p = strpbrk_or_eos (p, seps);
847       port_e = p;
849       /* Allow empty port, as per rfc2396. */
850       if (port_b != port_e)
851         for (port = 0, pp = port_b; pp < port_e; pp++)
852           {
853             if (!c_isdigit (*pp))
854               {
855                 /* http://host:12randomgarbage/blah */
856                 /*               ^                  */
857                 error_code = PE_BAD_PORT_NUMBER;
858                 goto error;
859               }
860             port = 10 * port + (*pp - '0');
861             /* Check for too large port numbers here, before we have
862                a chance to overflow on bogus port values.  */
863             if (port > 0xffff)
864               {
865                 error_code = PE_BAD_PORT_NUMBER;
866                 goto error;
867               }
868           }
869     }
870   /* Advance to the first separator *after* '/' (either ';' or '?',
871      depending on the scheme).  */
872   ++seps;
874   /* Get the optional parts of URL, each part being delimited by
875      current location and the position of the next separator.  */
876 #define GET_URL_PART(sepchar, var) do {                         \
877   if (*p == sepchar)                                            \
878     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
879   ++seps;                                                       \
880 } while (0)
882   GET_URL_PART ('/', path);
883   if (supported_schemes[scheme].flags & scm_has_params)
884     GET_URL_PART (';', params);
885   if (supported_schemes[scheme].flags & scm_has_query)
886     GET_URL_PART ('?', query);
887   if (supported_schemes[scheme].flags & scm_has_fragment)
888     GET_URL_PART ('#', fragment);
890 #undef GET_URL_PART
891   assert (*p == 0);
893   if (uname_b != uname_e)
894     {
895       /* http://user:pass@host */
896       /*        ^         ^    */
897       /*     uname_b   uname_e */
898       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
899         {
900           error_code = PE_INVALID_USER_NAME;
901           goto error;
902         }
903     }
905   u = xnew0 (struct url);
906   u->scheme = scheme;
907   u->host   = strdupdelim (host_b, host_e);
908   u->port   = port;
909   u->user   = user;
910   u->passwd = passwd;
912   u->path = strdupdelim (path_b, path_e);
913   path_modified = path_simplify (scheme, u->path);
914   split_path (u->path, &u->dir, &u->file);
916   host_modified = lowercase_str (u->host);
918   /* Decode %HH sequences in host name.  This is important not so much
919      to support %HH sequences in host names (which other browser
920      don't), but to support binary characters (which will have been
921      converted to %HH by reencode_escapes).  */
922   if (strchr (u->host, '%'))
923     {
924       url_unescape (u->host);
925       host_modified = true;
927       /* check for invalid control characters in host name */
928       for (p = u->host; *p; p++)
929         {
930           if (c_iscntrl(*p))
931             {
932               url_free(u);
933               error_code = PE_INVALID_HOST_NAME;
934               goto error;
935             }
936         }
938       /* Apply IDNA regardless of iri->utf8_encode status */
939       if (opt.enable_iri && iri)
940         {
941           char *new = idn_encode (iri, u->host);
942           if (new)
943             {
944               xfree (u->host);
945               u->host = new;
946               host_modified = true;
947             }
948         }
949     }
951   if (params_b)
952     u->params = strdupdelim (params_b, params_e);
953   if (query_b)
954     u->query = strdupdelim (query_b, query_e);
955   if (fragment_b)
956     u->fragment = strdupdelim (fragment_b, fragment_e);
958   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
959     {
960       /* If we suspect that a transformation has rendered what
961          url_string might return different from URL_ENCODED, rebuild
962          u->url using url_string.  */
963       u->url = url_string (u, URL_AUTH_SHOW);
965       if (url_encoded != url)
966         xfree (url_encoded);
967     }
968   else
969     {
970       if (url_encoded == url)
971         u->url = xstrdup (url);
972       else
973         u->url = (char *) url_encoded;
974     }
976   return u;
978  error:
979   /* Cleanup in case of error: */
980   if (url_encoded && url_encoded != url)
981     xfree (url_encoded);
983   /* Transmit the error code to the caller, if the caller wants to
984      know.  */
985   if (error)
986     *error = error_code;
987   return NULL;
988 }
990 /* Return the error message string from ERROR_CODE, which should have
991    been retrieved from url_parse.  The error message is translated.  */
993 char *
url_error(const char * url,int error_code)994 url_error (const char *url, int error_code)
995 {
996   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
998   if (error_code == PE_UNSUPPORTED_SCHEME)
999     {
1000       char *error, *p;
1001       char *scheme = xstrdup (url);
1002       assert (url_has_scheme (url));
1004       if ((p = strchr (scheme, ':')))
1005         *p = '\0';
1006       if (!c_strcasecmp (scheme, "https"))
1007         error = aprintf (_("HTTPS support not compiled in"));
1008       else
1009         error = aprintf (_(parse_errors[error_code]), quote (scheme));
1010       xfree (scheme);
1012       return error;
1013     }
1014   else
1015     return xstrdup (_(parse_errors[error_code]));
1016 }
1018 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
1019    expected to be URL-escaped.
1021    The path is split into directory (the part up to the last slash)
1022    and file (the part after the last slash), which are subsequently
1023    unescaped.  Examples:
1025    PATH                 DIR           FILE
1026    "foo/bar/baz"        "foo/bar"     "baz"
1027    "foo/bar/"           "foo/bar"     ""
1028    "foo"                ""            "foo"
1029    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
1031    DIR and FILE are freshly allocated.  */
1033 static void
split_path(const char * path,char ** dir,char ** file)1034 split_path (const char *path, char **dir, char **file)
1035 {
1036   char *last_slash = strrchr (path, '/');
1037   if (!last_slash)
1038     {
1039       *dir = xstrdup ("");
1040       *file = xstrdup (path);
1041     }
1042   else
1043     {
1044       *dir = strdupdelim (path, last_slash);
1045       *file = xstrdup (last_slash + 1);
1046     }
1047   url_unescape (*dir);
1048   url_unescape (*file);
1049 }
1051 /* Note: URL's "full path" is the path with the query string and
1052    params appended.  The "fragment" (#foo) is intentionally ignored,
1053    but that might be changed.  For example, if the original URL was
1054    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1055    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1057 /* Return the length of the full path, without the terminating
1058    zero.  */
1060 static int
full_path_length(const struct url * url)1061 full_path_length (const struct url *url)
1062 {
1063   int len = 0;
1065 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1067   FROB (path);
1068   FROB (params);
1069   FROB (query);
1071 #undef FROB
1073   return len;
1074 }
1076 /* Write out the full path. */
1078 static void
full_path_write(const struct url * url,char * where)1079 full_path_write (const struct url *url, char *where)
1080 {
1081 #define FROB(el, chr) do {                      \
1082   char *f_el = url->el;                         \
1083   if (f_el) {                                   \
1084     int l = strlen (f_el);                      \
1085     *where++ = chr;                             \
1086     memcpy (where, f_el, l);                    \
1087     where += l;                                 \
1088   }                                             \
1089 } while (0)
1091   FROB (path, '/');
1092   FROB (params, ';');
1093   FROB (query, '?');
1095 #undef FROB
1096 }
1098 /* Public function for getting the "full path".  E.g. if u->path is
1099    "foo/bar" and u->query is "param=value", full_path will be
1100    "/foo/bar?param=value". */
1102 char *
url_full_path(const struct url * url)1103 url_full_path (const struct url *url)
1104 {
1105   int length = full_path_length (url);
1106   char *full_path = xmalloc (length + 1);
1108   full_path_write (url, full_path);
1109   full_path[length] = '\0';
1111   return full_path;
1112 }
1114 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1115    escaping of certain characters, such as "/" and ":".  Returns a
1116    count of unescaped chars.  */
1118 static void
unescape_single_char(char * str,char chr)1119 unescape_single_char (char *str, char chr)
1120 {
1121   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1122   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1123   char *h = str;                /* hare */
1124   char *t = str;                /* tortoise */
1125   for (; *h; h++, t++)
1126     {
1127       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1128         {
1129           *t = chr;
1130           h += 2;
1131         }
1132       else
1133         *t = *h;
1134     }
1135   *t = '\0';
1136 }
1138 /* Escape unsafe and reserved characters, except for the slash
1139    characters.  */
1141 static char *
url_escape_dir(const char * dir)1142 url_escape_dir (const char *dir)
1143 {
1144   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1145   if (newdir == dir)
1146     return (char *)dir;
1148   unescape_single_char (newdir, '/');
1149   return newdir;
1150 }
1152 /* Sync u->path and u->url with u->dir and u->file.  Called after
1153    u->file or u->dir have been changed, typically by the FTP code.  */
1155 static void
sync_path(struct url * u)1156 sync_path (struct url *u)
1157 {
1158   char *newpath, *efile, *edir;
1160   xfree (u->path);
1162   /* u->dir and u->file are not escaped.  URL-escape them before
1163      reassembling them into u->path.  That way, if they contain
1164      separators like '?' or even if u->file contains slashes, the
1165      path will be correctly assembled.  (u->file can contain slashes
1166      if the URL specifies it with %2f, or if an FTP server returns
1167      it.)  */
1168   edir = url_escape_dir (u->dir);
1169   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1171   if (!*edir)
1172     newpath = xstrdup (efile);
1173   else
1174     {
1175       int dirlen = strlen (edir);
1176       int filelen = strlen (efile);
1178       /* Copy "DIR/FILE" to newpath. */
1179       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1180       memcpy (p, edir, dirlen);
1181       p += dirlen;
1182       *p++ = '/';
1183       memcpy (p, efile, filelen);
1184       p += filelen;
1185       *p = '\0';
1186     }
1188   u->path = newpath;
1190   if (edir != u->dir)
1191     xfree (edir);
1192   if (efile != u->file)
1193     xfree (efile);
1195   /* Regenerate u->url as well.  */
1196   xfree (u->url);
1197   u->url = url_string (u, URL_AUTH_SHOW);
1198 }
1200 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1201    This way we can sync u->path and u->url when they get changed.  */
1203 void
url_set_dir(struct url * url,const char * newdir)1204 url_set_dir (struct url *url, const char *newdir)
1205 {
1206   xfree (url->dir);
1207   url->dir = xstrdup (newdir);
1208   sync_path (url);
1209 }
1211 void
url_set_file(struct url * url,const char * newfile)1212 url_set_file (struct url *url, const char *newfile)
1213 {
1214   xfree (url->file);
1215   url->file = xstrdup (newfile);
1216   sync_path (url);
1217 }
1219 void
url_free(struct url * url)1220 url_free (struct url *url)
1221 {
1222   if (url)
1223     {
1224       xfree (url->host);
1226       xfree (url->path);
1227       xfree (url->url);
1229       xfree (url->params);
1230       xfree (url->query);
1231       xfree (url->fragment);
1232       xfree (url->user);
1233       xfree (url->passwd);
1235       xfree (url->dir);
1236       xfree (url->file);
1238       xfree (url);
1239     }
1240 }
1242 /* Create all the necessary directories for PATH (a file).  Calls
1243    make_directory internally.  */
1244 int
mkalldirs(const char * path)1245 mkalldirs (const char *path)
1246 {
1247   const char *p;
1248   char *t;
1249   struct stat st;
1250   int res;
1252   p = strrchr(path, '/');
1253   p = p == NULL ? path : p;
1255   /* Don't create if it's just a file.  */
1256   if ((p == path) && (*p != '/'))
1257     return 0;
1258   t = strdupdelim (path, p);
1260   /* Check whether the directory exists.  */
1261   if ((stat (t, &st) == 0))
1262     {
1263       if (S_ISDIR (st.st_mode))
1264         {
1265           xfree (t);
1266           return 0;
1267         }
1268       else
1269         {
1270           /* If the dir exists as a file name, remove it first.  This
1271              is *only* for Wget to work with buggy old CERN http
1272              servers.  Here is the scenario: When Wget tries to
1273              retrieve a directory without a slash, e.g.
1274              http://foo/bar (bar being a directory), CERN server will
1275              not redirect it too http://foo/bar/ -- it will generate a
1276              directory listing containing links to bar/file1,
1277              bar/file2, etc.  Wget will lose because it saves this
1278              HTML listing to a file `bar', so it cannot create the
1279              directory.  To work around this, if the file of the same
1280              name exists, we just remove it and create the directory
1281              anyway.  */
1282           DEBUGP (("Removing %s because of directory danger!\n", t));
1283           if (unlink (t))
1284             logprintf (LOG_NOTQUIET, "Failed to unlink %s (%d): %s\n",
1285                        t, errno, strerror(errno));
1286         }
1287     }
1288   res = make_directory (t);
1289   if (res != 0)
1290     logprintf (LOG_NOTQUIET, "%s: %s\n", t, strerror (errno));
1291   xfree (t);
1292   return res;
1293 }
1295 /* Functions for constructing the file name out of URL components.  */
1297 /* A growable string structure, used by url_file_name and friends.
1298    This should perhaps be moved to utils.c.
1300    The idea is to have a convenient and efficient way to construct a
1301    string by having various functions append data to it.  Instead of
1302    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1303    functions in questions, we pass the pointer to this struct.
1305    Functions that write to the members in this struct must make sure
1306    that base remains null terminated by calling append_null().
1307    */
1309 struct growable {
1310   char *base;
1311   int size;   /* memory allocated */
1312   int tail;   /* string length */
1313 };
1315 /* Ensure that the string can accept APPEND_COUNT more characters past
1316    the current TAIL position.  If necessary, this will grow the string
1317    and update its allocated size.  If the string is already large
1318    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1319 #define GROW(g, append_size) do {                                       \
1320   struct growable *G_ = g;                                              \
1321   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1322 } while (0)
1324 /* Return the tail position of the string. */
1325 #define TAIL(r) ((r)->base + (r)->tail)
1327 /* Move the tail position by APPEND_COUNT characters. */
1328 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1331 /* Append NULL to DEST. */
1332 static void
append_null(struct growable * dest)1333 append_null (struct growable *dest)
1334 {
1335   GROW (dest, 1);
1336   *TAIL (dest) = 0;
1337 }
1339 /* Append CH to DEST. */
1340 static void
append_char(char ch,struct growable * dest)1341 append_char (char ch, struct growable *dest)
1342 {
1343   if (ch)
1344     {
1345       GROW (dest, 1);
1346       *TAIL (dest) = ch;
1347       TAIL_INCR (dest, 1);
1348     }
1350   append_null (dest);
1351 }
1353 /* Append the string STR to DEST. */
1354 static void
append_string(const char * str,struct growable * dest)1355 append_string (const char *str, struct growable *dest)
1356 {
1357   int l = strlen (str);
1359   if (l)
1360     {
1361       GROW (dest, l);
1362       memcpy (TAIL (dest), str, l);
1363       TAIL_INCR (dest, l);
1364     }
1366   append_null (dest);
1367 }
1370 enum {
1371   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1372   filechr_not_vms     = 2,      /* unusable on VMS (ODS5), 0x00-0x1F * ? */
1373   filechr_not_windows = 4,      /* unusable on Windows, one of \|/<>?:*" */
1374   filechr_control     = 8       /* a control character, e.g. 0-31 */
1375 };
1377 #define FILE_CHAR_TEST(c, mask) \
1378     ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
1379     (filechr_table[(unsigned char)(c)] & (mask)))
1381 /* Shorthands for the table: */
1382 #define U filechr_not_unix
1383 #define V filechr_not_vms
1384 #define W filechr_not_windows
1385 #define C filechr_control
1387 #define UVWC U|V|W|C
1388 #define UW U|W
1389 #define VC V|C
1390 #define VW V|W
1392 /* Table of characters unsafe under various conditions (see above).
1394    Arguably we could also claim `%' to be unsafe, since we use it as
1395    the escape character.  If we ever want to be able to reliably
1396    translate file name back to URL, this would become important
1397    crucial.  Right now, it's better to be minimal in escaping.  */
1399 static const unsigned char filechr_table[256] =
1400 {
1402   VC, VC, VC, VC,  VC, VC, VC, VC,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1403   VC, VC, VC, VC,  VC, VC, VC, VC,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1404   VC, VC, VC, VC,  VC, VC, VC, VC,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1405    0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1406    0,  0, VW,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1407    0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1408    0,  0,  W,  0,   W,  0,  W, VW,   /* 8   9   :   ;    <   =   >   ?   */
1409    0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1410    0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1411    0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1412    0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1413    0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1414    0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1415    0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1416    0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1418   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, /* 128-143 */
1419   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, /* 144-159 */
1420   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1421   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1423   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1424   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1425   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1426   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1427 };
1428 #undef U
1429 #undef V
1430 #undef W
1431 #undef C
1432 #undef UW
1433 #undef UVWC
1434 #undef VC
1435 #undef VW
1437 /* FN_PORT_SEP is the separator between host and port in file names
1438    for non-standard port numbers.  On Unix this is normally ':', as in
1439    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1440    because Windows can't handle ':' in file names.  */
1441 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1443 /* FN_QUERY_SEP is the separator between the file name and the URL
1444    query, normally '?'.  Because VMS and Windows cannot handle '?' in a
1445    file name, we use '@' instead there.  */
1446 #define FN_QUERY_SEP \
1447  (((opt.restrict_files_os != restrict_vms) && \
1448    (opt.restrict_files_os != restrict_windows)) ? '?' : '@')
1449 #define FN_QUERY_SEP_STR \
1450  (((opt.restrict_files_os != restrict_vms) && \
1451    (opt.restrict_files_os != restrict_windows)) ? "?" : "@")
1453 /* Quote path element, characters in [b, e), as file name, and append
1454    the quoted string to DEST.  Each character is quoted as per
1455    file_unsafe_char and the corresponding table.
1457    If ESCAPED is true, the path element is considered to be
1458    URL-escaped and will be unescaped prior to inspection.  */
1460 static void
append_uri_pathel(const char * b,const char * e,bool escaped,struct growable * dest)1461 append_uri_pathel (const char *b, const char *e, bool escaped,
1462                    struct growable *dest)
1463 {
1464   const char *p;
1465   char buf[1024];
1466   char *unescaped = NULL;
1467   int quoted, outlen;
1468   int mask;
1469   int max_length;
1471   if (!dest)
1472     return;
1474   if (opt.restrict_files_os == restrict_unix)
1475     mask = filechr_not_unix;
1476   else if (opt.restrict_files_os == restrict_vms)
1477     mask = filechr_not_vms;
1478   else
1479     mask = filechr_not_windows;
1481   if (opt.restrict_files_ctrl)
1482     mask |= filechr_control;
1484   /* Copy [b, e) to PATHEL and URL-unescape it. */
1485   if (escaped)
1486     {
1487       size_t len = e - b;
1488 		if (len < sizeof (buf))
1489         unescaped = buf;
1490       else
1491         unescaped = xmalloc(len + 1);
1493 		memcpy(unescaped, b, len);
1494 		unescaped[len] = 0;
1496       url_unescape (unescaped);
1497       b = unescaped;
1498       e = unescaped + strlen (unescaped);
1499     }
1501   /* Defang ".." when found as component of path.  Remember that path
1502      comes from the URL and might contain malicious input.  */
1503   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1504     {
1505       b = "%2E%2E";
1506       e = b + 6;
1507     }
1509   /* Walk the PATHEL string and check how many characters we'll need
1510      to quote.  */
1511   quoted = 0;
1512   for (p = b; p < e; p++)
1513     if (FILE_CHAR_TEST (*p, mask))
1514       ++quoted;
1516   /* Calculate the length of the output string.  e-b is the input
1517      string length.  Each quoted char introduces two additional
1518      characters in the string, hence 2*quoted.  */
1519   outlen = (e - b) + (2 * quoted);
1520 # ifdef WINDOWS
1521   max_length = MAX_PATH;
1522 # else
1523   max_length = get_max_length(dest->base, dest->tail, _PC_NAME_MAX);
1524 # endif
1525   max_length -= CHOMP_BUFFER;
1526   if (max_length > 0 && outlen > max_length)
1527     {
1528       logprintf (LOG_NOTQUIET, "The destination name is too long (%d), reducing to %d\n", outlen, max_length);
1530       outlen = max_length;
1531     }
1532   GROW (dest, outlen);
1534   // This should not happen, but it's impossible to argue with static analysis that it can't happen
1535   // (in theory it can). So give static analyzers a hint.
1536   if (!dest->base)
1537     return;
1539   if (!quoted)
1540     {
1541       /* If there's nothing to quote, we can simply append the string
1542          without processing it again.  */
1543       memcpy (TAIL (dest), b, outlen);
1544     }
1545   else
1546     {
1547       char *q = TAIL (dest);
1548       int i;
1550       for (i = 0, p = b; p < e; p++)
1551         {
1552           if (!FILE_CHAR_TEST (*p, mask))
1553 	    {
1554 	      if (i == outlen)
1555 	        break;
1556 	      *q++ = *p;
1557 	      i++;
1558 	    }
1559           else if (i + 3 > outlen)
1560 	    break;
1561 	  else
1562             {
1563               unsigned char ch = *p;
1564               *q++ = '%';
1565               *q++ = XNUM_TO_DIGIT (ch >> 4);
1566               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1567 	      i += 3;
1568             }
1569         }
1570       assert (q - TAIL (dest) <= outlen);
1571     }
1573   /* Perform inline case transformation if required.  */
1574   if (opt.restrict_files_case == restrict_lowercase
1575       || opt.restrict_files_case == restrict_uppercase)
1576     {
1577       char *q;
1578       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1579         {
1580           if (opt.restrict_files_case == restrict_lowercase)
1581             *q = c_tolower (*q);
1582           else
1583             *q = c_toupper (*q);
1584         }
1585     }
1587   TAIL_INCR (dest, outlen);
1588   append_null (dest);
1590   if (unescaped && unescaped != buf)
1591 	  free (unescaped);
1592 }
1594 #ifdef HAVE_ICONV
1595 static char *
convert_fname(char * fname)1596 convert_fname (char *fname)
1597 {
1598   char *converted_fname;
1599   const char *from_encoding = opt.encoding_remote;
1600   const char *to_encoding = opt.locale;
1601   iconv_t cd;
1602   size_t len, done, inlen, outlen;
1603   char *s;
1604   const char *orig_fname;
1606   /* Defaults for remote and local encodings.  */
1607   if (!from_encoding)
1608     from_encoding = "UTF-8";
1609   if (!to_encoding)
1610     to_encoding = nl_langinfo (CODESET);
1612   cd = iconv_open (to_encoding, from_encoding);
1613   if (cd == (iconv_t) (-1))
1614     {
1615       logprintf (LOG_VERBOSE, _ ("Conversion from %s to %s isn't supported\n"),
1616                  quote_n (0, from_encoding), quote_n (1, to_encoding));
1617       return fname;
1618     }
1620   orig_fname = fname;
1621   inlen = strlen (fname);
1622   len = outlen = inlen * 2;
1623   converted_fname = s = xmalloc (outlen + 1);
1624   done = 0;
1626   for (;;)
1627     {
1628       errno = 0;
1629       if (iconv (cd, (ICONV_CONST char **) &fname, &inlen, &s, &outlen) == 0
1630           && iconv (cd, NULL, NULL, &s, &outlen) == 0)
1631         {
1632           *(converted_fname + len - outlen - done) = '\0';
1633           iconv_close (cd);
1634           DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n",
1635                    orig_fname, from_encoding, converted_fname, to_encoding));
1636           xfree (orig_fname);
1637           return converted_fname;
1638         }
1640       /* Incomplete or invalid multibyte sequence */
1641       if (errno == EINVAL || errno == EILSEQ || errno == 0)
1642         {
1643           if (errno)
1644             logprintf (LOG_VERBOSE,
1645                        _ ("Incomplete or invalid multibyte sequence encountered\n"));
1646           else
1647             logprintf (LOG_VERBOSE,
1648                        _ ("Unconvertable multibyte sequence encountered\n"));
1649           xfree (converted_fname);
1650           converted_fname = (char *) orig_fname;
1651           break;
1652         }
1653       else if (errno == E2BIG) /* Output buffer full */
1654         {
1655           done = len;
1656           len = outlen = done + inlen * 2;
1657           converted_fname = xrealloc (converted_fname, outlen + 1);
1658           s = converted_fname + done;
1659         }
1660       else /* Weird, we got an unspecified error */
1661         {
1662           logprintf (LOG_VERBOSE, _ ("Unhandled errno %d\n"), errno);
1663           xfree (converted_fname);
1664           converted_fname = (char *) orig_fname;
1665           break;
1666         }
1667     }
1668   DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n",
1669            orig_fname, from_encoding, to_encoding));
1671   iconv_close (cd);
1673   return converted_fname;
1674 }
1675 #else
1676 static char *
convert_fname(char * fname)1677 convert_fname (char *fname)
1678 {
1679   return fname;
1680 }
1681 #endif
1683 /* Append to DEST the directory structure that corresponds the
1684    directory part of URL's path.  For example, if the URL is
1685    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1687    Each path element ("dir1" and "dir2" in the above example) is
1688    examined, url-unescaped, and re-escaped as file name element.
1690    Additionally, it cuts as many directories from the path as
1691    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1692    will produce "bar" for the above example.  For 2 or more, it will
1693    produce "".
1695    Each component of the path is quoted for use as file name.  */
1697 static void
append_dir_structure(const struct url * u,struct growable * dest)1698 append_dir_structure (const struct url *u, struct growable *dest)
1699 {
1700   char *pathel, *next;
1701   int cut = opt.cut_dirs;
1703   /* Go through the path components, de-URL-quote them, and quote them
1704      (if necessary) as file names.  */
1706   pathel = u->path;
1707   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1708     {
1709       if (cut-- > 0)
1710         continue;
1711       if (pathel == next)
1712         /* Ignore empty pathels.  */
1713         continue;
1715       if (dest->tail)
1716         append_char ('/', dest);
1718       append_uri_pathel (pathel, next, true, dest);
1719     }
1720 }
1722 /* Return a unique file name that matches the given URL as well as
1723    possible.  Does not create directories on the file system.  */
1725 char *
url_file_name(const struct url * u,char * replaced_filename)1726 url_file_name (const struct url *u, char *replaced_filename)
1727 {
1728   struct growable fnres;        /* stands for "file name result" */
1729   struct growable temp_fnres;
1731   const char *u_file;
1732   char *fname, *unique, *fname_len_check;
1733   const char *index_filename = "index.html"; /* The default index file is index.html */
1735   fnres.base = NULL;
1736   fnres.size = 0;
1737   fnres.tail = 0;
1739   temp_fnres.base = NULL;
1740   temp_fnres.size = 0;
1741   temp_fnres.tail = 0;
1743   /* If an alternative index file was defined, change index_filename */
1744   if (opt.default_page)
1745     index_filename = opt.default_page;
1748   /* Start with the directory prefix, if specified. */
1749   if (opt.dir_prefix)
1750     append_string (opt.dir_prefix, &fnres);
1752   /* If "dirstruct" is turned on (typically the case with -r), add
1753      the host and port (unless those have been turned off) and
1754      directory structure.  */
1755   /* All safe remote chars are unescaped and stored in temp_fnres,
1756      then converted to local and appended to fnres.
1757      Internationalized URL/IDN will produce punycode to lookup IP from DNS:
1758      https://en.wikipedia.org/wiki/URL
1759      https://en.wikipedia.org/wiki/Internationalized_domain_name
1760      Non-ASCII code chars in the path:
1761      https://en.wikipedia.org/wiki/List_of_Unicode_characters
1762      https://en.wikipedia.org/wiki/List_of_writing_systems */
1763   if (opt.dirstruct)
1764     {
1765       if (opt.protocol_directories)
1766         {
1767           if (temp_fnres.tail)
1768             append_char ('/', &temp_fnres);
1769           append_string (supported_schemes[u->scheme].name, &temp_fnres);
1770         }
1771       if (opt.add_hostdir)
1772         {
1773           if (temp_fnres.tail)
1774             append_char ('/', &temp_fnres);
1775           if (0 != strcmp (u->host, ".."))
1776             append_string (u->host, &temp_fnres);
1777           else
1778             /* Host name can come from the network; malicious DNS may
1779                allow ".." to be resolved, causing us to write to
1780                "../<file>".  Defang such host names.  */
1781             append_string ("%2E%2E", &temp_fnres);
1782           if (u->port != scheme_default_port (u->scheme))
1783             {
1784               char portstr[24];
1785               number_to_string (portstr, u->port);
1786               append_char (FN_PORT_SEP, &temp_fnres);
1787               append_string (portstr, &temp_fnres);
1788             }
1789         }
1791       append_dir_structure (u, &temp_fnres);
1792     }
1794   if (!replaced_filename)
1795     {
1796       /* Create the filename. */
1797       u_file = *u->file ? u->file : index_filename;
1799       /* Append "?query" to the file name, even if empty,
1800        * and create fname_len_check. */
1801       if (u->query)
1802         fname_len_check = concat_strings (u_file, FN_QUERY_SEP_STR, u->query, NULL);
1803       else
1804         fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
1805     }
1806   else
1807     {
1808       u_file = replaced_filename;
1809       fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
1810     }
1812   if (temp_fnres.tail)
1813     append_char ('/', &temp_fnres);
1815   append_uri_pathel (fname_len_check,
1816     fname_len_check + strlen (fname_len_check), true, &temp_fnres);
1818   /* Zero-terminate the temporary file name. */
1819   append_char ('\0', &temp_fnres);
1821   /* convert all remote chars before length check and appending to local path */
1822   fname = convert_fname (temp_fnres.base);
1823   temp_fnres.base = NULL;
1824   temp_fnres.size = 0;
1825   temp_fnres.tail = 0;
1826   append_string (fname, &temp_fnres);
1828   xfree (fname);
1829   xfree (fname_len_check);
1831   /* The filename has already been 'cleaned' by append_uri_pathel() above.  So,
1832    * just append it. */
1833   if (fnres.tail)
1834     append_char ('/', &fnres);
1835   append_string (temp_fnres.base, &fnres);
1837   fname = fnres.base;
1839   /* Make a final check that the path length is acceptable? */
1840   /* TODO: check fnres.base for path length problem */
1842   xfree (temp_fnres.base);
1844   /* Check the cases in which the unique extensions are not used:
1845      1) Clobbering is turned off (-nc).
1846      2) Retrieval with regetting.
1847      3) Timestamping is used.
1848      4) Hierarchy is built.
1849      5) Backups are specified.
1851      The exception is the case when file does exist and is a
1852      directory (see `mkalldirs' for explanation).  */
1854   if (ALLOW_CLOBBER
1855       && !(file_exists_p (fname, NULL) && !file_non_directory_p (fname)))
1856     {
1857       unique = fname;
1858     }
1859   else
1860     {
1861       unique = unique_name_passthrough (fname);
1862       if (unique != fname)
1863         xfree (fname);
1864     }
1866 /* On VMS, alter the name as required. */
1867 #ifdef __VMS
1868   {
1869     char *unique2;
1871     unique2 = ods_conform( unique);
1872     if (unique2 != unique)
1873       {
1874         xfree (unique);
1875         unique = unique2;
1876       }
1877   }
1878 #endif /* def __VMS */
1880   return unique;
1881 }
1883 /* Resolve "." and ".." elements of PATH by destructively modifying
1884    PATH and return true if PATH has been modified, false otherwise.
1886    The algorithm is in spirit similar to the one described in rfc1808,
1887    although implemented differently, in one pass.  To recap, path
1888    elements containing only "." are removed, and ".." is taken to mean
1889    "back up one element".  Single leading and trailing slashes are
1890    preserved.
1892    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1893    test examples are provided below.  If you change anything in this
1894    function, run test_path_simplify to make sure you haven't broken a
1895    test case.  */
1897 static bool
path_simplify(enum url_scheme scheme,char * path)1898 path_simplify (enum url_scheme scheme, char *path)
1899 {
1900   char *h = path;               /* hare */
1901   char *t = path;               /* tortoise */
1902   char *beg = path;
1903   char *end = strchr (path, '\0');
1905   while (h < end)
1906     {
1907       /* Hare should be at the beginning of a path element. */
1909       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1910         {
1911           /* Ignore "./". */
1912           h += 2;
1913         }
1914       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1915         {
1916           /* Handle "../" by retreating the tortoise by one path
1917              element -- but not past beginning.  */
1918           if (t > beg)
1919             {
1920               /* Move backwards until T hits the beginning of the
1921                  previous path element or the beginning of path. */
1922               for (--t; t > beg && t[-1] != '/'; t--)
1923                 ;
1924             }
1925           else if (scheme == SCHEME_FTP
1926 #ifdef HAVE_SSL
1927               || scheme == SCHEME_FTPS
1928 #endif
1929               )
1930             {
1931               /* If we're at the beginning, copy the "../" literally
1932                  and move the beginning so a later ".." doesn't remove
1933                  it.  This violates RFC 3986; but we do it for FTP
1934                  anyway because there is otherwise no way to get at a
1935                  parent directory, when the FTP server drops us in a
1936                  non-root directory (which is not uncommon). */
1937               beg = t + 3;
1938               goto regular;
1939             }
1940           h += 3;
1941         }
1942       else
1943         {
1944         regular:
1945           /* A regular path element.  If H hasn't advanced past T,
1946              simply skip to the next path element.  Otherwise, copy
1947              the path element until the next slash.  */
1948           if (t == h)
1949             {
1950               /* Skip the path element, including the slash.  */
1951               while (h < end && *h != '/')
1952                 t++, h++;
1953               if (h < end)
1954                 t++, h++;
1955             }
1956           else
1957             {
1958               /* Copy the path element, including the final slash.  */
1959               while (h < end && *h != '/')
1960                 *t++ = *h++;
1961               if (h < end)
1962                 *t++ = *h++;
1963             }
1964         }
1965     }
1967   if (t != h)
1968     *t = '\0';
1970   return t != h;
1971 }
1973 /* Return the length of URL's path.  Path is considered to be
1974    terminated by one or more of the ?query or ;params or #fragment,
1975    depending on the scheme.  */
1977 static const char *
path_end(const char * url)1978 path_end (const char *url)
1979 {
1980   enum url_scheme scheme = url_scheme (url);
1981   const char *seps;
1982   if (scheme == SCHEME_INVALID)
1983     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1984   /* +2 to ignore the first two separators ':' and '/' */
1985   seps = init_seps (scheme) + 2;
1986   return strpbrk_or_eos (url, seps);
1987 }
1989 /* Find the last occurrence of character C in the range [b, e), or
1990    NULL, if none are present.  */
1991 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1993 /* Merge BASE with LINK and return the resulting URI.
1995    Either of the URIs may be absolute or relative, complete with the
1996    host name, or path only.  This tries to reasonably handle all
1997    foreseeable cases.  It only employs minimal URL parsing, without
1998    knowledge of the specifics of schemes.
2000    I briefly considered making this function call path_simplify after
2001    the merging process, as rfc1738 seems to suggest.  This is a bad
2002    idea for several reasons: 1) it complexifies the code, and 2)
2003    url_parse has to simplify path anyway, so it's wasteful to boot.  */
2005 char *
uri_merge(const char * base,const char * link)2006 uri_merge (const char *base, const char *link)
2007 {
2008   int linklength;
2009   const char *end;
2010   char *merge;
2012   if (url_has_scheme (link))
2013     return xstrdup (link);
2015   /* We may not examine BASE past END. */
2016   end = path_end (base);
2017   linklength = strlen (link);
2019   if (!*link)
2020     {
2021       /* Empty LINK points back to BASE, query string and all. */
2022       return xstrdup (base);
2023     }
2024   else if (*link == '?')
2025     {
2026       /* LINK points to the same location, but changes the query
2027          string.  Examples: */
2028       /* uri_merge("path",         "?new") -> "path?new"     */
2029       /* uri_merge("path?foo",     "?new") -> "path?new"     */
2030       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
2031       /* uri_merge("path#foo",     "?new") -> "path?new"     */
2032       int baselength = end - base;
2033       merge = xmalloc (baselength + linklength + 1);
2034       memcpy (merge, base, baselength);
2035       memcpy (merge + baselength, link, linklength);
2036       merge[baselength + linklength] = '\0';
2037     }
2038   else if (*link == '#')
2039     {
2040       /* uri_merge("path",         "#new") -> "path#new"     */
2041       /* uri_merge("path#foo",     "#new") -> "path#new"     */
2042       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
2043       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
2044       int baselength;
2045       const char *end1 = strchr (base, '#');
2046       if (!end1)
2047         end1 = base + strlen (base);
2048       baselength = end1 - base;
2049       merge = xmalloc (baselength + linklength + 1);
2050       memcpy (merge, base, baselength);
2051       memcpy (merge + baselength, link, linklength);
2052       merge[baselength + linklength] = '\0';
2053     }
2054   else if (*link == '/' && *(link + 1) == '/')
2055     {
2056       /* LINK begins with "//" and so is a net path: we need to
2057          replace everything after (and including) the double slash
2058          with LINK. */
2060       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
2061       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
2062       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
2064       int span;
2065       const char *slash;
2066       const char *start_insert;
2068       /* Look for first slash. */
2069       slash = memchr (base, '/', end - base);
2070       /* If found slash and it is a double slash, then replace
2071          from this point, else default to replacing from the
2072          beginning.  */
2073       if (slash && *(slash + 1) == '/')
2074         start_insert = slash;
2075       else
2076         start_insert = base;
2078       span = start_insert - base;
2079       merge = xmalloc (span + linklength + 1);
2080       if (span)
2081         memcpy (merge, base, span);
2082       memcpy (merge + span, link, linklength);
2083       merge[span + linklength] = '\0';
2084     }
2085   else if (*link == '/')
2086     {
2087       /* LINK is an absolute path: we need to replace everything
2088          after (and including) the FIRST slash with LINK.
2090          So, if BASE is "http://host/whatever/foo/bar", and LINK is
2091          "/qux/xyzzy", our result should be
2092          "http://host/qux/xyzzy".  */
2093       int span;
2094       const char *slash;
2095       const char *start_insert = NULL; /* for gcc to shut up. */
2096       const char *pos = base;
2097       bool seen_slash_slash = false;
2098       /* We're looking for the first slash, but want to ignore
2099          double slash. */
2100     again:
2101       slash = memchr (pos, '/', end - pos);
2102       if (slash && !seen_slash_slash)
2103         if (*(slash + 1) == '/')
2104           {
2105             pos = slash + 2;
2106             seen_slash_slash = true;
2107             goto again;
2108           }
2110       /* At this point, SLASH is the location of the first / after
2111          "//", or the first slash altogether.  START_INSERT is the
2112          pointer to the location where LINK will be inserted.  When
2113          examining the last two examples, keep in mind that LINK
2114          begins with '/'. */
2116       if (!slash && !seen_slash_slash)
2117         /* example: "foo" */
2118         /*           ^    */
2119         start_insert = base;
2120       else if (!slash && seen_slash_slash)
2121         /* example: "http://foo" */
2122         /*                     ^ */
2123         start_insert = end;
2124       else if (slash && !seen_slash_slash)
2125         /* example: "foo/bar" */
2126         /*           ^        */
2127         start_insert = base;
2128       else if (slash && seen_slash_slash)
2129         /* example: "http://something/" */
2130         /*                           ^  */
2131         start_insert = slash;
2133       span = start_insert - base;
2134       merge = xmalloc (span + linklength + 1);
2135       if (span)
2136         memcpy (merge, base, span);
2137       memcpy (merge + span, link, linklength);
2138       merge[span + linklength] = '\0';
2139     }
2140   else
2141     {
2142       /* LINK is a relative URL: we need to replace everything
2143          after last slash (possibly empty) with LINK.
2145          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2146          our result should be "whatever/foo/qux/xyzzy".  */
2147       bool need_explicit_slash = false;
2148       int span;
2149       const char *start_insert;
2150       const char *last_slash = find_last_char (base, end, '/');
2151       if (!last_slash)
2152         {
2153           /* No slash found at all.  Replace what we have with LINK. */
2154           start_insert = base;
2155         }
2156       else if (last_slash && last_slash >= base + 2
2157                && last_slash[-2] == ':' && last_slash[-1] == '/')
2158         {
2159           /* example: http://host"  */
2160           /*                      ^ */
2161           start_insert = end + 1;
2162           need_explicit_slash = true;
2163         }
2164       else
2165         {
2166           /* example: "whatever/foo/bar" */
2167           /*                        ^    */
2168           start_insert = last_slash + 1;
2169         }
2171       span = start_insert - base;
2172       merge = xmalloc (span + linklength + 1);
2173       if (span)
2174         memcpy (merge, base, span);
2175       if (need_explicit_slash)
2176         merge[span - 1] = '/';
2177       memcpy (merge + span, link, linklength);
2178       merge[span + linklength] = '\0';
2179     }
2181   return merge;
2182 }
2184 #define APPEND(p, s) do {                       \
2185   int len = strlen (s);                         \
2186   memcpy (p, s, len);                           \
2187   p += len;                                     \
2188 } while (0)
2190 /* Use this instead of password when the actual password is supposed
2191    to be hidden.  We intentionally use a generic string without giving
2192    away the number of characters in the password, like previous
2193    versions did.  */
2194 #define HIDDEN_PASSWORD "*password*"
2196 /* Recreate the URL string from the data in URL.
2198    If HIDE is true (as it is when we're calling this on a URL we plan
2199    to print, but not when calling it to canonicalize a URL for use
2200    within the program), password will be hidden.  Unsafe characters in
2201    the URL will be quoted.  */
2203 char *
url_string(const struct url * url,enum url_auth_mode auth_mode)2204 url_string (const struct url *url, enum url_auth_mode auth_mode)
2205 {
2206   int size;
2207   char *result, *p;
2208   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
2210   int scheme_port = supported_schemes[url->scheme].default_port;
2211   const char *scheme_str = supported_schemes[url->scheme].leading_string;
2212   int fplen = full_path_length (url);
2214   bool brackets_around_host;
2216   assert (scheme_str != NULL);
2218   /* Make sure the user name and password are quoted. */
2219   if (url->user)
2220     {
2221       if (auth_mode != URL_AUTH_HIDE)
2222         {
2223           quoted_user = url_escape_allow_passthrough (url->user);
2224           if (url->passwd)
2225             {
2226               if (auth_mode == URL_AUTH_HIDE_PASSWD)
2227                 quoted_passwd = (char *) HIDDEN_PASSWORD;
2228               else
2229                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2230             }
2231         }
2232     }
2234   /* In the unlikely event that the host name contains non-printable
2235      characters, quote it for displaying to the user.  */
2236   quoted_host = url_escape_allow_passthrough (url->host);
2238   /* Undo the quoting of colons that URL escaping performs.  IPv6
2239      addresses may legally contain colons, and in that case must be
2240      placed in square brackets.  */
2241   if (quoted_host != url->host)
2242     unescape_single_char (quoted_host, ':');
2243   brackets_around_host = strchr (quoted_host, ':') != NULL;
2245   size = (strlen (scheme_str)
2246           + strlen (quoted_host)
2247           + (brackets_around_host ? 2 : 0)
2248           + fplen
2249           + 1);
2250   if (url->port != scheme_port)
2251     size += 1 + numdigit (url->port);
2252   if (quoted_user)
2253     {
2254       size += 1 + strlen (quoted_user);
2255       if (quoted_passwd)
2256         size += 1 + strlen (quoted_passwd);
2257     }
2259   p = result = xmalloc (size);
2261   APPEND (p, scheme_str);
2262   if (quoted_user)
2263     {
2264       APPEND (p, quoted_user);
2265       if (quoted_passwd)
2266         {
2267           *p++ = ':';
2268           APPEND (p, quoted_passwd);
2269         }
2270       *p++ = '@';
2271     }
2273   if (brackets_around_host)
2274     *p++ = '[';
2275   APPEND (p, quoted_host);
2276   if (brackets_around_host)
2277     *p++ = ']';
2278   if (url->port != scheme_port)
2279     {
2280       *p++ = ':';
2281       p = number_to_string (p, url->port);
2282     }
2284   full_path_write (url, p);
2285   p += fplen;
2286   *p++ = '\0';
2288   assert (p - result == size);
2290   if (quoted_user && quoted_user != url->user)
2291     xfree (quoted_user);
2292   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
2293       && quoted_passwd != url->passwd)
2294     xfree (quoted_passwd);
2295   if (quoted_host != url->host)
2296     xfree (quoted_host);
2298   return result;
2299 }
2301 /* Return true if scheme a is similar to scheme b.
2303    Schemes are similar if they are equal.  If SSL is supported, schemes
2304    are also similar if one is http (SCHEME_HTTP) and the other is https
2305    (SCHEME_HTTPS).  */
2306 bool
schemes_are_similar_p(enum url_scheme a,enum url_scheme b)2307 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2308 {
2309   if (a == b)
2310     return true;
2311 #ifdef HAVE_SSL
2312   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2313       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2314     return true;
2315 #endif
2316   return false;
2317 }
2319 static int
getchar_from_escaped_string(const char * str,char * c)2320 getchar_from_escaped_string (const char *str, char *c)
2321 {
2322   const char *p = str;
2324   assert (str && *str);
2325   assert (c);
2327   if (p[0] == '%')
2328     {
2329       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2330         {
2331           *c = '%';
2332           return 1;
2333         }
2334       else
2335         {
2336           if (p[2] == 0)
2337             return 0; /* error: invalid string */
2339           *c = X2DIGITS_TO_NUM (p[1], p[2]);
2340           if (URL_RESERVED_CHAR(*c))
2341             {
2342               *c = '%';
2343               return 1;
2344             }
2345           else
2346             return 3;
2347         }
2348     }
2349   else
2350     {
2351       *c = p[0];
2352     }
2354   return 1;
2355 }
2357 bool
are_urls_equal(const char * u1,const char * u2)2358 are_urls_equal (const char *u1, const char *u2)
2359 {
2360   const char *p, *q;
2361   int pp, qq;
2362   char ch1, ch2;
2363   assert(u1 && u2);
2365   p = u1;
2366   q = u2;
2368   while (*p && *q
2369          && (pp = getchar_from_escaped_string (p, &ch1))
2370          && (qq = getchar_from_escaped_string (q, &ch2))
2371          && (c_tolower(ch1) == c_tolower(ch2)))
2372     {
2373       p += pp;
2374       q += qq;
2375     }
2377   return (*p == 0 && *q == 0 ? true : false);
2378 }
2380 #ifdef TESTING
2381 /* Debugging and testing support for path_simplify. */
2383 #if 0
2384 /* Debug: run path_simplify on PATH and return the result in a new
2385    string.  Useful for calling from the debugger.  */
2386 static char *
2387 ps (char *path)
2388 {
2389   char *copy = xstrdup (path);
2390   path_simplify (copy);
2391   return copy;
2392 }
2393 #endif
2395 static const char *
run_test(const char * test,const char * expected_result,enum url_scheme scheme,bool expected_change)2396 run_test (const char *test, const char *expected_result, enum url_scheme scheme,
2397           bool expected_change)
2398 {
2399   char *test_copy = xstrdup (test);
2400   bool modified = path_simplify (scheme, test_copy);
2402   if (0 != strcmp (test_copy, expected_result))
2403     {
2404       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2405               test, expected_result, test_copy);
2406       mu_assert ("", 0);
2407     }
2408   if (modified != expected_change)
2409     {
2410       if (expected_change)
2411         printf ("Expected modification with path_simplify(\"%s\").\n",
2412                 test);
2413       else
2414         printf ("Expected no modification with path_simplify(\"%s\").\n",
2415                 test);
2416     }
2417   xfree (test_copy);
2418   mu_assert ("", modified == expected_change);
2419   return NULL;
2420 }
2422 const char *
test_path_simplify(void)2423 test_path_simplify (void)
2424 {
2425   static const struct {
2426     const char *test, *result;
2427     enum url_scheme scheme;
2428     bool should_modify;
2429   } tests[] = {
2430     { "",                       "",             SCHEME_HTTP, false },
2431     { ".",                      "",             SCHEME_HTTP, true },
2432     { "./",                     "",             SCHEME_HTTP, true },
2433     { "..",                     "",             SCHEME_HTTP, true },
2434     { "../",                    "",             SCHEME_HTTP, true },
2435     { "..",                     "..",           SCHEME_FTP,  false },
2436     { "../",                    "../",          SCHEME_FTP,  false },
2437     { "foo",                    "foo",          SCHEME_HTTP, false },
2438     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2439     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2440     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2441     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2442     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2443     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2444     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2445     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2446     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2447     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2448     { "foo/..",                 "",             SCHEME_HTTP, true },
2449     { "foo/../..",              "",             SCHEME_HTTP, true },
2450     { "foo/../../..",           "",             SCHEME_HTTP, true },
2451     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2452     { "foo/../..",              "..",           SCHEME_FTP,  true },
2453     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2454     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2455     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2456     { "./a/../b",               "b",            SCHEME_HTTP, true }
2457   };
2458   unsigned i;
2460   for (i = 0; i < countof (tests); i++)
2461     {
2462       const char *message;
2463       const char *test = tests[i].test;
2464       const char *expected_result = tests[i].result;
2465       enum url_scheme scheme = tests[i].scheme;
2466       bool  expected_change = tests[i].should_modify;
2468       message = run_test (test, expected_result, scheme, expected_change);
2469       if (message) return message;
2470     }
2471   return NULL;
2472 }
2474 const char *
test_append_uri_pathel(void)2475 test_append_uri_pathel(void)
2476 {
2477   unsigned i;
2478   static const struct {
2479     const char *original_url;
2480     const char *input;
2481     bool escaped;
2482     const char *expected_result;
2483   } test_array[] = {
2484     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2485   };
2487   for (i = 0; i < countof(test_array); ++i)
2488     {
2489       struct growable dest;
2490       const char *p = test_array[i].input;
2492       memset (&dest, 0, sizeof (dest));
2494       append_string (test_array[i].original_url, &dest);
2495       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2497       mu_assert ("test_append_uri_pathel: wrong result",
2498                  strcmp (dest.base, test_array[i].expected_result) == 0);
2499       xfree (dest.base);
2500     }
2502   return NULL;
2503 }
2505 const char *
test_are_urls_equal(void)2506 test_are_urls_equal(void)
2507 {
2508   unsigned i;
2509   static const struct {
2510     const char *url1;
2511     const char *url2;
2512     bool expected_result;
2513   } test_array[] = {
2514     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2515     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2516     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2517     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2518     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2519     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2520   };
2522   for (i = 0; i < countof(test_array); ++i)
2523     {
2524       mu_assert ("test_are_urls_equal: wrong result",
2525                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2526     }
2528   return NULL;
2529 }
2531 #endif /* TESTING */
2533 /*
2534  * vim: et ts=2 sw=2
2535  */