1 /* URL handling.
2    Copyright (C) 1996-2011, 2015, 2018-2021 Free Software Foundation,
3    Inc.
4 
5 This file is part of GNU Wget.
6 
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or (at
10 your option) any later version.
11 
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19 
20 Additional permission under GNU GPL version 3 section 7
21 
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work.  */
30 
31 #include "wget.h"
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <assert.h>
39 
40 #include "utils.h"
41 #include "url.h"
42 #include "host.h"  /* for is_valid_ipv6_address */
43 #include "c-strcase.h"
44 
45 #ifdef HAVE_ICONV
46 # include <iconv.h>
47 #endif
48 #include <langinfo.h>
49 
50 #ifdef __VMS
51 #include "vms.h"
52 #endif /* def __VMS */
53 
54 #ifdef TESTING
55 #include "../tests/unit-tests.h"
56 #endif
57 
58 enum {
59   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
60   scm_has_params = 2,           /* whether scheme has ;params */
61   scm_has_query = 4,            /* whether scheme has ?query */
62   scm_has_fragment = 8          /* whether scheme has #fragment */
63 };
64 
65 struct scheme_data
66 {
67   /* Short name of the scheme, such as "http" or "ftp". */
68   const char *name;
69   /* Leading string that identifies the scheme, such as "https://". */
70   const char *leading_string;
71   /* Default port of the scheme when none is specified. */
72   int default_port;
73   /* Various flags. */
74   int flags;
75 };
76 
77 /* Supported schemes: */
78 static struct scheme_data supported_schemes[] =
79 {
80   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
81 #ifdef HAVE_SSL
82   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
83 #endif
84   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
85 #ifdef HAVE_SSL
86   /*
87    * Explicit FTPS uses the same port as FTP.
88    * Implicit FTPS has its own port (990), but it is disabled by default.
89    */
90   { "ftps",     "ftps://",  DEFAULT_FTP_PORT,  scm_has_params|scm_has_fragment },
91 #endif
92 
93   /* SCHEME_INVALID */
94   { NULL,       NULL,       -1,                 0 }
95 };
96 
97 /* Forward declarations: */
98 
99 static bool path_simplify (enum url_scheme, char *);
100 
101 /* Support for escaping and unescaping of URL strings.  */
102 
103 /* Table of "reserved" and "unsafe" characters.  Those terms are
104    rfc1738-speak, as such largely obsoleted by rfc2396 and later
105    specs, but the general idea remains.
106 
107    A reserved character is the one that you can't decode without
108    changing the meaning of the URL.  For example, you can't decode
109    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
110    path components is different.  Non-reserved characters can be
111    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
112    unsafe characters are loosely based on rfc1738, plus "$" and ",",
113    as recommended by rfc2396, and minus "~", which is very frequently
114    used (and sometimes unrecognized as %7E by broken servers).
115 
116    An unsafe character is the one that should be encoded when URLs are
117    placed in foreign environments.  E.g. space and newline are unsafe
118    in HTTP contexts because HTTP uses them as separator and line
119    terminator, so they must be encoded to %20 and %0A respectively.
120    "*" is unsafe in shell context, etc.
121 
122    We determine whether a character is unsafe through static table
123    lookup.  This code assumes ASCII character set and 8-bit chars.  */
124 
125 enum {
126   /* rfc1738 reserved chars + "$" and ",".  */
127   urlchr_reserved = 1,
128 
129   /* rfc1738 unsafe chars, plus non-printables.  */
130   urlchr_unsafe   = 2
131 };
132 
133 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
134 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
135 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
136 
137 /* Shorthands for the table: */
138 #define R  urlchr_reserved
139 #define U  urlchr_unsafe
140 #define RU R|U
141 
142 static const unsigned char urlchr_table[256] =
143 {
144   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
145   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
146   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
147   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
148   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
149   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
150   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
151   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
152  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
153   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
154   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
155   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
156   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
157   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
158   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
159   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
160 
161   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
162   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
163   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
164   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
165 
166   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
167   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
168   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
169   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
170 };
171 #undef R
172 #undef U
173 #undef RU
174 
175 static void
url_unescape_1(char * s,unsigned char mask)176 url_unescape_1 (char *s, unsigned char mask)
177 {
178   unsigned char *t = (unsigned char *) s; /* t - tortoise */
179   unsigned char *h = (unsigned char *) s; /* h - hare     */
180 
181   for (; *h; h++, t++)
182     {
183       if (*h != '%')
184         {
185         copychar:
186           *t = *h;
187         }
188       else
189         {
190           unsigned char c;
191           /* Do nothing if '%' is not followed by two hex digits. */
192           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
193             goto copychar;
194           c = X2DIGITS_TO_NUM (h[1], h[2]);
195           if (urlchr_test(c, mask))
196             goto copychar;
197           /* Don't unescape %00 because there is no way to insert it
198              into a C string without effectively truncating it. */
199           if (c == '\0')
200             goto copychar;
201           *t = c;
202           h += 2;
203         }
204     }
205   *t = '\0';
206 }
207 
208 /* URL-unescape the string S.
209 
210    This is done by transforming the sequences "%HH" to the character
211    represented by the hexadecimal digits HH.  If % is not followed by
212    two hexadecimal digits, it is inserted literally.
213 
214    The transformation is done in place.  If you need the original
215    string intact, make a copy before calling this function.  */
216 void
url_unescape(char * s)217 url_unescape (char *s)
218 {
219   url_unescape_1 (s, 0);
220 }
221 
222 /* URL-unescape the string S.
223 
224    This functions behaves identically as url_unescape(), but does not
225    convert characters from "reserved". In other words, it only converts
226    "unsafe" characters.  */
227 void
url_unescape_except_reserved(char * s)228 url_unescape_except_reserved (char *s)
229 {
230   url_unescape_1 (s, urlchr_reserved);
231 }
232 
233 /* The core of url_escape_* functions.  Escapes the characters that
234    match the provided mask in urlchr_table.
235 
236    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
237    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
238    allocated string will be returned in all cases.  */
239 
240 static char *
url_escape_1(const char * s,unsigned char mask,bool allow_passthrough)241 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
242 {
243   const char *p1;
244   char *p2, *newstr;
245   int newlen;
246   int addition = 0;
247 
248   for (p1 = s; *p1; p1++)
249     if (urlchr_test (*p1, mask))
250       addition += 2;            /* Two more characters (hex digits) */
251 
252   if (!addition)
253     return allow_passthrough ? (char *)s : xstrdup (s);
254 
255   newlen = (p1 - s) + addition;
256   newstr = xmalloc (newlen + 1);
257 
258   p1 = s;
259   p2 = newstr;
260   while (*p1)
261     {
262       /* Quote the characters that match the test mask. */
263       if (urlchr_test (*p1, mask))
264         {
265           unsigned char c = *p1++;
266           *p2++ = '%';
267           *p2++ = XNUM_TO_DIGIT (c >> 4);
268           *p2++ = XNUM_TO_DIGIT (c & 0xf);
269         }
270       else
271         *p2++ = *p1++;
272     }
273   assert (p2 - newstr == newlen);
274   *p2 = '\0';
275 
276   return newstr;
277 }
278 
279 /* URL-escape the unsafe characters (see urlchr_table) in a given
280    string, returning a freshly allocated string.  */
281 
282 char *
url_escape(const char * s)283 url_escape (const char *s)
284 {
285   return url_escape_1 (s, urlchr_unsafe, false);
286 }
287 
288 /* URL-escape the unsafe and reserved characters (see urlchr_table) in
289    a given string, returning a freshly allocated string.  */
290 
291 char *
url_escape_unsafe_and_reserved(const char * s)292 url_escape_unsafe_and_reserved (const char *s)
293 {
294   return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
295 }
296 
297 /* URL-escape the unsafe characters (see urlchr_table) in a given
298    string.  If no characters are unsafe, S is returned.  */
299 
300 static char *
url_escape_allow_passthrough(const char * s)301 url_escape_allow_passthrough (const char *s)
302 {
303   return url_escape_1 (s, urlchr_unsafe, true);
304 }
305 
306 /* Decide whether the char at position P needs to be encoded.  (It is
307    not enough to pass a single char *P because the function may need
308    to inspect the surrounding context.)
309 
310    Return true if the char should be escaped as %XX, false otherwise.  */
311 
312 static inline bool
char_needs_escaping(const char * p)313 char_needs_escaping (const char *p)
314 {
315   if (*p == '%')
316     {
317       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
318         return false;
319       else
320         /* Garbled %.. sequence: encode `%'. */
321         return true;
322     }
323   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
324     return true;
325   else
326     return false;
327 }
328 
329 /* Translate a %-escaped (but possibly non-conformant) input string S
330    into a %-escaped (and conformant) output string.  If no characters
331    are encoded or decoded, return the same string S; otherwise, return
332    a freshly allocated string with the new contents.
333 
334    After a URL has been run through this function, the protocols that
335    use `%' as the quote character can use the resulting string as-is,
336    while those that don't can use url_unescape to get to the intended
337    data.  This function is stable: once the input is transformed,
338    further transformations of the result yield the same output.
339 
340    Let's discuss why this function is needed.
341 
342    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
343    a raw space character would mess up the HTTP request, it needs to
344    be quoted, like this:
345 
346        GET /abc%20def HTTP/1.0
347 
348    It would appear that the unsafe chars need to be quoted, for
349    example with url_escape.  But what if we're requested to download
350    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
351    us with `abc%2520def'.  This is incorrect -- since %-escapes are
352    part of URL syntax, "%20" is the correct way to denote a literal
353    space on the Wget command line.  This leads to the conclusion that
354    in that case Wget should not call url_escape, but leave the `%20'
355    as is.  This is clearly contradictory, but it only gets worse.
356 
357    What if the requested URI is `abc%20 def'?  If we call url_escape,
358    we end up with `/abc%2520%20def', which is almost certainly not
359    intended.  If we don't call url_escape, we are left with the
360    embedded space and cannot complete the request.  What the user
361    meant was for Wget to request `/abc%20%20def', and this is where
362    reencode_escapes kicks in.
363 
364    Wget used to solve this by first decoding %-quotes, and then
365    encoding all the "unsafe" characters found in the resulting string.
366    This was wrong because it didn't preserve certain URL special
367    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
368    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
369    whether we considered `+' reserved (it is).  One of these results
370    is inevitable because by the second step we would lose information
371    on whether the `+' was originally encoded or not.  Both results
372    were wrong because in CGI parameters + means space, while %2B means
373    literal plus.  reencode_escapes correctly translates the above to
374    "a%2B+b", i.e. returns the original string.
375 
376    This function uses a modified version of the algorithm originally
377    proposed by Anon Sricharoenchai:
378 
379    * Encode all "unsafe" characters, except those that are also
380      "reserved", to %XX.  See urlchr_table for which characters are
381      unsafe and reserved.
382 
383    * Encode the "%" characters not followed by two hex digits to
384      "%25".
385 
386    * Pass through all other characters and %XX escapes as-is.  (Up to
387      Wget 1.10 this decoded %XX escapes corresponding to "safe"
388      characters, but that was obtrusive and broke some servers.)
389 
390    Anon's test case:
391 
392    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
393    ->
394    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
395 
396    Simpler test cases:
397 
398    "foo bar"         -> "foo%20bar"
399    "foo%20bar"       -> "foo%20bar"
400    "foo %20bar"      -> "foo%20%20bar"
401    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
402    "foo%25%20bar"    -> "foo%25%20bar"
403    "foo%2%20bar"     -> "foo%252%20bar"
404    "foo+bar"         -> "foo+bar"            (plus is reserved!)
405    "foo%2b+bar"      -> "foo%2b+bar"  */
406 
407 static char *
reencode_escapes(const char * s)408 reencode_escapes (const char *s)
409 {
410   const char *p1;
411   char *newstr, *p2;
412   int oldlen, newlen;
413 
414   int encode_count = 0;
415 
416   /* First pass: inspect the string to see if there's anything to do,
417      and to calculate the new length.  */
418   for (p1 = s; *p1; p1++)
419     if (char_needs_escaping (p1))
420       ++encode_count;
421 
422   if (!encode_count)
423     /* The string is good as it is. */
424     return (char *) s;          /* C const model sucks. */
425 
426   oldlen = p1 - s;
427   /* Each encoding adds two characters (hex digits).  */
428   newlen = oldlen + 2 * encode_count;
429   newstr = xmalloc (newlen + 1);
430 
431   /* Second pass: copy the string to the destination address, encoding
432      chars when needed.  */
433   p1 = s;
434   p2 = newstr;
435 
436   while (*p1)
437     if (char_needs_escaping (p1))
438       {
439         unsigned char c = *p1++;
440         *p2++ = '%';
441         *p2++ = XNUM_TO_DIGIT (c >> 4);
442         *p2++ = XNUM_TO_DIGIT (c & 0xf);
443       }
444     else
445       *p2++ = *p1++;
446 
447   *p2 = '\0';
448   assert (p2 - newstr == newlen);
449   return newstr;
450 }
451 
452 /* Returns the scheme type if the scheme is supported, or
453    SCHEME_INVALID if not.  */
454 
455 enum url_scheme
url_scheme(const char * url)456 url_scheme (const char *url)
457 {
458   int i;
459 
460   for (i = 0; supported_schemes[i].leading_string; i++)
461     if (0 == c_strncasecmp (url, supported_schemes[i].leading_string,
462                           strlen (supported_schemes[i].leading_string)))
463       {
464         if (!(supported_schemes[i].flags & scm_disabled))
465           return (enum url_scheme) i;
466         else
467           return SCHEME_INVALID;
468       }
469 
470   return SCHEME_INVALID;
471 }
472 
473 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
474 
475 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
476    currently implemented, it returns true if URL begins with
477    [-+a-zA-Z0-9]+: .  */
478 
479 bool
url_has_scheme(const char * url)480 url_has_scheme (const char *url)
481 {
482   const char *p = url;
483 
484   /* The first char must be a scheme char. */
485   if (!*p || !SCHEME_CHAR (*p))
486     return false;
487   ++p;
488   /* Followed by 0 or more scheme chars. */
489   while (*p && SCHEME_CHAR (*p))
490     ++p;
491   /* Terminated by ':'. */
492   return *p == ':';
493 }
494 
495 bool
url_valid_scheme(const char * url)496 url_valid_scheme (const char *url)
497 {
498   enum url_scheme scheme = url_scheme (url);
499   return scheme != SCHEME_INVALID;
500 }
501 
502 int
scheme_default_port(enum url_scheme scheme)503 scheme_default_port (enum url_scheme scheme)
504 {
505   return supported_schemes[scheme].default_port;
506 }
507 
508 void
scheme_disable(enum url_scheme scheme)509 scheme_disable (enum url_scheme scheme)
510 {
511   supported_schemes[scheme].flags |= scm_disabled;
512 }
513 
514 const char *
scheme_leading_string(enum url_scheme scheme)515 scheme_leading_string (enum url_scheme scheme)
516 {
517   return supported_schemes[scheme].leading_string;
518 }
519 
520 /* Skip the username and password, if present in the URL.  The
521    function should *not* be called with the complete URL, but with the
522    portion after the scheme.
523 
524    If no username and password are found, return URL.  */
525 
526 static const char *
url_skip_credentials(const char * url)527 url_skip_credentials (const char *url)
528 {
529   /* Look for '@' that comes before terminators, such as '/', '?',
530      '#', or ';'.  */
531   const char *p = (const char *)strpbrk (url, "@/?#;");
532   if (!p || *p != '@')
533     return url;
534   return p + 1;
535 }
536 
537 /* Parse credentials contained in [BEG, END).  The region is expected
538    to have come from a URL and is unescaped.  */
539 
540 static bool
parse_credentials(const char * beg,const char * end,char ** user,char ** passwd)541 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
542 {
543   char *colon;
544   const char *userend;
545 
546   if (beg == end)
547     return false;               /* empty user name */
548 
549   colon = memchr (beg, ':', end - beg);
550   if (colon == beg)
551     return false;               /* again empty user name */
552 
553   if (colon)
554     {
555       *passwd = strdupdelim (colon + 1, end);
556       userend = colon;
557       url_unescape (*passwd);
558     }
559   else
560     {
561       *passwd = NULL;
562       userend = end;
563     }
564   *user = strdupdelim (beg, userend);
565   url_unescape (*user);
566   return true;
567 }
568 
569 /* Used by main.c: detect URLs written using the "shorthand" URL forms
570    originally popularized by Netscape and NcFTP.  HTTP shorthands look
571    like this:
572 
573    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
574    www.foo.com[:port]            -> http://www.foo.com[:port]
575 
576    FTP shorthands look like this:
577 
578    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
579    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
580 
581    If the URL needs not or cannot be rewritten, return NULL.  */
582 
583 char *
rewrite_shorthand_url(const char * url)584 rewrite_shorthand_url (const char *url)
585 {
586   const char *p;
587   char *ret;
588 
589   if (url_scheme (url) != SCHEME_INVALID)
590     return NULL;
591 
592   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
593      latter Netscape.  */
594   p = strpbrk (url, ":/");
595   if (p == url)
596     return NULL;
597 
598   /* If we're looking at "://", it means the URL uses a scheme we
599      don't support, which may include "https" when compiled without
600      SSL support.  Don't bogusly rewrite such URLs.  */
601   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
602     return NULL;
603 
604   if (p && *p == ':')
605     {
606       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
607          special case of http port number ("localhost:10000").  */
608       int digits = strspn (p + 1, "0123456789");
609       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
610         goto http;
611 
612       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
613       if ((ret = aprintf ("ftp://%s", url)) != NULL)
614         ret[6 + (p - url)] = '/';
615     }
616   else
617     {
618     http:
619       /* Just prepend "http://" to URL. */
620       ret = aprintf ("http://%s", url);
621     }
622   return ret;
623 }
624 
625 static void split_path (const char *, char **, char **);
626 
627 /* Like strpbrk, with the exception that it returns the pointer to the
628    terminating zero (end-of-string aka "eos") if no matching character
629    is found.  */
630 
631 static inline char *
strpbrk_or_eos(const char * s,const char * accept)632 strpbrk_or_eos (const char *s, const char *accept)
633 {
634   char *p = strpbrk (s, accept);
635   if (!p)
636     p = strchr (s, '\0');
637   return p;
638 }
639 
640 /* Turn STR into lowercase; return true if a character was actually
641    changed. */
642 
643 static bool
lowercase_str(char * str)644 lowercase_str (char *str)
645 {
646   bool changed = false;
647   for (; *str; str++)
648     if (c_isupper (*str))
649       {
650         changed = true;
651         *str = c_tolower (*str);
652       }
653   return changed;
654 }
655 
656 static const char *
init_seps(enum url_scheme scheme)657 init_seps (enum url_scheme scheme)
658 {
659   static char seps[8] = ":/";
660   char *p = seps + 2;
661   int flags = supported_schemes[scheme].flags;
662 
663   if (flags & scm_has_params)
664     *p++ = ';';
665   if (flags & scm_has_query)
666     *p++ = '?';
667   if (flags & scm_has_fragment)
668     *p++ = '#';
669   *p = '\0';
670   return seps;
671 }
672 
673 static const char *parse_errors[] = {
674 #define PE_NO_ERROR                     0
675   N_("No error"),
676 #define PE_UNSUPPORTED_SCHEME           1
677   N_("Unsupported scheme %s"), /* support for format token only here */
678 #define PE_MISSING_SCHEME               2
679   N_("Scheme missing"),
680 #define PE_INVALID_HOST_NAME            3
681   N_("Invalid host name"),
682 #define PE_BAD_PORT_NUMBER              4
683   N_("Bad port number"),
684 #define PE_INVALID_USER_NAME            5
685   N_("Invalid user name"),
686 #define PE_UNTERMINATED_IPV6_ADDRESS    6
687   N_("Unterminated IPv6 numeric address"),
688 #define PE_IPV6_NOT_SUPPORTED           7
689   N_("IPv6 addresses not supported"),
690 #define PE_INVALID_IPV6_ADDRESS         8
691   N_("Invalid IPv6 numeric address")
692 };
693 
694 /* Parse a URL.
695 
696    Return a new struct url if successful, NULL on error.  In case of
697    error, and if ERROR is not NULL, also set *ERROR to the appropriate
698    error code. */
699 struct url *
url_parse(const char * url,int * error,struct iri * iri,bool percent_encode)700 url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
701 {
702   struct url *u;
703   const char *p;
704   bool path_modified, host_modified;
705 
706   enum url_scheme scheme;
707   const char *seps;
708 
709   const char *uname_b,     *uname_e;
710   const char *host_b,      *host_e;
711   const char *path_b,      *path_e;
712   const char *params_b,    *params_e;
713   const char *query_b,     *query_e;
714   const char *fragment_b,  *fragment_e;
715 
716   int port;
717   char *user = NULL, *passwd = NULL;
718 
719   const char *url_encoded = NULL;
720 
721   int error_code;
722 
723   scheme = url_scheme (url);
724   if (scheme == SCHEME_INVALID)
725     {
726       if (url_has_scheme (url))
727         error_code = PE_UNSUPPORTED_SCHEME;
728       else
729         error_code = PE_MISSING_SCHEME;
730       goto error;
731     }
732 
733   url_encoded = url;
734 
735   if (iri && iri->utf8_encode)
736     {
737       char *new_url = NULL;
738 
739       iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, &new_url);
740       if (!iri->utf8_encode)
741         new_url = NULL;
742       else
743         {
744           xfree (iri->orig_url);
745           iri->orig_url = xstrdup (url);
746           url_encoded = reencode_escapes (new_url);
747           if (url_encoded != new_url)
748             xfree (new_url);
749           percent_encode = false;
750         }
751     }
752 
753   if (percent_encode)
754     url_encoded = reencode_escapes (url);
755 
756   p = url_encoded;
757   p += strlen (supported_schemes[scheme].leading_string);
758   uname_b = p;
759   p = url_skip_credentials (p);
760   uname_e = p;
761 
762   /* scheme://user:pass@host[:port]... */
763   /*                    ^              */
764 
765   /* We attempt to break down the URL into the components path,
766      params, query, and fragment.  They are ordered like this:
767 
768        scheme://host[:port][/path][;params][?query][#fragment]  */
769 
770   path_b     = path_e     = NULL;
771   params_b   = params_e   = NULL;
772   query_b    = query_e    = NULL;
773   fragment_b = fragment_e = NULL;
774 
775   /* Initialize separators for optional parts of URL, depending on the
776      scheme.  For example, FTP has params, and HTTP and HTTPS have
777      query string and fragment. */
778   seps = init_seps (scheme);
779 
780   host_b = p;
781 
782   if (*p == '[')
783     {
784       /* Handle IPv6 address inside square brackets.  Ideally we'd
785          just look for the terminating ']', but rfc2732 mandates
786          rejecting invalid IPv6 addresses.  */
787 
788       /* The address begins after '['. */
789       host_b = p + 1;
790       host_e = strchr (host_b, ']');
791 
792       if (!host_e)
793         {
794           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
795           goto error;
796         }
797 
798 #ifdef ENABLE_IPV6
799       /* Check if the IPv6 address is valid. */
800       if (!is_valid_ipv6_address(host_b, host_e))
801         {
802           error_code = PE_INVALID_IPV6_ADDRESS;
803           goto error;
804         }
805 
806       /* Continue parsing after the closing ']'. */
807       p = host_e + 1;
808 #else
809       error_code = PE_IPV6_NOT_SUPPORTED;
810       goto error;
811 #endif
812 
813       /* The closing bracket must be followed by a separator or by the
814          null char.  */
815       /* http://[::1]... */
816       /*             ^   */
817       if (!strchr (seps, *p))
818         {
819           /* Trailing garbage after []-delimited IPv6 address. */
820           error_code = PE_INVALID_HOST_NAME;
821           goto error;
822         }
823     }
824   else
825     {
826       p = strpbrk_or_eos (p, seps);
827       host_e = p;
828     }
829   ++seps;                       /* advance to '/' */
830 
831   if (host_b == host_e)
832     {
833       error_code = PE_INVALID_HOST_NAME;
834       goto error;
835     }
836 
837   port = scheme_default_port (scheme);
838   if (*p == ':')
839     {
840       const char *port_b, *port_e, *pp;
841 
842       /* scheme://host:port/tralala */
843       /*              ^             */
844       ++p;
845       port_b = p;
846       p = strpbrk_or_eos (p, seps);
847       port_e = p;
848 
849       /* Allow empty port, as per rfc2396. */
850       if (port_b != port_e)
851         for (port = 0, pp = port_b; pp < port_e; pp++)
852           {
853             if (!c_isdigit (*pp))
854               {
855                 /* http://host:12randomgarbage/blah */
856                 /*               ^                  */
857                 error_code = PE_BAD_PORT_NUMBER;
858                 goto error;
859               }
860             port = 10 * port + (*pp - '0');
861             /* Check for too large port numbers here, before we have
862                a chance to overflow on bogus port values.  */
863             if (port > 0xffff)
864               {
865                 error_code = PE_BAD_PORT_NUMBER;
866                 goto error;
867               }
868           }
869     }
870   /* Advance to the first separator *after* '/' (either ';' or '?',
871      depending on the scheme).  */
872   ++seps;
873 
874   /* Get the optional parts of URL, each part being delimited by
875      current location and the position of the next separator.  */
876 #define GET_URL_PART(sepchar, var) do {                         \
877   if (*p == sepchar)                                            \
878     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
879   ++seps;                                                       \
880 } while (0)
881 
882   GET_URL_PART ('/', path);
883   if (supported_schemes[scheme].flags & scm_has_params)
884     GET_URL_PART (';', params);
885   if (supported_schemes[scheme].flags & scm_has_query)
886     GET_URL_PART ('?', query);
887   if (supported_schemes[scheme].flags & scm_has_fragment)
888     GET_URL_PART ('#', fragment);
889 
890 #undef GET_URL_PART
891   assert (*p == 0);
892 
893   if (uname_b != uname_e)
894     {
895       /* http://user:pass@host */
896       /*        ^         ^    */
897       /*     uname_b   uname_e */
898       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
899         {
900           error_code = PE_INVALID_USER_NAME;
901           goto error;
902         }
903     }
904 
905   u = xnew0 (struct url);
906   u->scheme = scheme;
907   u->host   = strdupdelim (host_b, host_e);
908   u->port   = port;
909   u->user   = user;
910   u->passwd = passwd;
911 
912   u->path = strdupdelim (path_b, path_e);
913   path_modified = path_simplify (scheme, u->path);
914   split_path (u->path, &u->dir, &u->file);
915 
916   host_modified = lowercase_str (u->host);
917 
918   /* Decode %HH sequences in host name.  This is important not so much
919      to support %HH sequences in host names (which other browser
920      don't), but to support binary characters (which will have been
921      converted to %HH by reencode_escapes).  */
922   if (strchr (u->host, '%'))
923     {
924       url_unescape (u->host);
925       host_modified = true;
926 
927       /* check for invalid control characters in host name */
928       for (p = u->host; *p; p++)
929         {
930           if (c_iscntrl(*p))
931             {
932               url_free(u);
933               error_code = PE_INVALID_HOST_NAME;
934               goto error;
935             }
936         }
937 
938       /* Apply IDNA regardless of iri->utf8_encode status */
939       if (opt.enable_iri && iri)
940         {
941           char *new = idn_encode (iri, u->host);
942           if (new)
943             {
944               xfree (u->host);
945               u->host = new;
946               host_modified = true;
947             }
948         }
949     }
950 
951   if (params_b)
952     u->params = strdupdelim (params_b, params_e);
953   if (query_b)
954     u->query = strdupdelim (query_b, query_e);
955   if (fragment_b)
956     u->fragment = strdupdelim (fragment_b, fragment_e);
957 
958   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
959     {
960       /* If we suspect that a transformation has rendered what
961          url_string might return different from URL_ENCODED, rebuild
962          u->url using url_string.  */
963       u->url = url_string (u, URL_AUTH_SHOW);
964 
965       if (url_encoded != url)
966         xfree (url_encoded);
967     }
968   else
969     {
970       if (url_encoded == url)
971         u->url = xstrdup (url);
972       else
973         u->url = (char *) url_encoded;
974     }
975 
976   return u;
977 
978  error:
979   /* Cleanup in case of error: */
980   if (url_encoded && url_encoded != url)
981     xfree (url_encoded);
982 
983   /* Transmit the error code to the caller, if the caller wants to
984      know.  */
985   if (error)
986     *error = error_code;
987   return NULL;
988 }
989 
990 /* Return the error message string from ERROR_CODE, which should have
991    been retrieved from url_parse.  The error message is translated.  */
992 
993 char *
url_error(const char * url,int error_code)994 url_error (const char *url, int error_code)
995 {
996   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
997 
998   if (error_code == PE_UNSUPPORTED_SCHEME)
999     {
1000       char *error, *p;
1001       char *scheme = xstrdup (url);
1002       assert (url_has_scheme (url));
1003 
1004       if ((p = strchr (scheme, ':')))
1005         *p = '\0';
1006       if (!c_strcasecmp (scheme, "https"))
1007         error = aprintf (_("HTTPS support not compiled in"));
1008       else
1009         error = aprintf (_(parse_errors[error_code]), quote (scheme));
1010       xfree (scheme);
1011 
1012       return error;
1013     }
1014   else
1015     return xstrdup (_(parse_errors[error_code]));
1016 }
1017 
1018 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
1019    expected to be URL-escaped.
1020 
1021    The path is split into directory (the part up to the last slash)
1022    and file (the part after the last slash), which are subsequently
1023    unescaped.  Examples:
1024 
1025    PATH                 DIR           FILE
1026    "foo/bar/baz"        "foo/bar"     "baz"
1027    "foo/bar/"           "foo/bar"     ""
1028    "foo"                ""            "foo"
1029    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
1030 
1031    DIR and FILE are freshly allocated.  */
1032 
1033 static void
split_path(const char * path,char ** dir,char ** file)1034 split_path (const char *path, char **dir, char **file)
1035 {
1036   char *last_slash = strrchr (path, '/');
1037   if (!last_slash)
1038     {
1039       *dir = xstrdup ("");
1040       *file = xstrdup (path);
1041     }
1042   else
1043     {
1044       *dir = strdupdelim (path, last_slash);
1045       *file = xstrdup (last_slash + 1);
1046     }
1047   url_unescape (*dir);
1048   url_unescape (*file);
1049 }
1050 
1051 /* Note: URL's "full path" is the path with the query string and
1052    params appended.  The "fragment" (#foo) is intentionally ignored,
1053    but that might be changed.  For example, if the original URL was
1054    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1055    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1056 
1057 /* Return the length of the full path, without the terminating
1058    zero.  */
1059 
1060 static int
full_path_length(const struct url * url)1061 full_path_length (const struct url *url)
1062 {
1063   int len = 0;
1064 
1065 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1066 
1067   FROB (path);
1068   FROB (params);
1069   FROB (query);
1070 
1071 #undef FROB
1072 
1073   return len;
1074 }
1075 
1076 /* Write out the full path. */
1077 
1078 static void
full_path_write(const struct url * url,char * where)1079 full_path_write (const struct url *url, char *where)
1080 {
1081 #define FROB(el, chr) do {                      \
1082   char *f_el = url->el;                         \
1083   if (f_el) {                                   \
1084     int l = strlen (f_el);                      \
1085     *where++ = chr;                             \
1086     memcpy (where, f_el, l);                    \
1087     where += l;                                 \
1088   }                                             \
1089 } while (0)
1090 
1091   FROB (path, '/');
1092   FROB (params, ';');
1093   FROB (query, '?');
1094 
1095 #undef FROB
1096 }
1097 
1098 /* Public function for getting the "full path".  E.g. if u->path is
1099    "foo/bar" and u->query is "param=value", full_path will be
1100    "/foo/bar?param=value". */
1101 
1102 char *
url_full_path(const struct url * url)1103 url_full_path (const struct url *url)
1104 {
1105   int length = full_path_length (url);
1106   char *full_path = xmalloc (length + 1);
1107 
1108   full_path_write (url, full_path);
1109   full_path[length] = '\0';
1110 
1111   return full_path;
1112 }
1113 
1114 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1115    escaping of certain characters, such as "/" and ":".  Returns a
1116    count of unescaped chars.  */
1117 
1118 static void
unescape_single_char(char * str,char chr)1119 unescape_single_char (char *str, char chr)
1120 {
1121   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1122   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1123   char *h = str;                /* hare */
1124   char *t = str;                /* tortoise */
1125   for (; *h; h++, t++)
1126     {
1127       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1128         {
1129           *t = chr;
1130           h += 2;
1131         }
1132       else
1133         *t = *h;
1134     }
1135   *t = '\0';
1136 }
1137 
1138 /* Escape unsafe and reserved characters, except for the slash
1139    characters.  */
1140 
1141 static char *
url_escape_dir(const char * dir)1142 url_escape_dir (const char *dir)
1143 {
1144   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1145   if (newdir == dir)
1146     return (char *)dir;
1147 
1148   unescape_single_char (newdir, '/');
1149   return newdir;
1150 }
1151 
1152 /* Sync u->path and u->url with u->dir and u->file.  Called after
1153    u->file or u->dir have been changed, typically by the FTP code.  */
1154 
1155 static void
sync_path(struct url * u)1156 sync_path (struct url *u)
1157 {
1158   char *newpath, *efile, *edir;
1159 
1160   xfree (u->path);
1161 
1162   /* u->dir and u->file are not escaped.  URL-escape them before
1163      reassembling them into u->path.  That way, if they contain
1164      separators like '?' or even if u->file contains slashes, the
1165      path will be correctly assembled.  (u->file can contain slashes
1166      if the URL specifies it with %2f, or if an FTP server returns
1167      it.)  */
1168   edir = url_escape_dir (u->dir);
1169   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1170 
1171   if (!*edir)
1172     newpath = xstrdup (efile);
1173   else
1174     {
1175       int dirlen = strlen (edir);
1176       int filelen = strlen (efile);
1177 
1178       /* Copy "DIR/FILE" to newpath. */
1179       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1180       memcpy (p, edir, dirlen);
1181       p += dirlen;
1182       *p++ = '/';
1183       memcpy (p, efile, filelen);
1184       p += filelen;
1185       *p = '\0';
1186     }
1187 
1188   u->path = newpath;
1189 
1190   if (edir != u->dir)
1191     xfree (edir);
1192   if (efile != u->file)
1193     xfree (efile);
1194 
1195   /* Regenerate u->url as well.  */
1196   xfree (u->url);
1197   u->url = url_string (u, URL_AUTH_SHOW);
1198 }
1199 
1200 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1201    This way we can sync u->path and u->url when they get changed.  */
1202 
1203 void
url_set_dir(struct url * url,const char * newdir)1204 url_set_dir (struct url *url, const char *newdir)
1205 {
1206   xfree (url->dir);
1207   url->dir = xstrdup (newdir);
1208   sync_path (url);
1209 }
1210 
1211 void
url_set_file(struct url * url,const char * newfile)1212 url_set_file (struct url *url, const char *newfile)
1213 {
1214   xfree (url->file);
1215   url->file = xstrdup (newfile);
1216   sync_path (url);
1217 }
1218 
1219 void
url_free(struct url * url)1220 url_free (struct url *url)
1221 {
1222   if (url)
1223     {
1224       xfree (url->host);
1225 
1226       xfree (url->path);
1227       xfree (url->url);
1228 
1229       xfree (url->params);
1230       xfree (url->query);
1231       xfree (url->fragment);
1232       xfree (url->user);
1233       xfree (url->passwd);
1234 
1235       xfree (url->dir);
1236       xfree (url->file);
1237 
1238       xfree (url);
1239     }
1240 }
1241 
1242 /* Create all the necessary directories for PATH (a file).  Calls
1243    make_directory internally.  */
1244 int
mkalldirs(const char * path)1245 mkalldirs (const char *path)
1246 {
1247   const char *p;
1248   char *t;
1249   struct stat st;
1250   int res;
1251 
1252   p = strrchr(path, '/');
1253   p = p == NULL ? path : p;
1254 
1255   /* Don't create if it's just a file.  */
1256   if ((p == path) && (*p != '/'))
1257     return 0;
1258   t = strdupdelim (path, p);
1259 
1260   /* Check whether the directory exists.  */
1261   if ((stat (t, &st) == 0))
1262     {
1263       if (S_ISDIR (st.st_mode))
1264         {
1265           xfree (t);
1266           return 0;
1267         }
1268       else
1269         {
1270           /* If the dir exists as a file name, remove it first.  This
1271              is *only* for Wget to work with buggy old CERN http
1272              servers.  Here is the scenario: When Wget tries to
1273              retrieve a directory without a slash, e.g.
1274              http://foo/bar (bar being a directory), CERN server will
1275              not redirect it too http://foo/bar/ -- it will generate a
1276              directory listing containing links to bar/file1,
1277              bar/file2, etc.  Wget will lose because it saves this
1278              HTML listing to a file `bar', so it cannot create the
1279              directory.  To work around this, if the file of the same
1280              name exists, we just remove it and create the directory
1281              anyway.  */
1282           DEBUGP (("Removing %s because of directory danger!\n", t));
1283           if (unlink (t))
1284             logprintf (LOG_NOTQUIET, "Failed to unlink %s (%d): %s\n",
1285                        t, errno, strerror(errno));
1286         }
1287     }
1288   res = make_directory (t);
1289   if (res != 0)
1290     logprintf (LOG_NOTQUIET, "%s: %s\n", t, strerror (errno));
1291   xfree (t);
1292   return res;
1293 }
1294 
1295 /* Functions for constructing the file name out of URL components.  */
1296 
1297 /* A growable string structure, used by url_file_name and friends.
1298    This should perhaps be moved to utils.c.
1299 
1300    The idea is to have a convenient and efficient way to construct a
1301    string by having various functions append data to it.  Instead of
1302    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1303    functions in questions, we pass the pointer to this struct.
1304 
1305    Functions that write to the members in this struct must make sure
1306    that base remains null terminated by calling append_null().
1307    */
1308 
1309 struct growable {
1310   char *base;
1311   int size;   /* memory allocated */
1312   int tail;   /* string length */
1313 };
1314 
1315 /* Ensure that the string can accept APPEND_COUNT more characters past
1316    the current TAIL position.  If necessary, this will grow the string
1317    and update its allocated size.  If the string is already large
1318    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1319 #define GROW(g, append_size) do {                                       \
1320   struct growable *G_ = g;                                              \
1321   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1322 } while (0)
1323 
1324 /* Return the tail position of the string. */
1325 #define TAIL(r) ((r)->base + (r)->tail)
1326 
1327 /* Move the tail position by APPEND_COUNT characters. */
1328 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1329 
1330 
1331 /* Append NULL to DEST. */
1332 static void
append_null(struct growable * dest)1333 append_null (struct growable *dest)
1334 {
1335   GROW (dest, 1);
1336   *TAIL (dest) = 0;
1337 }
1338 
1339 /* Append CH to DEST. */
1340 static void
append_char(char ch,struct growable * dest)1341 append_char (char ch, struct growable *dest)
1342 {
1343   if (ch)
1344     {
1345       GROW (dest, 1);
1346       *TAIL (dest) = ch;
1347       TAIL_INCR (dest, 1);
1348     }
1349 
1350   append_null (dest);
1351 }
1352 
1353 /* Append the string STR to DEST. */
1354 static void
append_string(const char * str,struct growable * dest)1355 append_string (const char *str, struct growable *dest)
1356 {
1357   int l = strlen (str);
1358 
1359   if (l)
1360     {
1361       GROW (dest, l);
1362       memcpy (TAIL (dest), str, l);
1363       TAIL_INCR (dest, l);
1364     }
1365 
1366   append_null (dest);
1367 }
1368 
1369 
1370 enum {
1371   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1372   filechr_not_vms     = 2,      /* unusable on VMS (ODS5), 0x00-0x1F * ? */
1373   filechr_not_windows = 4,      /* unusable on Windows, one of \|/<>?:*" */
1374   filechr_control     = 8       /* a control character, e.g. 0-31 */
1375 };
1376 
1377 #define FILE_CHAR_TEST(c, mask) \
1378     ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
1379     (filechr_table[(unsigned char)(c)] & (mask)))
1380 
1381 /* Shorthands for the table: */
1382 #define U filechr_not_unix
1383 #define V filechr_not_vms
1384 #define W filechr_not_windows
1385 #define C filechr_control
1386 
1387 #define UVWC U|V|W|C
1388 #define UW U|W
1389 #define VC V|C
1390 #define VW V|W
1391 
1392 /* Table of characters unsafe under various conditions (see above).
1393 
1394    Arguably we could also claim `%' to be unsafe, since we use it as
1395    the escape character.  If we ever want to be able to reliably
1396    translate file name back to URL, this would become important
1397    crucial.  Right now, it's better to be minimal in escaping.  */
1398 
1399 static const unsigned char filechr_table[256] =
1400 {
1401 UVWC, VC, VC, VC,  VC, VC, VC, VC,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1402   VC, VC, VC, VC,  VC, VC, VC, VC,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1403   VC, VC, VC, VC,  VC, VC, VC, VC,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1404   VC, VC, VC, VC,  VC, VC, VC, VC,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1405    0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1406    0,  0, VW,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1407    0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1408    0,  0,  W,  0,   W,  0,  W, VW,   /* 8   9   :   ;    <   =   >   ?   */
1409    0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1410    0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1411    0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1412    0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1413    0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1414    0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1415    0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1416    0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1417 
1418   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, /* 128-143 */
1419   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, /* 144-159 */
1420   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1421   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1422 
1423   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1424   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1425   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1426   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1427 };
1428 #undef U
1429 #undef V
1430 #undef W
1431 #undef C
1432 #undef UW
1433 #undef UVWC
1434 #undef VC
1435 #undef VW
1436 
1437 /* FN_PORT_SEP is the separator between host and port in file names
1438    for non-standard port numbers.  On Unix this is normally ':', as in
1439    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1440    because Windows can't handle ':' in file names.  */
1441 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1442 
1443 /* FN_QUERY_SEP is the separator between the file name and the URL
1444    query, normally '?'.  Because VMS and Windows cannot handle '?' in a
1445    file name, we use '@' instead there.  */
1446 #define FN_QUERY_SEP \
1447  (((opt.restrict_files_os != restrict_vms) && \
1448    (opt.restrict_files_os != restrict_windows)) ? '?' : '@')
1449 #define FN_QUERY_SEP_STR \
1450  (((opt.restrict_files_os != restrict_vms) && \
1451    (opt.restrict_files_os != restrict_windows)) ? "?" : "@")
1452 
1453 /* Quote path element, characters in [b, e), as file name, and append
1454    the quoted string to DEST.  Each character is quoted as per
1455    file_unsafe_char and the corresponding table.
1456 
1457    If ESCAPED is true, the path element is considered to be
1458    URL-escaped and will be unescaped prior to inspection.  */
1459 
1460 static void
append_uri_pathel(const char * b,const char * e,bool escaped,struct growable * dest)1461 append_uri_pathel (const char *b, const char *e, bool escaped,
1462                    struct growable *dest)
1463 {
1464   const char *p;
1465   char buf[1024];
1466   char *unescaped = NULL;
1467   int quoted, outlen;
1468   int mask;
1469   int max_length;
1470 
1471   if (!dest)
1472     return;
1473 
1474   if (opt.restrict_files_os == restrict_unix)
1475     mask = filechr_not_unix;
1476   else if (opt.restrict_files_os == restrict_vms)
1477     mask = filechr_not_vms;
1478   else
1479     mask = filechr_not_windows;
1480 
1481   if (opt.restrict_files_ctrl)
1482     mask |= filechr_control;
1483 
1484   /* Copy [b, e) to PATHEL and URL-unescape it. */
1485   if (escaped)
1486     {
1487       size_t len = e - b;
1488 		if (len < sizeof (buf))
1489         unescaped = buf;
1490       else
1491         unescaped = xmalloc(len + 1);
1492 
1493 		memcpy(unescaped, b, len);
1494 		unescaped[len] = 0;
1495 
1496       url_unescape (unescaped);
1497       b = unescaped;
1498       e = unescaped + strlen (unescaped);
1499     }
1500 
1501   /* Defang ".." when found as component of path.  Remember that path
1502      comes from the URL and might contain malicious input.  */
1503   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1504     {
1505       b = "%2E%2E";
1506       e = b + 6;
1507     }
1508 
1509   /* Walk the PATHEL string and check how many characters we'll need
1510      to quote.  */
1511   quoted = 0;
1512   for (p = b; p < e; p++)
1513     if (FILE_CHAR_TEST (*p, mask))
1514       ++quoted;
1515 
1516   /* Calculate the length of the output string.  e-b is the input
1517      string length.  Each quoted char introduces two additional
1518      characters in the string, hence 2*quoted.  */
1519   outlen = (e - b) + (2 * quoted);
1520 # ifdef WINDOWS
1521   max_length = MAX_PATH;
1522 # else
1523   max_length = get_max_length(dest->base, dest->tail, _PC_NAME_MAX);
1524 # endif
1525   max_length -= CHOMP_BUFFER;
1526   if (max_length > 0 && outlen > max_length)
1527     {
1528       logprintf (LOG_NOTQUIET, "The destination name is too long (%d), reducing to %d\n", outlen, max_length);
1529 
1530       outlen = max_length;
1531     }
1532   GROW (dest, outlen);
1533 
1534   // This should not happen, but it's impossible to argue with static analysis that it can't happen
1535   // (in theory it can). So give static analyzers a hint.
1536   if (!dest->base)
1537     return;
1538 
1539   if (!quoted)
1540     {
1541       /* If there's nothing to quote, we can simply append the string
1542          without processing it again.  */
1543       memcpy (TAIL (dest), b, outlen);
1544     }
1545   else
1546     {
1547       char *q = TAIL (dest);
1548       int i;
1549 
1550       for (i = 0, p = b; p < e; p++)
1551         {
1552           if (!FILE_CHAR_TEST (*p, mask))
1553 	    {
1554 	      if (i == outlen)
1555 	        break;
1556 	      *q++ = *p;
1557 	      i++;
1558 	    }
1559           else if (i + 3 > outlen)
1560 	    break;
1561 	  else
1562             {
1563               unsigned char ch = *p;
1564               *q++ = '%';
1565               *q++ = XNUM_TO_DIGIT (ch >> 4);
1566               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1567 	      i += 3;
1568             }
1569         }
1570       assert (q - TAIL (dest) <= outlen);
1571     }
1572 
1573   /* Perform inline case transformation if required.  */
1574   if (opt.restrict_files_case == restrict_lowercase
1575       || opt.restrict_files_case == restrict_uppercase)
1576     {
1577       char *q;
1578       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1579         {
1580           if (opt.restrict_files_case == restrict_lowercase)
1581             *q = c_tolower (*q);
1582           else
1583             *q = c_toupper (*q);
1584         }
1585     }
1586 
1587   TAIL_INCR (dest, outlen);
1588   append_null (dest);
1589 
1590   if (unescaped && unescaped != buf)
1591 	  free (unescaped);
1592 }
1593 
1594 #ifdef HAVE_ICONV
1595 static char *
convert_fname(char * fname)1596 convert_fname (char *fname)
1597 {
1598   char *converted_fname;
1599   const char *from_encoding = opt.encoding_remote;
1600   const char *to_encoding = opt.locale;
1601   iconv_t cd;
1602   size_t len, done, inlen, outlen;
1603   char *s;
1604   const char *orig_fname;
1605 
1606   /* Defaults for remote and local encodings.  */
1607   if (!from_encoding)
1608     from_encoding = "UTF-8";
1609   if (!to_encoding)
1610     to_encoding = nl_langinfo (CODESET);
1611 
1612   cd = iconv_open (to_encoding, from_encoding);
1613   if (cd == (iconv_t) (-1))
1614     {
1615       logprintf (LOG_VERBOSE, _ ("Conversion from %s to %s isn't supported\n"),
1616                  quote_n (0, from_encoding), quote_n (1, to_encoding));
1617       return fname;
1618     }
1619 
1620   orig_fname = fname;
1621   inlen = strlen (fname);
1622   len = outlen = inlen * 2;
1623   converted_fname = s = xmalloc (outlen + 1);
1624   done = 0;
1625 
1626   for (;;)
1627     {
1628       errno = 0;
1629       if (iconv (cd, (ICONV_CONST char **) &fname, &inlen, &s, &outlen) == 0
1630           && iconv (cd, NULL, NULL, &s, &outlen) == 0)
1631         {
1632           *(converted_fname + len - outlen - done) = '\0';
1633           iconv_close (cd);
1634           DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n",
1635                    orig_fname, from_encoding, converted_fname, to_encoding));
1636           xfree (orig_fname);
1637           return converted_fname;
1638         }
1639 
1640       /* Incomplete or invalid multibyte sequence */
1641       if (errno == EINVAL || errno == EILSEQ || errno == 0)
1642         {
1643           if (errno)
1644             logprintf (LOG_VERBOSE,
1645                        _ ("Incomplete or invalid multibyte sequence encountered\n"));
1646           else
1647             logprintf (LOG_VERBOSE,
1648                        _ ("Unconvertable multibyte sequence encountered\n"));
1649           xfree (converted_fname);
1650           converted_fname = (char *) orig_fname;
1651           break;
1652         }
1653       else if (errno == E2BIG) /* Output buffer full */
1654         {
1655           done = len;
1656           len = outlen = done + inlen * 2;
1657           converted_fname = xrealloc (converted_fname, outlen + 1);
1658           s = converted_fname + done;
1659         }
1660       else /* Weird, we got an unspecified error */
1661         {
1662           logprintf (LOG_VERBOSE, _ ("Unhandled errno %d\n"), errno);
1663           xfree (converted_fname);
1664           converted_fname = (char *) orig_fname;
1665           break;
1666         }
1667     }
1668   DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n",
1669            orig_fname, from_encoding, to_encoding));
1670 
1671   iconv_close (cd);
1672 
1673   return converted_fname;
1674 }
1675 #else
1676 static char *
convert_fname(char * fname)1677 convert_fname (char *fname)
1678 {
1679   return fname;
1680 }
1681 #endif
1682 
1683 /* Append to DEST the directory structure that corresponds the
1684    directory part of URL's path.  For example, if the URL is
1685    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1686 
1687    Each path element ("dir1" and "dir2" in the above example) is
1688    examined, url-unescaped, and re-escaped as file name element.
1689 
1690    Additionally, it cuts as many directories from the path as
1691    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1692    will produce "bar" for the above example.  For 2 or more, it will
1693    produce "".
1694 
1695    Each component of the path is quoted for use as file name.  */
1696 
1697 static void
append_dir_structure(const struct url * u,struct growable * dest)1698 append_dir_structure (const struct url *u, struct growable *dest)
1699 {
1700   char *pathel, *next;
1701   int cut = opt.cut_dirs;
1702 
1703   /* Go through the path components, de-URL-quote them, and quote them
1704      (if necessary) as file names.  */
1705 
1706   pathel = u->path;
1707   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1708     {
1709       if (cut-- > 0)
1710         continue;
1711       if (pathel == next)
1712         /* Ignore empty pathels.  */
1713         continue;
1714 
1715       if (dest->tail)
1716         append_char ('/', dest);
1717 
1718       append_uri_pathel (pathel, next, true, dest);
1719     }
1720 }
1721 
1722 /* Return a unique file name that matches the given URL as well as
1723    possible.  Does not create directories on the file system.  */
1724 
1725 char *
url_file_name(const struct url * u,char * replaced_filename)1726 url_file_name (const struct url *u, char *replaced_filename)
1727 {
1728   struct growable fnres;        /* stands for "file name result" */
1729   struct growable temp_fnres;
1730 
1731   const char *u_file;
1732   char *fname, *unique, *fname_len_check;
1733   const char *index_filename = "index.html"; /* The default index file is index.html */
1734 
1735   fnres.base = NULL;
1736   fnres.size = 0;
1737   fnres.tail = 0;
1738 
1739   temp_fnres.base = NULL;
1740   temp_fnres.size = 0;
1741   temp_fnres.tail = 0;
1742 
1743   /* If an alternative index file was defined, change index_filename */
1744   if (opt.default_page)
1745     index_filename = opt.default_page;
1746 
1747 
1748   /* Start with the directory prefix, if specified. */
1749   if (opt.dir_prefix)
1750     append_string (opt.dir_prefix, &fnres);
1751 
1752   /* If "dirstruct" is turned on (typically the case with -r), add
1753      the host and port (unless those have been turned off) and
1754      directory structure.  */
1755   /* All safe remote chars are unescaped and stored in temp_fnres,
1756      then converted to local and appended to fnres.
1757      Internationalized URL/IDN will produce punycode to lookup IP from DNS:
1758      https://en.wikipedia.org/wiki/URL
1759      https://en.wikipedia.org/wiki/Internationalized_domain_name
1760      Non-ASCII code chars in the path:
1761      https://en.wikipedia.org/wiki/List_of_Unicode_characters
1762      https://en.wikipedia.org/wiki/List_of_writing_systems */
1763   if (opt.dirstruct)
1764     {
1765       if (opt.protocol_directories)
1766         {
1767           if (temp_fnres.tail)
1768             append_char ('/', &temp_fnres);
1769           append_string (supported_schemes[u->scheme].name, &temp_fnres);
1770         }
1771       if (opt.add_hostdir)
1772         {
1773           if (temp_fnres.tail)
1774             append_char ('/', &temp_fnres);
1775           if (0 != strcmp (u->host, ".."))
1776             append_string (u->host, &temp_fnres);
1777           else
1778             /* Host name can come from the network; malicious DNS may
1779                allow ".." to be resolved, causing us to write to
1780                "../<file>".  Defang such host names.  */
1781             append_string ("%2E%2E", &temp_fnres);
1782           if (u->port != scheme_default_port (u->scheme))
1783             {
1784               char portstr[24];
1785               number_to_string (portstr, u->port);
1786               append_char (FN_PORT_SEP, &temp_fnres);
1787               append_string (portstr, &temp_fnres);
1788             }
1789         }
1790 
1791       append_dir_structure (u, &temp_fnres);
1792     }
1793 
1794   if (!replaced_filename)
1795     {
1796       /* Create the filename. */
1797       u_file = *u->file ? u->file : index_filename;
1798 
1799       /* Append "?query" to the file name, even if empty,
1800        * and create fname_len_check. */
1801       if (u->query)
1802         fname_len_check = concat_strings (u_file, FN_QUERY_SEP_STR, u->query, NULL);
1803       else
1804         fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
1805     }
1806   else
1807     {
1808       u_file = replaced_filename;
1809       fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
1810     }
1811 
1812   if (temp_fnres.tail)
1813     append_char ('/', &temp_fnres);
1814 
1815   append_uri_pathel (fname_len_check,
1816     fname_len_check + strlen (fname_len_check), true, &temp_fnres);
1817 
1818   /* Zero-terminate the temporary file name. */
1819   append_char ('\0', &temp_fnres);
1820 
1821   /* convert all remote chars before length check and appending to local path */
1822   fname = convert_fname (temp_fnres.base);
1823   temp_fnres.base = NULL;
1824   temp_fnres.size = 0;
1825   temp_fnres.tail = 0;
1826   append_string (fname, &temp_fnres);
1827 
1828   xfree (fname);
1829   xfree (fname_len_check);
1830 
1831   /* The filename has already been 'cleaned' by append_uri_pathel() above.  So,
1832    * just append it. */
1833   if (fnres.tail)
1834     append_char ('/', &fnres);
1835   append_string (temp_fnres.base, &fnres);
1836 
1837   fname = fnres.base;
1838 
1839   /* Make a final check that the path length is acceptable? */
1840   /* TODO: check fnres.base for path length problem */
1841 
1842   xfree (temp_fnres.base);
1843 
1844   /* Check the cases in which the unique extensions are not used:
1845      1) Clobbering is turned off (-nc).
1846      2) Retrieval with regetting.
1847      3) Timestamping is used.
1848      4) Hierarchy is built.
1849      5) Backups are specified.
1850 
1851      The exception is the case when file does exist and is a
1852      directory (see `mkalldirs' for explanation).  */
1853 
1854   if (ALLOW_CLOBBER
1855       && !(file_exists_p (fname, NULL) && !file_non_directory_p (fname)))
1856     {
1857       unique = fname;
1858     }
1859   else
1860     {
1861       unique = unique_name_passthrough (fname);
1862       if (unique != fname)
1863         xfree (fname);
1864     }
1865 
1866 /* On VMS, alter the name as required. */
1867 #ifdef __VMS
1868   {
1869     char *unique2;
1870 
1871     unique2 = ods_conform( unique);
1872     if (unique2 != unique)
1873       {
1874         xfree (unique);
1875         unique = unique2;
1876       }
1877   }
1878 #endif /* def __VMS */
1879 
1880   return unique;
1881 }
1882 
1883 /* Resolve "." and ".." elements of PATH by destructively modifying
1884    PATH and return true if PATH has been modified, false otherwise.
1885 
1886    The algorithm is in spirit similar to the one described in rfc1808,
1887    although implemented differently, in one pass.  To recap, path
1888    elements containing only "." are removed, and ".." is taken to mean
1889    "back up one element".  Single leading and trailing slashes are
1890    preserved.
1891 
1892    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1893    test examples are provided below.  If you change anything in this
1894    function, run test_path_simplify to make sure you haven't broken a
1895    test case.  */
1896 
1897 static bool
path_simplify(enum url_scheme scheme,char * path)1898 path_simplify (enum url_scheme scheme, char *path)
1899 {
1900   char *h = path;               /* hare */
1901   char *t = path;               /* tortoise */
1902   char *beg = path;
1903   char *end = strchr (path, '\0');
1904 
1905   while (h < end)
1906     {
1907       /* Hare should be at the beginning of a path element. */
1908 
1909       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1910         {
1911           /* Ignore "./". */
1912           h += 2;
1913         }
1914       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1915         {
1916           /* Handle "../" by retreating the tortoise by one path
1917              element -- but not past beginning.  */
1918           if (t > beg)
1919             {
1920               /* Move backwards until T hits the beginning of the
1921                  previous path element or the beginning of path. */
1922               for (--t; t > beg && t[-1] != '/'; t--)
1923                 ;
1924             }
1925           else if (scheme == SCHEME_FTP
1926 #ifdef HAVE_SSL
1927               || scheme == SCHEME_FTPS
1928 #endif
1929               )
1930             {
1931               /* If we're at the beginning, copy the "../" literally
1932                  and move the beginning so a later ".." doesn't remove
1933                  it.  This violates RFC 3986; but we do it for FTP
1934                  anyway because there is otherwise no way to get at a
1935                  parent directory, when the FTP server drops us in a
1936                  non-root directory (which is not uncommon). */
1937               beg = t + 3;
1938               goto regular;
1939             }
1940           h += 3;
1941         }
1942       else
1943         {
1944         regular:
1945           /* A regular path element.  If H hasn't advanced past T,
1946              simply skip to the next path element.  Otherwise, copy
1947              the path element until the next slash.  */
1948           if (t == h)
1949             {
1950               /* Skip the path element, including the slash.  */
1951               while (h < end && *h != '/')
1952                 t++, h++;
1953               if (h < end)
1954                 t++, h++;
1955             }
1956           else
1957             {
1958               /* Copy the path element, including the final slash.  */
1959               while (h < end && *h != '/')
1960                 *t++ = *h++;
1961               if (h < end)
1962                 *t++ = *h++;
1963             }
1964         }
1965     }
1966 
1967   if (t != h)
1968     *t = '\0';
1969 
1970   return t != h;
1971 }
1972 
1973 /* Return the length of URL's path.  Path is considered to be
1974    terminated by one or more of the ?query or ;params or #fragment,
1975    depending on the scheme.  */
1976 
1977 static const char *
path_end(const char * url)1978 path_end (const char *url)
1979 {
1980   enum url_scheme scheme = url_scheme (url);
1981   const char *seps;
1982   if (scheme == SCHEME_INVALID)
1983     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1984   /* +2 to ignore the first two separators ':' and '/' */
1985   seps = init_seps (scheme) + 2;
1986   return strpbrk_or_eos (url, seps);
1987 }
1988 
1989 /* Find the last occurrence of character C in the range [b, e), or
1990    NULL, if none are present.  */
1991 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1992 
1993 /* Merge BASE with LINK and return the resulting URI.
1994 
1995    Either of the URIs may be absolute or relative, complete with the
1996    host name, or path only.  This tries to reasonably handle all
1997    foreseeable cases.  It only employs minimal URL parsing, without
1998    knowledge of the specifics of schemes.
1999 
2000    I briefly considered making this function call path_simplify after
2001    the merging process, as rfc1738 seems to suggest.  This is a bad
2002    idea for several reasons: 1) it complexifies the code, and 2)
2003    url_parse has to simplify path anyway, so it's wasteful to boot.  */
2004 
2005 char *
uri_merge(const char * base,const char * link)2006 uri_merge (const char *base, const char *link)
2007 {
2008   int linklength;
2009   const char *end;
2010   char *merge;
2011 
2012   if (url_has_scheme (link))
2013     return xstrdup (link);
2014 
2015   /* We may not examine BASE past END. */
2016   end = path_end (base);
2017   linklength = strlen (link);
2018 
2019   if (!*link)
2020     {
2021       /* Empty LINK points back to BASE, query string and all. */
2022       return xstrdup (base);
2023     }
2024   else if (*link == '?')
2025     {
2026       /* LINK points to the same location, but changes the query
2027          string.  Examples: */
2028       /* uri_merge("path",         "?new") -> "path?new"     */
2029       /* uri_merge("path?foo",     "?new") -> "path?new"     */
2030       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
2031       /* uri_merge("path#foo",     "?new") -> "path?new"     */
2032       int baselength = end - base;
2033       merge = xmalloc (baselength + linklength + 1);
2034       memcpy (merge, base, baselength);
2035       memcpy (merge + baselength, link, linklength);
2036       merge[baselength + linklength] = '\0';
2037     }
2038   else if (*link == '#')
2039     {
2040       /* uri_merge("path",         "#new") -> "path#new"     */
2041       /* uri_merge("path#foo",     "#new") -> "path#new"     */
2042       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
2043       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
2044       int baselength;
2045       const char *end1 = strchr (base, '#');
2046       if (!end1)
2047         end1 = base + strlen (base);
2048       baselength = end1 - base;
2049       merge = xmalloc (baselength + linklength + 1);
2050       memcpy (merge, base, baselength);
2051       memcpy (merge + baselength, link, linklength);
2052       merge[baselength + linklength] = '\0';
2053     }
2054   else if (*link == '/' && *(link + 1) == '/')
2055     {
2056       /* LINK begins with "//" and so is a net path: we need to
2057          replace everything after (and including) the double slash
2058          with LINK. */
2059 
2060       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
2061       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
2062       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
2063 
2064       int span;
2065       const char *slash;
2066       const char *start_insert;
2067 
2068       /* Look for first slash. */
2069       slash = memchr (base, '/', end - base);
2070       /* If found slash and it is a double slash, then replace
2071          from this point, else default to replacing from the
2072          beginning.  */
2073       if (slash && *(slash + 1) == '/')
2074         start_insert = slash;
2075       else
2076         start_insert = base;
2077 
2078       span = start_insert - base;
2079       merge = xmalloc (span + linklength + 1);
2080       if (span)
2081         memcpy (merge, base, span);
2082       memcpy (merge + span, link, linklength);
2083       merge[span + linklength] = '\0';
2084     }
2085   else if (*link == '/')
2086     {
2087       /* LINK is an absolute path: we need to replace everything
2088          after (and including) the FIRST slash with LINK.
2089 
2090          So, if BASE is "http://host/whatever/foo/bar", and LINK is
2091          "/qux/xyzzy", our result should be
2092          "http://host/qux/xyzzy".  */
2093       int span;
2094       const char *slash;
2095       const char *start_insert = NULL; /* for gcc to shut up. */
2096       const char *pos = base;
2097       bool seen_slash_slash = false;
2098       /* We're looking for the first slash, but want to ignore
2099          double slash. */
2100     again:
2101       slash = memchr (pos, '/', end - pos);
2102       if (slash && !seen_slash_slash)
2103         if (*(slash + 1) == '/')
2104           {
2105             pos = slash + 2;
2106             seen_slash_slash = true;
2107             goto again;
2108           }
2109 
2110       /* At this point, SLASH is the location of the first / after
2111          "//", or the first slash altogether.  START_INSERT is the
2112          pointer to the location where LINK will be inserted.  When
2113          examining the last two examples, keep in mind that LINK
2114          begins with '/'. */
2115 
2116       if (!slash && !seen_slash_slash)
2117         /* example: "foo" */
2118         /*           ^    */
2119         start_insert = base;
2120       else if (!slash && seen_slash_slash)
2121         /* example: "http://foo" */
2122         /*                     ^ */
2123         start_insert = end;
2124       else if (slash && !seen_slash_slash)
2125         /* example: "foo/bar" */
2126         /*           ^        */
2127         start_insert = base;
2128       else if (slash && seen_slash_slash)
2129         /* example: "http://something/" */
2130         /*                           ^  */
2131         start_insert = slash;
2132 
2133       span = start_insert - base;
2134       merge = xmalloc (span + linklength + 1);
2135       if (span)
2136         memcpy (merge, base, span);
2137       memcpy (merge + span, link, linklength);
2138       merge[span + linklength] = '\0';
2139     }
2140   else
2141     {
2142       /* LINK is a relative URL: we need to replace everything
2143          after last slash (possibly empty) with LINK.
2144 
2145          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2146          our result should be "whatever/foo/qux/xyzzy".  */
2147       bool need_explicit_slash = false;
2148       int span;
2149       const char *start_insert;
2150       const char *last_slash = find_last_char (base, end, '/');
2151       if (!last_slash)
2152         {
2153           /* No slash found at all.  Replace what we have with LINK. */
2154           start_insert = base;
2155         }
2156       else if (last_slash && last_slash >= base + 2
2157                && last_slash[-2] == ':' && last_slash[-1] == '/')
2158         {
2159           /* example: http://host"  */
2160           /*                      ^ */
2161           start_insert = end + 1;
2162           need_explicit_slash = true;
2163         }
2164       else
2165         {
2166           /* example: "whatever/foo/bar" */
2167           /*                        ^    */
2168           start_insert = last_slash + 1;
2169         }
2170 
2171       span = start_insert - base;
2172       merge = xmalloc (span + linklength + 1);
2173       if (span)
2174         memcpy (merge, base, span);
2175       if (need_explicit_slash)
2176         merge[span - 1] = '/';
2177       memcpy (merge + span, link, linklength);
2178       merge[span + linklength] = '\0';
2179     }
2180 
2181   return merge;
2182 }
2183 
2184 #define APPEND(p, s) do {                       \
2185   int len = strlen (s);                         \
2186   memcpy (p, s, len);                           \
2187   p += len;                                     \
2188 } while (0)
2189 
2190 /* Use this instead of password when the actual password is supposed
2191    to be hidden.  We intentionally use a generic string without giving
2192    away the number of characters in the password, like previous
2193    versions did.  */
2194 #define HIDDEN_PASSWORD "*password*"
2195 
2196 /* Recreate the URL string from the data in URL.
2197 
2198    If HIDE is true (as it is when we're calling this on a URL we plan
2199    to print, but not when calling it to canonicalize a URL for use
2200    within the program), password will be hidden.  Unsafe characters in
2201    the URL will be quoted.  */
2202 
2203 char *
url_string(const struct url * url,enum url_auth_mode auth_mode)2204 url_string (const struct url *url, enum url_auth_mode auth_mode)
2205 {
2206   int size;
2207   char *result, *p;
2208   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
2209 
2210   int scheme_port = supported_schemes[url->scheme].default_port;
2211   const char *scheme_str = supported_schemes[url->scheme].leading_string;
2212   int fplen = full_path_length (url);
2213 
2214   bool brackets_around_host;
2215 
2216   assert (scheme_str != NULL);
2217 
2218   /* Make sure the user name and password are quoted. */
2219   if (url->user)
2220     {
2221       if (auth_mode != URL_AUTH_HIDE)
2222         {
2223           quoted_user = url_escape_allow_passthrough (url->user);
2224           if (url->passwd)
2225             {
2226               if (auth_mode == URL_AUTH_HIDE_PASSWD)
2227                 quoted_passwd = (char *) HIDDEN_PASSWORD;
2228               else
2229                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2230             }
2231         }
2232     }
2233 
2234   /* In the unlikely event that the host name contains non-printable
2235      characters, quote it for displaying to the user.  */
2236   quoted_host = url_escape_allow_passthrough (url->host);
2237 
2238   /* Undo the quoting of colons that URL escaping performs.  IPv6
2239      addresses may legally contain colons, and in that case must be
2240      placed in square brackets.  */
2241   if (quoted_host != url->host)
2242     unescape_single_char (quoted_host, ':');
2243   brackets_around_host = strchr (quoted_host, ':') != NULL;
2244 
2245   size = (strlen (scheme_str)
2246           + strlen (quoted_host)
2247           + (brackets_around_host ? 2 : 0)
2248           + fplen
2249           + 1);
2250   if (url->port != scheme_port)
2251     size += 1 + numdigit (url->port);
2252   if (quoted_user)
2253     {
2254       size += 1 + strlen (quoted_user);
2255       if (quoted_passwd)
2256         size += 1 + strlen (quoted_passwd);
2257     }
2258 
2259   p = result = xmalloc (size);
2260 
2261   APPEND (p, scheme_str);
2262   if (quoted_user)
2263     {
2264       APPEND (p, quoted_user);
2265       if (quoted_passwd)
2266         {
2267           *p++ = ':';
2268           APPEND (p, quoted_passwd);
2269         }
2270       *p++ = '@';
2271     }
2272 
2273   if (brackets_around_host)
2274     *p++ = '[';
2275   APPEND (p, quoted_host);
2276   if (brackets_around_host)
2277     *p++ = ']';
2278   if (url->port != scheme_port)
2279     {
2280       *p++ = ':';
2281       p = number_to_string (p, url->port);
2282     }
2283 
2284   full_path_write (url, p);
2285   p += fplen;
2286   *p++ = '\0';
2287 
2288   assert (p - result == size);
2289 
2290   if (quoted_user && quoted_user != url->user)
2291     xfree (quoted_user);
2292   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
2293       && quoted_passwd != url->passwd)
2294     xfree (quoted_passwd);
2295   if (quoted_host != url->host)
2296     xfree (quoted_host);
2297 
2298   return result;
2299 }
2300 
2301 /* Return true if scheme a is similar to scheme b.
2302 
2303    Schemes are similar if they are equal.  If SSL is supported, schemes
2304    are also similar if one is http (SCHEME_HTTP) and the other is https
2305    (SCHEME_HTTPS).  */
2306 bool
schemes_are_similar_p(enum url_scheme a,enum url_scheme b)2307 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2308 {
2309   if (a == b)
2310     return true;
2311 #ifdef HAVE_SSL
2312   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2313       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2314     return true;
2315 #endif
2316   return false;
2317 }
2318 
2319 static int
getchar_from_escaped_string(const char * str,char * c)2320 getchar_from_escaped_string (const char *str, char *c)
2321 {
2322   const char *p = str;
2323 
2324   assert (str && *str);
2325   assert (c);
2326 
2327   if (p[0] == '%')
2328     {
2329       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2330         {
2331           *c = '%';
2332           return 1;
2333         }
2334       else
2335         {
2336           if (p[2] == 0)
2337             return 0; /* error: invalid string */
2338 
2339           *c = X2DIGITS_TO_NUM (p[1], p[2]);
2340           if (URL_RESERVED_CHAR(*c))
2341             {
2342               *c = '%';
2343               return 1;
2344             }
2345           else
2346             return 3;
2347         }
2348     }
2349   else
2350     {
2351       *c = p[0];
2352     }
2353 
2354   return 1;
2355 }
2356 
2357 bool
are_urls_equal(const char * u1,const char * u2)2358 are_urls_equal (const char *u1, const char *u2)
2359 {
2360   const char *p, *q;
2361   int pp, qq;
2362   char ch1, ch2;
2363   assert(u1 && u2);
2364 
2365   p = u1;
2366   q = u2;
2367 
2368   while (*p && *q
2369          && (pp = getchar_from_escaped_string (p, &ch1))
2370          && (qq = getchar_from_escaped_string (q, &ch2))
2371          && (c_tolower(ch1) == c_tolower(ch2)))
2372     {
2373       p += pp;
2374       q += qq;
2375     }
2376 
2377   return (*p == 0 && *q == 0 ? true : false);
2378 }
2379 
2380 #ifdef TESTING
2381 /* Debugging and testing support for path_simplify. */
2382 
2383 #if 0
2384 /* Debug: run path_simplify on PATH and return the result in a new
2385    string.  Useful for calling from the debugger.  */
2386 static char *
2387 ps (char *path)
2388 {
2389   char *copy = xstrdup (path);
2390   path_simplify (copy);
2391   return copy;
2392 }
2393 #endif
2394 
2395 static const char *
run_test(const char * test,const char * expected_result,enum url_scheme scheme,bool expected_change)2396 run_test (const char *test, const char *expected_result, enum url_scheme scheme,
2397           bool expected_change)
2398 {
2399   char *test_copy = xstrdup (test);
2400   bool modified = path_simplify (scheme, test_copy);
2401 
2402   if (0 != strcmp (test_copy, expected_result))
2403     {
2404       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2405               test, expected_result, test_copy);
2406       mu_assert ("", 0);
2407     }
2408   if (modified != expected_change)
2409     {
2410       if (expected_change)
2411         printf ("Expected modification with path_simplify(\"%s\").\n",
2412                 test);
2413       else
2414         printf ("Expected no modification with path_simplify(\"%s\").\n",
2415                 test);
2416     }
2417   xfree (test_copy);
2418   mu_assert ("", modified == expected_change);
2419   return NULL;
2420 }
2421 
2422 const char *
test_path_simplify(void)2423 test_path_simplify (void)
2424 {
2425   static const struct {
2426     const char *test, *result;
2427     enum url_scheme scheme;
2428     bool should_modify;
2429   } tests[] = {
2430     { "",                       "",             SCHEME_HTTP, false },
2431     { ".",                      "",             SCHEME_HTTP, true },
2432     { "./",                     "",             SCHEME_HTTP, true },
2433     { "..",                     "",             SCHEME_HTTP, true },
2434     { "../",                    "",             SCHEME_HTTP, true },
2435     { "..",                     "..",           SCHEME_FTP,  false },
2436     { "../",                    "../",          SCHEME_FTP,  false },
2437     { "foo",                    "foo",          SCHEME_HTTP, false },
2438     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2439     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2440     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2441     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2442     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2443     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2444     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2445     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2446     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2447     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2448     { "foo/..",                 "",             SCHEME_HTTP, true },
2449     { "foo/../..",              "",             SCHEME_HTTP, true },
2450     { "foo/../../..",           "",             SCHEME_HTTP, true },
2451     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2452     { "foo/../..",              "..",           SCHEME_FTP,  true },
2453     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2454     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2455     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2456     { "./a/../b",               "b",            SCHEME_HTTP, true }
2457   };
2458   unsigned i;
2459 
2460   for (i = 0; i < countof (tests); i++)
2461     {
2462       const char *message;
2463       const char *test = tests[i].test;
2464       const char *expected_result = tests[i].result;
2465       enum url_scheme scheme = tests[i].scheme;
2466       bool  expected_change = tests[i].should_modify;
2467 
2468       message = run_test (test, expected_result, scheme, expected_change);
2469       if (message) return message;
2470     }
2471   return NULL;
2472 }
2473 
2474 const char *
test_append_uri_pathel(void)2475 test_append_uri_pathel(void)
2476 {
2477   unsigned i;
2478   static const struct {
2479     const char *original_url;
2480     const char *input;
2481     bool escaped;
2482     const char *expected_result;
2483   } test_array[] = {
2484     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2485   };
2486 
2487   for (i = 0; i < countof(test_array); ++i)
2488     {
2489       struct growable dest;
2490       const char *p = test_array[i].input;
2491 
2492       memset (&dest, 0, sizeof (dest));
2493 
2494       append_string (test_array[i].original_url, &dest);
2495       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2496 
2497       mu_assert ("test_append_uri_pathel: wrong result",
2498                  strcmp (dest.base, test_array[i].expected_result) == 0);
2499       xfree (dest.base);
2500     }
2501 
2502   return NULL;
2503 }
2504 
2505 const char *
test_are_urls_equal(void)2506 test_are_urls_equal(void)
2507 {
2508   unsigned i;
2509   static const struct {
2510     const char *url1;
2511     const char *url2;
2512     bool expected_result;
2513   } test_array[] = {
2514     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2515     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2516     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2517     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2518     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2519     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2520   };
2521 
2522   for (i = 0; i < countof(test_array); ++i)
2523     {
2524       mu_assert ("test_are_urls_equal: wrong result",
2525                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2526     }
2527 
2528   return NULL;
2529 }
2530 
2531 #endif /* TESTING */
2532 
2533 /*
2534  * vim: et ts=2 sw=2
2535  */
2536