1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.haxx.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  ***************************************************************************/
22 
23 #include "curl_setup.h"
24 
25 #include "urldata.h"
26 #include "urlapi-int.h"
27 #include "strcase.h"
28 #include "dotdot.h"
29 #include "url.h"
30 #include "escape.h"
31 #include "curl_ctype.h"
32 #include "inet_pton.h"
33 
34 /* The last 3 #include files should be in this order */
35 #include "curl_printf.h"
36 #include "curl_memory.h"
37 #include "memdebug.h"
38 
39   /* MSDOS/Windows style drive prefix, eg c: in c:foo */
40 #define STARTS_WITH_DRIVE_PREFIX(str) \
41   ((('a' <= str[0] && str[0] <= 'z') || \
42     ('A' <= str[0] && str[0] <= 'Z')) && \
43    (str[1] == ':'))
44 
45   /* MSDOS/Windows style drive prefix, optionally with
46    * a '|' instead of ':', followed by a slash or NUL */
47 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \
48   ((('a' <= (str)[0] && (str)[0] <= 'z') || \
49     ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
50    ((str)[1] == ':' || (str)[1] == '|') && \
51    ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
52 
53 /* Internal representation of CURLU. Point to URL-encoded strings. */
54 struct Curl_URL {
55   char *scheme;
56   char *user;
57   char *password;
58   char *options; /* IMAP only? */
59   char *host;
60   char *zoneid; /* for numerical IPv6 addresses */
61   char *port;
62   char *path;
63   char *query;
64   char *fragment;
65 
66   char *scratch; /* temporary scratch area */
67   long portnum; /* the numerical version */
68 };
69 
70 #define DEFAULT_SCHEME "https"
71 
free_urlhandle(struct Curl_URL * u)72 static void free_urlhandle(struct Curl_URL *u)
73 {
74   free(u->scheme);
75   free(u->user);
76   free(u->password);
77   free(u->options);
78   free(u->host);
79   free(u->zoneid);
80   free(u->port);
81   free(u->path);
82   free(u->query);
83   free(u->fragment);
84   free(u->scratch);
85 }
86 
87 /* move the full contents of one handle onto another and
88    free the original */
mv_urlhandle(struct Curl_URL * from,struct Curl_URL * to)89 static void mv_urlhandle(struct Curl_URL *from,
90                          struct Curl_URL *to)
91 {
92   free_urlhandle(to);
93   *to = *from;
94   free(from);
95 }
96 
97 /*
98  * Find the separator at the end of the host name, or the '?' in cases like
99  * http://www.url.com?id=2380
100  */
find_host_sep(const char * url)101 static const char *find_host_sep(const char *url)
102 {
103   const char *sep;
104   const char *query;
105 
106   /* Find the start of the hostname */
107   sep = strstr(url, "//");
108   if(!sep)
109     sep = url;
110   else
111     sep += 2;
112 
113   query = strchr(sep, '?');
114   sep = strchr(sep, '/');
115 
116   if(!sep)
117     sep = url + strlen(url);
118 
119   if(!query)
120     query = url + strlen(url);
121 
122   return sep < query ? sep : query;
123 }
124 
125 /*
126  * Decide in an encoding-independent manner whether a character in an
127  * URL must be escaped. The same criterion must be used in strlen_url()
128  * and strcpy_url().
129  */
urlchar_needs_escaping(int c)130 static bool urlchar_needs_escaping(int c)
131 {
132     return !(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c));
133 }
134 
135 /*
136  * strlen_url() returns the length of the given URL if the spaces within the
137  * URL were properly URL encoded.
138  * URL encoding should be skipped for host names, otherwise IDN resolution
139  * will fail.
140  */
strlen_url(const char * url,bool relative)141 static size_t strlen_url(const char *url, bool relative)
142 {
143   const unsigned char *ptr;
144   size_t newlen = 0;
145   bool left = TRUE; /* left side of the ? */
146   const unsigned char *host_sep = (const unsigned char *) url;
147 
148   if(!relative)
149     host_sep = (const unsigned char *) find_host_sep(url);
150 
151   for(ptr = (unsigned char *)url; *ptr; ptr++) {
152 
153     if(ptr < host_sep) {
154       ++newlen;
155       continue;
156     }
157 
158     switch(*ptr) {
159     case '?':
160       left = FALSE;
161       /* FALLTHROUGH */
162     default:
163       if(urlchar_needs_escaping(*ptr))
164         newlen += 2;
165       newlen++;
166       break;
167     case ' ':
168       if(left)
169         newlen += 3;
170       else
171         newlen++;
172       break;
173     }
174   }
175   return newlen;
176 }
177 
178 /* strcpy_url() copies a url to a output buffer and URL-encodes the spaces in
179  * the source URL accordingly.
180  * URL encoding should be skipped for host names, otherwise IDN resolution
181  * will fail.
182  */
strcpy_url(char * output,const char * url,bool relative)183 static void strcpy_url(char *output, const char *url, bool relative)
184 {
185   /* we must add this with whitespace-replacing */
186   bool left = TRUE;
187   const unsigned char *iptr;
188   char *optr = output;
189   const unsigned char *host_sep = (const unsigned char *) url;
190 
191   if(!relative)
192     host_sep = (const unsigned char *) find_host_sep(url);
193 
194   for(iptr = (unsigned char *)url;    /* read from here */
195       *iptr;         /* until zero byte */
196       iptr++) {
197 
198     if(iptr < host_sep) {
199       *optr++ = *iptr;
200       continue;
201     }
202 
203     switch(*iptr) {
204     case '?':
205       left = FALSE;
206       /* FALLTHROUGH */
207     default:
208       if(urlchar_needs_escaping(*iptr)) {
209         msnprintf(optr, 4, "%%%02x", *iptr);
210         optr += 3;
211       }
212       else
213         *optr++=*iptr;
214       break;
215     case ' ':
216       if(left) {
217         *optr++='%'; /* add a '%' */
218         *optr++='2'; /* add a '2' */
219         *optr++='0'; /* add a '0' */
220       }
221       else
222         *optr++='+'; /* add a '+' here */
223       break;
224     }
225   }
226   *optr = 0; /* zero terminate output buffer */
227 
228 }
229 
230 /*
231  * Returns true if the given URL is absolute (as opposed to relative) within
232  * the buffer size. Returns the scheme in the buffer if TRUE and 'buf' is
233  * non-NULL.
234  */
Curl_is_absolute_url(const char * url,char * buf,size_t buflen)235 bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
236 {
237   size_t i;
238 #ifdef WIN32
239   if(STARTS_WITH_DRIVE_PREFIX(url))
240     return FALSE;
241 #endif
242   for(i = 0; i < buflen && url[i]; ++i) {
243     char s = url[i];
244     if((s == ':') && (url[i + 1] == '/')) {
245       if(buf)
246         buf[i] = 0;
247       return TRUE;
248     }
249     /* RFC 3986 3.1 explains:
250       scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
251     */
252     else if(ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') ) {
253       if(buf)
254         buf[i] = (char)TOLOWER(s);
255     }
256     else
257       break;
258   }
259   return FALSE;
260 }
261 
262 /*
263  * Concatenate a relative URL to a base URL making it absolute.
264  * URL-encodes any spaces.
265  * The returned pointer must be freed by the caller unless NULL
266  * (returns NULL on out of memory).
267  */
concat_url(const char * base,const char * relurl)268 static char *concat_url(const char *base, const char *relurl)
269 {
270   /***
271    TRY to append this new path to the old URL
272    to the right of the host part. Oh crap, this is doomed to cause
273    problems in the future...
274   */
275   char *newest;
276   char *protsep;
277   char *pathsep;
278   size_t newlen;
279   bool host_changed = FALSE;
280 
281   const char *useurl = relurl;
282   size_t urllen;
283 
284   /* we must make our own copy of the URL to play with, as it may
285      point to read-only data */
286   char *url_clone = strdup(base);
287 
288   if(!url_clone)
289     return NULL; /* skip out of this NOW */
290 
291   /* protsep points to the start of the host name */
292   protsep = strstr(url_clone, "//");
293   if(!protsep)
294     protsep = url_clone;
295   else
296     protsep += 2; /* pass the slashes */
297 
298   if('/' != relurl[0]) {
299     int level = 0;
300 
301     /* First we need to find out if there's a ?-letter in the URL,
302        and cut it and the right-side of that off */
303     pathsep = strchr(protsep, '?');
304     if(pathsep)
305       *pathsep = 0;
306 
307     /* we have a relative path to append to the last slash if there's one
308        available, or if the new URL is just a query string (starts with a
309        '?')  we append the new one at the end of the entire currently worked
310        out URL */
311     if(useurl[0] != '?') {
312       pathsep = strrchr(protsep, '/');
313       if(pathsep)
314         *pathsep = 0;
315     }
316 
317     /* Check if there's any slash after the host name, and if so, remember
318        that position instead */
319     pathsep = strchr(protsep, '/');
320     if(pathsep)
321       protsep = pathsep + 1;
322     else
323       protsep = NULL;
324 
325     /* now deal with one "./" or any amount of "../" in the newurl
326        and act accordingly */
327 
328     if((useurl[0] == '.') && (useurl[1] == '/'))
329       useurl += 2; /* just skip the "./" */
330 
331     while((useurl[0] == '.') &&
332           (useurl[1] == '.') &&
333           (useurl[2] == '/')) {
334       level++;
335       useurl += 3; /* pass the "../" */
336     }
337 
338     if(protsep) {
339       while(level--) {
340         /* cut off one more level from the right of the original URL */
341         pathsep = strrchr(protsep, '/');
342         if(pathsep)
343           *pathsep = 0;
344         else {
345           *protsep = 0;
346           break;
347         }
348       }
349     }
350   }
351   else {
352     /* We got a new absolute path for this server */
353 
354     if((relurl[0] == '/') && (relurl[1] == '/')) {
355       /* the new URL starts with //, just keep the protocol part from the
356          original one */
357       *protsep = 0;
358       useurl = &relurl[2]; /* we keep the slashes from the original, so we
359                               skip the new ones */
360       host_changed = TRUE;
361     }
362     else {
363       /* cut off the original URL from the first slash, or deal with URLs
364          without slash */
365       pathsep = strchr(protsep, '/');
366       if(pathsep) {
367         /* When people use badly formatted URLs, such as
368            "http://www.url.com?dir=/home/daniel" we must not use the first
369            slash, if there's a ?-letter before it! */
370         char *sep = strchr(protsep, '?');
371         if(sep && (sep < pathsep))
372           pathsep = sep;
373         *pathsep = 0;
374       }
375       else {
376         /* There was no slash. Now, since we might be operating on a badly
377            formatted URL, such as "http://www.url.com?id=2380" which doesn't
378            use a slash separator as it is supposed to, we need to check for a
379            ?-letter as well! */
380         pathsep = strchr(protsep, '?');
381         if(pathsep)
382           *pathsep = 0;
383       }
384     }
385   }
386 
387   /* If the new part contains a space, this is a mighty stupid redirect
388      but we still make an effort to do "right". To the left of a '?'
389      letter we replace each space with %20 while it is replaced with '+'
390      on the right side of the '?' letter.
391   */
392   newlen = strlen_url(useurl, !host_changed);
393 
394   urllen = strlen(url_clone);
395 
396   newest = malloc(urllen + 1 + /* possible slash */
397                   newlen + 1 /* zero byte */);
398 
399   if(!newest) {
400     free(url_clone); /* don't leak this */
401     return NULL;
402   }
403 
404   /* copy over the root url part */
405   memcpy(newest, url_clone, urllen);
406 
407   /* check if we need to append a slash */
408   if(('/' == useurl[0]) || (protsep && !*protsep) || ('?' == useurl[0]))
409     ;
410   else
411     newest[urllen++]='/';
412 
413   /* then append the new piece on the right side */
414   strcpy_url(&newest[urllen], useurl, !host_changed);
415 
416   free(url_clone);
417 
418   return newest;
419 }
420 
421 /*
422  * parse_hostname_login()
423  *
424  * Parse the login details (user name, password and options) from the URL and
425  * strip them out of the host name
426  *
427  */
parse_hostname_login(struct Curl_URL * u,const struct Curl_handler * h,char ** hostname,unsigned int flags)428 static CURLUcode parse_hostname_login(struct Curl_URL *u,
429                                       const struct Curl_handler *h,
430                                       char **hostname,
431                                       unsigned int flags)
432 {
433   CURLUcode result = CURLUE_OK;
434   CURLcode ccode;
435   char *userp = NULL;
436   char *passwdp = NULL;
437   char *optionsp = NULL;
438 
439   /* At this point, we're hoping all the other special cases have
440    * been taken care of, so conn->host.name is at most
441    *    [user[:password][;options]]@]hostname
442    *
443    * We need somewhere to put the embedded details, so do that first.
444    */
445 
446   char *ptr = strchr(*hostname, '@');
447   char *login = *hostname;
448 
449   if(!ptr)
450     goto out;
451 
452   /* We will now try to extract the
453    * possible login information in a string like:
454    * ftp://user:password@ftp.my.site:8021/README */
455   *hostname = ++ptr;
456 
457   /* We could use the login information in the URL so extract it. Only parse
458      options if the handler says we should. Note that 'h' might be NULL! */
459   ccode = Curl_parse_login_details(login, ptr - login - 1,
460                                    &userp, &passwdp,
461                                    (h && (h->flags & PROTOPT_URLOPTIONS)) ?
462                                    &optionsp:NULL);
463   if(ccode) {
464     result = CURLUE_MALFORMED_INPUT;
465     goto out;
466   }
467 
468   if(userp) {
469     if(flags & CURLU_DISALLOW_USER) {
470       /* Option DISALLOW_USER is set and url contains username. */
471       result = CURLUE_USER_NOT_ALLOWED;
472       goto out;
473     }
474 
475     u->user = userp;
476   }
477 
478   if(passwdp)
479     u->password = passwdp;
480 
481   if(optionsp)
482     u->options = optionsp;
483 
484   return CURLUE_OK;
485   out:
486 
487   free(userp);
488   free(passwdp);
489   free(optionsp);
490 
491   return result;
492 }
493 
Curl_parse_port(struct Curl_URL * u,char * hostname)494 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname)
495 {
496   char *portptr = NULL;
497   char endbracket;
498   int len;
499 
500   /*
501    * Find the end of an IPv6 address, either on the ']' ending bracket or
502    * a percent-encoded zone index.
503    */
504   if(1 == sscanf(hostname, "[%*45[0123456789abcdefABCDEF:.]%c%n",
505                  &endbracket, &len)) {
506     if(']' == endbracket)
507       portptr = &hostname[len];
508     else if('%' == endbracket) {
509       int zonelen = len;
510       if(1 == sscanf(hostname + zonelen, "%*[^]]%c%n", &endbracket, &len)) {
511         if(']' != endbracket)
512           return CURLUE_MALFORMED_INPUT;
513         portptr = &hostname[--zonelen + len + 1];
514       }
515       else
516         return CURLUE_MALFORMED_INPUT;
517     }
518     else
519       return CURLUE_MALFORMED_INPUT;
520 
521     /* this is a RFC2732-style specified IP-address */
522     if(portptr && *portptr) {
523       if(*portptr != ':')
524         return CURLUE_MALFORMED_INPUT;
525     }
526     else
527       portptr = NULL;
528   }
529   else
530     portptr = strchr(hostname, ':');
531 
532   if(portptr) {
533     char *rest;
534     long port;
535     char portbuf[7];
536 
537     /* Browser behavior adaptation. If there's a colon with no digits after,
538        just cut off the name there which makes us ignore the colon and just
539        use the default port. Firefox, Chrome and Safari all do that. */
540     if(!portptr[1]) {
541       *portptr = '\0';
542       return CURLUE_OK;
543     }
544 
545     if(!ISDIGIT(portptr[1]))
546       return CURLUE_BAD_PORT_NUMBER;
547 
548     port = strtol(portptr + 1, &rest, 10);  /* Port number must be decimal */
549 
550     if((port <= 0) || (port > 0xffff))
551       /* Single unix standard says port numbers are 16 bits long, but we don't
552          treat port zero as OK. */
553       return CURLUE_BAD_PORT_NUMBER;
554 
555     if(rest[0])
556       return CURLUE_BAD_PORT_NUMBER;
557 
558     *portptr++ = '\0'; /* cut off the name there */
559     *rest = 0;
560     /* generate a new port number string to get rid of leading zeroes etc */
561     msnprintf(portbuf, sizeof(portbuf), "%ld", port);
562     u->portnum = port;
563     u->port = strdup(portbuf);
564     if(!u->port)
565       return CURLUE_OUT_OF_MEMORY;
566   }
567 
568   return CURLUE_OK;
569 }
570 
571 /* scan for byte values < 31 or 127 */
junkscan(char * part)572 static CURLUcode junkscan(char *part)
573 {
574   if(part) {
575     static const char badbytes[]={
576       /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
577       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
578       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
579       0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
580       0x7f,
581       0x00 /* zero terminate */
582     };
583     size_t n = strlen(part);
584     size_t nfine = strcspn(part, badbytes);
585     if(nfine != n)
586       /* since we don't know which part is scanned, return a generic error
587          code */
588       return CURLUE_MALFORMED_INPUT;
589   }
590   return CURLUE_OK;
591 }
592 
hostname_check(struct Curl_URL * u,char * hostname)593 static CURLUcode hostname_check(struct Curl_URL *u, char *hostname)
594 {
595   size_t len;
596   size_t hlen = strlen(hostname);
597 
598   if(hostname[0] == '[') {
599     char dest[16]; /* fits a binary IPv6 address */
600     const char *l = "0123456789abcdefABCDEF:.";
601     hostname++;
602     hlen -= 2;
603 
604     if(hostname[hlen] != ']')
605       return CURLUE_MALFORMED_INPUT;
606 
607     /* only valid letters are ok */
608     len = strspn(hostname, l);
609     if(hlen != len) {
610       hlen = len;
611       if(hostname[len] == '%') {
612         /* this could now be '%[zone id]' */
613         char zoneid[16];
614         int i = 0;
615         char *h = &hostname[len + 1];
616         /* pass '25' if present and is a url encoded percent sign */
617         if(!strncmp(h, "25", 2) && h[2] && (h[2] != ']'))
618           h += 2;
619         while(*h && (*h != ']') && (i < 15))
620           zoneid[i++] = *h++;
621         if(!i || (']' != *h))
622           return CURLUE_MALFORMED_INPUT;
623         zoneid[i] = 0;
624         u->zoneid = strdup(zoneid);
625         if(!u->zoneid)
626           return CURLUE_OUT_OF_MEMORY;
627         hostname[len] = ']'; /* insert end bracket */
628         hostname[len + 1] = 0; /* terminate the hostname */
629       }
630       else
631         return CURLUE_MALFORMED_INPUT;
632       /* hostname is fine */
633     }
634 #ifdef ENABLE_IPV6
635     hostname[hlen] = 0; /* end the address there */
636     if(1 != Curl_inet_pton(AF_INET6, hostname, dest))
637       return CURLUE_MALFORMED_INPUT;
638     hostname[hlen] = ']'; /* restore ending bracket */
639 #endif
640   }
641   else {
642     /* letters from the second string is not ok */
643     len = strcspn(hostname, " ");
644     if(hlen != len)
645       /* hostname with bad content */
646       return CURLUE_MALFORMED_INPUT;
647   }
648   if(!hostname[0])
649     return CURLUE_NO_HOST;
650   return CURLUE_OK;
651 }
652 
653 #define HOSTNAME_END(x) (((x) == '/') || ((x) == '?') || ((x) == '#'))
654 
seturl(const char * url,CURLU * u,unsigned int flags)655 static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
656 {
657   char *path;
658   bool path_alloced = FALSE;
659   char *hostname;
660   char *query = NULL;
661   char *fragment = NULL;
662   CURLUcode result;
663   bool url_has_scheme = FALSE;
664   char schemebuf[MAX_SCHEME_LEN + 1];
665   char *schemep = NULL;
666   size_t schemelen = 0;
667   size_t urllen;
668   const struct Curl_handler *h = NULL;
669 
670   if(!url)
671     return CURLUE_MALFORMED_INPUT;
672 
673   /*************************************************************
674    * Parse the URL.
675    ************************************************************/
676   /* allocate scratch area */
677   urllen = strlen(url);
678   if(urllen > CURL_MAX_INPUT_LENGTH)
679     /* excessive input length */
680     return CURLUE_MALFORMED_INPUT;
681 
682   path = u->scratch = malloc(urllen * 2 + 2);
683   if(!path)
684     return CURLUE_OUT_OF_MEMORY;
685 
686   hostname = &path[urllen + 1];
687   hostname[0] = 0;
688 
689   if(Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf))) {
690     url_has_scheme = TRUE;
691     schemelen = strlen(schemebuf);
692   }
693 
694   /* handle the file: scheme */
695   if(url_has_scheme && strcasecompare(schemebuf, "file")) {
696     /* path has been allocated large enough to hold this */
697     strcpy(path, &url[5]);
698 
699     hostname = NULL; /* no host for file: URLs */
700     u->scheme = strdup("file");
701     if(!u->scheme)
702       return CURLUE_OUT_OF_MEMORY;
703 
704     /* Extra handling URLs with an authority component (i.e. that start with
705      * "file://")
706      *
707      * We allow omitted hostname (e.g. file:/<path>) -- valid according to
708      * RFC 8089, but not the (current) WHAT-WG URL spec.
709      */
710     if(path[0] == '/' && path[1] == '/') {
711       /* swallow the two slashes */
712       char *ptr = &path[2];
713 
714       /*
715        * According to RFC 8089, a file: URL can be reliably dereferenced if:
716        *
717        *  o it has no/blank hostname, or
718        *
719        *  o the hostname matches "localhost" (case-insensitively), or
720        *
721        *  o the hostname is a FQDN that resolves to this machine.
722        *
723        * For brevity, we only consider URLs with empty, "localhost", or
724        * "127.0.0.1" hostnames as local.
725        *
726        * Additionally, there is an exception for URLs with a Windows drive
727        * letter in the authority (which was accidentally omitted from RFC 8089
728        * Appendix E, but believe me, it was meant to be there. --MK)
729        */
730       if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
731         /* the URL includes a host name, it must match "localhost" or
732            "127.0.0.1" to be valid */
733         if(!checkprefix("localhost/", ptr) &&
734            !checkprefix("127.0.0.1/", ptr)) {
735           /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
736              none */
737           return CURLUE_MALFORMED_INPUT;
738         }
739         ptr += 9; /* now points to the slash after the host */
740       }
741 
742       path = ptr;
743     }
744 
745 #if !defined(MSDOS) && !defined(WIN32) && !defined(__CYGWIN__)
746     /* Don't allow Windows drive letters when not in Windows.
747      * This catches both "file:/c:" and "file:c:" */
748     if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
749        STARTS_WITH_URL_DRIVE_PREFIX(path)) {
750       /* File drive letters are only accepted in MSDOS/Windows */
751       return CURLUE_MALFORMED_INPUT;
752     }
753 #else
754     /* If the path starts with a slash and a drive letter, ditch the slash */
755     if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
756       /* This cannot be done with strcpy, as the memory chunks overlap! */
757       memmove(path, &path[1], strlen(&path[1]) + 1);
758     }
759 #endif
760 
761   }
762   else {
763     /* clear path */
764     const char *p;
765     const char *hostp;
766     size_t len;
767     path[0] = 0;
768 
769     if(url_has_scheme) {
770       int i = 0;
771       p = &url[schemelen + 1];
772       while(p && (*p == '/') && (i < 4)) {
773         p++;
774         i++;
775       }
776       if((i < 1) || (i>3))
777         /* less than one or more than three slashes */
778         return CURLUE_MALFORMED_INPUT;
779 
780       schemep = schemebuf;
781       if(!Curl_builtin_scheme(schemep) &&
782          !(flags & CURLU_NON_SUPPORT_SCHEME))
783         return CURLUE_UNSUPPORTED_SCHEME;
784 
785       if(junkscan(schemep))
786         return CURLUE_MALFORMED_INPUT;
787     }
788     else {
789       /* no scheme! */
790 
791       if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME)))
792         return CURLUE_MALFORMED_INPUT;
793       if(flags & CURLU_DEFAULT_SCHEME)
794         schemep = (char *) DEFAULT_SCHEME;
795 
796       /*
797        * The URL was badly formatted, let's try without scheme specified.
798        */
799       p = url;
800     }
801     hostp = p; /* host name starts here */
802 
803     while(*p && !HOSTNAME_END(*p)) /* find end of host name */
804       p++;
805 
806     len = p - hostp;
807     if(!len)
808       return CURLUE_MALFORMED_INPUT;
809 
810     memcpy(hostname, hostp, len);
811     hostname[len] = 0;
812 
813     if((flags & CURLU_GUESS_SCHEME) && !schemep) {
814       /* legacy curl-style guess based on host name */
815       if(checkprefix("ftp.", hostname))
816         schemep = (char *)"ftp";
817       else if(checkprefix("dict.", hostname))
818         schemep = (char *)"dict";
819       else if(checkprefix("ldap.", hostname))
820         schemep = (char *)"ldap";
821       else if(checkprefix("imap.", hostname))
822         schemep = (char *)"imap";
823       else if(checkprefix("smtp.", hostname))
824         schemep = (char *)"smtp";
825       else if(checkprefix("pop3.", hostname))
826         schemep = (char *)"pop3";
827       else
828         schemep = (char *)"http";
829     }
830 
831     len = strlen(p);
832     memcpy(path, p, len);
833     path[len] = 0;
834 
835     u->scheme = strdup(schemep);
836     if(!u->scheme)
837       return CURLUE_OUT_OF_MEMORY;
838   }
839 
840   /* if this is a known scheme, get some details */
841   h = Curl_builtin_scheme(u->scheme);
842 
843   if(junkscan(path))
844     return CURLUE_MALFORMED_INPUT;
845 
846   query = strchr(path, '?');
847   if(query)
848     *query++ = 0;
849 
850   fragment = strchr(query?query:path, '#');
851   if(fragment)
852     *fragment++ = 0;
853 
854   if(!path[0])
855     /* if there's no path set, unset */
856     path = NULL;
857   else if(!(flags & CURLU_PATH_AS_IS)) {
858     /* sanitise paths and remove ../ and ./ sequences according to RFC3986 */
859     char *newp = Curl_dedotdotify(path);
860     if(!newp)
861       return CURLUE_OUT_OF_MEMORY;
862 
863     if(strcmp(newp, path)) {
864       /* if we got a new version */
865       path = newp;
866       path_alloced = TRUE;
867     }
868     else
869       free(newp);
870   }
871   if(path) {
872     u->path = path_alloced?path:strdup(path);
873     if(!u->path)
874       return CURLUE_OUT_OF_MEMORY;
875   }
876 
877   if(hostname) {
878     /*
879      * Parse the login details and strip them out of the host name.
880      */
881     if(junkscan(hostname))
882       return CURLUE_MALFORMED_INPUT;
883 
884     result = parse_hostname_login(u, h, &hostname, flags);
885     if(result)
886       return result;
887 
888     result = Curl_parse_port(u, hostname);
889     if(result)
890       return result;
891 
892     result = hostname_check(u, hostname);
893     if(result)
894       return result;
895 
896     u->host = strdup(hostname);
897     if(!u->host)
898       return CURLUE_OUT_OF_MEMORY;
899   }
900 
901   if(query) {
902     u->query = strdup(query);
903     if(!u->query)
904       return CURLUE_OUT_OF_MEMORY;
905   }
906   if(fragment && fragment[0]) {
907     u->fragment = strdup(fragment);
908     if(!u->fragment)
909       return CURLUE_OUT_OF_MEMORY;
910   }
911 
912   free(u->scratch);
913   u->scratch = NULL;
914 
915   return CURLUE_OK;
916 }
917 
918 /*
919  * Parse the URL and set the relevant members of the Curl_URL struct.
920  */
parseurl(const char * url,CURLU * u,unsigned int flags)921 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
922 {
923   CURLUcode result = seturl(url, u, flags);
924   if(result) {
925     free_urlhandle(u);
926     memset(u, 0, sizeof(struct Curl_URL));
927   }
928   return result;
929 }
930 
931 /*
932  */
curl_url(void)933 CURLU *curl_url(void)
934 {
935   return calloc(sizeof(struct Curl_URL), 1);
936 }
937 
curl_url_cleanup(CURLU * u)938 void curl_url_cleanup(CURLU *u)
939 {
940   if(u) {
941     free_urlhandle(u);
942     free(u);
943   }
944 }
945 
946 #define DUP(dest, src, name)         \
947   if(src->name) {                    \
948     dest->name = strdup(src->name);  \
949     if(!dest->name)                  \
950       goto fail;                     \
951   }
952 
curl_url_dup(CURLU * in)953 CURLU *curl_url_dup(CURLU *in)
954 {
955   struct Curl_URL *u = calloc(sizeof(struct Curl_URL), 1);
956   if(u) {
957     DUP(u, in, scheme);
958     DUP(u, in, user);
959     DUP(u, in, password);
960     DUP(u, in, options);
961     DUP(u, in, host);
962     DUP(u, in, port);
963     DUP(u, in, path);
964     DUP(u, in, query);
965     DUP(u, in, fragment);
966     u->portnum = in->portnum;
967   }
968   return u;
969   fail:
970   curl_url_cleanup(u);
971   return NULL;
972 }
973 
curl_url_get(CURLU * u,CURLUPart what,char ** part,unsigned int flags)974 CURLUcode curl_url_get(CURLU *u, CURLUPart what,
975                        char **part, unsigned int flags)
976 {
977   char *ptr;
978   CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
979   char portbuf[7];
980   bool urldecode = (flags & CURLU_URLDECODE)?1:0;
981   bool plusdecode = FALSE;
982   (void)flags;
983   if(!u)
984     return CURLUE_BAD_HANDLE;
985   if(!part)
986     return CURLUE_BAD_PARTPOINTER;
987   *part = NULL;
988 
989   switch(what) {
990   case CURLUPART_SCHEME:
991     ptr = u->scheme;
992     ifmissing = CURLUE_NO_SCHEME;
993     urldecode = FALSE; /* never for schemes */
994     break;
995   case CURLUPART_USER:
996     ptr = u->user;
997     ifmissing = CURLUE_NO_USER;
998     break;
999   case CURLUPART_PASSWORD:
1000     ptr = u->password;
1001     ifmissing = CURLUE_NO_PASSWORD;
1002     break;
1003   case CURLUPART_OPTIONS:
1004     ptr = u->options;
1005     ifmissing = CURLUE_NO_OPTIONS;
1006     break;
1007   case CURLUPART_HOST:
1008     ptr = u->host;
1009     ifmissing = CURLUE_NO_HOST;
1010     break;
1011   case CURLUPART_ZONEID:
1012     ptr = u->zoneid;
1013     break;
1014   case CURLUPART_PORT:
1015     ptr = u->port;
1016     ifmissing = CURLUE_NO_PORT;
1017     urldecode = FALSE; /* never for port */
1018     if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
1019       /* there's no stored port number, but asked to deliver
1020          a default one for the scheme */
1021       const struct Curl_handler *h =
1022         Curl_builtin_scheme(u->scheme);
1023       if(h) {
1024         msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
1025         ptr = portbuf;
1026       }
1027     }
1028     else if(ptr && u->scheme) {
1029       /* there is a stored port number, but ask to inhibit if
1030          it matches the default one for the scheme */
1031       const struct Curl_handler *h =
1032         Curl_builtin_scheme(u->scheme);
1033       if(h && (h->defport == u->portnum) &&
1034          (flags & CURLU_NO_DEFAULT_PORT))
1035         ptr = NULL;
1036     }
1037     break;
1038   case CURLUPART_PATH:
1039     ptr = u->path;
1040     if(!ptr) {
1041       ptr = u->path = strdup("/");
1042       if(!u->path)
1043         return CURLUE_OUT_OF_MEMORY;
1044     }
1045     break;
1046   case CURLUPART_QUERY:
1047     ptr = u->query;
1048     ifmissing = CURLUE_NO_QUERY;
1049     plusdecode = urldecode;
1050     break;
1051   case CURLUPART_FRAGMENT:
1052     ptr = u->fragment;
1053     ifmissing = CURLUE_NO_FRAGMENT;
1054     break;
1055   case CURLUPART_URL: {
1056     char *url;
1057     char *scheme;
1058     char *options = u->options;
1059     char *port = u->port;
1060     char *allochost = NULL;
1061     if(u->scheme && strcasecompare("file", u->scheme)) {
1062       url = aprintf("file://%s%s%s",
1063                     u->path,
1064                     u->fragment? "#": "",
1065                     u->fragment? u->fragment : "");
1066     }
1067     else if(!u->host)
1068       return CURLUE_NO_HOST;
1069     else {
1070       const struct Curl_handler *h = NULL;
1071       if(u->scheme)
1072         scheme = u->scheme;
1073       else if(flags & CURLU_DEFAULT_SCHEME)
1074         scheme = (char *) DEFAULT_SCHEME;
1075       else
1076         return CURLUE_NO_SCHEME;
1077 
1078       if(scheme) {
1079         h = Curl_builtin_scheme(scheme);
1080         if(!port && (flags & CURLU_DEFAULT_PORT)) {
1081           /* there's no stored port number, but asked to deliver
1082              a default one for the scheme */
1083           if(h) {
1084             msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
1085             port = portbuf;
1086           }
1087         }
1088         else if(port) {
1089           /* there is a stored port number, but asked to inhibit if it matches
1090              the default one for the scheme */
1091           if(h && (h->defport == u->portnum) &&
1092              (flags & CURLU_NO_DEFAULT_PORT))
1093             port = NULL;
1094         }
1095       }
1096       if(h && !(h->flags & PROTOPT_URLOPTIONS))
1097         options = NULL;
1098 
1099       if((u->host[0] == '[') && u->zoneid) {
1100         /* make it '[ host %25 zoneid ]' */
1101         size_t hostlen = strlen(u->host);
1102         size_t alen = hostlen + 3 + strlen(u->zoneid) + 1;
1103         allochost = malloc(alen);
1104         if(!allochost)
1105           return CURLUE_OUT_OF_MEMORY;
1106         memcpy(allochost, u->host, hostlen - 1);
1107         msnprintf(&allochost[hostlen - 1], alen - hostlen + 1,
1108                   "%%25%s]", u->zoneid);
1109       }
1110 
1111       url = aprintf("%s://%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
1112                     scheme,
1113                     u->user ? u->user : "",
1114                     u->password ? ":": "",
1115                     u->password ? u->password : "",
1116                     options ? ";" : "",
1117                     options ? options : "",
1118                     (u->user || u->password || options) ? "@": "",
1119                     allochost ? allochost : u->host,
1120                     port ? ":": "",
1121                     port ? port : "",
1122                     (u->path && (u->path[0] != '/')) ? "/": "",
1123                     u->path ? u->path : "/",
1124                     (u->query && u->query[0]) ? "?": "",
1125                     (u->query && u->query[0]) ? u->query : "",
1126                     u->fragment? "#": "",
1127                     u->fragment? u->fragment : "");
1128       free(allochost);
1129     }
1130     if(!url)
1131       return CURLUE_OUT_OF_MEMORY;
1132     *part = url;
1133     return CURLUE_OK;
1134   }
1135   default:
1136     ptr = NULL;
1137     break;
1138   }
1139   if(ptr) {
1140     *part = strdup(ptr);
1141     if(!*part)
1142       return CURLUE_OUT_OF_MEMORY;
1143     if(plusdecode) {
1144       /* convert + to space */
1145       char *plus;
1146       for(plus = *part; *plus; ++plus) {
1147         if(*plus == '+')
1148           *plus = ' ';
1149       }
1150     }
1151     if(urldecode) {
1152       char *decoded;
1153       size_t dlen;
1154       CURLcode res = Curl_urldecode(NULL, *part, 0, &decoded, &dlen, TRUE);
1155       free(*part);
1156       if(res) {
1157         *part = NULL;
1158         return CURLUE_URLDECODE;
1159       }
1160       *part = decoded;
1161     }
1162     return CURLUE_OK;
1163   }
1164   else
1165     return ifmissing;
1166 }
1167 
curl_url_set(CURLU * u,CURLUPart what,const char * part,unsigned int flags)1168 CURLUcode curl_url_set(CURLU *u, CURLUPart what,
1169                        const char *part, unsigned int flags)
1170 {
1171   char **storep = NULL;
1172   long port = 0;
1173   bool urlencode = (flags & CURLU_URLENCODE)? 1 : 0;
1174   bool plusencode = FALSE;
1175   bool urlskipslash = FALSE;
1176   bool appendquery = FALSE;
1177   bool equalsencode = FALSE;
1178 
1179   if(!u)
1180     return CURLUE_BAD_HANDLE;
1181   if(!part) {
1182     /* setting a part to NULL clears it */
1183     switch(what) {
1184     case CURLUPART_URL:
1185       break;
1186     case CURLUPART_SCHEME:
1187       storep = &u->scheme;
1188       break;
1189     case CURLUPART_USER:
1190       storep = &u->user;
1191       break;
1192     case CURLUPART_PASSWORD:
1193       storep = &u->password;
1194       break;
1195     case CURLUPART_OPTIONS:
1196       storep = &u->options;
1197       break;
1198     case CURLUPART_HOST:
1199       storep = &u->host;
1200       break;
1201     case CURLUPART_ZONEID:
1202       storep = &u->zoneid;
1203       break;
1204     case CURLUPART_PORT:
1205       u->portnum = 0;
1206       storep = &u->port;
1207       break;
1208     case CURLUPART_PATH:
1209       storep = &u->path;
1210       break;
1211     case CURLUPART_QUERY:
1212       storep = &u->query;
1213       break;
1214     case CURLUPART_FRAGMENT:
1215       storep = &u->fragment;
1216       break;
1217     default:
1218       return CURLUE_UNKNOWN_PART;
1219     }
1220     if(storep && *storep) {
1221       free(*storep);
1222       *storep = NULL;
1223     }
1224     return CURLUE_OK;
1225   }
1226 
1227   switch(what) {
1228   case CURLUPART_SCHEME:
1229     if(strlen(part) > MAX_SCHEME_LEN)
1230       /* too long */
1231       return CURLUE_MALFORMED_INPUT;
1232     if(!(flags & CURLU_NON_SUPPORT_SCHEME) &&
1233        /* verify that it is a fine scheme */
1234        !Curl_builtin_scheme(part))
1235       return CURLUE_UNSUPPORTED_SCHEME;
1236     storep = &u->scheme;
1237     urlencode = FALSE; /* never */
1238     break;
1239   case CURLUPART_USER:
1240     storep = &u->user;
1241     break;
1242   case CURLUPART_PASSWORD:
1243     storep = &u->password;
1244     break;
1245   case CURLUPART_OPTIONS:
1246     storep = &u->options;
1247     break;
1248   case CURLUPART_HOST:
1249     storep = &u->host;
1250     free(u->zoneid);
1251     u->zoneid = NULL;
1252     break;
1253   case CURLUPART_ZONEID:
1254     storep = &u->zoneid;
1255     break;
1256   case CURLUPART_PORT:
1257   {
1258     char *endp;
1259     urlencode = FALSE; /* never */
1260     port = strtol(part, &endp, 10);  /* Port number must be decimal */
1261     if((port <= 0) || (port > 0xffff))
1262       return CURLUE_BAD_PORT_NUMBER;
1263     if(*endp)
1264       /* weirdly provided number, not good! */
1265       return CURLUE_MALFORMED_INPUT;
1266     storep = &u->port;
1267   }
1268   break;
1269   case CURLUPART_PATH:
1270     urlskipslash = TRUE;
1271     storep = &u->path;
1272     break;
1273   case CURLUPART_QUERY:
1274     plusencode = urlencode;
1275     appendquery = (flags & CURLU_APPENDQUERY)?1:0;
1276     equalsencode = appendquery;
1277     storep = &u->query;
1278     break;
1279   case CURLUPART_FRAGMENT:
1280     storep = &u->fragment;
1281     break;
1282   case CURLUPART_URL: {
1283     /*
1284      * Allow a new URL to replace the existing (if any) contents.
1285      *
1286      * If the existing contents is enough for a URL, allow a relative URL to
1287      * replace it.
1288      */
1289     CURLUcode result;
1290     char *oldurl;
1291     char *redired_url;
1292     CURLU *handle2;
1293 
1294     if(Curl_is_absolute_url(part, NULL, MAX_SCHEME_LEN + 1)) {
1295       handle2 = curl_url();
1296       if(!handle2)
1297         return CURLUE_OUT_OF_MEMORY;
1298       result = parseurl(part, handle2, flags);
1299       if(!result)
1300         mv_urlhandle(handle2, u);
1301       else
1302         curl_url_cleanup(handle2);
1303       return result;
1304     }
1305     /* extract the full "old" URL to do the redirect on */
1306     result = curl_url_get(u, CURLUPART_URL, &oldurl, flags);
1307     if(result) {
1308       /* couldn't get the old URL, just use the new! */
1309       handle2 = curl_url();
1310       if(!handle2)
1311         return CURLUE_OUT_OF_MEMORY;
1312       result = parseurl(part, handle2, flags);
1313       if(!result)
1314         mv_urlhandle(handle2, u);
1315       else
1316         curl_url_cleanup(handle2);
1317       return result;
1318     }
1319 
1320     /* apply the relative part to create a new URL */
1321     redired_url = concat_url(oldurl, part);
1322     free(oldurl);
1323     if(!redired_url)
1324       return CURLUE_OUT_OF_MEMORY;
1325 
1326     /* now parse the new URL */
1327     handle2 = curl_url();
1328     if(!handle2) {
1329       free(redired_url);
1330       return CURLUE_OUT_OF_MEMORY;
1331     }
1332     result = parseurl(redired_url, handle2, flags);
1333     free(redired_url);
1334     if(!result)
1335       mv_urlhandle(handle2, u);
1336     else
1337       curl_url_cleanup(handle2);
1338     return result;
1339   }
1340   default:
1341     return CURLUE_UNKNOWN_PART;
1342   }
1343   if(storep) {
1344     const char *newp = part;
1345     size_t nalloc = strlen(part);
1346 
1347     if(nalloc > CURL_MAX_INPUT_LENGTH)
1348       /* excessive input length */
1349       return CURLUE_MALFORMED_INPUT;
1350 
1351     if(urlencode) {
1352       const unsigned char *i;
1353       char *o;
1354       bool free_part = FALSE;
1355       char *enc = malloc(nalloc * 3 + 1); /* for worst case! */
1356       if(!enc)
1357         return CURLUE_OUT_OF_MEMORY;
1358       if(plusencode) {
1359         /* space to plus */
1360         i = (const unsigned char *)part;
1361         for(o = enc; *i; ++o, ++i)
1362           *o = (*i == ' ') ? '+' : *i;
1363         *o = 0; /* zero terminate */
1364         part = strdup(enc);
1365         if(!part) {
1366           free(enc);
1367           return CURLUE_OUT_OF_MEMORY;
1368         }
1369         free_part = TRUE;
1370       }
1371       for(i = (const unsigned char *)part, o = enc; *i; i++) {
1372         if(Curl_isunreserved(*i) ||
1373            ((*i == '/') && urlskipslash) ||
1374            ((*i == '=') && equalsencode) ||
1375            ((*i == '+') && plusencode)) {
1376           if((*i == '=') && equalsencode)
1377             /* only skip the first equals sign */
1378             equalsencode = FALSE;
1379           *o = *i;
1380           o++;
1381         }
1382         else {
1383           msnprintf(o, 4, "%%%02x", *i);
1384           o += 3;
1385         }
1386       }
1387       *o = 0; /* zero terminate */
1388       newp = enc;
1389       if(free_part)
1390         free((char *)part);
1391     }
1392     else {
1393       char *p;
1394       newp = strdup(part);
1395       if(!newp)
1396         return CURLUE_OUT_OF_MEMORY;
1397       p = (char *)newp;
1398       while(*p) {
1399         /* make sure percent encoded are lower case */
1400         if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
1401            (ISUPPER(p[1]) || ISUPPER(p[2]))) {
1402           p[1] = (char)TOLOWER(p[1]);
1403           p[2] = (char)TOLOWER(p[2]);
1404           p += 3;
1405         }
1406         else
1407           p++;
1408       }
1409     }
1410 
1411     if(appendquery) {
1412       /* Append the string onto the old query. Add a '&' separator if none is
1413          present at the end of the exsting query already */
1414       size_t querylen = u->query ? strlen(u->query) : 0;
1415       bool addamperand = querylen && (u->query[querylen -1] != '&');
1416       if(querylen) {
1417         size_t newplen = strlen(newp);
1418         char *p = malloc(querylen + addamperand + newplen + 1);
1419         if(!p) {
1420           free((char *)newp);
1421           return CURLUE_OUT_OF_MEMORY;
1422         }
1423         strcpy(p, u->query); /* original query */
1424         if(addamperand)
1425           p[querylen] = '&'; /* ampersand */
1426         strcpy(&p[querylen + addamperand], newp); /* new suffix */
1427         free((char *)newp);
1428         free(*storep);
1429         *storep = p;
1430         return CURLUE_OK;
1431       }
1432     }
1433 
1434     if(what == CURLUPART_HOST) {
1435       if(hostname_check(u, (char *)newp)) {
1436         free((char *)newp);
1437         return CURLUE_MALFORMED_INPUT;
1438       }
1439     }
1440 
1441     free(*storep);
1442     *storep = (char *)newp;
1443   }
1444   /* set after the string, to make it not assigned if the allocation above
1445      fails */
1446   if(port)
1447     u->portnum = port;
1448   return CURLUE_OK;
1449 }
1450