1 /***************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at https://curl.haxx.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 ***************************************************************************/
22
23 #include "curl_setup.h"
24
25 #include "urldata.h"
26 #include "urlapi-int.h"
27 #include "strcase.h"
28 #include "dotdot.h"
29 #include "url.h"
30 #include "escape.h"
31 #include "curl_ctype.h"
32 #include "inet_pton.h"
33
34 /* The last 3 #include files should be in this order */
35 #include "curl_printf.h"
36 #include "curl_memory.h"
37 #include "memdebug.h"
38
39 /* MSDOS/Windows style drive prefix, eg c: in c:foo */
40 #define STARTS_WITH_DRIVE_PREFIX(str) \
41 ((('a' <= str[0] && str[0] <= 'z') || \
42 ('A' <= str[0] && str[0] <= 'Z')) && \
43 (str[1] == ':'))
44
45 /* MSDOS/Windows style drive prefix, optionally with
46 * a '|' instead of ':', followed by a slash or NUL */
47 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \
48 ((('a' <= (str)[0] && (str)[0] <= 'z') || \
49 ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
50 ((str)[1] == ':' || (str)[1] == '|') && \
51 ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
52
53 /* Internal representation of CURLU. Point to URL-encoded strings. */
54 struct Curl_URL {
55 char *scheme;
56 char *user;
57 char *password;
58 char *options; /* IMAP only? */
59 char *host;
60 char *zoneid; /* for numerical IPv6 addresses */
61 char *port;
62 char *path;
63 char *query;
64 char *fragment;
65
66 char *scratch; /* temporary scratch area */
67 long portnum; /* the numerical version */
68 };
69
70 #define DEFAULT_SCHEME "https"
71
free_urlhandle(struct Curl_URL * u)72 static void free_urlhandle(struct Curl_URL *u)
73 {
74 free(u->scheme);
75 free(u->user);
76 free(u->password);
77 free(u->options);
78 free(u->host);
79 free(u->zoneid);
80 free(u->port);
81 free(u->path);
82 free(u->query);
83 free(u->fragment);
84 free(u->scratch);
85 }
86
87 /* move the full contents of one handle onto another and
88 free the original */
mv_urlhandle(struct Curl_URL * from,struct Curl_URL * to)89 static void mv_urlhandle(struct Curl_URL *from,
90 struct Curl_URL *to)
91 {
92 free_urlhandle(to);
93 *to = *from;
94 free(from);
95 }
96
97 /*
98 * Find the separator at the end of the host name, or the '?' in cases like
99 * http://www.url.com?id=2380
100 */
find_host_sep(const char * url)101 static const char *find_host_sep(const char *url)
102 {
103 const char *sep;
104 const char *query;
105
106 /* Find the start of the hostname */
107 sep = strstr(url, "//");
108 if(!sep)
109 sep = url;
110 else
111 sep += 2;
112
113 query = strchr(sep, '?');
114 sep = strchr(sep, '/');
115
116 if(!sep)
117 sep = url + strlen(url);
118
119 if(!query)
120 query = url + strlen(url);
121
122 return sep < query ? sep : query;
123 }
124
125 /*
126 * Decide in an encoding-independent manner whether a character in an
127 * URL must be escaped. The same criterion must be used in strlen_url()
128 * and strcpy_url().
129 */
urlchar_needs_escaping(int c)130 static bool urlchar_needs_escaping(int c)
131 {
132 return !(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c));
133 }
134
135 /*
136 * strlen_url() returns the length of the given URL if the spaces within the
137 * URL were properly URL encoded.
138 * URL encoding should be skipped for host names, otherwise IDN resolution
139 * will fail.
140 */
strlen_url(const char * url,bool relative)141 static size_t strlen_url(const char *url, bool relative)
142 {
143 const unsigned char *ptr;
144 size_t newlen = 0;
145 bool left = TRUE; /* left side of the ? */
146 const unsigned char *host_sep = (const unsigned char *) url;
147
148 if(!relative)
149 host_sep = (const unsigned char *) find_host_sep(url);
150
151 for(ptr = (unsigned char *)url; *ptr; ptr++) {
152
153 if(ptr < host_sep) {
154 ++newlen;
155 continue;
156 }
157
158 switch(*ptr) {
159 case '?':
160 left = FALSE;
161 /* FALLTHROUGH */
162 default:
163 if(urlchar_needs_escaping(*ptr))
164 newlen += 2;
165 newlen++;
166 break;
167 case ' ':
168 if(left)
169 newlen += 3;
170 else
171 newlen++;
172 break;
173 }
174 }
175 return newlen;
176 }
177
178 /* strcpy_url() copies a url to a output buffer and URL-encodes the spaces in
179 * the source URL accordingly.
180 * URL encoding should be skipped for host names, otherwise IDN resolution
181 * will fail.
182 */
strcpy_url(char * output,const char * url,bool relative)183 static void strcpy_url(char *output, const char *url, bool relative)
184 {
185 /* we must add this with whitespace-replacing */
186 bool left = TRUE;
187 const unsigned char *iptr;
188 char *optr = output;
189 const unsigned char *host_sep = (const unsigned char *) url;
190
191 if(!relative)
192 host_sep = (const unsigned char *) find_host_sep(url);
193
194 for(iptr = (unsigned char *)url; /* read from here */
195 *iptr; /* until zero byte */
196 iptr++) {
197
198 if(iptr < host_sep) {
199 *optr++ = *iptr;
200 continue;
201 }
202
203 switch(*iptr) {
204 case '?':
205 left = FALSE;
206 /* FALLTHROUGH */
207 default:
208 if(urlchar_needs_escaping(*iptr)) {
209 msnprintf(optr, 4, "%%%02x", *iptr);
210 optr += 3;
211 }
212 else
213 *optr++=*iptr;
214 break;
215 case ' ':
216 if(left) {
217 *optr++='%'; /* add a '%' */
218 *optr++='2'; /* add a '2' */
219 *optr++='0'; /* add a '0' */
220 }
221 else
222 *optr++='+'; /* add a '+' here */
223 break;
224 }
225 }
226 *optr = 0; /* zero terminate output buffer */
227
228 }
229
230 /*
231 * Returns true if the given URL is absolute (as opposed to relative) within
232 * the buffer size. Returns the scheme in the buffer if TRUE and 'buf' is
233 * non-NULL.
234 */
Curl_is_absolute_url(const char * url,char * buf,size_t buflen)235 bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
236 {
237 size_t i;
238 #ifdef WIN32
239 if(STARTS_WITH_DRIVE_PREFIX(url))
240 return FALSE;
241 #endif
242 for(i = 0; i < buflen && url[i]; ++i) {
243 char s = url[i];
244 if((s == ':') && (url[i + 1] == '/')) {
245 if(buf)
246 buf[i] = 0;
247 return TRUE;
248 }
249 /* RFC 3986 3.1 explains:
250 scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
251 */
252 else if(ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') ) {
253 if(buf)
254 buf[i] = (char)TOLOWER(s);
255 }
256 else
257 break;
258 }
259 return FALSE;
260 }
261
262 /*
263 * Concatenate a relative URL to a base URL making it absolute.
264 * URL-encodes any spaces.
265 * The returned pointer must be freed by the caller unless NULL
266 * (returns NULL on out of memory).
267 */
concat_url(const char * base,const char * relurl)268 static char *concat_url(const char *base, const char *relurl)
269 {
270 /***
271 TRY to append this new path to the old URL
272 to the right of the host part. Oh crap, this is doomed to cause
273 problems in the future...
274 */
275 char *newest;
276 char *protsep;
277 char *pathsep;
278 size_t newlen;
279 bool host_changed = FALSE;
280
281 const char *useurl = relurl;
282 size_t urllen;
283
284 /* we must make our own copy of the URL to play with, as it may
285 point to read-only data */
286 char *url_clone = strdup(base);
287
288 if(!url_clone)
289 return NULL; /* skip out of this NOW */
290
291 /* protsep points to the start of the host name */
292 protsep = strstr(url_clone, "//");
293 if(!protsep)
294 protsep = url_clone;
295 else
296 protsep += 2; /* pass the slashes */
297
298 if('/' != relurl[0]) {
299 int level = 0;
300
301 /* First we need to find out if there's a ?-letter in the URL,
302 and cut it and the right-side of that off */
303 pathsep = strchr(protsep, '?');
304 if(pathsep)
305 *pathsep = 0;
306
307 /* we have a relative path to append to the last slash if there's one
308 available, or if the new URL is just a query string (starts with a
309 '?') we append the new one at the end of the entire currently worked
310 out URL */
311 if(useurl[0] != '?') {
312 pathsep = strrchr(protsep, '/');
313 if(pathsep)
314 *pathsep = 0;
315 }
316
317 /* Check if there's any slash after the host name, and if so, remember
318 that position instead */
319 pathsep = strchr(protsep, '/');
320 if(pathsep)
321 protsep = pathsep + 1;
322 else
323 protsep = NULL;
324
325 /* now deal with one "./" or any amount of "../" in the newurl
326 and act accordingly */
327
328 if((useurl[0] == '.') && (useurl[1] == '/'))
329 useurl += 2; /* just skip the "./" */
330
331 while((useurl[0] == '.') &&
332 (useurl[1] == '.') &&
333 (useurl[2] == '/')) {
334 level++;
335 useurl += 3; /* pass the "../" */
336 }
337
338 if(protsep) {
339 while(level--) {
340 /* cut off one more level from the right of the original URL */
341 pathsep = strrchr(protsep, '/');
342 if(pathsep)
343 *pathsep = 0;
344 else {
345 *protsep = 0;
346 break;
347 }
348 }
349 }
350 }
351 else {
352 /* We got a new absolute path for this server */
353
354 if((relurl[0] == '/') && (relurl[1] == '/')) {
355 /* the new URL starts with //, just keep the protocol part from the
356 original one */
357 *protsep = 0;
358 useurl = &relurl[2]; /* we keep the slashes from the original, so we
359 skip the new ones */
360 host_changed = TRUE;
361 }
362 else {
363 /* cut off the original URL from the first slash, or deal with URLs
364 without slash */
365 pathsep = strchr(protsep, '/');
366 if(pathsep) {
367 /* When people use badly formatted URLs, such as
368 "http://www.url.com?dir=/home/daniel" we must not use the first
369 slash, if there's a ?-letter before it! */
370 char *sep = strchr(protsep, '?');
371 if(sep && (sep < pathsep))
372 pathsep = sep;
373 *pathsep = 0;
374 }
375 else {
376 /* There was no slash. Now, since we might be operating on a badly
377 formatted URL, such as "http://www.url.com?id=2380" which doesn't
378 use a slash separator as it is supposed to, we need to check for a
379 ?-letter as well! */
380 pathsep = strchr(protsep, '?');
381 if(pathsep)
382 *pathsep = 0;
383 }
384 }
385 }
386
387 /* If the new part contains a space, this is a mighty stupid redirect
388 but we still make an effort to do "right". To the left of a '?'
389 letter we replace each space with %20 while it is replaced with '+'
390 on the right side of the '?' letter.
391 */
392 newlen = strlen_url(useurl, !host_changed);
393
394 urllen = strlen(url_clone);
395
396 newest = malloc(urllen + 1 + /* possible slash */
397 newlen + 1 /* zero byte */);
398
399 if(!newest) {
400 free(url_clone); /* don't leak this */
401 return NULL;
402 }
403
404 /* copy over the root url part */
405 memcpy(newest, url_clone, urllen);
406
407 /* check if we need to append a slash */
408 if(('/' == useurl[0]) || (protsep && !*protsep) || ('?' == useurl[0]))
409 ;
410 else
411 newest[urllen++]='/';
412
413 /* then append the new piece on the right side */
414 strcpy_url(&newest[urllen], useurl, !host_changed);
415
416 free(url_clone);
417
418 return newest;
419 }
420
421 /*
422 * parse_hostname_login()
423 *
424 * Parse the login details (user name, password and options) from the URL and
425 * strip them out of the host name
426 *
427 */
parse_hostname_login(struct Curl_URL * u,const struct Curl_handler * h,char ** hostname,unsigned int flags)428 static CURLUcode parse_hostname_login(struct Curl_URL *u,
429 const struct Curl_handler *h,
430 char **hostname,
431 unsigned int flags)
432 {
433 CURLUcode result = CURLUE_OK;
434 CURLcode ccode;
435 char *userp = NULL;
436 char *passwdp = NULL;
437 char *optionsp = NULL;
438
439 /* At this point, we're hoping all the other special cases have
440 * been taken care of, so conn->host.name is at most
441 * [user[:password][;options]]@]hostname
442 *
443 * We need somewhere to put the embedded details, so do that first.
444 */
445
446 char *ptr = strchr(*hostname, '@');
447 char *login = *hostname;
448
449 if(!ptr)
450 goto out;
451
452 /* We will now try to extract the
453 * possible login information in a string like:
454 * ftp://user:password@ftp.my.site:8021/README */
455 *hostname = ++ptr;
456
457 /* We could use the login information in the URL so extract it. Only parse
458 options if the handler says we should. Note that 'h' might be NULL! */
459 ccode = Curl_parse_login_details(login, ptr - login - 1,
460 &userp, &passwdp,
461 (h && (h->flags & PROTOPT_URLOPTIONS)) ?
462 &optionsp:NULL);
463 if(ccode) {
464 result = CURLUE_MALFORMED_INPUT;
465 goto out;
466 }
467
468 if(userp) {
469 if(flags & CURLU_DISALLOW_USER) {
470 /* Option DISALLOW_USER is set and url contains username. */
471 result = CURLUE_USER_NOT_ALLOWED;
472 goto out;
473 }
474
475 u->user = userp;
476 }
477
478 if(passwdp)
479 u->password = passwdp;
480
481 if(optionsp)
482 u->options = optionsp;
483
484 return CURLUE_OK;
485 out:
486
487 free(userp);
488 free(passwdp);
489 free(optionsp);
490
491 return result;
492 }
493
Curl_parse_port(struct Curl_URL * u,char * hostname)494 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname)
495 {
496 char *portptr = NULL;
497 char endbracket;
498 int len;
499
500 /*
501 * Find the end of an IPv6 address, either on the ']' ending bracket or
502 * a percent-encoded zone index.
503 */
504 if(1 == sscanf(hostname, "[%*45[0123456789abcdefABCDEF:.]%c%n",
505 &endbracket, &len)) {
506 if(']' == endbracket)
507 portptr = &hostname[len];
508 else if('%' == endbracket) {
509 int zonelen = len;
510 if(1 == sscanf(hostname + zonelen, "%*[^]]%c%n", &endbracket, &len)) {
511 if(']' != endbracket)
512 return CURLUE_MALFORMED_INPUT;
513 portptr = &hostname[--zonelen + len + 1];
514 }
515 else
516 return CURLUE_MALFORMED_INPUT;
517 }
518 else
519 return CURLUE_MALFORMED_INPUT;
520
521 /* this is a RFC2732-style specified IP-address */
522 if(portptr && *portptr) {
523 if(*portptr != ':')
524 return CURLUE_MALFORMED_INPUT;
525 }
526 else
527 portptr = NULL;
528 }
529 else
530 portptr = strchr(hostname, ':');
531
532 if(portptr) {
533 char *rest;
534 long port;
535 char portbuf[7];
536
537 /* Browser behavior adaptation. If there's a colon with no digits after,
538 just cut off the name there which makes us ignore the colon and just
539 use the default port. Firefox, Chrome and Safari all do that. */
540 if(!portptr[1]) {
541 *portptr = '\0';
542 return CURLUE_OK;
543 }
544
545 if(!ISDIGIT(portptr[1]))
546 return CURLUE_BAD_PORT_NUMBER;
547
548 port = strtol(portptr + 1, &rest, 10); /* Port number must be decimal */
549
550 if((port <= 0) || (port > 0xffff))
551 /* Single unix standard says port numbers are 16 bits long, but we don't
552 treat port zero as OK. */
553 return CURLUE_BAD_PORT_NUMBER;
554
555 if(rest[0])
556 return CURLUE_BAD_PORT_NUMBER;
557
558 *portptr++ = '\0'; /* cut off the name there */
559 *rest = 0;
560 /* generate a new port number string to get rid of leading zeroes etc */
561 msnprintf(portbuf, sizeof(portbuf), "%ld", port);
562 u->portnum = port;
563 u->port = strdup(portbuf);
564 if(!u->port)
565 return CURLUE_OUT_OF_MEMORY;
566 }
567
568 return CURLUE_OK;
569 }
570
571 /* scan for byte values < 31 or 127 */
junkscan(char * part)572 static CURLUcode junkscan(char *part)
573 {
574 if(part) {
575 static const char badbytes[]={
576 /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
577 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
578 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
579 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
580 0x7f,
581 0x00 /* zero terminate */
582 };
583 size_t n = strlen(part);
584 size_t nfine = strcspn(part, badbytes);
585 if(nfine != n)
586 /* since we don't know which part is scanned, return a generic error
587 code */
588 return CURLUE_MALFORMED_INPUT;
589 }
590 return CURLUE_OK;
591 }
592
hostname_check(struct Curl_URL * u,char * hostname)593 static CURLUcode hostname_check(struct Curl_URL *u, char *hostname)
594 {
595 size_t len;
596 size_t hlen = strlen(hostname);
597
598 if(hostname[0] == '[') {
599 char dest[16]; /* fits a binary IPv6 address */
600 const char *l = "0123456789abcdefABCDEF:.";
601 hostname++;
602 hlen -= 2;
603
604 if(hostname[hlen] != ']')
605 return CURLUE_MALFORMED_INPUT;
606
607 /* only valid letters are ok */
608 len = strspn(hostname, l);
609 if(hlen != len) {
610 hlen = len;
611 if(hostname[len] == '%') {
612 /* this could now be '%[zone id]' */
613 char zoneid[16];
614 int i = 0;
615 char *h = &hostname[len + 1];
616 /* pass '25' if present and is a url encoded percent sign */
617 if(!strncmp(h, "25", 2) && h[2] && (h[2] != ']'))
618 h += 2;
619 while(*h && (*h != ']') && (i < 15))
620 zoneid[i++] = *h++;
621 if(!i || (']' != *h))
622 return CURLUE_MALFORMED_INPUT;
623 zoneid[i] = 0;
624 u->zoneid = strdup(zoneid);
625 if(!u->zoneid)
626 return CURLUE_OUT_OF_MEMORY;
627 hostname[len] = ']'; /* insert end bracket */
628 hostname[len + 1] = 0; /* terminate the hostname */
629 }
630 else
631 return CURLUE_MALFORMED_INPUT;
632 /* hostname is fine */
633 }
634 #ifdef ENABLE_IPV6
635 hostname[hlen] = 0; /* end the address there */
636 if(1 != Curl_inet_pton(AF_INET6, hostname, dest))
637 return CURLUE_MALFORMED_INPUT;
638 hostname[hlen] = ']'; /* restore ending bracket */
639 #endif
640 }
641 else {
642 /* letters from the second string is not ok */
643 len = strcspn(hostname, " ");
644 if(hlen != len)
645 /* hostname with bad content */
646 return CURLUE_MALFORMED_INPUT;
647 }
648 if(!hostname[0])
649 return CURLUE_NO_HOST;
650 return CURLUE_OK;
651 }
652
653 #define HOSTNAME_END(x) (((x) == '/') || ((x) == '?') || ((x) == '#'))
654
seturl(const char * url,CURLU * u,unsigned int flags)655 static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
656 {
657 char *path;
658 bool path_alloced = FALSE;
659 char *hostname;
660 char *query = NULL;
661 char *fragment = NULL;
662 CURLUcode result;
663 bool url_has_scheme = FALSE;
664 char schemebuf[MAX_SCHEME_LEN + 1];
665 char *schemep = NULL;
666 size_t schemelen = 0;
667 size_t urllen;
668 const struct Curl_handler *h = NULL;
669
670 if(!url)
671 return CURLUE_MALFORMED_INPUT;
672
673 /*************************************************************
674 * Parse the URL.
675 ************************************************************/
676 /* allocate scratch area */
677 urllen = strlen(url);
678 if(urllen > CURL_MAX_INPUT_LENGTH)
679 /* excessive input length */
680 return CURLUE_MALFORMED_INPUT;
681
682 path = u->scratch = malloc(urllen * 2 + 2);
683 if(!path)
684 return CURLUE_OUT_OF_MEMORY;
685
686 hostname = &path[urllen + 1];
687 hostname[0] = 0;
688
689 if(Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf))) {
690 url_has_scheme = TRUE;
691 schemelen = strlen(schemebuf);
692 }
693
694 /* handle the file: scheme */
695 if(url_has_scheme && strcasecompare(schemebuf, "file")) {
696 /* path has been allocated large enough to hold this */
697 strcpy(path, &url[5]);
698
699 hostname = NULL; /* no host for file: URLs */
700 u->scheme = strdup("file");
701 if(!u->scheme)
702 return CURLUE_OUT_OF_MEMORY;
703
704 /* Extra handling URLs with an authority component (i.e. that start with
705 * "file://")
706 *
707 * We allow omitted hostname (e.g. file:/<path>) -- valid according to
708 * RFC 8089, but not the (current) WHAT-WG URL spec.
709 */
710 if(path[0] == '/' && path[1] == '/') {
711 /* swallow the two slashes */
712 char *ptr = &path[2];
713
714 /*
715 * According to RFC 8089, a file: URL can be reliably dereferenced if:
716 *
717 * o it has no/blank hostname, or
718 *
719 * o the hostname matches "localhost" (case-insensitively), or
720 *
721 * o the hostname is a FQDN that resolves to this machine.
722 *
723 * For brevity, we only consider URLs with empty, "localhost", or
724 * "127.0.0.1" hostnames as local.
725 *
726 * Additionally, there is an exception for URLs with a Windows drive
727 * letter in the authority (which was accidentally omitted from RFC 8089
728 * Appendix E, but believe me, it was meant to be there. --MK)
729 */
730 if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
731 /* the URL includes a host name, it must match "localhost" or
732 "127.0.0.1" to be valid */
733 if(!checkprefix("localhost/", ptr) &&
734 !checkprefix("127.0.0.1/", ptr)) {
735 /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
736 none */
737 return CURLUE_MALFORMED_INPUT;
738 }
739 ptr += 9; /* now points to the slash after the host */
740 }
741
742 path = ptr;
743 }
744
745 #if !defined(MSDOS) && !defined(WIN32) && !defined(__CYGWIN__)
746 /* Don't allow Windows drive letters when not in Windows.
747 * This catches both "file:/c:" and "file:c:" */
748 if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
749 STARTS_WITH_URL_DRIVE_PREFIX(path)) {
750 /* File drive letters are only accepted in MSDOS/Windows */
751 return CURLUE_MALFORMED_INPUT;
752 }
753 #else
754 /* If the path starts with a slash and a drive letter, ditch the slash */
755 if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
756 /* This cannot be done with strcpy, as the memory chunks overlap! */
757 memmove(path, &path[1], strlen(&path[1]) + 1);
758 }
759 #endif
760
761 }
762 else {
763 /* clear path */
764 const char *p;
765 const char *hostp;
766 size_t len;
767 path[0] = 0;
768
769 if(url_has_scheme) {
770 int i = 0;
771 p = &url[schemelen + 1];
772 while(p && (*p == '/') && (i < 4)) {
773 p++;
774 i++;
775 }
776 if((i < 1) || (i>3))
777 /* less than one or more than three slashes */
778 return CURLUE_MALFORMED_INPUT;
779
780 schemep = schemebuf;
781 if(!Curl_builtin_scheme(schemep) &&
782 !(flags & CURLU_NON_SUPPORT_SCHEME))
783 return CURLUE_UNSUPPORTED_SCHEME;
784
785 if(junkscan(schemep))
786 return CURLUE_MALFORMED_INPUT;
787 }
788 else {
789 /* no scheme! */
790
791 if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME)))
792 return CURLUE_MALFORMED_INPUT;
793 if(flags & CURLU_DEFAULT_SCHEME)
794 schemep = (char *) DEFAULT_SCHEME;
795
796 /*
797 * The URL was badly formatted, let's try without scheme specified.
798 */
799 p = url;
800 }
801 hostp = p; /* host name starts here */
802
803 while(*p && !HOSTNAME_END(*p)) /* find end of host name */
804 p++;
805
806 len = p - hostp;
807 if(!len)
808 return CURLUE_MALFORMED_INPUT;
809
810 memcpy(hostname, hostp, len);
811 hostname[len] = 0;
812
813 if((flags & CURLU_GUESS_SCHEME) && !schemep) {
814 /* legacy curl-style guess based on host name */
815 if(checkprefix("ftp.", hostname))
816 schemep = (char *)"ftp";
817 else if(checkprefix("dict.", hostname))
818 schemep = (char *)"dict";
819 else if(checkprefix("ldap.", hostname))
820 schemep = (char *)"ldap";
821 else if(checkprefix("imap.", hostname))
822 schemep = (char *)"imap";
823 else if(checkprefix("smtp.", hostname))
824 schemep = (char *)"smtp";
825 else if(checkprefix("pop3.", hostname))
826 schemep = (char *)"pop3";
827 else
828 schemep = (char *)"http";
829 }
830
831 len = strlen(p);
832 memcpy(path, p, len);
833 path[len] = 0;
834
835 u->scheme = strdup(schemep);
836 if(!u->scheme)
837 return CURLUE_OUT_OF_MEMORY;
838 }
839
840 /* if this is a known scheme, get some details */
841 h = Curl_builtin_scheme(u->scheme);
842
843 if(junkscan(path))
844 return CURLUE_MALFORMED_INPUT;
845
846 query = strchr(path, '?');
847 if(query)
848 *query++ = 0;
849
850 fragment = strchr(query?query:path, '#');
851 if(fragment)
852 *fragment++ = 0;
853
854 if(!path[0])
855 /* if there's no path set, unset */
856 path = NULL;
857 else if(!(flags & CURLU_PATH_AS_IS)) {
858 /* sanitise paths and remove ../ and ./ sequences according to RFC3986 */
859 char *newp = Curl_dedotdotify(path);
860 if(!newp)
861 return CURLUE_OUT_OF_MEMORY;
862
863 if(strcmp(newp, path)) {
864 /* if we got a new version */
865 path = newp;
866 path_alloced = TRUE;
867 }
868 else
869 free(newp);
870 }
871 if(path) {
872 u->path = path_alloced?path:strdup(path);
873 if(!u->path)
874 return CURLUE_OUT_OF_MEMORY;
875 }
876
877 if(hostname) {
878 /*
879 * Parse the login details and strip them out of the host name.
880 */
881 if(junkscan(hostname))
882 return CURLUE_MALFORMED_INPUT;
883
884 result = parse_hostname_login(u, h, &hostname, flags);
885 if(result)
886 return result;
887
888 result = Curl_parse_port(u, hostname);
889 if(result)
890 return result;
891
892 result = hostname_check(u, hostname);
893 if(result)
894 return result;
895
896 u->host = strdup(hostname);
897 if(!u->host)
898 return CURLUE_OUT_OF_MEMORY;
899 }
900
901 if(query) {
902 u->query = strdup(query);
903 if(!u->query)
904 return CURLUE_OUT_OF_MEMORY;
905 }
906 if(fragment && fragment[0]) {
907 u->fragment = strdup(fragment);
908 if(!u->fragment)
909 return CURLUE_OUT_OF_MEMORY;
910 }
911
912 free(u->scratch);
913 u->scratch = NULL;
914
915 return CURLUE_OK;
916 }
917
918 /*
919 * Parse the URL and set the relevant members of the Curl_URL struct.
920 */
parseurl(const char * url,CURLU * u,unsigned int flags)921 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
922 {
923 CURLUcode result = seturl(url, u, flags);
924 if(result) {
925 free_urlhandle(u);
926 memset(u, 0, sizeof(struct Curl_URL));
927 }
928 return result;
929 }
930
931 /*
932 */
curl_url(void)933 CURLU *curl_url(void)
934 {
935 return calloc(sizeof(struct Curl_URL), 1);
936 }
937
curl_url_cleanup(CURLU * u)938 void curl_url_cleanup(CURLU *u)
939 {
940 if(u) {
941 free_urlhandle(u);
942 free(u);
943 }
944 }
945
946 #define DUP(dest, src, name) \
947 if(src->name) { \
948 dest->name = strdup(src->name); \
949 if(!dest->name) \
950 goto fail; \
951 }
952
curl_url_dup(CURLU * in)953 CURLU *curl_url_dup(CURLU *in)
954 {
955 struct Curl_URL *u = calloc(sizeof(struct Curl_URL), 1);
956 if(u) {
957 DUP(u, in, scheme);
958 DUP(u, in, user);
959 DUP(u, in, password);
960 DUP(u, in, options);
961 DUP(u, in, host);
962 DUP(u, in, port);
963 DUP(u, in, path);
964 DUP(u, in, query);
965 DUP(u, in, fragment);
966 u->portnum = in->portnum;
967 }
968 return u;
969 fail:
970 curl_url_cleanup(u);
971 return NULL;
972 }
973
curl_url_get(CURLU * u,CURLUPart what,char ** part,unsigned int flags)974 CURLUcode curl_url_get(CURLU *u, CURLUPart what,
975 char **part, unsigned int flags)
976 {
977 char *ptr;
978 CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
979 char portbuf[7];
980 bool urldecode = (flags & CURLU_URLDECODE)?1:0;
981 bool plusdecode = FALSE;
982 (void)flags;
983 if(!u)
984 return CURLUE_BAD_HANDLE;
985 if(!part)
986 return CURLUE_BAD_PARTPOINTER;
987 *part = NULL;
988
989 switch(what) {
990 case CURLUPART_SCHEME:
991 ptr = u->scheme;
992 ifmissing = CURLUE_NO_SCHEME;
993 urldecode = FALSE; /* never for schemes */
994 break;
995 case CURLUPART_USER:
996 ptr = u->user;
997 ifmissing = CURLUE_NO_USER;
998 break;
999 case CURLUPART_PASSWORD:
1000 ptr = u->password;
1001 ifmissing = CURLUE_NO_PASSWORD;
1002 break;
1003 case CURLUPART_OPTIONS:
1004 ptr = u->options;
1005 ifmissing = CURLUE_NO_OPTIONS;
1006 break;
1007 case CURLUPART_HOST:
1008 ptr = u->host;
1009 ifmissing = CURLUE_NO_HOST;
1010 break;
1011 case CURLUPART_ZONEID:
1012 ptr = u->zoneid;
1013 break;
1014 case CURLUPART_PORT:
1015 ptr = u->port;
1016 ifmissing = CURLUE_NO_PORT;
1017 urldecode = FALSE; /* never for port */
1018 if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
1019 /* there's no stored port number, but asked to deliver
1020 a default one for the scheme */
1021 const struct Curl_handler *h =
1022 Curl_builtin_scheme(u->scheme);
1023 if(h) {
1024 msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
1025 ptr = portbuf;
1026 }
1027 }
1028 else if(ptr && u->scheme) {
1029 /* there is a stored port number, but ask to inhibit if
1030 it matches the default one for the scheme */
1031 const struct Curl_handler *h =
1032 Curl_builtin_scheme(u->scheme);
1033 if(h && (h->defport == u->portnum) &&
1034 (flags & CURLU_NO_DEFAULT_PORT))
1035 ptr = NULL;
1036 }
1037 break;
1038 case CURLUPART_PATH:
1039 ptr = u->path;
1040 if(!ptr) {
1041 ptr = u->path = strdup("/");
1042 if(!u->path)
1043 return CURLUE_OUT_OF_MEMORY;
1044 }
1045 break;
1046 case CURLUPART_QUERY:
1047 ptr = u->query;
1048 ifmissing = CURLUE_NO_QUERY;
1049 plusdecode = urldecode;
1050 break;
1051 case CURLUPART_FRAGMENT:
1052 ptr = u->fragment;
1053 ifmissing = CURLUE_NO_FRAGMENT;
1054 break;
1055 case CURLUPART_URL: {
1056 char *url;
1057 char *scheme;
1058 char *options = u->options;
1059 char *port = u->port;
1060 char *allochost = NULL;
1061 if(u->scheme && strcasecompare("file", u->scheme)) {
1062 url = aprintf("file://%s%s%s",
1063 u->path,
1064 u->fragment? "#": "",
1065 u->fragment? u->fragment : "");
1066 }
1067 else if(!u->host)
1068 return CURLUE_NO_HOST;
1069 else {
1070 const struct Curl_handler *h = NULL;
1071 if(u->scheme)
1072 scheme = u->scheme;
1073 else if(flags & CURLU_DEFAULT_SCHEME)
1074 scheme = (char *) DEFAULT_SCHEME;
1075 else
1076 return CURLUE_NO_SCHEME;
1077
1078 if(scheme) {
1079 h = Curl_builtin_scheme(scheme);
1080 if(!port && (flags & CURLU_DEFAULT_PORT)) {
1081 /* there's no stored port number, but asked to deliver
1082 a default one for the scheme */
1083 if(h) {
1084 msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
1085 port = portbuf;
1086 }
1087 }
1088 else if(port) {
1089 /* there is a stored port number, but asked to inhibit if it matches
1090 the default one for the scheme */
1091 if(h && (h->defport == u->portnum) &&
1092 (flags & CURLU_NO_DEFAULT_PORT))
1093 port = NULL;
1094 }
1095 }
1096 if(h && !(h->flags & PROTOPT_URLOPTIONS))
1097 options = NULL;
1098
1099 if((u->host[0] == '[') && u->zoneid) {
1100 /* make it '[ host %25 zoneid ]' */
1101 size_t hostlen = strlen(u->host);
1102 size_t alen = hostlen + 3 + strlen(u->zoneid) + 1;
1103 allochost = malloc(alen);
1104 if(!allochost)
1105 return CURLUE_OUT_OF_MEMORY;
1106 memcpy(allochost, u->host, hostlen - 1);
1107 msnprintf(&allochost[hostlen - 1], alen - hostlen + 1,
1108 "%%25%s]", u->zoneid);
1109 }
1110
1111 url = aprintf("%s://%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
1112 scheme,
1113 u->user ? u->user : "",
1114 u->password ? ":": "",
1115 u->password ? u->password : "",
1116 options ? ";" : "",
1117 options ? options : "",
1118 (u->user || u->password || options) ? "@": "",
1119 allochost ? allochost : u->host,
1120 port ? ":": "",
1121 port ? port : "",
1122 (u->path && (u->path[0] != '/')) ? "/": "",
1123 u->path ? u->path : "/",
1124 (u->query && u->query[0]) ? "?": "",
1125 (u->query && u->query[0]) ? u->query : "",
1126 u->fragment? "#": "",
1127 u->fragment? u->fragment : "");
1128 free(allochost);
1129 }
1130 if(!url)
1131 return CURLUE_OUT_OF_MEMORY;
1132 *part = url;
1133 return CURLUE_OK;
1134 }
1135 default:
1136 ptr = NULL;
1137 break;
1138 }
1139 if(ptr) {
1140 *part = strdup(ptr);
1141 if(!*part)
1142 return CURLUE_OUT_OF_MEMORY;
1143 if(plusdecode) {
1144 /* convert + to space */
1145 char *plus;
1146 for(plus = *part; *plus; ++plus) {
1147 if(*plus == '+')
1148 *plus = ' ';
1149 }
1150 }
1151 if(urldecode) {
1152 char *decoded;
1153 size_t dlen;
1154 CURLcode res = Curl_urldecode(NULL, *part, 0, &decoded, &dlen, TRUE);
1155 free(*part);
1156 if(res) {
1157 *part = NULL;
1158 return CURLUE_URLDECODE;
1159 }
1160 *part = decoded;
1161 }
1162 return CURLUE_OK;
1163 }
1164 else
1165 return ifmissing;
1166 }
1167
curl_url_set(CURLU * u,CURLUPart what,const char * part,unsigned int flags)1168 CURLUcode curl_url_set(CURLU *u, CURLUPart what,
1169 const char *part, unsigned int flags)
1170 {
1171 char **storep = NULL;
1172 long port = 0;
1173 bool urlencode = (flags & CURLU_URLENCODE)? 1 : 0;
1174 bool plusencode = FALSE;
1175 bool urlskipslash = FALSE;
1176 bool appendquery = FALSE;
1177 bool equalsencode = FALSE;
1178
1179 if(!u)
1180 return CURLUE_BAD_HANDLE;
1181 if(!part) {
1182 /* setting a part to NULL clears it */
1183 switch(what) {
1184 case CURLUPART_URL:
1185 break;
1186 case CURLUPART_SCHEME:
1187 storep = &u->scheme;
1188 break;
1189 case CURLUPART_USER:
1190 storep = &u->user;
1191 break;
1192 case CURLUPART_PASSWORD:
1193 storep = &u->password;
1194 break;
1195 case CURLUPART_OPTIONS:
1196 storep = &u->options;
1197 break;
1198 case CURLUPART_HOST:
1199 storep = &u->host;
1200 break;
1201 case CURLUPART_ZONEID:
1202 storep = &u->zoneid;
1203 break;
1204 case CURLUPART_PORT:
1205 u->portnum = 0;
1206 storep = &u->port;
1207 break;
1208 case CURLUPART_PATH:
1209 storep = &u->path;
1210 break;
1211 case CURLUPART_QUERY:
1212 storep = &u->query;
1213 break;
1214 case CURLUPART_FRAGMENT:
1215 storep = &u->fragment;
1216 break;
1217 default:
1218 return CURLUE_UNKNOWN_PART;
1219 }
1220 if(storep && *storep) {
1221 free(*storep);
1222 *storep = NULL;
1223 }
1224 return CURLUE_OK;
1225 }
1226
1227 switch(what) {
1228 case CURLUPART_SCHEME:
1229 if(strlen(part) > MAX_SCHEME_LEN)
1230 /* too long */
1231 return CURLUE_MALFORMED_INPUT;
1232 if(!(flags & CURLU_NON_SUPPORT_SCHEME) &&
1233 /* verify that it is a fine scheme */
1234 !Curl_builtin_scheme(part))
1235 return CURLUE_UNSUPPORTED_SCHEME;
1236 storep = &u->scheme;
1237 urlencode = FALSE; /* never */
1238 break;
1239 case CURLUPART_USER:
1240 storep = &u->user;
1241 break;
1242 case CURLUPART_PASSWORD:
1243 storep = &u->password;
1244 break;
1245 case CURLUPART_OPTIONS:
1246 storep = &u->options;
1247 break;
1248 case CURLUPART_HOST:
1249 storep = &u->host;
1250 free(u->zoneid);
1251 u->zoneid = NULL;
1252 break;
1253 case CURLUPART_ZONEID:
1254 storep = &u->zoneid;
1255 break;
1256 case CURLUPART_PORT:
1257 {
1258 char *endp;
1259 urlencode = FALSE; /* never */
1260 port = strtol(part, &endp, 10); /* Port number must be decimal */
1261 if((port <= 0) || (port > 0xffff))
1262 return CURLUE_BAD_PORT_NUMBER;
1263 if(*endp)
1264 /* weirdly provided number, not good! */
1265 return CURLUE_MALFORMED_INPUT;
1266 storep = &u->port;
1267 }
1268 break;
1269 case CURLUPART_PATH:
1270 urlskipslash = TRUE;
1271 storep = &u->path;
1272 break;
1273 case CURLUPART_QUERY:
1274 plusencode = urlencode;
1275 appendquery = (flags & CURLU_APPENDQUERY)?1:0;
1276 equalsencode = appendquery;
1277 storep = &u->query;
1278 break;
1279 case CURLUPART_FRAGMENT:
1280 storep = &u->fragment;
1281 break;
1282 case CURLUPART_URL: {
1283 /*
1284 * Allow a new URL to replace the existing (if any) contents.
1285 *
1286 * If the existing contents is enough for a URL, allow a relative URL to
1287 * replace it.
1288 */
1289 CURLUcode result;
1290 char *oldurl;
1291 char *redired_url;
1292 CURLU *handle2;
1293
1294 if(Curl_is_absolute_url(part, NULL, MAX_SCHEME_LEN + 1)) {
1295 handle2 = curl_url();
1296 if(!handle2)
1297 return CURLUE_OUT_OF_MEMORY;
1298 result = parseurl(part, handle2, flags);
1299 if(!result)
1300 mv_urlhandle(handle2, u);
1301 else
1302 curl_url_cleanup(handle2);
1303 return result;
1304 }
1305 /* extract the full "old" URL to do the redirect on */
1306 result = curl_url_get(u, CURLUPART_URL, &oldurl, flags);
1307 if(result) {
1308 /* couldn't get the old URL, just use the new! */
1309 handle2 = curl_url();
1310 if(!handle2)
1311 return CURLUE_OUT_OF_MEMORY;
1312 result = parseurl(part, handle2, flags);
1313 if(!result)
1314 mv_urlhandle(handle2, u);
1315 else
1316 curl_url_cleanup(handle2);
1317 return result;
1318 }
1319
1320 /* apply the relative part to create a new URL */
1321 redired_url = concat_url(oldurl, part);
1322 free(oldurl);
1323 if(!redired_url)
1324 return CURLUE_OUT_OF_MEMORY;
1325
1326 /* now parse the new URL */
1327 handle2 = curl_url();
1328 if(!handle2) {
1329 free(redired_url);
1330 return CURLUE_OUT_OF_MEMORY;
1331 }
1332 result = parseurl(redired_url, handle2, flags);
1333 free(redired_url);
1334 if(!result)
1335 mv_urlhandle(handle2, u);
1336 else
1337 curl_url_cleanup(handle2);
1338 return result;
1339 }
1340 default:
1341 return CURLUE_UNKNOWN_PART;
1342 }
1343 if(storep) {
1344 const char *newp = part;
1345 size_t nalloc = strlen(part);
1346
1347 if(nalloc > CURL_MAX_INPUT_LENGTH)
1348 /* excessive input length */
1349 return CURLUE_MALFORMED_INPUT;
1350
1351 if(urlencode) {
1352 const unsigned char *i;
1353 char *o;
1354 bool free_part = FALSE;
1355 char *enc = malloc(nalloc * 3 + 1); /* for worst case! */
1356 if(!enc)
1357 return CURLUE_OUT_OF_MEMORY;
1358 if(plusencode) {
1359 /* space to plus */
1360 i = (const unsigned char *)part;
1361 for(o = enc; *i; ++o, ++i)
1362 *o = (*i == ' ') ? '+' : *i;
1363 *o = 0; /* zero terminate */
1364 part = strdup(enc);
1365 if(!part) {
1366 free(enc);
1367 return CURLUE_OUT_OF_MEMORY;
1368 }
1369 free_part = TRUE;
1370 }
1371 for(i = (const unsigned char *)part, o = enc; *i; i++) {
1372 if(Curl_isunreserved(*i) ||
1373 ((*i == '/') && urlskipslash) ||
1374 ((*i == '=') && equalsencode) ||
1375 ((*i == '+') && plusencode)) {
1376 if((*i == '=') && equalsencode)
1377 /* only skip the first equals sign */
1378 equalsencode = FALSE;
1379 *o = *i;
1380 o++;
1381 }
1382 else {
1383 msnprintf(o, 4, "%%%02x", *i);
1384 o += 3;
1385 }
1386 }
1387 *o = 0; /* zero terminate */
1388 newp = enc;
1389 if(free_part)
1390 free((char *)part);
1391 }
1392 else {
1393 char *p;
1394 newp = strdup(part);
1395 if(!newp)
1396 return CURLUE_OUT_OF_MEMORY;
1397 p = (char *)newp;
1398 while(*p) {
1399 /* make sure percent encoded are lower case */
1400 if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
1401 (ISUPPER(p[1]) || ISUPPER(p[2]))) {
1402 p[1] = (char)TOLOWER(p[1]);
1403 p[2] = (char)TOLOWER(p[2]);
1404 p += 3;
1405 }
1406 else
1407 p++;
1408 }
1409 }
1410
1411 if(appendquery) {
1412 /* Append the string onto the old query. Add a '&' separator if none is
1413 present at the end of the exsting query already */
1414 size_t querylen = u->query ? strlen(u->query) : 0;
1415 bool addamperand = querylen && (u->query[querylen -1] != '&');
1416 if(querylen) {
1417 size_t newplen = strlen(newp);
1418 char *p = malloc(querylen + addamperand + newplen + 1);
1419 if(!p) {
1420 free((char *)newp);
1421 return CURLUE_OUT_OF_MEMORY;
1422 }
1423 strcpy(p, u->query); /* original query */
1424 if(addamperand)
1425 p[querylen] = '&'; /* ampersand */
1426 strcpy(&p[querylen + addamperand], newp); /* new suffix */
1427 free((char *)newp);
1428 free(*storep);
1429 *storep = p;
1430 return CURLUE_OK;
1431 }
1432 }
1433
1434 if(what == CURLUPART_HOST) {
1435 if(hostname_check(u, (char *)newp)) {
1436 free((char *)newp);
1437 return CURLUE_MALFORMED_INPUT;
1438 }
1439 }
1440
1441 free(*storep);
1442 *storep = (char *)newp;
1443 }
1444 /* set after the string, to make it not assigned if the allocation above
1445 fails */
1446 if(port)
1447 u->portnum = port;
1448 return CURLUE_OK;
1449 }
1450