1 /* HTTP support.
2 Copyright (C) 1996-2012, 2014-2015, 2018-2021 Free Software
3 Foundation, Inc.
4
5 This file is part of GNU Wget.
6
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19
20 Additional permission under GNU GPL version 3 section 7
21
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
30
31 #include "wget.h"
32
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <assert.h>
38 #include <errno.h>
39 #include <time.h>
40 #include <locale.h>
41 #include <fcntl.h>
42
43 #include "hash.h"
44 #include "http.h"
45 #include "hsts.h"
46 #include "utils.h"
47 #include "url.h"
48 #include "host.h"
49 #include "retr.h"
50 #include "connect.h"
51 #include "netrc.h"
52 #ifdef HAVE_SSL
53 # include "ssl.h"
54 #endif
55 #ifdef ENABLE_NTLM
56 # include "http-ntlm.h"
57 #endif
58 #include "cookies.h"
59 #include "md5.h"
60 #include "convert.h"
61 #include "spider.h"
62 #include "warc.h"
63 #include "c-strcase.h"
64 #include "version.h"
65 #include "xstrndup.h"
66 #ifdef HAVE_METALINK
67 # include "metalink.h"
68 #endif
69 #ifdef ENABLE_XATTR
70 #include "xattr.h"
71 #endif
72
73 #ifdef TESTING
74 #include "../tests/unit-tests.h"
75 #endif
76
77 #ifdef __VMS
78 # include "vms.h"
79 #endif /* def __VMS */
80
81
82 /* Forward decls. */
83 struct http_stat;
84 static char *create_authorization_line (const char *, const char *,
85 const char *, const char *,
86 const char *, bool *, uerr_t *);
87 static char *basic_authentication_encode (const char *, const char *);
88 static bool known_authentication_scheme_p (const char *, const char *);
89 static void ensure_extension (struct http_stat *, const char *, int *);
90 static void load_cookies (void);
91
92 static bool cookies_loaded_p;
93 static struct cookie_jar *wget_cookie_jar;
94
95 #define TEXTHTML_S "text/html"
96 #define TEXTXHTML_S "application/xhtml+xml"
97 #define TEXTCSS_S "text/css"
98
99 /* Some status code validation macros: */
100 #define H_10X(x) (((x) >= 100) && ((x) < 200))
101 #define H_20X(x) (((x) >= 200) && ((x) < 300))
102 #define H_PARTIAL(x) ((x) == HTTP_STATUS_PARTIAL_CONTENTS)
103 #define H_REDIRECTED(x) ((x) == HTTP_STATUS_MOVED_PERMANENTLY \
104 || (x) == HTTP_STATUS_MOVED_TEMPORARILY \
105 || (x) == HTTP_STATUS_SEE_OTHER \
106 || (x) == HTTP_STATUS_TEMPORARY_REDIRECT \
107 || (x) == HTTP_STATUS_PERMANENT_REDIRECT)
108
109 /* HTTP/1.0 status codes from RFC1945, provided for reference. */
110 /* Successful 2xx. */
111 #define HTTP_STATUS_OK 200
112 #define HTTP_STATUS_CREATED 201
113 #define HTTP_STATUS_ACCEPTED 202
114 #define HTTP_STATUS_NO_CONTENT 204
115 #define HTTP_STATUS_PARTIAL_CONTENTS 206
116
117 /* Redirection 3xx. */
118 #define HTTP_STATUS_MULTIPLE_CHOICES 300
119 #define HTTP_STATUS_MOVED_PERMANENTLY 301
120 #define HTTP_STATUS_MOVED_TEMPORARILY 302
121 #define HTTP_STATUS_SEE_OTHER 303 /* from HTTP/1.1 */
122 #define HTTP_STATUS_NOT_MODIFIED 304
123 #define HTTP_STATUS_TEMPORARY_REDIRECT 307 /* from HTTP/1.1 */
124 #define HTTP_STATUS_PERMANENT_REDIRECT 308 /* from HTTP/1.1 */
125
126 /* Client error 4xx. */
127 #define HTTP_STATUS_BAD_REQUEST 400
128 #define HTTP_STATUS_UNAUTHORIZED 401
129 #define HTTP_STATUS_FORBIDDEN 403
130 #define HTTP_STATUS_NOT_FOUND 404
131 #define HTTP_STATUS_RANGE_NOT_SATISFIABLE 416
132
133 /* Server errors 5xx. */
134 #define HTTP_STATUS_INTERNAL 500
135 #define HTTP_STATUS_NOT_IMPLEMENTED 501
136 #define HTTP_STATUS_BAD_GATEWAY 502
137 #define HTTP_STATUS_UNAVAILABLE 503
138 #define HTTP_STATUS_GATEWAY_TIMEOUT 504
139
140 enum rp {
141 rel_none, rel_name, rel_value, rel_both
142 };
143
144 struct request {
145 const char *method;
146 char *arg;
147
148 struct request_header {
149 char *name, *value;
150 enum rp release_policy;
151 } *headers;
152 int hcount, hcapacity;
153 };
154
155
156 /* Create a new, empty request. Set the request's method and its
157 arguments. METHOD should be a literal string (or it should outlive
158 the request) because it will not be freed. ARG will be freed by
159 request_free. */
160
161 static struct request *
request_new(const char * method,char * arg)162 request_new (const char *method, char *arg)
163 {
164 struct request *req = xnew0 (struct request);
165 req->hcapacity = 8;
166 req->headers = xnew_array (struct request_header, req->hcapacity);
167 req->method = method;
168 req->arg = arg;
169 return req;
170 }
171
172 /* Return the method string passed with the last call to
173 request_set_method. */
174
175 static const char *
request_method(const struct request * req)176 request_method (const struct request *req)
177 {
178 return req->method;
179 }
180
181 /* Free one header according to the release policy specified with
182 request_set_header. */
183
184 static void
release_header(struct request_header * hdr)185 release_header (struct request_header *hdr)
186 {
187 switch (hdr->release_policy)
188 {
189 case rel_none:
190 break;
191 case rel_name:
192 xfree (hdr->name);
193 break;
194 case rel_value:
195 xfree (hdr->value);
196 break;
197 case rel_both:
198 xfree (hdr->name);
199 xfree (hdr->value);
200 break;
201 }
202 }
203
204 /* Set the request named NAME to VALUE. Specifically, this means that
205 a "NAME: VALUE\r\n" header line will be used in the request. If a
206 header with the same name previously existed in the request, its
207 value will be replaced by this one. A NULL value means do nothing.
208
209 RELEASE_POLICY determines whether NAME and VALUE should be released
210 (freed) with request_free. Allowed values are:
211
212 - rel_none - don't free NAME or VALUE
213 - rel_name - free NAME when done
214 - rel_value - free VALUE when done
215 - rel_both - free both NAME and VALUE when done
216
217 Setting release policy is useful when arguments come from different
218 sources. For example:
219
220 // Don't free literal strings!
221 request_set_header (req, "Pragma", "no-cache", rel_none);
222
223 // Don't free a global variable, we'll need it later.
224 request_set_header (req, "Referer", opt.referer, rel_none);
225
226 // Value freshly allocated, free it when done.
227 request_set_header (req, "Range",
228 aprintf ("bytes=%s-", number_to_static_string (hs->restval)),
229 rel_value);
230 */
231
232 static void
request_set_header(struct request * req,const char * name,const char * value,enum rp release_policy)233 request_set_header (struct request *req, const char *name, const char *value,
234 enum rp release_policy)
235 {
236 struct request_header *hdr;
237 int i;
238
239 if (!value)
240 {
241 /* A NULL value is a no-op; if freeing the name is requested,
242 free it now to avoid leaks. */
243 if (release_policy == rel_name || release_policy == rel_both)
244 xfree (name);
245 return;
246 }
247
248 for (i = 0; i < req->hcount; i++)
249 {
250 hdr = &req->headers[i];
251 if (0 == c_strcasecmp (name, hdr->name))
252 {
253 /* Replace existing header. */
254 release_header (hdr);
255 hdr->name = (void *)name;
256 hdr->value = (void *)value;
257 hdr->release_policy = release_policy;
258 return;
259 }
260 }
261
262 /* Install new header. */
263
264 if (req->hcount >= req->hcapacity)
265 {
266 req->hcapacity <<= 1;
267 req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr));
268 }
269 hdr = &req->headers[req->hcount++];
270 hdr->name = (void *)name;
271 hdr->value = (void *)value;
272 hdr->release_policy = release_policy;
273 }
274
275 /* Like request_set_header, but sets the whole header line, as
276 provided by the user using the `--header' option. For example,
277 request_set_user_header (req, "Foo: bar") works just like
278 request_set_header (req, "Foo", "bar"). */
279
280 static void
request_set_user_header(struct request * req,const char * header)281 request_set_user_header (struct request *req, const char *header)
282 {
283 const char *name, *p;
284
285 if (!(p = strchr (header, ':')))
286 return;
287
288 name = xstrndup(header, p - header);
289
290 ++p;
291 while (c_isspace (*p))
292 ++p;
293
294 request_set_header (req, name, p, rel_name);
295 }
296
297 /* Remove the header with specified name from REQ. Returns true if
298 the header was actually removed, false otherwise. */
299
300 static bool
request_remove_header(struct request * req,const char * name)301 request_remove_header (struct request *req, const char *name)
302 {
303 int i;
304 for (i = 0; i < req->hcount; i++)
305 {
306 struct request_header *hdr = &req->headers[i];
307 if (0 == c_strcasecmp (name, hdr->name))
308 {
309 release_header (hdr);
310 /* Move the remaining headers by one. */
311 if (i < req->hcount - 1)
312 memmove (hdr, hdr + 1, (req->hcount - i - 1) * sizeof (*hdr));
313 --req->hcount;
314 return true;
315 }
316 }
317 return false;
318 }
319
320 #define APPEND(p, str) do { \
321 int A_len = strlen (str); \
322 memcpy (p, str, A_len); \
323 p += A_len; \
324 } while (0)
325
326 /* Construct the request and write it to FD using fd_write.
327 If warc_tmp is set to a file pointer, the request string will
328 also be written to that file. */
329
330 static int
request_send(const struct request * req,int fd,FILE * warc_tmp)331 request_send (const struct request *req, int fd, FILE *warc_tmp)
332 {
333 char *request_string, *p;
334 int i, size, write_error;
335
336 /* Count the request size. */
337 size = 0;
338
339 /* METHOD " " ARG " " "HTTP/1.0" "\r\n" */
340 size += strlen (req->method) + 1 + strlen (req->arg) + 1 + 8 + 2;
341
342 for (i = 0; i < req->hcount; i++)
343 {
344 struct request_header *hdr = &req->headers[i];
345 /* NAME ": " VALUE "\r\n" */
346 size += strlen (hdr->name) + 2 + strlen (hdr->value) + 2;
347 }
348
349 /* "\r\n\0" */
350 size += 3;
351
352 p = request_string = xmalloc (size);
353
354 /* Generate the request. */
355
356 APPEND (p, req->method); *p++ = ' ';
357 APPEND (p, req->arg); *p++ = ' ';
358 memcpy (p, "HTTP/1.1\r\n", 10); p += 10;
359
360 for (i = 0; i < req->hcount; i++)
361 {
362 struct request_header *hdr = &req->headers[i];
363 APPEND (p, hdr->name);
364 *p++ = ':', *p++ = ' ';
365 APPEND (p, hdr->value);
366 *p++ = '\r', *p++ = '\n';
367 }
368
369 *p++ = '\r', *p++ = '\n', *p++ = '\0';
370 assert (p - request_string == size);
371
372 #undef APPEND
373
374 DEBUGP (("\n---request begin---\n%s---request end---\n", request_string));
375
376 /* Send the request to the server. */
377
378 write_error = fd_write (fd, request_string, size - 1, -1);
379 if (write_error < 0)
380 logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
381 fd_errstr (fd));
382 else if (warc_tmp != NULL)
383 {
384 /* Write a copy of the data to the WARC record. */
385 int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp);
386 if (warc_tmp_written != size - 1)
387 write_error = -2;
388 }
389 xfree (request_string);
390 return write_error;
391 }
392
393 /* Release the resources used by REQ.
394 It is safe to call it with a valid pointer to a NULL pointer.
395 It is not safe to call it with an invalid or NULL pointer. */
396
397 static void
request_free(struct request ** req_ref)398 request_free (struct request **req_ref)
399 {
400 int i;
401 struct request *req = *req_ref;
402
403 if (!req)
404 return;
405
406 xfree (req->arg);
407 for (i = 0; i < req->hcount; i++)
408 release_header (&req->headers[i]);
409 xfree (req->headers);
410 xfree (req);
411 *req_ref = NULL;
412 }
413
414 static struct hash_table *basic_authed_hosts;
415
416 /* Find out if this host has issued a Basic challenge yet; if so, give
417 * it the username, password. A temporary measure until we can get
418 * proper authentication in place. */
419
420 static bool
maybe_send_basic_creds(const char * hostname,const char * user,const char * passwd,struct request * req)421 maybe_send_basic_creds (const char *hostname, const char *user,
422 const char *passwd, struct request *req)
423 {
424 bool do_challenge = false;
425
426 if (opt.auth_without_challenge)
427 {
428 DEBUGP (("Auth-without-challenge set, sending Basic credentials.\n"));
429 do_challenge = true;
430 }
431 else if (basic_authed_hosts
432 && hash_table_contains (basic_authed_hosts, hostname))
433 {
434 DEBUGP (("Found %s in basic_authed_hosts.\n", quote (hostname)));
435 do_challenge = true;
436 }
437 else
438 {
439 DEBUGP (("Host %s has not issued a general basic challenge.\n",
440 quote (hostname)));
441 }
442 if (do_challenge)
443 {
444 request_set_header (req, "Authorization",
445 basic_authentication_encode (user, passwd),
446 rel_value);
447 }
448 return do_challenge;
449 }
450
451 static void
register_basic_auth_host(const char * hostname)452 register_basic_auth_host (const char *hostname)
453 {
454 if (!basic_authed_hosts)
455 {
456 basic_authed_hosts = make_nocase_string_hash_table (1);
457 }
458 if (!hash_table_contains (basic_authed_hosts, hostname))
459 {
460 hash_table_put (basic_authed_hosts, xstrdup (hostname), NULL);
461 DEBUGP (("Inserted %s into basic_authed_hosts\n", quote (hostname)));
462 }
463 }
464
465 /* Send the contents of FILE_NAME to SOCK. Make sure that exactly
466 PROMISED_SIZE bytes are sent over the wire -- if the file is
467 longer, read only that much; if the file is shorter, report an error.
468 If warc_tmp is set to a file pointer, the post data will
469 also be written to that file. */
470
471 static int
body_file_send(int sock,const char * file_name,wgint promised_size,FILE * warc_tmp)472 body_file_send (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp)
473 {
474 static char chunk[8192];
475 wgint written = 0;
476 int write_error;
477 FILE *fp;
478
479 DEBUGP (("[writing BODY file %s ... ", file_name));
480
481 fp = fopen (file_name, "rb");
482 if (!fp)
483 return -1;
484 while (!feof (fp) && written < promised_size)
485 {
486 int towrite;
487 int length = fread (chunk, 1, sizeof (chunk), fp);
488 if (length == 0)
489 break;
490 towrite = MIN (promised_size - written, length);
491 write_error = fd_write (sock, chunk, towrite, -1);
492 if (write_error < 0)
493 {
494 fclose (fp);
495 return -1;
496 }
497 if (warc_tmp != NULL)
498 {
499 /* Write a copy of the data to the WARC record. */
500 int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp);
501 if (warc_tmp_written != towrite)
502 {
503 fclose (fp);
504 return -2;
505 }
506 }
507 written += towrite;
508 }
509 fclose (fp);
510
511 /* If we've written less than was promised, report a (probably
512 nonsensical) error rather than break the promise. */
513 if (written < promised_size)
514 {
515 errno = EINVAL;
516 return -1;
517 }
518
519 assert (written == promised_size);
520 DEBUGP (("done]\n"));
521 return 0;
522 }
523
524 /* Determine whether [START, PEEKED + PEEKLEN) contains an empty line.
525 If so, return the pointer to the position after the line, otherwise
526 return NULL. This is used as callback to fd_read_hunk. The data
527 between START and PEEKED has been read and cannot be "unread"; the
528 data after PEEKED has only been peeked. */
529
530 static const char *
response_head_terminator(const char * start,const char * peeked,int peeklen)531 response_head_terminator (const char *start, const char *peeked, int peeklen)
532 {
533 const char *p, *end;
534
535 /* If at first peek, verify whether HUNK starts with "HTTP". If
536 not, this is a HTTP/0.9 request and we must bail out without
537 reading anything. */
538 if (start == peeked && 0 != memcmp (start, "HTTP", MIN (peeklen, 4)))
539 return start;
540
541 /* Look for "\n[\r]\n", and return the following position if found.
542 Start two chars before the current to cover the possibility that
543 part of the terminator (e.g. "\n\r") arrived in the previous
544 batch. */
545 p = peeked - start < 2 ? start : peeked - 2;
546 end = peeked + peeklen;
547
548 /* Check for \n\r\n or \n\n anywhere in [p, end-2). */
549 for (; p < end - 2; p++)
550 if (*p == '\n')
551 {
552 if (p[1] == '\r' && p[2] == '\n')
553 return p + 3;
554 else if (p[1] == '\n')
555 return p + 2;
556 }
557 /* p==end-2: check for \n\n directly preceding END. */
558 if (peeklen >= 2 && p[0] == '\n' && p[1] == '\n')
559 return p + 2;
560
561 return NULL;
562 }
563
564 /* The maximum size of a single HTTP response we care to read. Rather
565 than being a limit of the reader implementation, this limit
566 prevents Wget from slurping all available memory upon encountering
567 malicious or buggy server output, thus protecting the user. Define
568 it to 0 to remove the limit. */
569
570 #define HTTP_RESPONSE_MAX_SIZE 65536
571
572 /* Read the HTTP request head from FD and return it. The error
573 conditions are the same as with fd_read_hunk.
574
575 To support HTTP/0.9 responses, this function tries to make sure
576 that the data begins with "HTTP". If this is not the case, no data
577 is read and an empty request is returned, so that the remaining
578 data can be treated as body. */
579
580 static char *
read_http_response_head(int fd)581 read_http_response_head (int fd)
582 {
583 return fd_read_hunk (fd, response_head_terminator, 512,
584 HTTP_RESPONSE_MAX_SIZE);
585 }
586
587 struct response {
588 /* The response data. */
589 const char *data;
590
591 /* The array of pointers that indicate where each header starts.
592 For example, given this HTTP response:
593
594 HTTP/1.0 200 Ok
595 Description: some
596 text
597 Etag: x
598
599 The headers are located like this:
600
601 "HTTP/1.0 200 Ok\r\nDescription: some\r\n text\r\nEtag: x\r\n\r\n"
602 ^ ^ ^ ^
603 headers[0] headers[1] headers[2] headers[3]
604
605 I.e. headers[0] points to the beginning of the request,
606 headers[1] points to the end of the first header and the
607 beginning of the second one, etc. */
608
609 const char **headers;
610 };
611
612 /* Create a new response object from the text of the HTTP response,
613 available in HEAD. That text is automatically split into
614 constituent header lines for fast retrieval using
615 resp_header_*. */
616
617 static struct response *
resp_new(char * head)618 resp_new (char *head)
619 {
620 char *hdr;
621 int count, size;
622
623 struct response *resp = xnew0 (struct response);
624 resp->data = head;
625
626 if (*head == '\0')
627 {
628 /* Empty head means that we're dealing with a headerless
629 (HTTP/0.9) response. In that case, don't set HEADERS at
630 all. */
631 return resp;
632 }
633
634 /* Split HEAD into header lines, so that resp_header_* functions
635 don't need to do this over and over again. */
636
637 size = count = 0;
638 hdr = head;
639 while (1)
640 {
641 DO_REALLOC (resp->headers, size, count + 1, const char *);
642 resp->headers[count++] = hdr;
643
644 /* Break upon encountering an empty line. */
645 if (!hdr[0] || (hdr[0] == '\r' && hdr[1] == '\n') || hdr[0] == '\n')
646 break;
647
648 /* Find the end of HDR, including continuations. */
649 for (;;)
650 {
651 char *end = strchr (hdr, '\n');
652
653 if (!end)
654 {
655 hdr += strlen (hdr);
656 break;
657 }
658
659 hdr = end + 1;
660
661 if (*hdr != ' ' && *hdr != '\t')
662 break;
663
664 // continuation, transform \r and \n into spaces
665 *end = ' ';
666 if (end > head && end[-1] == '\r')
667 end[-1] = ' ';
668 }
669 }
670 DO_REALLOC (resp->headers, size, count + 1, const char *);
671 resp->headers[count] = NULL;
672
673 return resp;
674 }
675
676 /* Locate the header named NAME in the request data, starting with
677 position START. This allows the code to loop through the request
678 data, filtering for all requests of a given name. Returns the
679 found position, or -1 for failure. The code that uses this
680 function typically looks like this:
681
682 for (pos = 0; (pos = resp_header_locate (...)) != -1; pos++)
683 ... do something with header ...
684
685 If you only care about one header, use resp_header_get instead of
686 this function. */
687
688 static int
resp_header_locate(const struct response * resp,const char * name,int start,const char ** begptr,const char ** endptr)689 resp_header_locate (const struct response *resp, const char *name, int start,
690 const char **begptr, const char **endptr)
691 {
692 int i;
693 const char **headers = resp->headers;
694 int name_len;
695
696 if (!headers || !headers[1])
697 return -1;
698
699 name_len = strlen (name);
700 if (start > 0)
701 i = start;
702 else
703 i = 1;
704
705 for (; headers[i + 1]; i++)
706 {
707 const char *b = headers[i];
708 const char *e = headers[i + 1];
709 if (e - b > name_len
710 && b[name_len] == ':'
711 && 0 == c_strncasecmp (b, name, name_len))
712 {
713 b += name_len + 1;
714 while (b < e && c_isspace (*b))
715 ++b;
716 while (b < e && c_isspace (e[-1]))
717 --e;
718 *begptr = b;
719 *endptr = e;
720 return i;
721 }
722 }
723 return -1;
724 }
725
726 /* Find and retrieve the header named NAME in the request data. If
727 found, set *BEGPTR to its starting, and *ENDPTR to its ending
728 position, and return true. Otherwise return false.
729
730 This function is used as a building block for resp_header_copy
731 and resp_header_strdup. */
732
733 static bool
resp_header_get(const struct response * resp,const char * name,const char ** begptr,const char ** endptr)734 resp_header_get (const struct response *resp, const char *name,
735 const char **begptr, const char **endptr)
736 {
737 int pos = resp_header_locate (resp, name, 0, begptr, endptr);
738 return pos != -1;
739 }
740
741 /* Copy the response header named NAME to buffer BUF, no longer than
742 BUFSIZE (BUFSIZE includes the terminating 0). If the header
743 exists, true is returned, false otherwise. If there should be no
744 limit on the size of the header, use resp_header_strdup instead.
745
746 If BUFSIZE is 0, no data is copied, but the boolean indication of
747 whether the header is present is still returned. */
748
749 static bool
resp_header_copy(const struct response * resp,const char * name,char * buf,int bufsize)750 resp_header_copy (const struct response *resp, const char *name,
751 char *buf, int bufsize)
752 {
753 const char *b, *e;
754 if (!resp_header_get (resp, name, &b, &e))
755 return false;
756 if (bufsize)
757 {
758 int len = MIN (e - b, bufsize - 1);
759 memcpy (buf, b, len);
760 buf[len] = '\0';
761 }
762 return true;
763 }
764
765 /* Return the value of header named NAME in RESP, allocated with
766 malloc. If such a header does not exist in RESP, return NULL. */
767
768 static char *
resp_header_strdup(const struct response * resp,const char * name)769 resp_header_strdup (const struct response *resp, const char *name)
770 {
771 const char *b, *e;
772 if (!resp_header_get (resp, name, &b, &e))
773 return NULL;
774 return strdupdelim (b, e);
775 }
776
777 /* Parse the HTTP status line, which is of format:
778
779 HTTP-Version SP Status-Code SP Reason-Phrase
780
781 The function returns the status-code, or -1 if the status line
782 appears malformed. The pointer to "reason-phrase" message is
783 returned in *MESSAGE. */
784
785 static int
resp_status(const struct response * resp,char ** message)786 resp_status (const struct response *resp, char **message)
787 {
788 int status;
789 const char *p, *end;
790
791 if (!resp->headers)
792 {
793 /* For a HTTP/0.9 response, assume status 200. */
794 if (message)
795 *message = xstrdup (_("No headers, assuming HTTP/0.9"));
796 return 200;
797 }
798
799 p = resp->headers[0];
800 end = resp->headers[1];
801
802 if (!end)
803 return -1;
804
805 /* "HTTP" */
806 if (end - p < 4 || 0 != strncmp (p, "HTTP", 4))
807 return -1;
808 p += 4;
809
810 /* Match the HTTP version. This is optional because Gnutella
811 servers have been reported to not specify HTTP version. */
812 if (p < end && *p == '/')
813 {
814 ++p;
815 while (p < end && c_isdigit (*p))
816 ++p;
817 if (p < end && *p == '.')
818 ++p;
819 while (p < end && c_isdigit (*p))
820 ++p;
821 }
822
823 while (p < end && c_isspace (*p))
824 ++p;
825 if (end - p < 3 || !c_isdigit (p[0]) || !c_isdigit (p[1]) || !c_isdigit (p[2]))
826 return -1;
827
828 status = 100 * (p[0] - '0') + 10 * (p[1] - '0') + (p[2] - '0');
829 p += 3;
830
831 if (message)
832 {
833 while (p < end && c_isspace (*p))
834 ++p;
835 while (p < end && c_isspace (end[-1]))
836 --end;
837 *message = strdupdelim (p, end);
838 }
839
840 return status;
841 }
842
843 /* Release the resources used by RESP.
844 It is safe to call it with a valid pointer to a NULL pointer.
845 It is not safe to call it with a invalid or NULL pointer. */
846
847 static void
resp_free(struct response ** resp_ref)848 resp_free (struct response **resp_ref)
849 {
850 struct response *resp = *resp_ref;
851
852 if (!resp)
853 return;
854
855 xfree (resp->headers);
856 xfree (resp);
857
858 *resp_ref = NULL;
859 }
860
861 /* Print a single line of response, the characters [b, e). We tried
862 getting away with
863 logprintf (LOG_VERBOSE, "%s%.*s\n", prefix, (int) (e - b), b);
864 but that failed to escape the non-printable characters and, in fact,
865 caused crashes in UTF-8 locales. */
866
867 static void
print_response_line(const char * prefix,const char * b,const char * e)868 print_response_line (const char *prefix, const char *b, const char *e)
869 {
870 char buf[1024], *copy;
871 size_t len = e - b;
872
873 if (len < sizeof (buf))
874 copy = buf;
875 else
876 copy = xmalloc(len + 1);
877
878 memcpy(copy, b, len);
879 copy[len] = 0;
880
881 logprintf (LOG_ALWAYS, "%s%s\n", prefix,
882 quotearg_style (escape_quoting_style, copy));
883
884 if (copy != buf)
885 xfree (copy);
886 }
887
888 /* Print the server response, line by line, omitting the trailing CRLF
889 from individual header lines, and prefixed with PREFIX. */
890
891 static void
print_server_response(const struct response * resp,const char * prefix)892 print_server_response (const struct response *resp, const char *prefix)
893 {
894 int i;
895 if (!resp->headers)
896 return;
897 for (i = 0; resp->headers[i + 1]; i++)
898 {
899 const char *b = resp->headers[i];
900 const char *e = resp->headers[i + 1];
901 /* Skip CRLF */
902 if (b < e && e[-1] == '\n')
903 --e;
904 if (b < e && e[-1] == '\r')
905 --e;
906 print_response_line (prefix, b, e);
907 }
908 }
909
910 /* Parse the `Content-Range' header and extract the information it
911 contains. Returns true if successful, false otherwise. */
912 static bool
parse_content_range(const char * hdr,wgint * first_byte_ptr,wgint * last_byte_ptr,wgint * entity_length_ptr)913 parse_content_range (const char *hdr, wgint *first_byte_ptr,
914 wgint *last_byte_ptr, wgint *entity_length_ptr)
915 {
916 wgint num;
917
918 /* Ancient versions of Netscape proxy server, presumably predating
919 rfc2068, sent out `Content-Range' without the "bytes"
920 specifier. */
921 if (0 == strncasecmp (hdr, "bytes", 5))
922 {
923 hdr += 5;
924 /* "JavaWebServer/1.1.1" sends "bytes: x-y/z", contrary to the
925 HTTP spec. */
926 if (*hdr == ':')
927 ++hdr;
928 while (c_isspace (*hdr))
929 ++hdr;
930 if (!*hdr)
931 return false;
932 }
933 if (!c_isdigit (*hdr))
934 return false;
935 for (num = 0; c_isdigit (*hdr); hdr++)
936 num = 10 * num + (*hdr - '0');
937 if (*hdr != '-' || !c_isdigit (*(hdr + 1)))
938 return false;
939 *first_byte_ptr = num;
940 ++hdr;
941 for (num = 0; c_isdigit (*hdr); hdr++)
942 num = 10 * num + (*hdr - '0');
943 if (*hdr != '/')
944 return false;
945 *last_byte_ptr = num;
946 if (!(c_isdigit (*(hdr + 1)) || *(hdr + 1) == '*'))
947 return false;
948 if (*last_byte_ptr < *first_byte_ptr)
949 return false;
950 ++hdr;
951 if (*hdr == '*')
952 num = -1;
953 else
954 for (num = 0; c_isdigit (*hdr); hdr++)
955 num = 10 * num + (*hdr - '0');
956 *entity_length_ptr = num;
957 if ((*entity_length_ptr <= *last_byte_ptr) && *entity_length_ptr != -1)
958 return false;
959 return true;
960 }
961
962 /* Read the body of the request, but don't store it anywhere and don't
963 display a progress gauge. This is useful for reading the bodies of
964 administrative responses to which we will soon issue another
965 request. The response is not useful to the user, but reading it
966 allows us to continue using the same connection to the server.
967
968 If reading fails, false is returned, true otherwise. In debug
969 mode, the body is displayed for debugging purposes. */
970
971 static bool
skip_short_body(int fd,wgint contlen,bool chunked)972 skip_short_body (int fd, wgint contlen, bool chunked)
973 {
974 enum {
975 SKIP_SIZE = 512, /* size of the download buffer */
976 SKIP_THRESHOLD = 4096 /* the largest size we read */
977 };
978 wgint remaining_chunk_size = 0;
979 char dlbuf[SKIP_SIZE + 1];
980 dlbuf[SKIP_SIZE] = '\0'; /* so DEBUGP can safely print it */
981
982 /* If the body is too large, it makes more sense to simply close the
983 connection than to try to read the body. */
984 if (contlen > SKIP_THRESHOLD)
985 return false;
986
987 while (contlen > 0 || chunked)
988 {
989 int ret;
990 if (chunked)
991 {
992 if (remaining_chunk_size == 0)
993 {
994 char *line = fd_read_line (fd);
995 char *endl;
996 if (line == NULL)
997 break;
998
999 remaining_chunk_size = strtol (line, &endl, 16);
1000 xfree (line);
1001
1002 if (remaining_chunk_size < 0)
1003 return false;
1004
1005 if (remaining_chunk_size == 0)
1006 {
1007 line = fd_read_line (fd);
1008 xfree (line);
1009 break;
1010 }
1011 }
1012
1013 contlen = MIN (remaining_chunk_size, SKIP_SIZE);
1014 }
1015
1016 DEBUGP (("Skipping %s bytes of body: [", number_to_static_string (contlen)));
1017
1018 ret = fd_read (fd, dlbuf, MIN (contlen, SKIP_SIZE), -1);
1019 if (ret <= 0)
1020 {
1021 /* Don't normally report the error since this is an
1022 optimization that should be invisible to the user. */
1023 DEBUGP (("] aborting (%s).\n",
1024 ret < 0 ? fd_errstr (fd) : "EOF received"));
1025 return false;
1026 }
1027 contlen -= ret;
1028
1029 if (chunked)
1030 {
1031 remaining_chunk_size -= ret;
1032 if (remaining_chunk_size == 0)
1033 {
1034 char *line = fd_read_line (fd);
1035 if (line == NULL)
1036 return false;
1037 else
1038 xfree (line);
1039 }
1040 }
1041
1042 /* Safe even if %.*s bogusly expects terminating \0 because
1043 we've zero-terminated dlbuf above. */
1044 DEBUGP (("%.*s", ret, dlbuf));
1045 }
1046
1047 DEBUGP (("] done.\n"));
1048 return true;
1049 }
1050
1051 #define NOT_RFC2231 0
1052 #define RFC2231_NOENCODING 1
1053 #define RFC2231_ENCODING 2
1054
1055 /* extract_param extracts the parameter name into NAME.
1056 However, if the parameter name is in RFC2231 format then
1057 this function adjusts NAME by stripping of the trailing
1058 characters that are not part of the name but are present to
1059 indicate the presence of encoding information in the value
1060 or a fragment of a long parameter value
1061 */
1062 static int
modify_param_name(param_token * name)1063 modify_param_name (param_token *name)
1064 {
1065 const char *delim1 = memchr (name->b, '*', name->e - name->b);
1066 const char *delim2 = memrchr (name->b, '*', name->e - name->b);
1067
1068 int result;
1069
1070 if (delim1 == NULL)
1071 {
1072 result = NOT_RFC2231;
1073 }
1074 else if (delim1 == delim2)
1075 {
1076 if ((name->e - 1) == delim1)
1077 {
1078 result = RFC2231_ENCODING;
1079 }
1080 else
1081 {
1082 result = RFC2231_NOENCODING;
1083 }
1084 name->e = delim1;
1085 }
1086 else
1087 {
1088 name->e = delim1;
1089 result = RFC2231_ENCODING;
1090 }
1091 return result;
1092 }
1093
1094 /* extract_param extract the parameter value into VALUE.
1095 Like modify_param_name this function modifies VALUE by
1096 stripping off the encoding information from the actual value
1097 */
1098 static void
modify_param_value(param_token * value,int encoding_type)1099 modify_param_value (param_token *value, int encoding_type )
1100 {
1101 if (encoding_type == RFC2231_ENCODING)
1102 {
1103 const char *delim = memrchr (value->b, '\'', value->e - value->b);
1104 if (delim != NULL)
1105 {
1106 value->b = (delim+1);
1107 }
1108 }
1109 }
1110
1111 /* Extract a parameter from the string (typically an HTTP header) at
1112 **SOURCE and advance SOURCE to the next parameter. Return false
1113 when there are no more parameters to extract. The name of the
1114 parameter is returned in NAME, and the value in VALUE. If the
1115 parameter has no value, the token's value is zeroed out.
1116
1117 For example, if *SOURCE points to the string "attachment;
1118 filename=\"foo bar\"", the first call to this function will return
1119 the token named "attachment" and no value, and the second call will
1120 return the token named "filename" and value "foo bar". The third
1121 call will return false, indicating no more valid tokens.
1122
1123 is_url_encoded is an out parameter. If not NULL, a boolean value will be
1124 stored into it, letting the caller know whether or not the extracted value is
1125 URL-encoded. The caller can then decode it with url_unescape(), which however
1126 performs decoding in-place. URL-encoding is used by RFC 2231 to support
1127 non-US-ASCII characters in HTTP header values. */
1128
1129 bool
extract_param(const char ** source,param_token * name,param_token * value,char separator,bool * is_url_encoded)1130 extract_param (const char **source, param_token *name, param_token *value,
1131 char separator, bool *is_url_encoded)
1132 {
1133 const char *p = *source;
1134 int param_type;
1135 if (is_url_encoded)
1136 *is_url_encoded = false; /* initializing the out parameter */
1137
1138 while (c_isspace (*p)) ++p;
1139 if (!*p)
1140 {
1141 *source = p;
1142 return false; /* no error; nothing more to extract */
1143 }
1144
1145 /* Extract name. */
1146 name->b = p;
1147 while (*p && !c_isspace (*p) && *p != '=' && *p != separator) ++p;
1148 name->e = p;
1149 if (name->b == name->e)
1150 return false; /* empty name: error */
1151 while (c_isspace (*p)) ++p;
1152 if (*p == separator || !*p) /* no value */
1153 {
1154 xzero (*value);
1155 if (*p == separator) ++p;
1156 *source = p;
1157 return true;
1158 }
1159 if (*p != '=')
1160 return false; /* error */
1161
1162 /* *p is '=', extract value */
1163 ++p;
1164 while (c_isspace (*p)) ++p;
1165 if (*p == '"') /* quoted */
1166 {
1167 value->b = ++p;
1168 while (*p && *p != '"') ++p;
1169 if (!*p)
1170 return false;
1171 value->e = p++;
1172 /* Currently at closing quote; find the end of param. */
1173 while (c_isspace (*p)) ++p;
1174 while (*p && *p != separator) ++p;
1175 if (*p == separator)
1176 ++p;
1177 else if (*p)
1178 /* garbage after closed quote, e.g. foo="bar"baz */
1179 return false;
1180 }
1181 else /* unquoted */
1182 {
1183 value->b = p;
1184 while (*p && *p != separator) ++p;
1185 value->e = p;
1186 while (value->e != value->b && c_isspace (value->e[-1]))
1187 --value->e;
1188 if (*p == separator) ++p;
1189 }
1190 *source = p;
1191
1192 param_type = modify_param_name (name);
1193 if (param_type != NOT_RFC2231)
1194 {
1195 if (param_type == RFC2231_ENCODING && is_url_encoded)
1196 *is_url_encoded = true;
1197 modify_param_value (value, param_type);
1198 }
1199 return true;
1200 }
1201
1202 #undef NOT_RFC2231
1203 #undef RFC2231_NOENCODING
1204 #undef RFC2231_ENCODING
1205
1206 /* Appends the string represented by VALUE to FILENAME */
1207
1208 static void
append_value_to_filename(char ** filename,param_token const * const value,bool is_url_encoded)1209 append_value_to_filename (char **filename, param_token const * const value,
1210 bool is_url_encoded)
1211 {
1212 int original_length = strlen (*filename);
1213 int new_length = strlen (*filename) + (value->e - value->b);
1214 *filename = xrealloc (*filename, new_length+1);
1215 memcpy (*filename + original_length, value->b, (value->e - value->b));
1216 (*filename)[new_length] = '\0';
1217 if (is_url_encoded)
1218 url_unescape (*filename + original_length);
1219 }
1220
1221 /* Parse the contents of the `Content-Disposition' header, extracting
1222 the information useful to Wget. Content-Disposition is a header
1223 borrowed from MIME; when used in HTTP, it typically serves for
1224 specifying the desired file name of the resource. For example:
1225
1226 Content-Disposition: attachment; filename="flora.jpg"
1227
1228 Wget will skip the tokens it doesn't care about, such as
1229 "attachment" in the previous example; it will also skip other
1230 unrecognized params. If the header is syntactically correct and
1231 contains a file name, a copy of the file name is stored in
1232 *filename and true is returned. Otherwise, the function returns
1233 false.
1234
1235 The file name is stripped of directory components and must not be
1236 empty.
1237
1238 Historically, this function returned filename prefixed with opt.dir_prefix,
1239 now that logic is handled by the caller, new code should pay attention,
1240 changed by crq, Sep 2010.
1241
1242 */
1243 static bool
parse_content_disposition(const char * hdr,char ** filename)1244 parse_content_disposition (const char *hdr, char **filename)
1245 {
1246 param_token name, value;
1247 bool is_url_encoded = false;
1248
1249 char *encodedFilename = NULL;
1250 char *unencodedFilename = NULL;
1251 for ( ; extract_param (&hdr, &name, &value, ';', &is_url_encoded);
1252 is_url_encoded = false)
1253 {
1254 int isFilename = BOUNDED_EQUAL_NO_CASE (name.b, name.e, "filename");
1255 if ( isFilename && value.b != NULL)
1256 {
1257 /* Make the file name begin at the last slash or backslash. */
1258 bool isEncodedFilename;
1259 char **outFilename;
1260 const char *last_slash = memrchr (value.b, '/', value.e - value.b);
1261 const char *last_bs = memrchr (value.b, '\\', value.e - value.b);
1262 if (last_slash && last_bs)
1263 value.b = 1 + MAX (last_slash, last_bs);
1264 else if (last_slash || last_bs)
1265 value.b = 1 + (last_slash ? last_slash : last_bs);
1266 if (value.b == value.e)
1267 continue;
1268
1269 /* Check if the name is "filename*" as specified in RFC 6266.
1270 * Since "filename" could be broken up as "filename*N" (RFC 2231),
1271 * a check is needed to make sure this is not the case */
1272 isEncodedFilename = *name.e == '*' && !c_isdigit (*(name.e + 1));
1273 outFilename = isEncodedFilename ? &encodedFilename
1274 : &unencodedFilename;
1275 if (*outFilename)
1276 append_value_to_filename (outFilename, &value, is_url_encoded);
1277 else
1278 {
1279 *outFilename = strdupdelim (value.b, value.e);
1280 if (is_url_encoded)
1281 url_unescape (*outFilename);
1282 }
1283 }
1284 }
1285 if (encodedFilename)
1286 {
1287 xfree (unencodedFilename);
1288 *filename = encodedFilename;
1289 }
1290 else
1291 {
1292 xfree (encodedFilename);
1293 *filename = unencodedFilename;
1294 }
1295 if (*filename)
1296 return true;
1297 else
1298 return false;
1299 }
1300
1301 #ifdef HAVE_HSTS
1302 static bool
parse_strict_transport_security(const char * header,time_t * max_age,bool * include_subdomains)1303 parse_strict_transport_security (const char *header, time_t *max_age, bool *include_subdomains)
1304 {
1305 param_token name, value;
1306 const char *c_max_age = NULL;
1307 bool is = false; /* includeSubDomains */
1308 bool is_url_encoded = false;
1309 bool success = false;
1310
1311 if (header)
1312 {
1313 /* Process the STS header. Keys should be matched case-insensitively. */
1314 for (; extract_param (&header, &name, &value, ';', &is_url_encoded); is_url_encoded = false)
1315 {
1316 if (BOUNDED_EQUAL_NO_CASE (name.b, name.e, "max-age"))
1317 {
1318 xfree (c_max_age);
1319 c_max_age = strdupdelim (value.b, value.e);
1320 }
1321 else if (BOUNDED_EQUAL_NO_CASE (name.b, name.e, "includeSubDomains"))
1322 is = true;
1323 }
1324
1325 /* pass the parsed values over */
1326 if (c_max_age)
1327 {
1328 /* If the string value goes out of a long's bounds, strtol() will return LONG_MIN or LONG_MAX.
1329 * In theory, the HSTS engine should be able to handle it.
1330 * Also, time_t is normally defined as a long, so this should not break.
1331 */
1332 if (max_age)
1333 *max_age = (time_t) strtol (c_max_age, NULL, 10);
1334 if (include_subdomains)
1335 *include_subdomains = is;
1336
1337 DEBUGP (("Parsed Strict-Transport-Security max-age = %s, includeSubDomains = %s\n",
1338 c_max_age, (is ? "true" : "false")));
1339
1340 xfree (c_max_age);
1341 success = true;
1342 }
1343 else
1344 {
1345 /* something weird happened */
1346 logprintf (LOG_VERBOSE, "Could not parse String-Transport-Security header\n");
1347 success = false;
1348 }
1349 }
1350
1351 return success;
1352 }
1353 #endif
1354
1355 /* Persistent connections. Currently, we cache the most recently used
1356 connection as persistent, provided that the HTTP server agrees to
1357 make it such. The persistence data is stored in the variables
1358 below. Ideally, it should be possible to cache an arbitrary fixed
1359 number of these connections. */
1360
1361 /* Whether a persistent connection is active. */
1362 static bool pconn_active;
1363
1364 static struct {
1365 /* The socket of the connection. */
1366 int socket;
1367
1368 /* Host and port of the currently active persistent connection. */
1369 char *host;
1370 int port;
1371
1372 /* Whether a ssl handshake has occurred on this connection. */
1373 bool ssl;
1374
1375 /* Whether the connection was authorized. This is only done by
1376 NTLM, which authorizes *connections* rather than individual
1377 requests. (That practice is peculiar for HTTP, but it is a
1378 useful optimization.) */
1379 bool authorized;
1380
1381 #ifdef ENABLE_NTLM
1382 /* NTLM data of the current connection. */
1383 struct ntlmdata ntlm;
1384 #endif
1385 } pconn;
1386
1387 /* Mark the persistent connection as invalid and free the resources it
1388 uses. This is used by the CLOSE_* macros after they forcefully
1389 close a registered persistent connection. */
1390
1391 static void
invalidate_persistent(void)1392 invalidate_persistent (void)
1393 {
1394 DEBUGP (("Disabling further reuse of socket %d.\n", pconn.socket));
1395 pconn_active = false;
1396 fd_close (pconn.socket);
1397 xfree (pconn.host);
1398 xzero (pconn);
1399 }
1400
1401 /* Register FD, which should be a TCP/IP connection to HOST:PORT, as
1402 persistent. This will enable someone to use the same connection
1403 later. In the context of HTTP, this must be called only AFTER the
1404 response has been received and the server has promised that the
1405 connection will remain alive.
1406
1407 If a previous connection was persistent, it is closed. */
1408
1409 static void
register_persistent(const char * host,int port,int fd,bool ssl)1410 register_persistent (const char *host, int port, int fd, bool ssl)
1411 {
1412 if (pconn_active)
1413 {
1414 if (pconn.socket == fd)
1415 {
1416 /* The connection FD is already registered. */
1417 return;
1418 }
1419 else
1420 {
1421 /* The old persistent connection is still active; close it
1422 first. This situation arises whenever a persistent
1423 connection exists, but we then connect to a different
1424 host, and try to register a persistent connection to that
1425 one. */
1426 invalidate_persistent ();
1427 }
1428 }
1429
1430 pconn_active = true;
1431 pconn.socket = fd;
1432 pconn.host = xstrdup (host);
1433 pconn.port = port;
1434 pconn.ssl = ssl;
1435 pconn.authorized = false;
1436
1437 DEBUGP (("Registered socket %d for persistent reuse.\n", fd));
1438 }
1439
1440 /* Return true if a persistent connection is available for connecting
1441 to HOST:PORT. */
1442
1443 static bool
persistent_available_p(const char * host,int port,bool ssl,bool * host_lookup_failed)1444 persistent_available_p (const char *host, int port, bool ssl,
1445 bool *host_lookup_failed)
1446 {
1447 /* First, check whether a persistent connection is active at all. */
1448 if (!pconn_active)
1449 return false;
1450
1451 /* If we want SSL and the last connection wasn't or vice versa,
1452 don't use it. Checking for host and port is not enough because
1453 HTTP and HTTPS can apparently coexist on the same port. */
1454 if (ssl != pconn.ssl)
1455 return false;
1456
1457 /* If we're not connecting to the same port, we're not interested. */
1458 if (port != pconn.port)
1459 return false;
1460
1461 /* If the host is the same, we're in business. If not, there is
1462 still hope -- read below. */
1463 if (0 != strcasecmp (host, pconn.host))
1464 {
1465 /* Check if pconn.socket is talking to HOST under another name.
1466 This happens often when both sites are virtual hosts
1467 distinguished only by name and served by the same network
1468 interface, and hence the same web server (possibly set up by
1469 the ISP and serving many different web sites). This
1470 admittedly unconventional optimization does not contradict
1471 HTTP and works well with popular server software. */
1472
1473 bool found;
1474 ip_address ip;
1475 struct address_list *al;
1476
1477 if (ssl)
1478 /* Don't try to talk to two different SSL sites over the same
1479 secure connection! (Besides, it's not clear that
1480 name-based virtual hosting is even possible with SSL.) */
1481 return false;
1482
1483 /* If pconn.socket's peer is one of the IP addresses HOST
1484 resolves to, pconn.socket is for all intents and purposes
1485 already talking to HOST. */
1486
1487 if (!socket_ip_address (pconn.socket, &ip, ENDPOINT_PEER))
1488 {
1489 /* Can't get the peer's address -- something must be very
1490 wrong with the connection. */
1491 invalidate_persistent ();
1492 return false;
1493 }
1494 al = lookup_host (host, 0);
1495 if (!al)
1496 {
1497 *host_lookup_failed = true;
1498 return false;
1499 }
1500
1501 found = address_list_contains (al, &ip);
1502 address_list_release (al);
1503
1504 if (!found)
1505 return false;
1506
1507 /* The persistent connection's peer address was found among the
1508 addresses HOST resolved to; therefore, pconn.sock is in fact
1509 already talking to HOST -- no need to reconnect. */
1510 }
1511
1512 /* Finally, check whether the connection is still open. This is
1513 important because most servers implement liberal (short) timeout
1514 on persistent connections. Wget can of course always reconnect
1515 if the connection doesn't work out, but it's nicer to know in
1516 advance. This test is a logical followup of the first test, but
1517 is "expensive" and therefore placed at the end of the list.
1518
1519 (Current implementation of test_socket_open has a nice side
1520 effect that it treats sockets with pending data as "closed".
1521 This is exactly what we want: if a broken server sends message
1522 body in response to HEAD, or if it sends more than conent-length
1523 data, we won't reuse the corrupted connection.) */
1524
1525 if (!test_socket_open (pconn.socket))
1526 {
1527 /* Oops, the socket is no longer open. Now that we know that,
1528 let's invalidate the persistent connection before returning
1529 0. */
1530 invalidate_persistent ();
1531 return false;
1532 }
1533
1534 return true;
1535 }
1536
1537 /* The idea behind these two CLOSE macros is to distinguish between
1538 two cases: one when the job we've been doing is finished, and we
1539 want to close the connection and leave, and two when something is
1540 seriously wrong and we're closing the connection as part of
1541 cleanup.
1542
1543 In case of keep_alive, CLOSE_FINISH should leave the connection
1544 open, while CLOSE_INVALIDATE should still close it.
1545
1546 Note that the semantics of the flag `keep_alive' is "this
1547 connection *will* be reused (the server has promised not to close
1548 the connection once we're done)", while the semantics of
1549 `pc_active_p && (fd) == pc_last_fd' is "we're *now* using an
1550 active, registered connection". */
1551
1552 #define CLOSE_FINISH(fd) do { \
1553 if (!keep_alive) \
1554 { \
1555 if (pconn_active && (fd) == pconn.socket) \
1556 invalidate_persistent (); \
1557 else \
1558 fd_close (fd); \
1559 fd = -1; \
1560 } \
1561 } while (0)
1562
1563 #define CLOSE_INVALIDATE(fd) do { \
1564 if (pconn_active && (fd) == pconn.socket) \
1565 invalidate_persistent (); \
1566 else \
1567 fd_close (fd); \
1568 fd = -1; \
1569 } while (0)
1570
1571 typedef enum
1572 {
1573 ENC_INVALID = -1, /* invalid encoding */
1574 ENC_NONE = 0, /* no special encoding */
1575 ENC_GZIP, /* gzip compression */
1576 ENC_DEFLATE, /* deflate compression */
1577 ENC_COMPRESS, /* compress compression */
1578 ENC_BROTLI /* brotli compression */
1579 } encoding_t;
1580
1581 struct http_stat
1582 {
1583 wgint len; /* received length */
1584 wgint contlen; /* expected length */
1585 wgint restval; /* the restart value */
1586 int res; /* the result of last read */
1587 char *rderrmsg; /* error message from read error */
1588 char *newloc; /* new location (redirection) */
1589 char *remote_time; /* remote time-stamp string */
1590 char *error; /* textual HTTP error */
1591 int statcode; /* status code */
1592 char *message; /* status message */
1593 wgint rd_size; /* amount of data read from socket */
1594 double dltime; /* time it took to download the data */
1595 const char *referer; /* value of the referer header. */
1596 char *local_file; /* local file name. */
1597 bool existence_checked; /* true if we already checked for a file's
1598 existence after having begun to download
1599 (needed in gethttp for when connection is
1600 interrupted/restarted. */
1601 bool timestamp_checked; /* true if pre-download time-stamping checks
1602 * have already been performed */
1603 char *orig_file_name; /* name of file to compare for time-stamping
1604 * (might be != local_file if -K is set) */
1605 wgint orig_file_size; /* size of file to compare for time-stamping */
1606 time_t orig_file_tstamp; /* time-stamp of file to compare for
1607 * time-stamping */
1608 #ifdef HAVE_METALINK
1609 metalink_t *metalink;
1610 #endif
1611
1612 encoding_t local_encoding; /* the encoding of the local file */
1613 encoding_t remote_encoding; /* the encoding of the remote file */
1614
1615 bool temporary; /* downloading a temporary file */
1616 };
1617
1618 static void
free_hstat(struct http_stat * hs)1619 free_hstat (struct http_stat *hs)
1620 {
1621 xfree (hs->newloc);
1622 xfree (hs->remote_time);
1623 xfree (hs->error);
1624 xfree (hs->rderrmsg);
1625 xfree (hs->local_file);
1626 xfree (hs->orig_file_name);
1627 xfree (hs->message);
1628 #ifdef HAVE_METALINK
1629 metalink_delete (hs->metalink);
1630 hs->metalink = NULL;
1631 #endif
1632 }
1633
1634 static void
get_file_flags(const char * filename,int * dt)1635 get_file_flags (const char *filename, int *dt)
1636 {
1637 logprintf (LOG_VERBOSE, _("\
1638 File %s already there; not retrieving.\n\n"), quote (filename));
1639 /* If the file is there, we suppose it's retrieved OK. */
1640 *dt |= RETROKF;
1641
1642 /* #### Bogusness alert. */
1643 /* If its suffix is "html" or "htm" or similar, assume text/html. */
1644 if (has_html_suffix_p (filename))
1645 *dt |= TEXTHTML;
1646 }
1647
1648 /* Download the response body from the socket and writes it to
1649 an output file. The headers have already been read from the
1650 socket. If WARC is enabled, the response body will also be
1651 written to a WARC response record.
1652
1653 hs, contlen, contrange, chunked_transfer_encoding and url are
1654 parameters from the gethttp method. fp is a pointer to the
1655 output file.
1656
1657 url, warc_timestamp_str, warc_request_uuid, warc_ip, type
1658 and statcode will be saved in the headers of the WARC record.
1659 The head parameter contains the HTTP headers of the response.
1660
1661 If fp is NULL and WARC is enabled, the response body will be
1662 written only to the WARC file. If WARC is disabled and fp
1663 is a file pointer, the data will be written to the file.
1664 If fp is a file pointer and WARC is enabled, the body will
1665 be written to both destinations.
1666
1667 Returns the error code. */
1668 static int
read_response_body(struct http_stat * hs,int sock,FILE * fp,wgint contlen,wgint contrange,bool chunked_transfer_encoding,char * url,char * warc_timestamp_str,char * warc_request_uuid,ip_address * warc_ip,char * type,int statcode,char * head)1669 read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen,
1670 wgint contrange, bool chunked_transfer_encoding,
1671 char *url, char *warc_timestamp_str, char *warc_request_uuid,
1672 ip_address *warc_ip, char *type, int statcode, char *head)
1673 {
1674 int warc_payload_offset = 0;
1675 FILE *warc_tmp = NULL;
1676 int warcerr = 0;
1677 int flags = 0;
1678
1679 if (opt.warc_filename != NULL)
1680 {
1681 /* Open a temporary file where we can write the response before we
1682 add it to the WARC record. */
1683 warc_tmp = warc_tempfile ();
1684 if (warc_tmp == NULL)
1685 warcerr = WARC_TMP_FOPENERR;
1686
1687 if (warcerr == 0)
1688 {
1689 /* We should keep the response headers for the WARC record. */
1690 int head_len = strlen (head);
1691 int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp);
1692 if (warc_tmp_written != head_len)
1693 warcerr = WARC_TMP_FWRITEERR;
1694 warc_payload_offset = head_len;
1695 }
1696
1697 if (warcerr != 0)
1698 {
1699 if (warc_tmp != NULL)
1700 fclose (warc_tmp);
1701 return warcerr;
1702 }
1703 }
1704
1705 if (fp != NULL)
1706 {
1707 /* This confuses the timestamping code that checks for file size.
1708 #### The timestamping code should be smarter about file size. */
1709 if (opt.save_headers && hs->restval == 0)
1710 fwrite (head, 1, strlen (head), fp);
1711 }
1712
1713 /* Read the response body. */
1714 if (contlen != -1)
1715 /* If content-length is present, read that much; otherwise, read
1716 until EOF. The HTTP spec doesn't require the server to
1717 actually close the connection when it's done sending data. */
1718 flags |= rb_read_exactly;
1719 if (fp != NULL && hs->restval > 0 && contrange == 0)
1720 /* If the server ignored our range request, instruct fd_read_body
1721 to skip the first RESTVAL bytes of body. */
1722 flags |= rb_skip_startpos;
1723 if (chunked_transfer_encoding)
1724 flags |= rb_chunked_transfer_encoding;
1725
1726 if (hs->remote_encoding == ENC_GZIP)
1727 flags |= rb_compressed_gzip;
1728
1729 hs->len = hs->restval;
1730 hs->rd_size = 0;
1731 /* Download the response body and write it to fp.
1732 If we are working on a WARC file, we simultaneously write the
1733 response body to warc_tmp. */
1734 hs->res = fd_read_body (hs->local_file, sock, fp, contlen != -1 ? contlen : 0,
1735 hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
1736 flags, warc_tmp);
1737 if (hs->res >= 0)
1738 {
1739 if (warc_tmp != NULL)
1740 {
1741 /* Create a response record and write it to the WARC file.
1742 Note: per the WARC standard, the request and response should share
1743 the same date header. We re-use the timestamp of the request.
1744 The response record should also refer to the uuid of the request. */
1745 bool r = warc_write_response_record (url, warc_timestamp_str,
1746 warc_request_uuid, warc_ip,
1747 warc_tmp, warc_payload_offset,
1748 type, statcode, hs->newloc);
1749
1750 /* warc_write_response_record has closed warc_tmp. */
1751
1752 if (! r)
1753 return WARC_ERR;
1754 }
1755
1756 return RETRFINISHED;
1757 }
1758
1759 if (warc_tmp != NULL)
1760 fclose (warc_tmp);
1761
1762 if (hs->res == -2)
1763 {
1764 /* Error while writing to fd. */
1765 return FWRITEERR;
1766 }
1767 else if (hs->res == -3)
1768 {
1769 /* Error while writing to warc_tmp. */
1770 return WARC_TMP_FWRITEERR;
1771 }
1772 else
1773 {
1774 /* A read error! */
1775 xfree (hs->rderrmsg);
1776 hs->rderrmsg = xstrdup (fd_errstr (sock));
1777 return RETRFINISHED;
1778 }
1779 }
1780
1781 #define BEGINS_WITH(line, string_constant) \
1782 (!c_strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
1783 && (c_isspace (line[sizeof (string_constant) - 1]) \
1784 || !line[sizeof (string_constant) - 1]))
1785
1786 #define SET_USER_AGENT(req) do { \
1787 if (!opt.useragent) \
1788 request_set_header (req, "User-Agent", \
1789 aprintf ("Wget/%s", \
1790 version_string), \
1791 rel_value); \
1792 else if (*opt.useragent) \
1793 request_set_header (req, "User-Agent", opt.useragent, rel_none); \
1794 } while (0)
1795
1796 /*
1797 Convert time_t to one of valid HTTP date formats
1798 ie. rfc1123-date.
1799
1800 HTTP-date = rfc1123-date | rfc850-date | asctime-date
1801 rfc1123-date = wkday "," SP date1 SP time SP "GMT"
1802 rfc850-date = weekday "," SP date2 SP time SP "GMT"
1803 asctime-date = wkday SP date3 SP time SP 4DIGIT
1804 date1 = 2DIGIT SP month SP 4DIGIT
1805 ; day month year (e.g., 02 Jun 1982)
1806 date2 = 2DIGIT "-" month "-" 2DIGIT
1807 ; day-month-year (e.g., 02-Jun-82)
1808 date3 = month SP ( 2DIGIT | ( SP 1DIGIT ))
1809 ; month day (e.g., Jun 2)
1810 time = 2DIGIT ":" 2DIGIT ":" 2DIGIT
1811 ; 00:00:00 - 23:59:59
1812 wkday = "Mon" | "Tue" | "Wed"
1813 | "Thu" | "Fri" | "Sat" | "Sun"
1814 weekday = "Monday" | "Tuesday" | "Wednesday"
1815 | "Thursday" | "Friday" | "Saturday" | "Sunday"
1816 month = "Jan" | "Feb" | "Mar" | "Apr"
1817 | "May" | "Jun" | "Jul" | "Aug"
1818 | "Sep" | "Oct" | "Nov" | "Dec"
1819
1820 source: RFC2616 */
1821 static uerr_t
time_to_rfc1123(time_t time,char * buf,size_t bufsize)1822 time_to_rfc1123 (time_t time, char *buf, size_t bufsize)
1823 {
1824 static const char *wkday[] = { "Sun", "Mon", "Tue", "Wed",
1825 "Thu", "Fri", "Sat" };
1826 static const char *month[] = { "Jan", "Feb", "Mar", "Apr",
1827 "May", "Jun", "Jul", "Aug",
1828 "Sep", "Oct", "Nov", "Dec" };
1829 /* rfc1123 example: Thu, 01 Jan 1998 22:12:57 GMT */
1830 static const char *time_format = "%s, %02d %s %04d %02d:%02d:%02d GMT";
1831
1832 struct tm *gtm = gmtime (&time);
1833 if (!gtm)
1834 {
1835 logprintf (LOG_NOTQUIET,
1836 _("gmtime failed. This is probably a bug.\n"));
1837 return TIMECONV_ERR;
1838 }
1839
1840 snprintf (buf, bufsize, time_format, wkday[gtm->tm_wday],
1841 gtm->tm_mday, month[gtm->tm_mon],
1842 gtm->tm_year + 1900, gtm->tm_hour,
1843 gtm->tm_min, gtm->tm_sec);
1844
1845 return RETROK;
1846 }
1847
1848 static struct request *
initialize_request(const struct url * u,struct http_stat * hs,int * dt,struct url * proxy,bool inhibit_keep_alive,bool * basic_auth_finished,wgint * body_data_size,char ** user,char ** passwd,uerr_t * ret)1849 initialize_request (const struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
1850 bool inhibit_keep_alive, bool *basic_auth_finished,
1851 wgint *body_data_size, char **user, char **passwd, uerr_t *ret)
1852 {
1853 bool head_only = !!(*dt & HEAD_ONLY);
1854 struct request *req;
1855
1856 /* Prepare the request to send. */
1857 {
1858 char *meth_arg;
1859 const char *meth = "GET";
1860 if (head_only)
1861 meth = "HEAD";
1862 else if (opt.method)
1863 meth = opt.method;
1864 /* Use the full path, i.e. one that includes the leading slash and
1865 the query string. E.g. if u->path is "foo/bar" and u->query is
1866 "param=value", full_path will be "/foo/bar?param=value". */
1867 if (proxy
1868 #ifdef HAVE_SSL
1869 /* When using SSL over proxy, CONNECT establishes a direct
1870 connection to the HTTPS server. Therefore use the same
1871 argument as when talking to the server directly. */
1872 && u->scheme != SCHEME_HTTPS
1873 #endif
1874 )
1875 meth_arg = xstrdup (u->url);
1876 else
1877 meth_arg = url_full_path (u);
1878 req = request_new (meth, meth_arg);
1879 }
1880
1881 /* Generate the Host header, HOST:PORT. Take into account that:
1882
1883 - Broken server-side software often doesn't recognize the PORT
1884 argument, so we must generate "Host: www.server.com" instead of
1885 "Host: www.server.com:80" (and likewise for https port).
1886
1887 - IPv6 addresses contain ":", so "Host: 3ffe:8100:200:2::2:1234"
1888 becomes ambiguous and needs to be rewritten as "Host:
1889 [3ffe:8100:200:2::2]:1234". */
1890 {
1891 /* Formats arranged for hfmt[add_port][add_squares]. */
1892 static const char *hfmt[][2] = {
1893 { "%s", "[%s]" }, { "%s:%d", "[%s]:%d" }
1894 };
1895 int add_port = u->port != scheme_default_port (u->scheme);
1896 int add_squares = strchr (u->host, ':') != NULL;
1897 request_set_header (req, "Host",
1898 aprintf (hfmt[add_port][add_squares], u->host, u->port),
1899 rel_value);
1900 }
1901
1902 request_set_header (req, "Referer", hs->referer, rel_none);
1903 if (*dt & SEND_NOCACHE)
1904 {
1905 /* Cache-Control MUST be obeyed by all HTTP/1.1 caching mechanisms... */
1906 request_set_header (req, "Cache-Control", "no-cache", rel_none);
1907
1908 /* ... but some HTTP/1.0 caches doesn't implement Cache-Control. */
1909 request_set_header (req, "Pragma", "no-cache", rel_none);
1910 }
1911 if (*dt & IF_MODIFIED_SINCE)
1912 {
1913 char strtime[32];
1914 uerr_t err = time_to_rfc1123 (hs->orig_file_tstamp, strtime, countof (strtime));
1915
1916 if (err != RETROK)
1917 {
1918 logputs (LOG_VERBOSE, _("Cannot convert timestamp to http format. "
1919 "Falling back to time 0 as last modification "
1920 "time.\n"));
1921 strcpy (strtime, "Thu, 01 Jan 1970 00:00:00 GMT");
1922 }
1923 request_set_header (req, "If-Modified-Since", xstrdup (strtime), rel_value);
1924 }
1925 if (hs->restval)
1926 request_set_header (req, "Range",
1927 aprintf ("bytes=%s-",
1928 number_to_static_string (hs->restval)),
1929 rel_value);
1930 SET_USER_AGENT (req);
1931 request_set_header (req, "Accept", "*/*", rel_none);
1932 #ifdef HAVE_LIBZ
1933 if (opt.compression != compression_none)
1934 request_set_header (req, "Accept-Encoding", "gzip", rel_none);
1935 else
1936 #endif
1937 request_set_header (req, "Accept-Encoding", "identity", rel_none);
1938
1939 /* Find the username with priority */
1940 if (u->user)
1941 *user = u->user;
1942 else if (opt.user && (opt.use_askpass || opt.ask_passwd))
1943 *user = opt.user;
1944 else if (opt.http_user)
1945 *user = opt.http_user;
1946 else if (opt.user)
1947 *user = opt.user;
1948 else
1949 *user = NULL;
1950
1951 /* Find the password with priority */
1952 if (u->passwd)
1953 *passwd = u->passwd;
1954 else if (opt.passwd && (opt.use_askpass || opt.ask_passwd))
1955 *passwd = opt.passwd;
1956 else if (opt.http_passwd)
1957 *passwd = opt.http_passwd;
1958 else if (opt.passwd)
1959 *passwd = opt.passwd;
1960 else
1961 *passwd = NULL;
1962
1963 /* Check for ~/.netrc if none of the above match */
1964 if (opt.netrc && (!*user || !*passwd))
1965 search_netrc (u->host, (const char **) user, (const char **) passwd, 0, NULL);
1966
1967 /* We only do "site-wide" authentication with "global" user/password
1968 * values unless --auth-no-challenge has been requested; URL user/password
1969 * info overrides. */
1970 if (*user && *passwd && (!u->user || opt.auth_without_challenge))
1971 {
1972 /* If this is a host for which we've already received a Basic
1973 * challenge, we'll go ahead and send Basic authentication creds. */
1974 *basic_auth_finished = maybe_send_basic_creds (u->host, *user, *passwd, req);
1975 }
1976
1977 if (inhibit_keep_alive)
1978 request_set_header (req, "Connection", "Close", rel_none);
1979 else
1980 {
1981 request_set_header (req, "Connection", "Keep-Alive", rel_none);
1982 if (proxy)
1983 request_set_header (req, "Proxy-Connection", "Keep-Alive", rel_none);
1984 }
1985
1986 if (opt.method)
1987 {
1988
1989 if (opt.body_data || opt.body_file)
1990 {
1991 request_set_header (req, "Content-Type",
1992 "application/x-www-form-urlencoded", rel_none);
1993
1994 if (opt.body_data)
1995 *body_data_size = strlen (opt.body_data);
1996 else
1997 {
1998 *body_data_size = file_size (opt.body_file);
1999 if (*body_data_size == -1)
2000 {
2001 logprintf (LOG_NOTQUIET, _("BODY data file %s missing: %s\n"),
2002 quote (opt.body_file), strerror (errno));
2003 request_free (&req);
2004 *ret = FILEBADFILE;
2005 return NULL;
2006 }
2007 }
2008 request_set_header (req, "Content-Length",
2009 xstrdup (number_to_static_string (*body_data_size)),
2010 rel_value);
2011 }
2012 else if (c_strcasecmp (opt.method, "post") == 0
2013 || c_strcasecmp (opt.method, "put") == 0
2014 || c_strcasecmp (opt.method, "patch") == 0)
2015 request_set_header (req, "Content-Length", "0", rel_none);
2016 }
2017 return req;
2018 }
2019
2020 static void
initialize_proxy_configuration(const struct url * u,struct request * req,struct url * proxy,char ** proxyauth)2021 initialize_proxy_configuration (const struct url *u, struct request *req,
2022 struct url *proxy, char **proxyauth)
2023 {
2024 char *proxy_user, *proxy_passwd;
2025 /* For normal username and password, URL components override
2026 command-line/wgetrc parameters. With proxy
2027 authentication, it's the reverse, because proxy URLs are
2028 normally the "permanent" ones, so command-line args
2029 should take precedence. */
2030 if (opt.proxy_user && opt.proxy_passwd)
2031 {
2032 proxy_user = opt.proxy_user;
2033 proxy_passwd = opt.proxy_passwd;
2034 }
2035 else
2036 {
2037 proxy_user = proxy->user;
2038 proxy_passwd = proxy->passwd;
2039 }
2040 /* #### This does not appear right. Can't the proxy request,
2041 say, `Digest' authentication? */
2042 if (proxy_user && proxy_passwd)
2043 *proxyauth = basic_authentication_encode (proxy_user, proxy_passwd);
2044
2045 /* Proxy authorization over SSL is handled below. */
2046 #ifdef HAVE_SSL
2047 if (u->scheme != SCHEME_HTTPS)
2048 #endif
2049 request_set_header (req, "Proxy-Authorization", *proxyauth, rel_value);
2050 }
2051
2052 static uerr_t
establish_connection(const struct url * u,const struct url ** conn_ref,struct http_stat * hs,struct url * proxy,char ** proxyauth,struct request ** req_ref,bool * using_ssl,bool inhibit_keep_alive,int * sock_ref)2053 establish_connection (const struct url *u, const struct url **conn_ref,
2054 struct http_stat *hs, struct url *proxy,
2055 char **proxyauth,
2056 struct request **req_ref, bool *using_ssl,
2057 bool inhibit_keep_alive,
2058 int *sock_ref)
2059 {
2060 bool host_lookup_failed = false;
2061 int sock = *sock_ref;
2062 struct request *req = *req_ref;
2063 const struct url *conn = *conn_ref;
2064 struct response *resp;
2065 int write_error;
2066 int statcode;
2067
2068 if (! inhibit_keep_alive)
2069 {
2070 /* Look for a persistent connection to target host, unless a
2071 proxy is used. The exception is when SSL is in use, in which
2072 case the proxy is nothing but a passthrough to the target
2073 host, registered as a connection to the latter. */
2074 const struct url *relevant = conn;
2075 #ifdef HAVE_SSL
2076 if (u->scheme == SCHEME_HTTPS)
2077 relevant = u;
2078 #endif
2079
2080 if (persistent_available_p (relevant->host, relevant->port,
2081 #ifdef HAVE_SSL
2082 relevant->scheme == SCHEME_HTTPS,
2083 #else
2084 0,
2085 #endif
2086 &host_lookup_failed))
2087 {
2088 int family = socket_family (pconn.socket, ENDPOINT_PEER);
2089 sock = pconn.socket;
2090 *using_ssl = pconn.ssl;
2091 #if ENABLE_IPV6
2092 if (family == AF_INET6)
2093 logprintf (LOG_VERBOSE, _("Reusing existing connection to [%s]:%d.\n"),
2094 quotearg_style (escape_quoting_style, pconn.host),
2095 pconn.port);
2096 else
2097 #endif
2098 logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"),
2099 quotearg_style (escape_quoting_style, pconn.host),
2100 pconn.port);
2101 DEBUGP (("Reusing fd %d.\n", sock));
2102 if (pconn.authorized)
2103 /* If the connection is already authorized, the "Basic"
2104 authorization added by code above is unnecessary and
2105 only hurts us. */
2106 request_remove_header (req, "Authorization");
2107 }
2108 else if (host_lookup_failed)
2109 {
2110 logprintf(LOG_NOTQUIET,
2111 _("%s: unable to resolve host address %s\n"),
2112 exec_name, quote (relevant->host));
2113 return HOSTERR;
2114 }
2115 else if (sock != -1)
2116 {
2117 sock = -1;
2118 }
2119 }
2120
2121 if (sock < 0)
2122 {
2123 sock = connect_to_host (conn->host, conn->port);
2124 if (sock == E_HOST)
2125 return HOSTERR;
2126 else if (sock < 0)
2127 return (retryable_socket_connect_error (errno)
2128 ? CONERROR : CONIMPOSSIBLE);
2129
2130 #ifdef HAVE_SSL
2131 if (proxy && u->scheme == SCHEME_HTTPS)
2132 {
2133 char *head;
2134 char *message;
2135 /* When requesting SSL URLs through proxies, use the
2136 CONNECT method to request passthrough. */
2137 struct request *connreq = request_new ("CONNECT",
2138 aprintf ("%s:%d", u->host, u->port));
2139 SET_USER_AGENT (connreq);
2140 if (proxyauth)
2141 {
2142 request_set_header (connreq, "Proxy-Authorization",
2143 *proxyauth, rel_value);
2144 /* Now that PROXYAUTH is part of the CONNECT request,
2145 zero it out so we don't send proxy authorization with
2146 the regular request below. */
2147 *proxyauth = NULL;
2148 }
2149 request_set_header (connreq, "Host",
2150 aprintf ("%s:%d", u->host, u->port),
2151 rel_value);
2152
2153 write_error = request_send (connreq, sock, 0);
2154 request_free (&connreq);
2155 if (write_error < 0)
2156 {
2157 CLOSE_INVALIDATE (sock);
2158 return WRITEFAILED;
2159 }
2160
2161 head = read_http_response_head (sock);
2162 if (!head)
2163 {
2164 logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"),
2165 fd_errstr (sock));
2166 CLOSE_INVALIDATE (sock);
2167 return HERR;
2168 }
2169 message = NULL;
2170 if (!*head)
2171 {
2172 xfree (head);
2173 goto failed_tunnel;
2174 }
2175 DEBUGP (("proxy responded with: [%s]\n", head));
2176
2177 resp = resp_new (head);
2178 statcode = resp_status (resp, &message);
2179 if (statcode < 0)
2180 {
2181 char *tms = datetime_str (time (NULL));
2182 logprintf (LOG_VERBOSE, "%d\n", statcode);
2183 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), tms, statcode,
2184 quotearg_style (escape_quoting_style,
2185 _("Malformed status line")));
2186 xfree (head);
2187 return HERR;
2188 }
2189 xfree (hs->message);
2190 hs->message = xstrdup (message);
2191 resp_free (&resp);
2192 xfree (head);
2193 if (statcode != 200)
2194 {
2195 failed_tunnel:
2196 logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"),
2197 message ? quotearg_style (escape_quoting_style, message) : "?");
2198 xfree (message);
2199 return CONSSLERR;
2200 }
2201 xfree (message);
2202
2203 /* SOCK is now *really* connected to u->host, so update CONN
2204 to reflect this. That way register_persistent will
2205 register SOCK as being connected to u->host:u->port. */
2206 conn = u;
2207 }
2208
2209 if (conn->scheme == SCHEME_HTTPS)
2210 {
2211 if (!ssl_connect_wget (sock, u->host, NULL))
2212 {
2213 CLOSE_INVALIDATE (sock);
2214 return CONSSLERR;
2215 }
2216 else if (!ssl_check_certificate (sock, u->host))
2217 {
2218 CLOSE_INVALIDATE (sock);
2219 return VERIFCERTERR;
2220 }
2221 *using_ssl = true;
2222 }
2223 #endif /* HAVE_SSL */
2224 }
2225 *conn_ref = conn;
2226 *req_ref = req;
2227 *sock_ref = sock;
2228 return RETROK;
2229 }
2230
2231 static uerr_t
set_file_timestamp(struct http_stat * hs)2232 set_file_timestamp (struct http_stat *hs)
2233 {
2234 bool local_dot_orig_file_exists = false;
2235 char *local_filename = NULL;
2236 struct stat st;
2237 char buf[1024];
2238
2239 if (opt.backup_converted)
2240 /* If -K is specified, we'll act on the assumption that it was specified
2241 last time these files were downloaded as well, and instead of just
2242 comparing local file X against server file X, we'll compare local
2243 file X.orig (if extant, else X) against server file X. If -K
2244 _wasn't_ specified last time, or the server contains files called
2245 *.orig, -N will be back to not operating correctly with -k. */
2246 {
2247 size_t filename_len = strlen (hs->local_file);
2248 char *filename_plus_orig_suffix;
2249
2250 if (filename_len + sizeof (ORIG_SFX) > sizeof (buf))
2251 filename_plus_orig_suffix = xmalloc (filename_len + sizeof (ORIG_SFX));
2252 else
2253 filename_plus_orig_suffix = buf;
2254
2255 /* Would a single s[n]printf() call be faster? --dan
2256
2257 Definitely not. sprintf() is horribly slow. It's a
2258 different question whether the difference between the two
2259 affects a program. Usually I'd say "no", but at one
2260 point I profiled Wget, and found that a measurable and
2261 non-negligible amount of time was lost calling sprintf()
2262 in url.c. Replacing sprintf with inline calls to
2263 strcpy() and number_to_string() made a difference.
2264 --hniksic */
2265 memcpy (filename_plus_orig_suffix, hs->local_file, filename_len);
2266 memcpy (filename_plus_orig_suffix + filename_len,
2267 ORIG_SFX, sizeof (ORIG_SFX));
2268
2269 /* Try to stat() the .orig file. */
2270 if (stat (filename_plus_orig_suffix, &st) == 0)
2271 {
2272 local_dot_orig_file_exists = true;
2273 local_filename = filename_plus_orig_suffix;
2274 }
2275 }
2276
2277 if (!local_dot_orig_file_exists)
2278 /* Couldn't stat() <file>.orig, so try to stat() <file>. */
2279 if (stat (hs->local_file, &st) == 0)
2280 {
2281 if (local_filename != buf)
2282 xfree (local_filename);
2283 local_filename = hs->local_file;
2284 }
2285
2286 if (local_filename != NULL)
2287 /* There was a local file, so we'll check later to see if the version
2288 the server has is the same version we already have, allowing us to
2289 skip a download. */
2290 {
2291 if (local_filename == buf || local_filename == hs->local_file)
2292 hs->orig_file_name = xstrdup (local_filename); // on stack or a copy, make a heap copy
2293 else
2294 hs->orig_file_name = local_filename; // was previously malloc'ed
2295 hs->orig_file_size = st.st_size;
2296 hs->orig_file_tstamp = st.st_mtime;
2297 #ifdef WINDOWS
2298 /* Modification time granularity is 2 seconds for Windows, so
2299 increase local time by 1 second for later comparison. */
2300 ++hs->orig_file_tstamp;
2301 #endif
2302 hs->timestamp_checked = true;
2303 }
2304
2305 return RETROK;
2306 }
2307
2308 static uerr_t
check_file_output(const struct url * u,struct http_stat * hs,struct response * resp,char * hdrval,size_t hdrsize)2309 check_file_output (const struct url *u, struct http_stat *hs,
2310 struct response *resp, char *hdrval, size_t hdrsize)
2311 {
2312 /* Determine the local filename if needed. Notice that if -O is used
2313 * hstat.local_file is set by http_loop to the argument of -O. */
2314 if (!hs->local_file)
2315 {
2316 char *local_file = NULL;
2317
2318 /* Honor Content-Disposition whether possible. */
2319 if (!opt.content_disposition
2320 || !resp_header_copy (resp, "Content-Disposition",
2321 hdrval, hdrsize)
2322 || !parse_content_disposition (hdrval, &local_file))
2323 {
2324 /* The Content-Disposition header is missing or broken.
2325 * Choose unique file name according to given URL. */
2326 hs->local_file = url_file_name (u, NULL);
2327 }
2328 else
2329 {
2330 DEBUGP (("Parsed filename from Content-Disposition: %s\n",
2331 local_file));
2332 hs->local_file = url_file_name (u, local_file);
2333 }
2334
2335 xfree (local_file);
2336 }
2337
2338 hs->temporary = opt.delete_after || opt.spider || !acceptable (hs->local_file);
2339 if (hs->temporary)
2340 {
2341 char *tmp = aprintf ("%s.tmp", hs->local_file);
2342 xfree (hs->local_file);
2343 hs->local_file = tmp;
2344 }
2345
2346 /* TODO: perform this check only once. */
2347 if (!hs->existence_checked && file_exists_p (hs->local_file, NULL))
2348 {
2349 if (opt.noclobber && !opt.output_document)
2350 {
2351 /* If opt.noclobber is turned on and file already exists, do not
2352 retrieve the file. But if the output_document was given, then this
2353 test was already done and the file didn't exist. Hence the !opt.output_document */
2354 return RETRUNNEEDED;
2355 }
2356 else if (!ALLOW_CLOBBER)
2357 {
2358 char *unique = unique_name_passthrough (hs->local_file);
2359 if (unique != hs->local_file)
2360 xfree (hs->local_file);
2361 hs->local_file = unique;
2362 }
2363 }
2364 hs->existence_checked = true;
2365
2366 /* Support timestamping */
2367 if (opt.timestamping && !hs->timestamp_checked)
2368 {
2369 uerr_t timestamp_err = set_file_timestamp (hs);
2370 if (timestamp_err != RETROK)
2371 return timestamp_err;
2372 }
2373 return RETROK;
2374 }
2375
2376 static uerr_t
check_auth(const struct url * u,char * user,char * passwd,struct response * resp,struct request * req,bool * ntlm_seen_ref,bool * retry,bool * basic_auth_finished_ref,bool * auth_finished_ref)2377 check_auth (const struct url *u, char *user, char *passwd, struct response *resp,
2378 struct request *req, bool *ntlm_seen_ref, bool *retry,
2379 bool *basic_auth_finished_ref, bool *auth_finished_ref)
2380 {
2381 uerr_t auth_err = RETROK;
2382 bool basic_auth_finished = *basic_auth_finished_ref;
2383 bool auth_finished = *auth_finished_ref;
2384 bool ntlm_seen = *ntlm_seen_ref;
2385 char buf[256], *tmp = NULL;
2386
2387 *retry = false;
2388
2389 if (!auth_finished && (user && passwd))
2390 {
2391 /* IIS sends multiple copies of WWW-Authenticate, one with
2392 the value "negotiate", and other(s) with data. Loop over
2393 all the occurrences and pick the one we recognize. */
2394 int wapos;
2395 const char *www_authenticate = NULL;
2396 const char *wabeg, *waend;
2397 const char *digest = NULL, *basic = NULL, *ntlm = NULL;
2398
2399 for (wapos = 0; !ntlm
2400 && (wapos = resp_header_locate (resp, "WWW-Authenticate", wapos,
2401 &wabeg, &waend)) != -1;
2402 ++wapos)
2403 {
2404 param_token name, value;
2405 size_t len = waend - wabeg;
2406
2407 if (tmp != buf)
2408 xfree (tmp);
2409
2410 if (len < sizeof (buf))
2411 tmp = buf;
2412 else
2413 tmp = xmalloc (len + 1);
2414
2415 memcpy (tmp, wabeg, len);
2416 tmp[len] = 0;
2417
2418 www_authenticate = tmp;
2419
2420 for (;!ntlm;)
2421 {
2422 /* extract the auth-scheme */
2423 while (c_isspace (*www_authenticate)) www_authenticate++;
2424 name.e = name.b = www_authenticate;
2425 while (*name.e && !c_isspace (*name.e)) name.e++;
2426
2427 if (name.b == name.e)
2428 break;
2429
2430 DEBUGP (("Auth scheme found '%.*s'\n", (int) (name.e - name.b), name.b));
2431
2432 if (known_authentication_scheme_p (name.b, name.e))
2433 {
2434 if (BEGINS_WITH (name.b, "NTLM"))
2435 {
2436 ntlm = name.b;
2437 break; /* this is the most secure challenge, stop here */
2438 }
2439 else if (!digest && BEGINS_WITH (name.b, "Digest"))
2440 digest = name.b;
2441 else if (!basic && BEGINS_WITH (name.b, "Basic"))
2442 basic = name.b;
2443 }
2444
2445 /* now advance over the auth-params */
2446 www_authenticate = name.e;
2447 DEBUGP (("Auth param list '%s'\n", www_authenticate));
2448 while (extract_param (&www_authenticate, &name, &value, ',', NULL) && name.b && value.b)
2449 {
2450 DEBUGP (("Auth param %.*s=%.*s\n",
2451 (int) (name.e - name.b), name.b, (int) (value.e - value.b), value.b));
2452 }
2453 }
2454 }
2455
2456 if (!basic && !digest && !ntlm)
2457 {
2458 /* If the authentication header is missing or
2459 unrecognized, there's no sense in retrying. */
2460 logputs (LOG_NOTQUIET, _("Unknown authentication scheme.\n"));
2461 }
2462 else if (!basic_auth_finished
2463 || !basic)
2464 {
2465 char *pth = url_full_path (u);
2466 const char *value;
2467 uerr_t *auth_stat;
2468 auth_stat = xmalloc (sizeof (uerr_t));
2469 *auth_stat = RETROK;
2470
2471 if (ntlm)
2472 www_authenticate = ntlm;
2473 else if (digest)
2474 www_authenticate = digest;
2475 else
2476 www_authenticate = basic;
2477
2478 logprintf (LOG_NOTQUIET, _("Authentication selected: %s\n"), www_authenticate);
2479
2480 value = create_authorization_line (www_authenticate,
2481 user, passwd,
2482 request_method (req),
2483 pth,
2484 &auth_finished,
2485 auth_stat);
2486
2487 auth_err = *auth_stat;
2488 xfree (auth_stat);
2489 xfree (pth);
2490 if (auth_err == RETROK)
2491 {
2492 request_set_header (req, "Authorization", value, rel_value);
2493
2494 if (BEGINS_WITH (www_authenticate, "NTLM"))
2495 ntlm_seen = true;
2496 else if (!u->user && BEGINS_WITH (www_authenticate, "Basic"))
2497 {
2498 /* Need to register this host as using basic auth,
2499 * so we automatically send creds next time. */
2500 register_basic_auth_host (u->host);
2501 }
2502
2503 *retry = true;
2504 goto cleanup;
2505 }
2506 else
2507 {
2508 /* Creating the Authorization header went wrong */
2509 xfree (value);
2510 }
2511 }
2512 else
2513 {
2514 /* We already did Basic auth, and it failed. Gotta
2515 * give up. */
2516 }
2517 }
2518
2519 cleanup:
2520 if (tmp != buf)
2521 xfree (tmp);
2522 *ntlm_seen_ref = ntlm_seen;
2523 *basic_auth_finished_ref = basic_auth_finished;
2524 *auth_finished_ref = auth_finished;
2525 return auth_err;
2526 }
2527
2528 static uerr_t
open_output_stream(struct http_stat * hs,int count,FILE ** fp)2529 open_output_stream (struct http_stat *hs, int count, FILE **fp)
2530 {
2531 /* 2005-06-17 SMS.
2532 For VMS, define common fopen() optional arguments.
2533 */
2534 #ifdef __VMS
2535 # define FOPEN_OPT_ARGS "fop=sqo", "acc", acc_cb, &open_id
2536 # define FOPEN_BIN_FLAG 3
2537 #else /* def __VMS */
2538 # define FOPEN_BIN_FLAG true
2539 #endif /* def __VMS [else] */
2540
2541 /* Open the local file. */
2542 if (!output_stream)
2543 {
2544 mkalldirs (hs->local_file);
2545 if (opt.backups)
2546 rotate_backups (hs->local_file);
2547 if (hs->restval)
2548 {
2549 #ifdef __VMS
2550 int open_id;
2551
2552 open_id = 21;
2553 *fp = fopen (hs->local_file, "ab", FOPEN_OPT_ARGS);
2554 #else /* def __VMS */
2555 *fp = fopen (hs->local_file, "ab");
2556 #endif /* def __VMS [else] */
2557 }
2558 else if (ALLOW_CLOBBER || count > 0)
2559 {
2560 if (opt.unlink_requested && file_exists_p (hs->local_file, NULL))
2561 {
2562 if (unlink (hs->local_file) < 0)
2563 {
2564 logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file,
2565 strerror (errno));
2566 return UNLINKERR;
2567 }
2568 }
2569
2570 #ifdef __VMS
2571 int open_id;
2572
2573 open_id = 22;
2574 *fp = fopen (hs->local_file, "wb", FOPEN_OPT_ARGS);
2575 #else /* def __VMS */
2576 if (hs->temporary)
2577 {
2578 *fp = fdopen (open (hs->local_file, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY, S_IRUSR | S_IWUSR), "wb");
2579 }
2580 else
2581 {
2582 *fp = fopen (hs->local_file, "wb");
2583 }
2584
2585 #endif /* def __VMS [else] */
2586 }
2587 else
2588 {
2589 *fp = fopen_excl (hs->local_file, FOPEN_BIN_FLAG);
2590 if (!*fp && errno == EEXIST)
2591 {
2592 /* We cannot just invent a new name and use it (which is
2593 what functions like unique_create typically do)
2594 because we told the user we'd use this name.
2595 Instead, return and retry the download. */
2596 logprintf (LOG_NOTQUIET,
2597 _("%s has sprung into existence.\n"),
2598 hs->local_file);
2599 return FOPEN_EXCL_ERR;
2600 }
2601 }
2602 if (!*fp)
2603 {
2604 logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno));
2605 return FOPENERR;
2606 }
2607 }
2608 else
2609 *fp = output_stream;
2610
2611 /* Print fetch message, if opt.verbose. */
2612 logprintf (LOG_VERBOSE, _("Saving to: %s\n"),
2613 HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file));
2614
2615 return RETROK;
2616 }
2617
2618 /* Set proper type flags based on type string. */
2619 static void
set_content_type(int * dt,const char * type)2620 set_content_type (int *dt, const char *type)
2621 {
2622 /* If content-type is not given, assume text/html. This is because
2623 of the multitude of broken CGI's that "forget" to generate the
2624 content-type. */
2625 if (!type ||
2626 0 == c_strcasecmp (type, TEXTHTML_S) ||
2627 0 == c_strcasecmp (type, TEXTXHTML_S))
2628 *dt |= TEXTHTML;
2629 else
2630 *dt &= ~TEXTHTML;
2631
2632 if (type &&
2633 0 == c_strcasecmp (type, TEXTCSS_S))
2634 *dt |= TEXTCSS;
2635 else
2636 *dt &= ~TEXTCSS;
2637 }
2638
2639 #ifdef HAVE_METALINK
2640 /* Will return proper metalink_t structure if enough data was found in
2641 http response resp. Otherwise returns NULL.
2642 Two exit points: one for success and one for failure. */
2643 static metalink_t *
metalink_from_http(const struct response * resp,const struct http_stat * hs,const struct url * u)2644 metalink_from_http (const struct response *resp, const struct http_stat *hs,
2645 const struct url *u)
2646 {
2647 metalink_t *metalink = NULL;
2648 metalink_file_t *mfile = xnew0 (metalink_file_t);
2649 const char *val_beg, *val_end;
2650 int res_count = 0, meta_count = 0, hash_count = 0, sig_count = 0, i;
2651
2652 DEBUGP (("Checking for Metalink in HTTP response\n"));
2653
2654 /* Initialize metalink file for our simple use case. */
2655 if (hs->local_file)
2656 mfile->name = xstrdup (hs->local_file);
2657 else
2658 mfile->name = url_file_name (u, NULL);
2659
2660 /* Begin with 1-element array (for 0-termination). */
2661 mfile->checksums = xnew0 (metalink_checksum_t *);
2662 mfile->resources = xnew0 (metalink_resource_t *);
2663 mfile->metaurls = xnew0 (metalink_metaurl_t *);
2664
2665 /* Process the Content-Type header. */
2666 if (resp_header_locate (resp, "Content-Type", 0, &val_beg, &val_end) != -1)
2667 {
2668 metalink_metaurl_t murl = {0};
2669
2670 const char *type_beg, *type_end;
2671 char *typestr = NULL;
2672 char *namestr = NULL;
2673 size_t type_len;
2674
2675 DEBUGP (("Processing Content-Type header...\n"));
2676
2677 /* Find beginning of type. */
2678 type_beg = val_beg;
2679 while (type_beg < val_end && c_isspace (*type_beg))
2680 type_beg++;
2681
2682 /* Find end of type. */
2683 type_end = type_beg + 1;
2684 while (type_end < val_end &&
2685 *type_end != ';' &&
2686 *type_end != ' ' &&
2687 *type_end != '\r' &&
2688 *type_end != '\n')
2689 type_end++;
2690
2691 if (type_beg >= val_end || type_end > val_end)
2692 {
2693 DEBUGP (("Invalid Content-Type header. Ignoring.\n"));
2694 goto skip_content_type;
2695 }
2696
2697 type_len = type_end - type_beg;
2698 typestr = xstrndup (type_beg, type_len);
2699
2700 DEBUGP (("Content-Type: %s\n", typestr));
2701
2702 if (strcmp (typestr, "application/metalink4+xml"))
2703 {
2704 xfree (typestr);
2705 goto skip_content_type;
2706 }
2707
2708 /*
2709 Valid ranges for the "pri" attribute are from
2710 1 to 999999. Mirror servers with a lower value of the "pri"
2711 attribute have a higher priority, while mirrors with an undefined
2712 "pri" attribute are considered to have a value of 999999, which is
2713 the lowest priority.
2714
2715 rfc6249 section 3.1
2716 */
2717 murl.priority = DEFAULT_PRI;
2718
2719 murl.mediatype = typestr;
2720 typestr = NULL;
2721
2722 if (opt.content_disposition
2723 && resp_header_locate (resp, "Content-Disposition", 0, &val_beg, &val_end) != -1)
2724 {
2725 find_key_value (val_beg, val_end, "filename", &namestr);
2726 murl.name = namestr;
2727 namestr = NULL;
2728 }
2729
2730 murl.url = xstrdup (u->url);
2731
2732 DEBUGP (("URL=%s\n", murl.url));
2733 DEBUGP (("MEDIATYPE=%s\n", murl.mediatype));
2734 DEBUGP (("NAME=%s\n", murl.name ? murl.name : ""));
2735 DEBUGP (("PRIORITY=%d\n", murl.priority));
2736
2737 /* 1 slot from new resource, 1 slot for null-termination. */
2738 mfile->metaurls = xrealloc (mfile->metaurls,
2739 sizeof (metalink_metaurl_t *) * (meta_count + 2));
2740 mfile->metaurls[meta_count] = xnew0 (metalink_metaurl_t);
2741 *mfile->metaurls[meta_count] = murl;
2742 meta_count++;
2743 }
2744 skip_content_type:
2745
2746 /* Find all Link headers. */
2747 for (i = 0;
2748 (i = resp_header_locate (resp, "Link", i, &val_beg, &val_end)) != -1;
2749 i++)
2750 {
2751 char *rel = NULL, *reltype = NULL;
2752 char *urlstr = NULL;
2753 const char *url_beg, *url_end, *attrs_beg;
2754 size_t url_len;
2755
2756 /* Sample Metalink Link headers:
2757
2758 Link: <http://www2.example.com/dir1/dir2/dir3/dir4/dir5/example.ext>;
2759 rel=duplicate; pri=1; pref; geo=gb; depth=4
2760
2761 Link: <http://example.com/example.ext.asc>; rel=describedby;
2762 type="application/pgp-signature"
2763 */
2764
2765 /* Find beginning of URL. */
2766 url_beg = val_beg;
2767 while (url_beg < val_end - 1 && c_isspace (*url_beg))
2768 url_beg++;
2769
2770 /* Find end of URL. */
2771 /* The convention here is that end ptr points to one element after
2772 end of string. In this case, it should be pointing to the '>', which
2773 is one element after end of actual URL. Therefore, it should never point
2774 to val_end, which is one element after entire header value string. */
2775 url_end = url_beg + 1;
2776 while (url_end < val_end - 1 && *url_end != '>')
2777 url_end++;
2778
2779 if (url_beg >= val_end || url_end >= val_end ||
2780 *url_beg != '<' || *url_end != '>')
2781 {
2782 DEBUGP (("This is not a valid Link header. Ignoring.\n"));
2783 continue;
2784 }
2785
2786 /* Skip <. */
2787 url_beg++;
2788 url_len = url_end - url_beg;
2789
2790 /* URL found. Now handle the attributes. */
2791 attrs_beg = url_end + 1;
2792
2793 /* First we need to find out what type of link it is. Currently, we
2794 support rel=duplicate and rel=describedby. */
2795 if (!find_key_value (attrs_beg, val_end, "rel", &rel))
2796 {
2797 DEBUGP (("No rel value in Link header, skipping.\n"));
2798 continue;
2799 }
2800
2801 urlstr = xstrndup (url_beg, url_len);
2802 DEBUGP (("URL=%s\n", urlstr));
2803 DEBUGP (("rel=%s\n", rel));
2804
2805 if (!strcmp (rel, "describedby"))
2806 find_key_value (attrs_beg, val_end, "type", &reltype);
2807
2808 /* Handle signatures.
2809 Libmetalink only supports one signature per file. Therefore we stop
2810 as soon as we successfully get first supported signature. */
2811 if (sig_count == 0 &&
2812 reltype && !strcmp (reltype, "application/pgp-signature"))
2813 {
2814 /* Download the signature to a temporary file. */
2815 FILE *_output_stream = output_stream;
2816 bool _output_stream_regular = output_stream_regular;
2817
2818 output_stream = tmpfile ();
2819 if (output_stream)
2820 {
2821 struct iri *iri = iri_new ();
2822 struct url *url;
2823 int url_err;
2824
2825 set_uri_encoding (iri, opt.locale, true);
2826 url = url_parse (urlstr, &url_err, iri, false);
2827
2828 if (!url)
2829 {
2830 char *error = url_error (urlstr, url_err);
2831 logprintf (LOG_NOTQUIET, _("When downloading signature:\n"
2832 "%s: %s.\n"), urlstr, error);
2833 xfree (error);
2834 iri_free (iri);
2835 }
2836 else
2837 {
2838 /* Avoid recursive Metalink from HTTP headers. */
2839 bool _metalink_http = opt.metalink_over_http;
2840 uerr_t retr_err;
2841
2842 opt.metalink_over_http = false;
2843 retr_err = retrieve_url (url, urlstr, NULL, NULL,
2844 NULL, NULL, false, iri, false);
2845 opt.metalink_over_http = _metalink_http;
2846
2847 url_free (url);
2848 iri_free (iri);
2849
2850 if (retr_err == RETROK)
2851 {
2852 /* Signature is in the temporary file. Read it into
2853 metalink resource structure. */
2854 metalink_signature_t msig;
2855 size_t siglen;
2856
2857 fseek (output_stream, 0, SEEK_END);
2858 siglen = ftell (output_stream);
2859 fseek (output_stream, 0, SEEK_SET);
2860
2861 DEBUGP (("siglen=%lu\n", siglen));
2862
2863 msig.signature = xmalloc (siglen + 1);
2864 if (fread (msig.signature, siglen, 1, output_stream) != 1)
2865 {
2866 logputs (LOG_NOTQUIET,
2867 _("Unable to read signature content from "
2868 "temporary file. Skipping.\n"));
2869 xfree (msig.signature);
2870 }
2871 else
2872 {
2873 msig.signature[siglen] = '\0'; /* Just in case. */
2874 msig.mediatype = xstrdup ("application/pgp-signature");
2875
2876 DEBUGP (("Signature (%s):\n%s\n",
2877 msig.mediatype, msig.signature));
2878
2879 mfile->signature = xnew (metalink_signature_t);
2880 *mfile->signature = msig;
2881
2882 sig_count++;
2883 }
2884 }
2885 }
2886 fclose (output_stream);
2887 }
2888 else
2889 {
2890 logputs (LOG_NOTQUIET, _("Could not create temporary file. "
2891 "Skipping signature download.\n"));
2892 }
2893 output_stream_regular = _output_stream_regular;
2894 output_stream = _output_stream;
2895 } /* Iterate over signatures. */
2896
2897 /* Handle Metalink resources. */
2898 else if (!strcmp (rel, "duplicate"))
2899 {
2900 metalink_resource_t mres = {0};
2901 char *pristr;
2902
2903 /*
2904 Valid ranges for the "pri" attribute are from
2905 1 to 999999. Mirror servers with a lower value of the "pri"
2906 attribute have a higher priority, while mirrors with an undefined
2907 "pri" attribute are considered to have a value of 999999, which is
2908 the lowest priority.
2909
2910 rfc6249 section 3.1
2911 */
2912 mres.priority = DEFAULT_PRI;
2913 if (find_key_value (url_end, val_end, "pri", &pristr))
2914 {
2915 long pri;
2916 char *end_pristr;
2917 /* Do not care for errno since 0 is error in this case. */
2918 pri = strtol (pristr, &end_pristr, 10);
2919 if (end_pristr != pristr + strlen (pristr) ||
2920 !VALID_PRI_RANGE (pri))
2921 {
2922 /* This is against the specification, so let's inform the user. */
2923 logprintf (LOG_NOTQUIET,
2924 _("Invalid pri value. Assuming %d.\n"),
2925 DEFAULT_PRI);
2926 }
2927 else
2928 mres.priority = pri;
2929 xfree (pristr);
2930 }
2931
2932 switch (url_scheme (urlstr))
2933 {
2934 case SCHEME_HTTP:
2935 mres.type = xstrdup ("http");
2936 break;
2937 #ifdef HAVE_SSL
2938 case SCHEME_HTTPS:
2939 mres.type = xstrdup ("https");
2940 break;
2941 case SCHEME_FTPS:
2942 mres.type = xstrdup ("ftps");
2943 break;
2944 #endif
2945 case SCHEME_FTP:
2946 mres.type = xstrdup ("ftp");
2947 break;
2948 default:
2949 DEBUGP (("Unsupported url scheme in %s. Skipping resource.\n", urlstr));
2950 }
2951
2952 if (mres.type)
2953 {
2954 DEBUGP (("TYPE=%s\n", mres.type));
2955
2956 /* At this point we have validated the new resource. */
2957
2958 find_key_value (url_end, val_end, "geo", &mres.location);
2959
2960 mres.url = urlstr;
2961 urlstr = NULL;
2962
2963 mres.preference = 0;
2964 if (has_key (url_end, val_end, "pref"))
2965 {
2966 DEBUGP (("This resource has preference\n"));
2967 mres.preference = 1;
2968 }
2969
2970 /* 1 slot from new resource, 1 slot for null-termination. */
2971 mfile->resources = xrealloc (mfile->resources,
2972 sizeof (metalink_resource_t *) * (res_count + 2));
2973 mfile->resources[res_count] = xnew0 (metalink_resource_t);
2974 *mfile->resources[res_count] = mres;
2975 res_count++;
2976 }
2977 } /* Handle resource link (rel=duplicate). */
2978
2979 /* Handle Metalink/XML resources. */
2980 else if (reltype && !strcmp (reltype, "application/metalink4+xml"))
2981 {
2982 metalink_metaurl_t murl = {0};
2983 char *pristr;
2984
2985 /*
2986 Valid ranges for the "pri" attribute are from
2987 1 to 999999. Mirror servers with a lower value of the "pri"
2988 attribute have a higher priority, while mirrors with an undefined
2989 "pri" attribute are considered to have a value of 999999, which is
2990 the lowest priority.
2991
2992 rfc6249 section 3.1
2993 */
2994 murl.priority = DEFAULT_PRI;
2995 if (find_key_value (url_end, val_end, "pri", &pristr))
2996 {
2997 long pri;
2998 char *end_pristr;
2999 /* Do not care for errno since 0 is error in this case. */
3000 pri = strtol (pristr, &end_pristr, 10);
3001 if (end_pristr != pristr + strlen (pristr) ||
3002 !VALID_PRI_RANGE (pri))
3003 {
3004 /* This is against the specification, so let's inform the user. */
3005 logprintf (LOG_NOTQUIET,
3006 _("Invalid pri value. Assuming %d.\n"),
3007 DEFAULT_PRI);
3008 }
3009 else
3010 murl.priority = pri;
3011 xfree (pristr);
3012 }
3013
3014 murl.mediatype = xstrdup (reltype);
3015
3016 DEBUGP (("MEDIATYPE=%s\n", murl.mediatype));
3017
3018 /* At this point we have validated the new resource. */
3019
3020 find_key_value (url_end, val_end, "name", &murl.name);
3021
3022 murl.url = urlstr;
3023 urlstr = NULL;
3024
3025 /* 1 slot from new resource, 1 slot for null-termination. */
3026 mfile->metaurls = xrealloc (mfile->metaurls,
3027 sizeof (metalink_metaurl_t *) * (meta_count + 2));
3028 mfile->metaurls[meta_count] = xnew0 (metalink_metaurl_t);
3029 *mfile->metaurls[meta_count] = murl;
3030 meta_count++;
3031 } /* Handle resource link (rel=describedby). */
3032 else
3033 DEBUGP (("This link header was not used for Metalink\n"));
3034
3035 xfree (urlstr);
3036 xfree (reltype);
3037 xfree (rel);
3038 } /* Iterate over link headers. */
3039
3040 /* Null-terminate resources array. */
3041 mfile->resources[res_count] = 0;
3042 mfile->metaurls[meta_count] = 0;
3043
3044 if (res_count == 0 && meta_count == 0)
3045 {
3046 DEBUGP (("No valid metalink references found.\n"));
3047 goto fail;
3048 }
3049
3050 /* Find all Digest headers. */
3051 for (i = 0;
3052 (i = resp_header_locate (resp, "Digest", i, &val_beg, &val_end)) != -1;
3053 i++)
3054 {
3055 const char *dig_pos;
3056 char *dig_type, *dig_hash;
3057
3058 /* Each Digest header can include multiple hashes. Example:
3059 Digest: SHA=thvDyvhfIqlvFe+A9MYgxAfm1q5=,unixsum=30637
3060 Digest: md5=HUXZLQLMuI/KZ5KDcJPcOA==
3061 */
3062 for (dig_pos = val_beg;
3063 (dig_pos = find_key_values (dig_pos, val_end, &dig_type, &dig_hash));
3064 dig_pos++)
3065 {
3066 /* The hash here is assumed to be base64. We need the hash in hex.
3067 Therefore we convert: base64 -> binary -> hex. */
3068 const size_t dig_hash_str_len = strlen (dig_hash);
3069 char bin_hash[256];
3070 ssize_t hash_bin_len;
3071
3072 // there is no hash with that size
3073 if (dig_hash_str_len >= sizeof (bin_hash))
3074 {
3075 DEBUGP (("Hash too long, ignored.\n"));
3076 xfree (dig_type);
3077 xfree (dig_hash);
3078 continue;
3079 }
3080
3081 hash_bin_len = wget_base64_decode (dig_hash, bin_hash, dig_hash_str_len * 3 / 4 + 1);
3082
3083 /* Detect malformed base64 input. */
3084 if (hash_bin_len < 0)
3085 {
3086 DEBUGP (("Malformed base64 input, ignored.\n"));
3087 xfree (dig_type);
3088 xfree (dig_hash);
3089 continue;
3090 }
3091
3092 /* One slot for me, one for zero-termination. */
3093 mfile->checksums =
3094 xrealloc (mfile->checksums,
3095 sizeof (metalink_checksum_t *) * (hash_count + 2));
3096 mfile->checksums[hash_count] = xnew (metalink_checksum_t);
3097 mfile->checksums[hash_count]->type = dig_type;
3098
3099 mfile->checksums[hash_count]->hash = xmalloc ((size_t)hash_bin_len * 2 + 1);
3100 wg_hex_to_string (mfile->checksums[hash_count]->hash, bin_hash, (size_t)hash_bin_len);
3101
3102 xfree (dig_hash);
3103
3104 hash_count++;
3105 }
3106 }
3107
3108 /* Zero-terminate checksums array. */
3109 mfile->checksums[hash_count] = 0;
3110
3111 /*
3112 If Instance Digests are not provided by the Metalink servers, the
3113 Link header fields pertaining to this specification MUST be ignored.
3114
3115 rfc6249 section 6
3116 */
3117 if (res_count && hash_count == 0)
3118 {
3119 logputs (LOG_VERBOSE,
3120 _("Could not find acceptable digest for Metalink resources.\n"
3121 "Ignoring them.\n"));
3122 goto fail;
3123 }
3124
3125 /* Metalink data is OK. Now we just need to sort the resources based
3126 on their priorities, preference, and perhaps location. */
3127 stable_sort (mfile->resources, res_count, sizeof (metalink_resource_t *), metalink_res_cmp);
3128 stable_sort (mfile->metaurls, meta_count, sizeof (metalink_metaurl_t *), metalink_meta_cmp);
3129
3130 /* Restore sensible preference values (in case someone cares to look). */
3131 for (i = 0; i < res_count; ++i)
3132 mfile->resources[i]->preference = 1000000 - mfile->resources[i]->priority;
3133
3134 metalink = xnew0 (metalink_t);
3135 metalink->files = xmalloc (sizeof (metalink_file_t *) * 2);
3136 metalink->files[0] = mfile;
3137 metalink->files[1] = 0;
3138 metalink->origin = xstrdup (u->url);
3139 metalink->version = METALINK_VERSION_4;
3140 /* Leave other fields set to 0. */
3141
3142 return metalink;
3143
3144 fail:
3145 /* Free all allocated memory. */
3146 if (metalink)
3147 metalink_delete (metalink);
3148 else
3149 metalink_file_delete (mfile);
3150 return NULL;
3151 }
3152 #endif /* HAVE_METALINK */
3153
3154 /* Retrieve a document through HTTP protocol. It recognizes status
3155 code, and correctly handles redirections. It closes the network
3156 socket. If it receives an error from the functions below it, it
3157 will print it if there is enough information to do so (almost
3158 always), returning the error to the caller (i.e. http_loop).
3159
3160 Various HTTP parameters are stored to hs.
3161
3162 If PROXY is non-NULL, the connection will be made to the proxy
3163 server, and u->url will be requested. */
3164 static uerr_t
gethttp(const struct url * u,struct url * original_url,struct http_stat * hs,int * dt,struct url * proxy,struct iri * iri,int count)3165 gethttp (const struct url *u, struct url *original_url, struct http_stat *hs,
3166 int *dt, struct url *proxy, struct iri *iri, int count)
3167 {
3168 struct request *req = NULL;
3169
3170 char *type = NULL;
3171 char *user, *passwd;
3172 char *proxyauth;
3173 int statcode;
3174 int write_error;
3175 wgint contlen, contrange;
3176 const struct url *conn;
3177 FILE *fp;
3178 int err;
3179 uerr_t retval;
3180 #ifdef HAVE_HSTS
3181 #ifdef TESTING
3182 /* we don't link against main.o when we're testing */
3183 hsts_store_t hsts_store = NULL;
3184 #else
3185 extern hsts_store_t hsts_store;
3186 #endif
3187 const char *hsts_params;
3188 time_t max_age;
3189 bool include_subdomains;
3190 #endif
3191
3192 int sock = -1;
3193
3194 /* Set to 1 when the authorization has already been sent and should
3195 not be tried again. */
3196 bool auth_finished = false;
3197
3198 /* Set to 1 when just globally-set Basic authorization has been sent;
3199 * should prevent further Basic negotiations, but not other
3200 * mechanisms. */
3201 bool basic_auth_finished = false;
3202
3203 /* Whether NTLM authentication is used for this request. */
3204 bool ntlm_seen = false;
3205
3206 /* Whether our connection to the remote host is through SSL. */
3207 bool using_ssl = false;
3208
3209 /* Whether a HEAD request will be issued (as opposed to GET or
3210 POST). */
3211 bool head_only = !!(*dt & HEAD_ONLY);
3212
3213 /* Whether conditional get request will be issued. */
3214 bool cond_get = !!(*dt & IF_MODIFIED_SINCE);
3215
3216 #ifdef HAVE_METALINK
3217 /* Are we looking for metalink info in HTTP headers? */
3218 bool metalink = !!(*dt & METALINK_METADATA);
3219 #endif
3220
3221 char *head = NULL;
3222 struct response *resp = NULL;
3223 char hdrval[512];
3224 char *message = NULL;
3225
3226 /* Declare WARC variables. */
3227 bool warc_enabled = (opt.warc_filename != NULL);
3228 FILE *warc_tmp = NULL;
3229 char warc_timestamp_str [21];
3230 char warc_request_uuid [48];
3231 ip_address warc_ip_buf, *warc_ip = NULL;
3232 off_t warc_payload_offset = -1;
3233
3234 /* Whether this connection will be kept alive after the HTTP request
3235 is done. */
3236 bool keep_alive;
3237
3238 /* Is the server using the chunked transfer encoding? */
3239 bool chunked_transfer_encoding = false;
3240
3241 /* Whether keep-alive should be inhibited. */
3242 bool inhibit_keep_alive =
3243 !opt.http_keep_alive || opt.ignore_length;
3244
3245 /* Headers sent when using POST. */
3246 wgint body_data_size = 0;
3247
3248 #ifdef HAVE_SSL
3249 if (u->scheme == SCHEME_HTTPS)
3250 {
3251 /* Initialize the SSL context. After this has once been done,
3252 it becomes a no-op. */
3253 if (!ssl_init ())
3254 {
3255 scheme_disable (SCHEME_HTTPS);
3256 logprintf (LOG_NOTQUIET,
3257 _("Disabling SSL due to encountered errors.\n"));
3258 retval = SSLINITFAILED;
3259 goto cleanup;
3260 }
3261 }
3262 #endif /* HAVE_SSL */
3263
3264 /* Initialize certain elements of struct http_stat.
3265 * Since this function is called in a loop, we have to xfree certain
3266 * members. */
3267 hs->len = 0;
3268 hs->contlen = -1;
3269 hs->res = -1;
3270 xfree (hs->rderrmsg);
3271 xfree (hs->newloc);
3272 xfree (hs->remote_time);
3273 xfree (hs->error);
3274 xfree (hs->message);
3275 hs->local_encoding = ENC_NONE;
3276 hs->remote_encoding = ENC_NONE;
3277
3278 conn = u;
3279
3280 {
3281 uerr_t ret;
3282 req = initialize_request (u, hs, dt, proxy, inhibit_keep_alive,
3283 &basic_auth_finished, &body_data_size,
3284 &user, &passwd, &ret);
3285 if (req == NULL)
3286 {
3287 retval = ret;
3288 goto cleanup;
3289 }
3290 }
3291 retry_with_auth:
3292 /* We need to come back here when the initial attempt to retrieve
3293 without authorization header fails. (Expected to happen at least
3294 for the Digest authorization scheme.) */
3295
3296 if (opt.cookies)
3297 request_set_header (req, "Cookie",
3298 cookie_header (wget_cookie_jar,
3299 u->host, u->port, u->path,
3300 #ifdef HAVE_SSL
3301 u->scheme == SCHEME_HTTPS
3302 #else
3303 0
3304 #endif
3305 ),
3306 rel_value);
3307
3308 /* Add the user headers. */
3309 if (opt.user_headers)
3310 {
3311 int i;
3312 for (i = 0; opt.user_headers[i]; i++)
3313 request_set_user_header (req, opt.user_headers[i]);
3314 }
3315
3316 proxyauth = NULL;
3317 if (proxy)
3318 {
3319 conn = proxy;
3320 initialize_proxy_configuration (u, req, proxy, &proxyauth);
3321 }
3322 keep_alive = true;
3323
3324 /* Establish the connection. */
3325 if (inhibit_keep_alive)
3326 keep_alive = false;
3327
3328 {
3329 uerr_t conn_err = establish_connection (u, &conn, hs, proxy, &proxyauth, &req,
3330 &using_ssl, inhibit_keep_alive, &sock);
3331 if (conn_err != RETROK)
3332 {
3333 retval = conn_err;
3334 goto cleanup;
3335 }
3336 }
3337
3338 /* Open the temporary file where we will write the request. */
3339 if (warc_enabled)
3340 {
3341 warc_tmp = warc_tempfile ();
3342 if (warc_tmp == NULL)
3343 {
3344 CLOSE_INVALIDATE (sock);
3345 retval = WARC_TMP_FOPENERR;
3346 goto cleanup;
3347 }
3348
3349 if (! proxy)
3350 {
3351 warc_ip = &warc_ip_buf;
3352 socket_ip_address (sock, warc_ip, ENDPOINT_PEER);
3353 }
3354 }
3355
3356 /* Send the request to server. */
3357 write_error = request_send (req, sock, warc_tmp);
3358
3359 if (write_error >= 0)
3360 {
3361 if (opt.body_data)
3362 {
3363 DEBUGP (("[BODY data: %s]\n", opt.body_data));
3364 write_error = fd_write (sock, opt.body_data, body_data_size, -1);
3365 if (write_error >= 0 && warc_tmp != NULL)
3366 {
3367 int warc_tmp_written;
3368
3369 /* Remember end of headers / start of payload. */
3370 warc_payload_offset = ftello (warc_tmp);
3371
3372 /* Write a copy of the data to the WARC record. */
3373 warc_tmp_written = fwrite (opt.body_data, 1, body_data_size, warc_tmp);
3374 if (warc_tmp_written != body_data_size)
3375 write_error = -2;
3376 }
3377 }
3378 else if (opt.body_file && body_data_size != 0)
3379 {
3380 if (warc_tmp != NULL)
3381 /* Remember end of headers / start of payload */
3382 warc_payload_offset = ftello (warc_tmp);
3383
3384 write_error = body_file_send (sock, opt.body_file, body_data_size, warc_tmp);
3385 }
3386 }
3387
3388 if (write_error < 0)
3389 {
3390 CLOSE_INVALIDATE (sock);
3391
3392 if (warc_tmp != NULL)
3393 fclose (warc_tmp);
3394
3395 if (write_error == -2)
3396 retval = WARC_TMP_FWRITEERR;
3397 else
3398 retval = WRITEFAILED;
3399 goto cleanup;
3400 }
3401 logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
3402 proxy ? "Proxy" : "HTTP");
3403 contlen = -1;
3404 contrange = 0;
3405 *dt &= ~RETROKF;
3406
3407
3408 if (warc_enabled)
3409 {
3410 bool warc_result;
3411
3412 /* Generate a timestamp and uuid for this request. */
3413 warc_timestamp (warc_timestamp_str, sizeof (warc_timestamp_str));
3414 warc_uuid_str (warc_request_uuid, sizeof (warc_request_uuid));
3415
3416 /* Create a request record and store it in the WARC file. */
3417 warc_result = warc_write_request_record (u->url, warc_timestamp_str,
3418 warc_request_uuid, warc_ip,
3419 warc_tmp, warc_payload_offset);
3420 if (! warc_result)
3421 {
3422 CLOSE_INVALIDATE (sock);
3423 retval = WARC_ERR;
3424 goto cleanup;
3425 }
3426
3427 /* warc_write_request_record has also closed warc_tmp. */
3428 }
3429
3430 /* Repeat while we receive a 10x response code. */
3431 {
3432 bool _repeat;
3433
3434 do
3435 {
3436 head = read_http_response_head (sock);
3437 if (!head)
3438 {
3439 if (errno == 0)
3440 {
3441 logputs (LOG_NOTQUIET, _("No data received.\n"));
3442 CLOSE_INVALIDATE (sock);
3443 retval = HEOF;
3444 }
3445 else
3446 {
3447 logprintf (LOG_NOTQUIET, _("Read error (%s) in headers.\n"),
3448 fd_errstr (sock));
3449 CLOSE_INVALIDATE (sock);
3450 retval = HERR;
3451 }
3452 goto cleanup;
3453 }
3454 DEBUGP (("\n---response begin---\n%s---response end---\n", head));
3455
3456 resp = resp_new (head);
3457
3458 /* Check for status line. */
3459 xfree (message);
3460 statcode = resp_status (resp, &message);
3461 if (statcode < 0)
3462 {
3463 char *tms = datetime_str (time (NULL));
3464 logprintf (LOG_VERBOSE, "%d\n", statcode);
3465 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), tms, statcode,
3466 quotearg_style (escape_quoting_style,
3467 _("Malformed status line")));
3468 CLOSE_INVALIDATE (sock);
3469 retval = HERR;
3470 goto cleanup;
3471 }
3472
3473 if (H_10X (statcode))
3474 {
3475 xfree (head);
3476 resp_free (&resp);
3477 _repeat = true;
3478 DEBUGP (("Ignoring response\n"));
3479 }
3480 else
3481 {
3482 _repeat = false;
3483 }
3484 }
3485 while (_repeat);
3486 }
3487
3488 xfree (hs->message);
3489 hs->message = xstrdup (message);
3490 if (!opt.server_response)
3491 logprintf (LOG_VERBOSE, "%2d %s\n", statcode,
3492 message ? quotearg_style (escape_quoting_style, message) : "");
3493 else
3494 {
3495 logprintf (LOG_VERBOSE, "\n");
3496 print_server_response (resp, " ");
3497 }
3498
3499 if (!opt.ignore_length
3500 && resp_header_copy (resp, "Content-Length", hdrval, sizeof (hdrval)))
3501 {
3502 wgint parsed;
3503 errno = 0;
3504 parsed = str_to_wgint (hdrval, NULL, 10);
3505 if (parsed == WGINT_MAX && errno == ERANGE)
3506 {
3507 /* Out of range.
3508 #### If Content-Length is out of range, it most likely
3509 means that the file is larger than 2G and that we're
3510 compiled without LFS. In that case we should probably
3511 refuse to even attempt to download the file. */
3512 contlen = -1;
3513 }
3514 else if (parsed < 0)
3515 {
3516 /* Negative Content-Length; nonsensical, so we can't
3517 assume any information about the content to receive. */
3518 contlen = -1;
3519 }
3520 else
3521 contlen = parsed;
3522 }
3523
3524 /* Check for keep-alive related responses. */
3525 if (!inhibit_keep_alive)
3526 {
3527 if (resp_header_copy (resp, "Connection", hdrval, sizeof (hdrval)))
3528 {
3529 if (0 == c_strcasecmp (hdrval, "Close"))
3530 keep_alive = false;
3531 }
3532 }
3533
3534 chunked_transfer_encoding = false;
3535 if (resp_header_copy (resp, "Transfer-Encoding", hdrval, sizeof (hdrval))
3536 && 0 == c_strcasecmp (hdrval, "chunked"))
3537 chunked_transfer_encoding = true;
3538
3539 /* Handle (possibly multiple instances of) the Set-Cookie header. */
3540 if (opt.cookies)
3541 {
3542 int scpos;
3543 const char *scbeg, *scend;
3544 /* The jar should have been created by now. */
3545 assert (wget_cookie_jar != NULL);
3546 for (scpos = 0;
3547 (scpos = resp_header_locate (resp, "Set-Cookie", scpos,
3548 &scbeg, &scend)) != -1;
3549 ++scpos)
3550 {
3551 char buf[1024], *set_cookie;
3552 size_t len = scend - scbeg;
3553
3554 if (len < sizeof (buf))
3555 set_cookie = buf;
3556 else
3557 set_cookie = xmalloc (len + 1);
3558
3559 memcpy (set_cookie, scbeg, len);
3560 set_cookie[len] = 0;
3561
3562 cookie_handle_set_cookie (wget_cookie_jar, u->host, u->port,
3563 u->path, set_cookie);
3564
3565 if (set_cookie != buf)
3566 xfree (set_cookie);
3567 }
3568 }
3569
3570 if (keep_alive)
3571 /* The server has promised that it will not close the connection
3572 when we're done. This means that we can register it. */
3573 register_persistent (conn->host, conn->port, sock, using_ssl);
3574
3575 #ifdef HAVE_METALINK
3576 /* We need to check for the Metalink data in the very first response
3577 we get from the server (before redirections, authorization, etc.). */
3578 if (metalink)
3579 {
3580 hs->metalink = metalink_from_http (resp, hs, u);
3581 /* Bugfix: hs->local_file is NULL (opt.content_disposition). */
3582 if (!hs->local_file && hs->metalink && hs->metalink->origin)
3583 hs->local_file = xstrdup (hs->metalink->origin);
3584 xfree (hs->message);
3585 retval = RETR_WITH_METALINK;
3586 CLOSE_FINISH (sock);
3587 goto cleanup;
3588 }
3589 #endif
3590
3591 if (statcode == HTTP_STATUS_UNAUTHORIZED)
3592 {
3593 /* Authorization is required. */
3594 uerr_t auth_err = RETROK;
3595 bool retry;
3596 /* Normally we are not interested in the response body.
3597 But if we are writing a WARC file we are: we like to keep everything. */
3598 if (warc_enabled)
3599 {
3600 int _err;
3601 type = resp_header_strdup (resp, "Content-Type");
3602 _err = read_response_body (hs, sock, NULL, contlen, 0,
3603 chunked_transfer_encoding,
3604 u->url, warc_timestamp_str,
3605 warc_request_uuid, warc_ip, type,
3606 statcode, head);
3607 xfree (type);
3608
3609 if (_err != RETRFINISHED || hs->res < 0)
3610 {
3611 CLOSE_INVALIDATE (sock);
3612 retval = _err;
3613 goto cleanup;
3614 }
3615 else
3616 CLOSE_FINISH (sock);
3617 }
3618 else
3619 {
3620 /* Since WARC is disabled, we are not interested in the response body. */
3621 if (keep_alive && !head_only
3622 && skip_short_body (sock, contlen, chunked_transfer_encoding))
3623 CLOSE_FINISH (sock);
3624 else
3625 CLOSE_INVALIDATE (sock);
3626 }
3627
3628 pconn.authorized = false;
3629
3630 {
3631 auth_err = check_auth (u, user, passwd, resp, req,
3632 &ntlm_seen, &retry,
3633 &basic_auth_finished,
3634 &auth_finished);
3635 if (auth_err == RETROK && retry)
3636 {
3637 resp_free (&resp);
3638 xfree (message);
3639 xfree (head);
3640 goto retry_with_auth;
3641 }
3642 }
3643 if (auth_err == RETROK)
3644 retval = AUTHFAILED;
3645 else
3646 retval = auth_err;
3647 goto cleanup;
3648 }
3649 else /* statcode != HTTP_STATUS_UNAUTHORIZED */
3650 {
3651 /* Kludge: if NTLM is used, mark the TCP connection as authorized. */
3652 if (ntlm_seen)
3653 pconn.authorized = true;
3654 }
3655
3656 {
3657 uerr_t ret = check_file_output (u, hs, resp, hdrval, sizeof hdrval);
3658 if (ret != RETROK)
3659 {
3660 retval = ret;
3661 goto cleanup;
3662 }
3663 }
3664
3665 hs->statcode = statcode;
3666 xfree (hs->error);
3667 if (statcode == -1)
3668 hs->error = xstrdup (_("Malformed status line"));
3669 else if (!message || !*message)
3670 hs->error = xstrdup (_("(no description)"));
3671 else
3672 hs->error = xstrdup (message);
3673
3674 #ifdef HAVE_HSTS
3675 if (opt.hsts && hsts_store)
3676 {
3677 hsts_params = resp_header_strdup (resp, "Strict-Transport-Security");
3678 if (parse_strict_transport_security (hsts_params, &max_age, &include_subdomains))
3679 {
3680 /* process strict transport security */
3681 if (hsts_store_entry (hsts_store, u->scheme, u->host, u->port, max_age, include_subdomains))
3682 DEBUGP(("Added new HSTS host: %s:%u (max-age: %lu, includeSubdomains: %s)\n",
3683 u->host,
3684 (unsigned) u->port,
3685 (unsigned long) max_age,
3686 (include_subdomains ? "true" : "false")));
3687 else
3688 DEBUGP(("Updated HSTS host: %s:%u (max-age: %lu, includeSubdomains: %s)\n",
3689 u->host,
3690 (unsigned) u->port,
3691 (unsigned long) max_age,
3692 (include_subdomains ? "true" : "false")));
3693 }
3694 xfree (hsts_params);
3695 }
3696 #endif
3697
3698 type = resp_header_strdup (resp, "Content-Type");
3699 if (type)
3700 {
3701 char *tmp = strchr (type, ';');
3702 if (tmp)
3703 {
3704 #ifdef ENABLE_IRI
3705 /* sXXXav: only needed if IRI support is enabled */
3706 char *tmp2 = tmp + 1;
3707 #endif
3708
3709 while (tmp > type && c_isspace (tmp[-1]))
3710 --tmp;
3711 *tmp = '\0';
3712
3713 #ifdef ENABLE_IRI
3714 /* Try to get remote encoding if needed */
3715 if (opt.enable_iri && !opt.encoding_remote)
3716 {
3717 tmp = parse_charset (tmp2);
3718 if (tmp)
3719 set_content_encoding (iri, tmp);
3720 xfree (tmp);
3721 }
3722 #endif
3723 }
3724 }
3725 xfree (hs->newloc);
3726 hs->newloc = resp_header_strdup (resp, "Location");
3727 xfree (hs->remote_time);
3728 hs->remote_time = resp_header_strdup (resp, "Last-Modified");
3729 if (!hs->remote_time) // now look for the Wayback Machine's timestamp
3730 hs->remote_time = resp_header_strdup (resp, "X-Archive-Orig-last-modified");
3731
3732 if (resp_header_copy (resp, "Content-Range", hdrval, sizeof (hdrval)))
3733 {
3734 wgint first_byte_pos, last_byte_pos, entity_length;
3735 if (parse_content_range (hdrval, &first_byte_pos, &last_byte_pos,
3736 &entity_length))
3737 {
3738 contrange = first_byte_pos;
3739 contlen = last_byte_pos - first_byte_pos + 1;
3740 }
3741 }
3742
3743 if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof (hdrval)))
3744 {
3745 hs->local_encoding = ENC_INVALID;
3746
3747 switch (hdrval[0])
3748 {
3749 case 'b': case 'B':
3750 if (0 == c_strcasecmp(hdrval, "br"))
3751 hs->local_encoding = ENC_BROTLI;
3752 break;
3753 case 'c': case 'C':
3754 if (0 == c_strcasecmp(hdrval, "compress"))
3755 hs->local_encoding = ENC_COMPRESS;
3756 break;
3757 case 'd': case 'D':
3758 if (0 == c_strcasecmp(hdrval, "deflate"))
3759 hs->local_encoding = ENC_DEFLATE;
3760 break;
3761 case 'g': case 'G':
3762 if (0 == c_strcasecmp(hdrval, "gzip"))
3763 hs->local_encoding = ENC_GZIP;
3764 break;
3765 case 'i': case 'I':
3766 if (0 == c_strcasecmp(hdrval, "identity"))
3767 hs->local_encoding = ENC_NONE;
3768 break;
3769 case 'x': case 'X':
3770 if (0 == c_strcasecmp(hdrval, "x-compress"))
3771 hs->local_encoding = ENC_COMPRESS;
3772 else if (0 == c_strcasecmp(hdrval, "x-gzip"))
3773 hs->local_encoding = ENC_GZIP;
3774 break;
3775 case '\0':
3776 hs->local_encoding = ENC_NONE;
3777 }
3778
3779 if (hs->local_encoding == ENC_INVALID)
3780 {
3781 DEBUGP (("Unrecognized Content-Encoding: %s\n", hdrval));
3782 hs->local_encoding = ENC_NONE;
3783 }
3784 #ifdef HAVE_LIBZ
3785 else if (hs->local_encoding == ENC_GZIP
3786 && opt.compression != compression_none)
3787 {
3788 const char *p;
3789
3790 /* Make sure the Content-Type is not gzip before decompressing */
3791 if (type)
3792 {
3793 p = strchr (type, '/');
3794 if (p == NULL)
3795 {
3796 hs->remote_encoding = ENC_GZIP;
3797 hs->local_encoding = ENC_NONE;
3798 }
3799 else
3800 {
3801 p++;
3802 if (c_tolower(p[0]) == 'x' && p[1] == '-')
3803 p += 2;
3804 if (0 != c_strcasecmp (p, "gzip"))
3805 {
3806 hs->remote_encoding = ENC_GZIP;
3807 hs->local_encoding = ENC_NONE;
3808 }
3809 }
3810 }
3811 else
3812 {
3813 hs->remote_encoding = ENC_GZIP;
3814 hs->local_encoding = ENC_NONE;
3815 }
3816
3817 /* don't uncompress if a file ends with '.gz' or '.tgz' */
3818 if (hs->remote_encoding == ENC_GZIP
3819 && (p = strrchr(u->file, '.'))
3820 && (c_strcasecmp(p, ".gz") == 0 || c_strcasecmp(p, ".tgz") == 0))
3821 {
3822 DEBUGP (("Enabling broken server workaround. Will not decompress this GZip file.\n"));
3823 hs->remote_encoding = ENC_NONE;
3824 }
3825 }
3826 #endif
3827 }
3828
3829 /* 20x responses are counted among successful by default. */
3830 if (H_20X (statcode))
3831 *dt |= RETROKF;
3832
3833 if (statcode == HTTP_STATUS_NO_CONTENT)
3834 {
3835 /* 204 response has no body (RFC 2616, 4.3) */
3836
3837 /* In case the caller cares to look... */
3838 hs->len = 0;
3839 hs->res = 0;
3840 hs->restval = 0;
3841
3842 CLOSE_FINISH (sock);
3843
3844 retval = RETRFINISHED;
3845 goto cleanup;
3846 }
3847
3848 /* Return if redirected. */
3849 if (H_REDIRECTED (statcode) || statcode == HTTP_STATUS_MULTIPLE_CHOICES)
3850 {
3851 /* RFC2068 says that in case of the 300 (multiple choices)
3852 response, the server can output a preferred URL through
3853 `Location' header; otherwise, the request should be treated
3854 like GET. So, if the location is set, it will be a
3855 redirection; otherwise, just proceed normally. */
3856 if (statcode == HTTP_STATUS_MULTIPLE_CHOICES && !hs->newloc)
3857 *dt |= RETROKF;
3858 else
3859 {
3860 logprintf (LOG_VERBOSE,
3861 _("Location: %s%s\n"),
3862 hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"),
3863 hs->newloc ? _(" [following]") : "");
3864
3865 /* In case the caller cares to look... */
3866 hs->len = 0;
3867 hs->res = 0;
3868 hs->restval = 0;
3869
3870 /* Normally we are not interested in the response body of a redirect.
3871 But if we are writing a WARC file we are: we like to keep everything. */
3872 if (warc_enabled)
3873 {
3874 int _err = read_response_body (hs, sock, NULL, contlen, 0,
3875 chunked_transfer_encoding,
3876 u->url, warc_timestamp_str,
3877 warc_request_uuid, warc_ip, type,
3878 statcode, head);
3879
3880 if (_err != RETRFINISHED || hs->res < 0)
3881 {
3882 CLOSE_INVALIDATE (sock);
3883 retval = _err;
3884 goto cleanup;
3885 }
3886 else
3887 CLOSE_FINISH (sock);
3888 }
3889 else
3890 {
3891 /* Since WARC is disabled, we are not interested in the response body. */
3892 if (keep_alive && !head_only
3893 && skip_short_body (sock, contlen, chunked_transfer_encoding))
3894 CLOSE_FINISH (sock);
3895 else
3896 CLOSE_INVALIDATE (sock);
3897 }
3898
3899 /* From RFC2616: The status codes 303 and 307 have
3900 been added for servers that wish to make unambiguously
3901 clear which kind of reaction is expected of the client.
3902
3903 A 307 should be redirected using the same method,
3904 in other words, a POST should be preserved and not
3905 converted to a GET in that case.
3906
3907 With strict adherence to RFC2616, POST requests are not
3908 converted to a GET request on 301 Permanent Redirect
3909 or 302 Temporary Redirect.
3910
3911 A switch may be provided later based on the HTTPbis draft
3912 that allows clients to convert POST requests to GET
3913 requests on 301 and 302 response codes. */
3914 switch (statcode)
3915 {
3916 case HTTP_STATUS_TEMPORARY_REDIRECT:
3917 case HTTP_STATUS_PERMANENT_REDIRECT:
3918 retval = NEWLOCATION_KEEP_POST;
3919 goto cleanup;
3920 case HTTP_STATUS_MOVED_PERMANENTLY:
3921 if (opt.method && c_strcasecmp (opt.method, "post") != 0)
3922 {
3923 retval = NEWLOCATION_KEEP_POST;
3924 goto cleanup;
3925 }
3926 break;
3927 case HTTP_STATUS_MOVED_TEMPORARILY:
3928 if (opt.method && c_strcasecmp (opt.method, "post") != 0)
3929 {
3930 retval = NEWLOCATION_KEEP_POST;
3931 goto cleanup;
3932 }
3933 break;
3934 }
3935 retval = NEWLOCATION;
3936 goto cleanup;
3937 }
3938 }
3939
3940 if (cond_get)
3941 {
3942 if (statcode == HTTP_STATUS_NOT_MODIFIED)
3943 {
3944 logprintf (LOG_VERBOSE,
3945 _ ("File %s not modified on server. Omitting download.\n\n"),
3946 quote (hs->local_file));
3947 *dt |= RETROKF;
3948 CLOSE_FINISH (sock);
3949 retval = RETRUNNEEDED;
3950 goto cleanup;
3951 }
3952 }
3953
3954 set_content_type (dt, type);
3955
3956 if (opt.adjust_extension)
3957 {
3958 const char *encoding_ext = NULL;
3959 switch (hs->local_encoding)
3960 {
3961 case ENC_INVALID:
3962 case ENC_NONE:
3963 break;
3964 case ENC_BROTLI:
3965 encoding_ext = ".br";
3966 break;
3967 case ENC_COMPRESS:
3968 encoding_ext = ".Z";
3969 break;
3970 case ENC_DEFLATE:
3971 encoding_ext = ".zlib";
3972 break;
3973 case ENC_GZIP:
3974 encoding_ext = ".gz";
3975 break;
3976 default:
3977 DEBUGP (("No extension found for encoding %d\n",
3978 hs->local_encoding));
3979 }
3980 if (encoding_ext != NULL)
3981 {
3982 char *file_ext = strrchr (hs->local_file, '.');
3983 /* strip Content-Encoding extension (it will be re-added later) */
3984 if (file_ext != NULL && 0 == strcasecmp (file_ext, encoding_ext))
3985 *file_ext = '\0';
3986 }
3987 if (*dt & TEXTHTML)
3988 /* -E / --adjust-extension / adjust_extension = on was specified,
3989 and this is a text/html file. If some case-insensitive
3990 variation on ".htm[l]" isn't already the file's suffix,
3991 tack on ".html". */
3992 {
3993 ensure_extension (hs, ".html", dt);
3994 }
3995 else if (*dt & TEXTCSS)
3996 {
3997 ensure_extension (hs, ".css", dt);
3998 }
3999 if (encoding_ext != NULL)
4000 {
4001 ensure_extension (hs, encoding_ext, dt);
4002 }
4003 }
4004
4005 if (cond_get)
4006 {
4007 /* Handle the case when server ignores If-Modified-Since header. */
4008 if (statcode == HTTP_STATUS_OK && hs->remote_time)
4009 {
4010 time_t tmr = http_atotm (hs->remote_time);
4011
4012 /* Check if the local file is up-to-date based on Last-Modified header
4013 and content length. */
4014 if (tmr != (time_t) - 1 && tmr <= hs->orig_file_tstamp
4015 && (contlen == -1 || contlen == hs->orig_file_size))
4016 {
4017 logprintf (LOG_VERBOSE,
4018 _("Server ignored If-Modified-Since header for file %s.\n"
4019 "You might want to add --no-if-modified-since option."
4020 "\n\n"),
4021 quote (hs->local_file));
4022 *dt |= RETROKF;
4023 CLOSE_INVALIDATE (sock);
4024 retval = RETRUNNEEDED;
4025 goto cleanup;
4026 }
4027 }
4028 }
4029
4030 if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE
4031 || (!opt.timestamping && hs->restval > 0 && statcode == HTTP_STATUS_OK
4032 && contrange == 0 && contlen >= 0 && hs->restval >= contlen))
4033 {
4034 /* If `-c' is in use and the file has been fully downloaded (or
4035 the remote file has shrunk), Wget effectively requests bytes
4036 after the end of file and the server response with 416
4037 (or 200 with a <= Content-Length. */
4038 logputs (LOG_VERBOSE, _("\
4039 \n The file is already fully retrieved; nothing to do.\n\n"));
4040 /* In case the caller inspects. */
4041 hs->len = contlen;
4042 hs->res = 0;
4043 /* Mark as successfully retrieved. */
4044 *dt |= RETROKF;
4045
4046 /* Try to maintain the keep-alive connection. It is often cheaper to
4047 * consume some bytes which have already been sent than to negotiate
4048 * a new connection. However, if the body is too large, or we don't
4049 * care about keep-alive, then simply terminate the connection */
4050 if (keep_alive &&
4051 skip_short_body (sock, contlen, chunked_transfer_encoding))
4052 CLOSE_FINISH (sock);
4053 else
4054 CLOSE_INVALIDATE (sock);
4055 retval = RETRUNNEEDED;
4056 goto cleanup;
4057 }
4058 if ((contrange != 0 && contrange != hs->restval)
4059 || (H_PARTIAL (statcode) && !contrange && hs->restval))
4060 {
4061 /* The Range request was somehow misunderstood by the server.
4062 Bail out. */
4063 CLOSE_INVALIDATE (sock);
4064 retval = RANGEERR;
4065 goto cleanup;
4066 }
4067 if (contlen == -1)
4068 hs->contlen = -1;
4069 /* If the response is gzipped, the uncompressed size is unknown. */
4070 else if (hs->remote_encoding == ENC_GZIP)
4071 hs->contlen = -1;
4072 else
4073 hs->contlen = contlen + contrange;
4074
4075 if (opt.verbose)
4076 {
4077 if (*dt & RETROKF)
4078 {
4079 /* No need to print this output if the body won't be
4080 downloaded at all, or if the original server response is
4081 printed. */
4082 logputs (LOG_VERBOSE, _("Length: "));
4083 if (contlen != -1)
4084 {
4085 logputs (LOG_VERBOSE, number_to_static_string (contlen + contrange));
4086 if (contlen + contrange >= 1024)
4087 logprintf (LOG_VERBOSE, " (%s)",
4088 human_readable (contlen + contrange, 10, 1));
4089 if (contrange)
4090 {
4091 if (contlen >= 1024)
4092 logprintf (LOG_VERBOSE, _(", %s (%s) remaining"),
4093 number_to_static_string (contlen),
4094 human_readable (contlen, 10, 1));
4095 else
4096 logprintf (LOG_VERBOSE, _(", %s remaining"),
4097 number_to_static_string (contlen));
4098 }
4099 }
4100 else
4101 logputs (LOG_VERBOSE,
4102 opt.ignore_length ? _("ignored") : _("unspecified"));
4103 if (type)
4104 logprintf (LOG_VERBOSE, " [%s]\n", quotearg_style (escape_quoting_style, type));
4105 else
4106 logputs (LOG_VERBOSE, "\n");
4107 }
4108 }
4109
4110 /* Return if we have no intention of further downloading. */
4111 if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only || (opt.spider && !opt.recursive))
4112 {
4113 /* In case the caller cares to look... */
4114 hs->len = 0;
4115 hs->res = 0;
4116 hs->restval = 0;
4117
4118 /* Normally we are not interested in the response body of a error responses.
4119 But if we are writing a WARC file we are: we like to keep everything. */
4120 if (warc_enabled)
4121 {
4122 int _err = read_response_body (hs, sock, NULL, contlen, 0,
4123 chunked_transfer_encoding,
4124 u->url, warc_timestamp_str,
4125 warc_request_uuid, warc_ip, type,
4126 statcode, head);
4127
4128 if (_err != RETRFINISHED || hs->res < 0)
4129 {
4130 CLOSE_INVALIDATE (sock);
4131 retval = _err;
4132 goto cleanup;
4133 }
4134
4135 CLOSE_FINISH (sock);
4136 }
4137 else
4138 {
4139 /* Since WARC is disabled, we are not interested in the response body. */
4140 if (head_only)
4141 /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the
4142 servers not to send body in response to a HEAD request, and
4143 those that do will likely be caught by test_socket_open.
4144 If not, they can be worked around using
4145 `--no-http-keep-alive'. */
4146 CLOSE_FINISH (sock);
4147 else if (opt.spider && !opt.recursive)
4148 /* we just want to see if the page exists - no downloading required */
4149 CLOSE_INVALIDATE (sock);
4150 else if (keep_alive
4151 && skip_short_body (sock, contlen, chunked_transfer_encoding))
4152 /* Successfully skipped the body; also keep using the socket. */
4153 CLOSE_FINISH (sock);
4154 else
4155 CLOSE_INVALIDATE (sock);
4156 }
4157
4158 if (statcode == HTTP_STATUS_GATEWAY_TIMEOUT)
4159 retval = GATEWAYTIMEOUT;
4160 else
4161 retval = RETRFINISHED;
4162
4163 goto cleanup;
4164 }
4165
4166 err = open_output_stream (hs, count, &fp);
4167 if (err != RETROK)
4168 {
4169 CLOSE_INVALIDATE (sock);
4170 retval = err;
4171 goto cleanup;
4172 }
4173
4174 #ifdef ENABLE_XATTR
4175 if (opt.enable_xattr)
4176 {
4177 if (original_url != u)
4178 set_file_metadata (u, original_url, fp);
4179 else
4180 set_file_metadata (u, NULL, fp);
4181 }
4182 #endif
4183
4184 err = read_response_body (hs, sock, fp, contlen, contrange,
4185 chunked_transfer_encoding,
4186 u->url, warc_timestamp_str,
4187 warc_request_uuid, warc_ip, type,
4188 statcode, head);
4189
4190 if (hs->res >= 0)
4191 CLOSE_FINISH (sock);
4192 else
4193 CLOSE_INVALIDATE (sock);
4194
4195 if (!output_stream)
4196 fclose (fp);
4197
4198 retval = err;
4199
4200 cleanup:
4201 xfree (head);
4202 xfree (type);
4203 xfree (message);
4204 resp_free (&resp);
4205 request_free (&req);
4206
4207 return retval;
4208 }
4209
4210 /* Check whether the supplied HTTP status code is among those
4211 listed for the --retry-on-http-error option. */
4212 static bool
check_retry_on_http_error(const int statcode)4213 check_retry_on_http_error (const int statcode)
4214 {
4215 const char *tok = opt.retry_on_http_error;
4216 while (tok && *tok)
4217 {
4218 if (atoi (tok) == statcode)
4219 return true;
4220 if ((tok = strchr (tok, ',')))
4221 ++tok;
4222 }
4223 return false;
4224 }
4225
4226 /* The genuine HTTP loop! This is the part where the retrieval is
4227 retried, and retried, and retried, and... */
4228 uerr_t
http_loop(const struct url * u,struct url * original_url,char ** newloc,char ** local_file,const char * referer,int * dt,struct url * proxy,struct iri * iri)4229 http_loop (const struct url *u, struct url *original_url, char **newloc,
4230 char **local_file, const char *referer, int *dt, struct url *proxy,
4231 struct iri *iri)
4232 {
4233 int count;
4234 bool got_head = false; /* used for time-stamping and filename detection */
4235 bool time_came_from_head = false;
4236 bool got_name = false;
4237 char *tms;
4238 const char *tmrate;
4239 uerr_t err, ret = TRYLIMEXC;
4240 time_t tmr = -1; /* remote time-stamp */
4241 struct http_stat hstat; /* HTTP status */
4242 struct stat st;
4243 bool send_head_first = true;
4244 bool force_full_retrieve = false;
4245
4246
4247 /* If we are writing to a WARC file: always retrieve the whole file. */
4248 if (opt.warc_filename != NULL)
4249 force_full_retrieve = true;
4250
4251
4252 /* Assert that no value for *LOCAL_FILE was passed. */
4253 assert (local_file == NULL || *local_file == NULL);
4254
4255 /* Set LOCAL_FILE parameter. */
4256 if (local_file && opt.output_document)
4257 *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document);
4258
4259 /* Reset NEWLOC parameter. */
4260 *newloc = NULL;
4261
4262 /* This used to be done in main, but it's a better idea to do it
4263 here so that we don't go through the hoops if we're just using
4264 FTP or whatever. */
4265 if (opt.cookies)
4266 load_cookies ();
4267
4268 /* Warn on (likely bogus) wildcard usage in HTTP. */
4269 if (opt.ftp_glob && has_wildcards_p (u->path))
4270 logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n"));
4271
4272 /* Setup hstat struct. */
4273 xzero (hstat);
4274 hstat.referer = referer;
4275
4276 if (opt.output_document)
4277 {
4278 hstat.local_file = xstrdup (opt.output_document);
4279 got_name = true;
4280 }
4281 else if (!opt.content_disposition)
4282 {
4283 hstat.local_file =
4284 url_file_name (opt.trustservernames ? u : original_url, NULL);
4285 got_name = true;
4286 }
4287
4288 if (got_name && file_exists_p (hstat.local_file, NULL) && opt.noclobber && !opt.output_document)
4289 {
4290 /* If opt.noclobber is turned on and file already exists, do not
4291 retrieve the file. But if the output_document was given, then this
4292 test was already done and the file didn't exist. Hence the !opt.output_document */
4293 get_file_flags (hstat.local_file, dt);
4294 ret = RETROK;
4295 goto exit;
4296 }
4297
4298 /* Reset the counter. */
4299 count = 0;
4300
4301 /* Reset the document type. */
4302 *dt = 0;
4303
4304 /* Skip preliminary HEAD request if we're not in spider mode. */
4305 if (!opt.spider)
4306 send_head_first = false;
4307
4308 /* Send preliminary HEAD request if --content-disposition and -c are used
4309 together. */
4310 if (opt.content_disposition && opt.always_rest)
4311 send_head_first = true;
4312
4313 #ifdef HAVE_METALINK
4314 if (opt.metalink_over_http)
4315 {
4316 *dt |= METALINK_METADATA;
4317 send_head_first = true;
4318 }
4319 #endif
4320
4321 if (opt.timestamping)
4322 {
4323 /* Use conditional get request if requested
4324 * and if timestamp is known at this moment. */
4325 if (opt.if_modified_since && !send_head_first && got_name && file_exists_p (hstat.local_file, NULL))
4326 {
4327 *dt |= IF_MODIFIED_SINCE;
4328 {
4329 uerr_t timestamp_err = set_file_timestamp (&hstat);
4330 if (timestamp_err != RETROK)
4331 return timestamp_err;
4332 }
4333 }
4334 /* Send preliminary HEAD request if -N is given and we have existing
4335 * destination file or content disposition is enabled. */
4336 else if (opt.content_disposition || file_exists_p (hstat.local_file, NULL))
4337 send_head_first = true;
4338 }
4339
4340 /* THE loop */
4341 do
4342 {
4343 /* Increment the pass counter. */
4344 ++count;
4345 sleep_between_retrievals (count);
4346
4347 /* Get the current time string. */
4348 tms = datetime_str (time (NULL));
4349
4350 if (opt.spider && !got_head)
4351 logprintf (LOG_VERBOSE,
4352 _("Spider mode enabled. Check if remote file exists.\n"));
4353
4354 /* Print fetch message, if opt.verbose. */
4355 if (opt.verbose)
4356 {
4357 char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
4358
4359 if (count > 1)
4360 {
4361 char tmp[256];
4362 sprintf (tmp, _("(try:%2d)"), count);
4363 logprintf (LOG_NOTQUIET, "--%s-- %s %s\n",
4364 tms, tmp, hurl);
4365 }
4366 else
4367 {
4368 logprintf (LOG_NOTQUIET, "--%s-- %s\n",
4369 tms, hurl);
4370 }
4371
4372 #ifdef WINDOWS
4373 ws_changetitle (hurl);
4374 #endif
4375 xfree (hurl);
4376 }
4377
4378 /* Default document type is empty. However, if spider mode is
4379 on or time-stamping is employed, HEAD_ONLY commands is
4380 encoded within *dt. */
4381 if (send_head_first && !got_head)
4382 *dt |= HEAD_ONLY;
4383 else
4384 *dt &= ~HEAD_ONLY;
4385
4386 /* Decide whether or not to restart. */
4387 if (force_full_retrieve)
4388 hstat.restval = hstat.len;
4389 else if (opt.start_pos >= 0)
4390 hstat.restval = opt.start_pos;
4391 else if (opt.always_rest
4392 && got_name
4393 && stat (hstat.local_file, &st) == 0
4394 && S_ISREG (st.st_mode))
4395 /* When -c is used, continue from on-disk size. (Can't use
4396 hstat.len even if count>1 because we don't want a failed
4397 first attempt to clobber existing data.) */
4398 hstat.restval = st.st_size;
4399 else if (count > 1)
4400 {
4401 /* otherwise, continue where the previous try left off */
4402 if (hstat.len < hstat.restval)
4403 hstat.restval -= hstat.len;
4404 else
4405 hstat.restval = hstat.len;
4406 }
4407 else
4408 hstat.restval = 0;
4409
4410 /* Decide whether to send the no-cache directive. We send it in
4411 two cases:
4412 a) we're using a proxy, and we're past our first retrieval.
4413 Some proxies are notorious for caching incomplete data, so
4414 we require a fresh get.
4415 b) caching is explicitly inhibited. */
4416 if ((proxy && count > 1) /* a */
4417 || !opt.allow_cache) /* b */
4418 *dt |= SEND_NOCACHE;
4419 else
4420 *dt &= ~SEND_NOCACHE;
4421
4422 /* Try fetching the document, or at least its head. */
4423 err = gethttp (u, original_url, &hstat, dt, proxy, iri, count);
4424
4425 /* Time? */
4426 tms = datetime_str (time (NULL));
4427
4428 /* Get the new location (with or without the redirection). */
4429 if (hstat.newloc)
4430 *newloc = xstrdup (hstat.newloc);
4431
4432 switch (err)
4433 {
4434 case HERR: case HEOF: case CONSOCKERR:
4435 case CONERROR: case READERR: case WRITEFAILED:
4436 case RANGEERR: case FOPEN_EXCL_ERR: case GATEWAYTIMEOUT:
4437 /* Non-fatal errors continue executing the loop, which will
4438 bring them to "while" statement at the end, to judge
4439 whether the number of tries was exceeded. */
4440 printwhat (count, opt.ntry);
4441 continue;
4442 case FWRITEERR: case FOPENERR:
4443 /* Another fatal error. */
4444 logputs (LOG_VERBOSE, "\n");
4445 logprintf (LOG_NOTQUIET, _("Cannot write to %s (%s).\n"),
4446 quote (hstat.local_file), strerror (errno));
4447 ret = err;
4448 goto exit;
4449 case HOSTERR:
4450 /* Fatal unless option set otherwise. */
4451 if ( opt.retry_on_host_error )
4452 {
4453 printwhat (count, opt.ntry);
4454 continue;
4455 }
4456 ret = err;
4457 goto exit;
4458 case CONIMPOSSIBLE: case PROXERR: case SSLINITFAILED:
4459 case CONTNOTSUPPORTED: case VERIFCERTERR: case FILEBADFILE:
4460 case UNKNOWNATTR:
4461 /* Fatal errors just return from the function. */
4462 ret = err;
4463 goto exit;
4464 case ATTRMISSING:
4465 /* A missing attribute in a Header is a fatal Protocol error. */
4466 logputs (LOG_VERBOSE, "\n");
4467 logprintf (LOG_NOTQUIET, _("Required attribute missing from Header received.\n"));
4468 ret = err;
4469 goto exit;
4470 case AUTHFAILED:
4471 logputs (LOG_VERBOSE, "\n");
4472 logprintf (LOG_NOTQUIET, _("Username/Password Authentication Failed.\n"));
4473 ret = err;
4474 goto exit;
4475 case WARC_ERR:
4476 /* A fatal WARC error. */
4477 logputs (LOG_VERBOSE, "\n");
4478 logprintf (LOG_NOTQUIET, _("Cannot write to WARC file.\n"));
4479 ret = err;
4480 goto exit;
4481 case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR:
4482 /* A fatal WARC error. */
4483 logputs (LOG_VERBOSE, "\n");
4484 logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n"));
4485 ret = err;
4486 goto exit;
4487 case CONSSLERR:
4488 /* Another fatal error. */
4489 logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));
4490 ret = err;
4491 goto exit;
4492 case UNLINKERR:
4493 /* Another fatal error. */
4494 logputs (LOG_VERBOSE, "\n");
4495 logprintf (LOG_NOTQUIET, _("Cannot unlink %s (%s).\n"),
4496 quote (hstat.local_file), strerror (errno));
4497 ret = err;
4498 goto exit;
4499 case NEWLOCATION:
4500 case NEWLOCATION_KEEP_POST:
4501 /* Return the new location to the caller. */
4502 if (!*newloc)
4503 {
4504 logprintf (LOG_NOTQUIET,
4505 _("ERROR: Redirection (%d) without location.\n"),
4506 hstat.statcode);
4507 ret = WRONGCODE;
4508 }
4509 else
4510 {
4511 ret = err;
4512 }
4513 goto exit;
4514 case RETRUNNEEDED:
4515 /* The file was already fully retrieved. */
4516 ret = RETROK;
4517 goto exit;
4518 case RETRFINISHED:
4519 /* Deal with you later. */
4520 break;
4521 #ifdef HAVE_METALINK
4522 case RETR_WITH_METALINK:
4523 {
4524 if (hstat.metalink == NULL)
4525 {
4526 logputs (LOG_NOTQUIET,
4527 _("Could not find Metalink data in HTTP response. "
4528 "Downloading file using HTTP GET.\n"));
4529 *dt &= ~METALINK_METADATA;
4530 *dt &= ~HEAD_ONLY;
4531 got_head = true;
4532 continue;
4533 }
4534
4535 logputs (LOG_VERBOSE,
4536 _("Metalink headers found. "
4537 "Switching to Metalink mode.\n"));
4538
4539 ret = retrieve_from_metalink (hstat.metalink);
4540 goto exit;
4541 }
4542 break;
4543 #endif
4544 default:
4545 /* All possibilities should have been exhausted. */
4546 abort ();
4547 }
4548
4549 if (!(*dt & RETROKF))
4550 {
4551 char *hurl = NULL;
4552 if (!opt.verbose)
4553 {
4554 /* #### Ugly ugly ugly! */
4555 hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
4556 logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
4557 }
4558
4559 /* Fall back to GET if HEAD fails with a 500 or 501 error code. */
4560 if (*dt & HEAD_ONLY
4561 && (hstat.statcode == 500 || hstat.statcode == 501))
4562 {
4563 got_head = true;
4564 xfree (hurl);
4565 continue;
4566 }
4567 /* Maybe we should always keep track of broken links, not just in
4568 * spider mode.
4569 * Don't log error if it was UTF-8 encoded because we will try
4570 * once unencoded. */
4571 else if (opt.spider && !iri->utf8_encode)
4572 {
4573 /* #### Again: ugly ugly ugly! */
4574 if (!hurl)
4575 hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
4576 nonexisting_url (hurl);
4577 logprintf (LOG_NOTQUIET, _("\
4578 Remote file does not exist -- broken link!!!\n"));
4579 }
4580 else if (check_retry_on_http_error (hstat.statcode))
4581 {
4582 printwhat (count, opt.ntry);
4583 xfree (hurl);
4584 continue;
4585 }
4586 else
4587 {
4588 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
4589 tms, hstat.statcode,
4590 quotearg_style (escape_quoting_style, hstat.error));
4591 }
4592 logputs (LOG_VERBOSE, "\n");
4593 ret = WRONGCODE;
4594 xfree (hurl);
4595 goto exit;
4596 }
4597
4598 /* Did we get the time-stamp? */
4599 if (!got_head || (opt.spider && !opt.recursive))
4600 {
4601 got_head = true; /* no more time-stamping */
4602
4603 if (opt.timestamping && !hstat.remote_time)
4604 {
4605 logputs (LOG_NOTQUIET, _("\
4606 Last-modified header missing -- time-stamps turned off.\n"));
4607 }
4608 else if (hstat.remote_time)
4609 {
4610 /* Convert the date-string into struct tm. */
4611 tmr = http_atotm (hstat.remote_time);
4612 if (tmr == (time_t) (-1))
4613 logputs (LOG_VERBOSE, _("\
4614 Last-modified header invalid -- time-stamp ignored.\n"));
4615 if (*dt & HEAD_ONLY)
4616 time_came_from_head = true;
4617 }
4618
4619 if (send_head_first)
4620 {
4621 /* The time-stamping section. */
4622 if (opt.timestamping)
4623 {
4624 if (hstat.orig_file_name) /* Perform the following
4625 checks only if the file
4626 we're supposed to
4627 download already exists. */
4628 {
4629 if (hstat.remote_time &&
4630 tmr != (time_t) (-1))
4631 {
4632 /* Now time-stamping can be used validly.
4633 Time-stamping means that if the sizes of
4634 the local and remote file match, and local
4635 file is newer than the remote file, it will
4636 not be retrieved. Otherwise, the normal
4637 download procedure is resumed. */
4638 if (hstat.orig_file_tstamp >= tmr)
4639 {
4640 if (hstat.contlen == -1
4641 || hstat.orig_file_size == hstat.contlen)
4642 {
4643 logprintf (LOG_VERBOSE, _("\
4644 Server file no newer than local file %s -- not retrieving.\n\n"),
4645 quote (hstat.orig_file_name));
4646 ret = RETROK;
4647 goto exit;
4648 }
4649 else
4650 {
4651 logprintf (LOG_VERBOSE, _("\
4652 The sizes do not match (local %s) -- retrieving.\n"),
4653 number_to_static_string (hstat.orig_file_size));
4654 }
4655 }
4656 else
4657 {
4658 force_full_retrieve = true;
4659 logputs (LOG_VERBOSE,
4660 _("Remote file is newer, retrieving.\n"));
4661 }
4662
4663 logputs (LOG_VERBOSE, "\n");
4664 }
4665 }
4666
4667 /* free_hstat (&hstat); */
4668 hstat.timestamp_checked = true;
4669 }
4670
4671 if (opt.spider)
4672 {
4673 bool finished = true;
4674 if (opt.recursive)
4675 {
4676 if ((*dt & TEXTHTML) || (*dt & TEXTCSS))
4677 {
4678 logputs (LOG_VERBOSE, _("\
4679 Remote file exists and could contain links to other resources -- retrieving.\n\n"));
4680 finished = false;
4681 }
4682 else
4683 {
4684 logprintf (LOG_VERBOSE, _("\
4685 Remote file exists but does not contain any link -- not retrieving.\n\n"));
4686 ret = RETROK; /* RETRUNNEEDED is not for caller. */
4687 }
4688 }
4689 else
4690 {
4691 if ((*dt & TEXTHTML) || (*dt & TEXTCSS))
4692 {
4693 logprintf (LOG_VERBOSE, _("\
4694 Remote file exists and could contain further links,\n\
4695 but recursion is disabled -- not retrieving.\n\n"));
4696 }
4697 else
4698 {
4699 logprintf (LOG_VERBOSE, _("\
4700 Remote file exists.\n\n"));
4701 }
4702 ret = RETROK; /* RETRUNNEEDED is not for caller. */
4703 }
4704
4705 if (finished)
4706 {
4707 logprintf (LOG_NONVERBOSE,
4708 _("%s URL: %s %2d %s\n"),
4709 tms, u->url, hstat.statcode,
4710 hstat.message ? quotearg_style (escape_quoting_style, hstat.message) : "");
4711 goto exit;
4712 }
4713 }
4714
4715 got_name = true;
4716 *dt &= ~HEAD_ONLY;
4717 count = 0; /* the retrieve count for HEAD is reset */
4718 continue;
4719 } /* send_head_first */
4720 } /* !got_head */
4721
4722 if (opt.useservertimestamps
4723 && (tmr != (time_t) (-1))
4724 && ((hstat.len == hstat.contlen) ||
4725 ((hstat.res == 0) && (hstat.contlen == -1))))
4726 {
4727 const char *fl = NULL;
4728 set_local_file (&fl, hstat.local_file);
4729 if (fl)
4730 {
4731 time_t newtmr = -1;
4732 /* Reparse time header, in case it's changed. */
4733 if (time_came_from_head
4734 && hstat.remote_time && hstat.remote_time[0])
4735 {
4736 newtmr = http_atotm (hstat.remote_time);
4737 if (newtmr != (time_t)-1)
4738 tmr = newtmr;
4739 }
4740 touch (fl, tmr);
4741 }
4742 }
4743 /* End of time-stamping section. */
4744
4745 tmrate = retr_rate (hstat.rd_size, hstat.dltime);
4746 total_download_time += hstat.dltime;
4747
4748 if (hstat.len == hstat.contlen)
4749 {
4750 if (*dt & RETROKF || opt.content_on_error)
4751 {
4752 bool write_to_stdout = (opt.output_document && HYPHENP (opt.output_document));
4753
4754 logprintf (LOG_VERBOSE,
4755 write_to_stdout
4756 ? _("%s (%s) - written to stdout %s[%s/%s]\n\n")
4757 : _("%s (%s) - %s saved [%s/%s]\n\n"),
4758 tms, tmrate,
4759 write_to_stdout ? "" : quote (hstat.local_file),
4760 number_to_static_string (hstat.len),
4761 number_to_static_string (hstat.contlen));
4762 logprintf (LOG_NONVERBOSE,
4763 "%s URL:%s [%s/%s] -> \"%s\" [%d]\n",
4764 tms, u->url,
4765 number_to_static_string (hstat.len),
4766 number_to_static_string (hstat.contlen),
4767 hstat.local_file, count);
4768 }
4769 ++numurls;
4770 total_downloaded_bytes += hstat.rd_size;
4771
4772 /* Remember that we downloaded the file for later ".orig" code. */
4773 if (*dt & ADDED_HTML_EXTENSION)
4774 downloaded_file (FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, hstat.local_file);
4775 else
4776 downloaded_file (FILE_DOWNLOADED_NORMALLY, hstat.local_file);
4777
4778 ret = RETROK;
4779 goto exit;
4780 }
4781 else if (hstat.res == 0) /* No read error */
4782 {
4783 if (hstat.contlen == -1) /* We don't know how much we were supposed
4784 to get, so assume we succeeded. */
4785 {
4786 if (*dt & RETROKF || opt.content_on_error)
4787 {
4788 bool write_to_stdout = (opt.output_document && HYPHENP (opt.output_document));
4789
4790 logprintf (LOG_VERBOSE,
4791 write_to_stdout
4792 ? _("%s (%s) - written to stdout %s[%s]\n\n")
4793 : _("%s (%s) - %s saved [%s]\n\n"),
4794 tms, tmrate,
4795 write_to_stdout ? "" : quote (hstat.local_file),
4796 number_to_static_string (hstat.len));
4797 logprintf (LOG_NONVERBOSE,
4798 "%s URL:%s [%s] -> \"%s\" [%d]\n",
4799 tms, u->url, number_to_static_string (hstat.len),
4800 hstat.local_file, count);
4801 }
4802 ++numurls;
4803 total_downloaded_bytes += hstat.rd_size;
4804
4805 /* Remember that we downloaded the file for later ".orig" code. */
4806 if (*dt & ADDED_HTML_EXTENSION)
4807 downloaded_file (FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, hstat.local_file);
4808 else
4809 downloaded_file (FILE_DOWNLOADED_NORMALLY, hstat.local_file);
4810
4811 ret = RETROK;
4812 goto exit;
4813 }
4814 else if (hstat.len < hstat.contlen) /* meaning we lost the
4815 connection too soon */
4816 {
4817 logprintf (LOG_VERBOSE,
4818 _("%s (%s) - Connection closed at byte %s. "),
4819 tms, tmrate, number_to_static_string (hstat.len));
4820 printwhat (count, opt.ntry);
4821 continue;
4822 }
4823 else if (hstat.len != hstat.restval)
4824 /* Getting here would mean reading more data than
4825 requested with content-length, which we never do. */
4826 abort ();
4827 else
4828 {
4829 /* Getting here probably means that the content-length was
4830 * _less_ than the original, local size. We should probably
4831 * truncate or re-read, or something. FIXME */
4832 ret = RETROK;
4833 goto exit;
4834 }
4835 }
4836 else /* from now on hstat.res can only be -1 */
4837 {
4838 if (hstat.contlen == -1)
4839 {
4840 logprintf (LOG_VERBOSE,
4841 _("%s (%s) - Read error at byte %s (%s)."),
4842 tms, tmrate, number_to_static_string (hstat.len),
4843 hstat.rderrmsg);
4844 printwhat (count, opt.ntry);
4845 continue;
4846 }
4847 else /* hstat.res == -1 and contlen is given */
4848 {
4849 logprintf (LOG_VERBOSE,
4850 _("%s (%s) - Read error at byte %s/%s (%s). "),
4851 tms, tmrate,
4852 number_to_static_string (hstat.len),
4853 number_to_static_string (hstat.contlen),
4854 hstat.rderrmsg);
4855 printwhat (count, opt.ntry);
4856 continue;
4857 }
4858 }
4859 /* not reached */
4860 }
4861 while (!opt.ntry || (count < opt.ntry));
4862
4863 exit:
4864 if ((ret == RETROK || opt.content_on_error) && local_file)
4865 {
4866 xfree (*local_file);
4867 /* Bugfix: Prevent SIGSEGV when hstat.local_file was left NULL
4868 (i.e. due to opt.content_disposition). */
4869 if (hstat.local_file)
4870 {
4871 *local_file = hstat.local_file;
4872 hstat.local_file = NULL;
4873 }
4874 }
4875 free_hstat (&hstat);
4876
4877 return ret;
4878 }
4879
4880 /* Check whether the result of strptime() indicates success.
4881 strptime() returns the pointer to how far it got to in the string.
4882 The processing has been successful if the string is at `GMT' or
4883 `+X', or at the end of the string.
4884
4885 In extended regexp parlance, the function returns 1 if P matches
4886 "^ *(GMT|[+-][0-9]|$)", 0 otherwise. P being NULL (which strptime
4887 can return) is considered a failure and 0 is returned. */
4888 static bool
check_end(const char * p)4889 check_end (const char *p)
4890 {
4891 if (!p)
4892 return false;
4893 while (c_isspace (*p))
4894 ++p;
4895 if (!*p
4896 || (p[0] == 'G' && p[1] == 'M' && p[2] == 'T')
4897 || ((p[0] == '+' || p[0] == '-') && c_isdigit (p[1])))
4898 return true;
4899 else
4900 return false;
4901 }
4902
4903 /* Convert the textual specification of time in TIME_STRING to the
4904 number of seconds since the Epoch.
4905
4906 TIME_STRING can be in any of the three formats RFC2616 allows the
4907 HTTP servers to emit -- RFC1123-date, RFC850-date or asctime-date,
4908 as well as the time format used in the Set-Cookie header.
4909 Timezones are ignored, and should be GMT.
4910
4911 Return the computed time_t representation, or -1 if the conversion
4912 fails.
4913
4914 This function uses strptime with various string formats for parsing
4915 TIME_STRING. This results in a parser that is not as lenient in
4916 interpreting TIME_STRING as I would like it to be. Being based on
4917 strptime, it always allows shortened months, one-digit days, etc.,
4918 but due to the multitude of formats in which time can be
4919 represented, an ideal HTTP time parser would be even more
4920 forgiving. It should completely ignore things like week days and
4921 concentrate only on the various forms of representing years,
4922 months, days, hours, minutes, and seconds. For example, it would
4923 be nice if it accepted ISO 8601 out of the box.
4924
4925 I've investigated free and PD code for this purpose, but none was
4926 usable. getdate was big and unwieldy, and had potential copyright
4927 issues, or so I was informed. Dr. Marcus Hennecke's atotm(),
4928 distributed with phttpd, is excellent, but we cannot use it because
4929 it is not assigned to the FSF. So I stuck it with strptime. */
4930
4931 time_t
http_atotm(const char * time_string)4932 http_atotm (const char *time_string)
4933 {
4934 /* NOTE: Solaris strptime man page claims that %n and %t match white
4935 space, but that's not universally available. Instead, we simply
4936 use ` ' to mean "skip all WS", which works under all strptime
4937 implementations I've tested. */
4938
4939 static const char *time_formats[] = {
4940 "%a, %d %b %Y %T", /* rfc1123: Thu, 29 Jan 1998 22:12:57 */
4941 "%A, %d-%b-%y %T", /* rfc850: Thursday, 29-Jan-98 22:12:57 */
4942 "%a %b %d %T %Y", /* asctime: Thu Jan 29 22:12:57 1998 */
4943 "%a, %d-%b-%Y %T" /* cookies: Thu, 29-Jan-1998 22:12:57
4944 (used in Set-Cookie, defined in the
4945 Netscape cookie specification.) */
4946 };
4947 const char *oldlocale;
4948 char savedlocale[256];
4949 size_t i;
4950 time_t ret = (time_t) -1;
4951
4952 /* Solaris strptime fails to recognize English month names in
4953 non-English locales, which we work around by temporarily setting
4954 locale to C before invoking strptime. */
4955 oldlocale = setlocale (LC_TIME, NULL);
4956 if (oldlocale)
4957 {
4958 size_t l = strlen (oldlocale) + 1;
4959 if (l >= sizeof savedlocale)
4960 savedlocale[0] = '\0';
4961 else
4962 memcpy (savedlocale, oldlocale, l);
4963 }
4964 else savedlocale[0] = '\0';
4965
4966 setlocale (LC_TIME, "C");
4967
4968 for (i = 0; i < countof (time_formats); i++)
4969 {
4970 struct tm t;
4971
4972 /* Some versions of strptime use the existing contents of struct
4973 tm to recalculate the date according to format. Zero it out
4974 to prevent stack garbage from influencing strptime. */
4975 xzero (t);
4976
4977 if (check_end (strptime (time_string, time_formats[i], &t)))
4978 {
4979 ret = timegm (&t);
4980 break;
4981 }
4982 }
4983
4984 /* Restore the previous locale. */
4985 if (savedlocale[0])
4986 setlocale (LC_TIME, savedlocale);
4987
4988 return ret;
4989 }
4990
4991 /* Authorization support: We support three authorization schemes:
4992
4993 * `Basic' scheme, consisting of base64-ing USER:PASSWORD string;
4994
4995 * `Digest' scheme, added by Junio Hamano <junio@twinsun.com>,
4996 consisting of answering to the server's challenge with the proper
4997 MD5 digests.
4998
4999 * `NTLM' ("NT Lan Manager") scheme, based on code written by Daniel
5000 Stenberg for libcurl. Like digest, NTLM is based on a
5001 challenge-response mechanism, but unlike digest, it is non-standard
5002 (authenticates TCP connections rather than requests), undocumented
5003 and Microsoft-specific. */
5004
5005 /* Create the authentication header contents for the `Basic' scheme.
5006 This is done by encoding the string "USER:PASS" to base64 and
5007 prepending the string "Basic " in front of it. */
5008
5009 static char *
basic_authentication_encode(const char * user,const char * passwd)5010 basic_authentication_encode (const char *user, const char *passwd)
5011 {
5012 char buf_t1[256], buf_t2[256];
5013 char *t1, *t2, *ret;
5014 size_t len1 = strlen (user) + 1 + strlen (passwd);
5015
5016 if (len1 < sizeof (buf_t1))
5017 t1 = buf_t1;
5018 else
5019 t1 = xmalloc(len1 + 1);
5020
5021 if (BASE64_LENGTH (len1) < sizeof (buf_t2))
5022 t2 = buf_t2;
5023 else
5024 t2 = xmalloc (BASE64_LENGTH (len1) + 1);
5025
5026 sprintf (t1, "%s:%s", user, passwd);
5027 wget_base64_encode (t1, len1, t2);
5028
5029 ret = concat_strings ("Basic ", t2, (char *) 0);
5030
5031 if (t2 != buf_t2)
5032 xfree (t2);
5033
5034 if (t1 != buf_t1)
5035 xfree (t1);
5036
5037 return ret;
5038 }
5039
5040 #define SKIP_WS(x) do { \
5041 while (c_isspace (*(x))) \
5042 ++(x); \
5043 } while (0)
5044
5045 #ifdef ENABLE_DIGEST
5046 /* Dump the hexadecimal representation of HASH to BUF. HASH should be
5047 an array of 16 bytes containing the hash keys, and BUF should be a
5048 buffer of 33 writable characters (32 for hex digits plus one for
5049 zero termination). */
5050 static void
dump_hash(char * buf,const unsigned char * hash)5051 dump_hash (char *buf, const unsigned char *hash)
5052 {
5053 int i;
5054
5055 for (i = 0; i < MD5_DIGEST_SIZE; i++, hash++)
5056 {
5057 *buf++ = XNUM_TO_digit (*hash >> 4);
5058 *buf++ = XNUM_TO_digit (*hash & 0xf);
5059 }
5060 *buf = '\0';
5061 }
5062
5063 /* Take the line apart to find the challenge, and compose a digest
5064 authorization header. See RFC2069 section 2.1.2. */
5065 static char *
digest_authentication_encode(const char * au,const char * user,const char * passwd,const char * method,const char * path,uerr_t * auth_err)5066 digest_authentication_encode (const char *au, const char *user,
5067 const char *passwd, const char *method,
5068 const char *path, uerr_t *auth_err)
5069 {
5070 static char *realm, *opaque, *nonce, *qop, *algorithm;
5071 static struct {
5072 const char *name;
5073 char **variable;
5074 } options[] = {
5075 { "realm", &realm },
5076 { "opaque", &opaque },
5077 { "nonce", &nonce },
5078 { "qop", &qop },
5079 { "algorithm", &algorithm }
5080 };
5081 char cnonce[16] = "";
5082 char *res = NULL;
5083 int res_len;
5084 size_t res_size;
5085 param_token name, value;
5086
5087
5088 realm = opaque = nonce = algorithm = qop = NULL;
5089
5090 au += 6; /* skip over `Digest' */
5091 while (extract_param (&au, &name, &value, ',', NULL))
5092 {
5093 size_t i;
5094 size_t namelen = name.e - name.b;
5095 for (i = 0; i < countof (options); i++)
5096 if (namelen == strlen (options[i].name)
5097 && 0 == strncmp (name.b, options[i].name,
5098 namelen))
5099 {
5100 *options[i].variable = strdupdelim (value.b, value.e);
5101 break;
5102 }
5103 }
5104
5105 if (qop && strcmp (qop, "auth"))
5106 {
5107 logprintf (LOG_NOTQUIET, _("Unsupported quality of protection '%s'.\n"), qop);
5108 xfree (qop); /* force freeing mem and continue */
5109 }
5110 else if (algorithm && strcmp (algorithm,"MD5") && strcmp (algorithm,"MD5-sess"))
5111 {
5112 logprintf (LOG_NOTQUIET, _("Unsupported algorithm '%s'.\n"), algorithm);
5113 xfree (algorithm); /* force freeing mem and continue */
5114 }
5115
5116 if (!realm || !nonce || !user || !passwd || !path || !method)
5117 {
5118 *auth_err = ATTRMISSING;
5119 goto cleanup;
5120 }
5121
5122 /* Calculate the digest value. */
5123 {
5124 struct md5_ctx ctx;
5125 unsigned char hash[MD5_DIGEST_SIZE];
5126 char a1buf[MD5_DIGEST_SIZE * 2 + 1], a2buf[MD5_DIGEST_SIZE * 2 + 1];
5127 char response_digest[MD5_DIGEST_SIZE * 2 + 1];
5128
5129 /* A1BUF = H(user ":" realm ":" password) */
5130 md5_init_ctx (&ctx);
5131 md5_process_bytes ((unsigned char *)user, strlen (user), &ctx);
5132 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5133 md5_process_bytes ((unsigned char *)realm, strlen (realm), &ctx);
5134 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5135 md5_process_bytes ((unsigned char *)passwd, strlen (passwd), &ctx);
5136 md5_finish_ctx (&ctx, hash);
5137
5138 dump_hash (a1buf, hash);
5139
5140 if (algorithm && !strcmp (algorithm, "MD5-sess"))
5141 {
5142 /* A1BUF = H( H(user ":" realm ":" password) ":" nonce ":" cnonce ) */
5143 snprintf (cnonce, sizeof (cnonce), "%08x",
5144 (unsigned) random_number (INT_MAX));
5145
5146 md5_init_ctx (&ctx);
5147 /* md5_process_bytes (hash, MD5_DIGEST_SIZE, &ctx); */
5148 md5_process_bytes (a1buf, MD5_DIGEST_SIZE * 2, &ctx);
5149 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5150 md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx);
5151 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5152 md5_process_bytes ((unsigned char *)cnonce, strlen (cnonce), &ctx);
5153 md5_finish_ctx (&ctx, hash);
5154
5155 dump_hash (a1buf, hash);
5156 }
5157
5158 /* A2BUF = H(method ":" path) */
5159 md5_init_ctx (&ctx);
5160 md5_process_bytes ((unsigned char *)method, strlen (method), &ctx);
5161 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5162 md5_process_bytes ((unsigned char *)path, strlen (path), &ctx);
5163 md5_finish_ctx (&ctx, hash);
5164 dump_hash (a2buf, hash);
5165
5166 if (qop && !strcmp (qop, "auth"))
5167 {
5168 /* RFC 2617 Digest Access Authentication */
5169 /* generate random hex string */
5170 if (!*cnonce)
5171 snprintf (cnonce, sizeof (cnonce), "%08x",
5172 (unsigned) random_number (INT_MAX));
5173
5174 /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" noncecount ":" clientnonce ":" qop ": " A2BUF) */
5175 md5_init_ctx (&ctx);
5176 md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx);
5177 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5178 md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx);
5179 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5180 md5_process_bytes ((unsigned char *)"00000001", 8, &ctx); /* TODO: keep track of server nonce values */
5181 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5182 md5_process_bytes ((unsigned char *)cnonce, strlen (cnonce), &ctx);
5183 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5184 md5_process_bytes ((unsigned char *)qop, strlen (qop), &ctx);
5185 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5186 md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx);
5187 md5_finish_ctx (&ctx, hash);
5188 }
5189 else
5190 {
5191 /* RFC 2069 Digest Access Authentication */
5192 /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */
5193 md5_init_ctx (&ctx);
5194 md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx);
5195 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5196 md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx);
5197 md5_process_bytes ((unsigned char *)":", 1, &ctx);
5198 md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx);
5199 md5_finish_ctx (&ctx, hash);
5200 }
5201
5202 dump_hash (response_digest, hash);
5203
5204 res_size = strlen (user)
5205 + strlen (realm)
5206 + strlen (nonce)
5207 + strlen (path)
5208 + 2 * MD5_DIGEST_SIZE /*strlen (response_digest)*/
5209 + (opaque ? strlen (opaque) : 0)
5210 + (algorithm ? strlen (algorithm) : 0)
5211 + (qop ? 128: 0)
5212 + strlen (cnonce)
5213 + 128;
5214
5215 res = xmalloc (res_size);
5216
5217 if (qop && !strcmp (qop, "auth"))
5218 {
5219 res_len = snprintf (res, res_size, "Digest "\
5220 "username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\""\
5221 ", qop=auth, nc=00000001, cnonce=\"%s\"",
5222 user, realm, nonce, path, response_digest, cnonce);
5223
5224 }
5225 else
5226 {
5227 res_len = snprintf (res, res_size, "Digest "\
5228 "username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"",
5229 user, realm, nonce, path, response_digest);
5230 }
5231
5232 if (opaque)
5233 {
5234 res_len += snprintf (res + res_len, res_size - res_len, ", opaque=\"%s\"", opaque);
5235 }
5236
5237 if (algorithm)
5238 {
5239 snprintf (res + res_len, res_size - res_len, ", algorithm=\"%s\"", algorithm);
5240 }
5241 }
5242
5243 cleanup:
5244 xfree (realm);
5245 xfree (opaque);
5246 xfree (nonce);
5247 xfree (qop);
5248 xfree (algorithm);
5249
5250 return res;
5251 }
5252 #endif /* ENABLE_DIGEST */
5253
5254 /* Computing the size of a string literal must take into account that
5255 value returned by sizeof includes the terminating \0. */
5256 #define STRSIZE(literal) (sizeof (literal) - 1)
5257
5258 /* Whether chars in [b, e) begin with the literal string provided as
5259 first argument and are followed by whitespace or terminating \0.
5260 The comparison is case-insensitive. */
5261 #define STARTS(literal, b, e) \
5262 ((e > b) \
5263 && ((size_t) ((e) - (b))) >= STRSIZE (literal) \
5264 && 0 == c_strncasecmp (b, literal, STRSIZE (literal)) \
5265 && ((size_t) ((e) - (b)) == STRSIZE (literal) \
5266 || c_isspace (b[STRSIZE (literal)])))
5267
5268 static bool
known_authentication_scheme_p(const char * hdrbeg,const char * hdrend)5269 known_authentication_scheme_p (const char *hdrbeg, const char *hdrend)
5270 {
5271 return STARTS ("Basic", hdrbeg, hdrend)
5272 #ifdef ENABLE_DIGEST
5273 || STARTS ("Digest", hdrbeg, hdrend)
5274 #endif
5275 #ifdef ENABLE_NTLM
5276 || STARTS ("NTLM", hdrbeg, hdrend)
5277 #endif
5278 ;
5279 }
5280
5281 #undef STARTS
5282
5283 /* Create the HTTP authorization request header. When the
5284 `WWW-Authenticate' response header is seen, according to the
5285 authorization scheme specified in that header (`Basic' and `Digest'
5286 are supported by the current implementation), produce an
5287 appropriate HTTP authorization request header. */
5288 static char *
create_authorization_line(const char * au,const char * user,const char * passwd,const char * method,const char * path,bool * finished,uerr_t * auth_err)5289 create_authorization_line (const char *au, const char *user,
5290 const char *passwd, const char *method,
5291 const char *path, bool *finished, uerr_t *auth_err)
5292 {
5293 /* We are called only with known schemes, so we can dispatch on the
5294 first letter. */
5295 switch (c_toupper (*au))
5296 {
5297 case 'B': /* Basic */
5298 *finished = true;
5299 return basic_authentication_encode (user, passwd);
5300 #ifdef ENABLE_DIGEST
5301 case 'D': /* Digest */
5302 *finished = true;
5303 return digest_authentication_encode (au, user, passwd, method, path, auth_err);
5304 #endif
5305 #ifdef ENABLE_NTLM
5306 case 'N': /* NTLM */
5307 if (!ntlm_input (&pconn.ntlm, au))
5308 {
5309 *finished = true;
5310 return NULL;
5311 }
5312 return ntlm_output (&pconn.ntlm, user, passwd, finished);
5313 #endif
5314 default:
5315 /* We shouldn't get here -- this function should be only called
5316 with values approved by known_authentication_scheme_p. */
5317 abort ();
5318 }
5319 }
5320
5321 static void
load_cookies(void)5322 load_cookies (void)
5323 {
5324 if (!wget_cookie_jar)
5325 wget_cookie_jar = cookie_jar_new ();
5326 if (opt.cookies_input && !cookies_loaded_p)
5327 {
5328 cookie_jar_load (wget_cookie_jar, opt.cookies_input);
5329 cookies_loaded_p = true;
5330 }
5331 }
5332
5333 void
save_cookies(void)5334 save_cookies (void)
5335 {
5336 if (wget_cookie_jar)
5337 cookie_jar_save (wget_cookie_jar, opt.cookies_output);
5338 }
5339
5340 #if defined DEBUG_MALLOC || defined TESTING
5341 void
http_cleanup(void)5342 http_cleanup (void)
5343 {
5344 if (pconn_active)
5345 invalidate_persistent ();
5346
5347 if (wget_cookie_jar)
5348 {
5349 cookie_jar_delete (wget_cookie_jar);
5350 wget_cookie_jar = NULL;
5351 }
5352
5353 if (basic_authed_hosts)
5354 {
5355 hash_table_iterator iter;
5356 for (hash_table_iterate (basic_authed_hosts, &iter); hash_table_iter_next (&iter); )
5357 {
5358 xfree (iter.key);
5359 }
5360 hash_table_destroy (basic_authed_hosts);
5361 basic_authed_hosts = NULL;
5362 }
5363 }
5364 #endif
5365
5366 void
ensure_extension(struct http_stat * hs,const char * ext,int * dt)5367 ensure_extension (struct http_stat *hs, const char *ext, int *dt)
5368 {
5369 char *last_period_in_local_filename = strrchr (hs->local_file, '.');
5370 char shortext[8];
5371 int len;
5372 shortext[0] = '\0';
5373 len = strlen (ext);
5374 if (len == 5)
5375 {
5376 memcpy (shortext, ext, len - 1);
5377 shortext[len - 1] = '\0';
5378 }
5379
5380 if (last_period_in_local_filename == NULL
5381 || !(0 == strcasecmp (last_period_in_local_filename, shortext)
5382 || 0 == strcasecmp (last_period_in_local_filename, ext)))
5383 {
5384 int local_filename_len = strlen (hs->local_file);
5385 /* Resize the local file, allowing for ".html" preceded by
5386 optional ".NUMBER". */
5387 hs->local_file = xrealloc (hs->local_file,
5388 local_filename_len + 24 + len);
5389 strcpy (hs->local_file + local_filename_len, ext);
5390 /* If clobbering is not allowed and the file, as named,
5391 exists, tack on ".NUMBER.html" instead. */
5392 if (!ALLOW_CLOBBER && file_exists_p (hs->local_file, NULL))
5393 {
5394 int ext_num = 1;
5395 do
5396 sprintf (hs->local_file + local_filename_len,
5397 ".%d%s", ext_num++, ext);
5398 while (file_exists_p (hs->local_file, NULL));
5399 }
5400 *dt |= ADDED_HTML_EXTENSION;
5401 }
5402 }
5403
5404 #ifdef TESTING
5405
5406 const char *
test_parse_range_header(void)5407 test_parse_range_header (void)
5408 {
5409 unsigned i;
5410 static const struct {
5411 const char * rangehdr;
5412 const wgint firstbyte;
5413 const wgint lastbyte;
5414 const wgint length;
5415 const bool shouldPass;
5416 } test_array[] = {
5417 { "bytes 0-1000/1000", 0, 1000, 1000, false },
5418 { "bytes 0-999/1000", 0, 999, 1000, true },
5419 { "bytes 100-99/1000", 100, 99, 1000, false },
5420 { "bytes 100-100/1000", 100, 100, 1000, true },
5421 { "bytes 0-1000/100000000", 0, 1000, 100000000, true },
5422 { "bytes 1-999/1000", 1, 999, 1000, true },
5423 { "bytes 42-1233/1234", 42, 1233, 1234, true },
5424 { "bytes 42-1233/*", 42, 1233, -1, true },
5425 { "bytes 0-2147483648/2147483649", 0, 2147483648U, 2147483649U, true },
5426 { "bytes 2147483648-4294967296/4294967297", 2147483648U, 4294967296ULL, 4294967297ULL, true },
5427 };
5428
5429 wgint firstbyteptr[sizeof(wgint)];
5430 wgint lastbyteptr[sizeof(wgint)];
5431 wgint lengthptr[sizeof(wgint)];
5432 bool result;
5433 for (i = 0; i < countof (test_array); i++)
5434 {
5435 result = parse_content_range (test_array[i].rangehdr, firstbyteptr, lastbyteptr, lengthptr);
5436 #if 0
5437 printf ("%ld %ld\n", test_array[i].firstbyte, *firstbyteptr);
5438 printf ("%ld %ld\n", test_array[i].lastbyte, *lastbyteptr);
5439 printf ("%ld %ld\n", test_array[i].length, *lengthptr);
5440 printf ("\n");
5441 #endif
5442 mu_assert ("test_parse_range_header: False Negative", result == test_array[i].shouldPass);
5443 mu_assert ("test_parse_range_header: Bad parse", test_array[i].firstbyte == *firstbyteptr &&
5444 test_array[i].lastbyte == *lastbyteptr &&
5445 test_array[i].length == *lengthptr);
5446 }
5447
5448 return NULL;
5449 }
5450
5451 const char *
test_parse_content_disposition(void)5452 test_parse_content_disposition (void)
5453 {
5454 unsigned i;
5455 static const struct {
5456 const char *hdrval;
5457 const char *filename;
5458 bool result;
5459 } test_array[] = {
5460 { "filename=\"file.ext\"", "file.ext", true },
5461 { "attachment; filename=\"file.ext\"", "file.ext", true },
5462 { "attachment; filename=\"file.ext\"; dummy", "file.ext", true },
5463 { "attachment", NULL, false },
5464 { "attachment; filename*=UTF-8'en-US'hello.txt", "hello.txt", true },
5465 { "attachment; filename*0=\"hello\"; filename*1=\"world.txt\"",
5466 "helloworld.txt", true },
5467 { "attachment; filename=\"A.ext\"; filename*=\"B.ext\"", "B.ext", true },
5468 { "attachment; filename*=\"A.ext\"; filename*0=\"B\"; filename*1=\"B.ext\"",
5469 "A.ext", true },
5470 { "filename**0=\"A\"; filename**1=\"A.ext\"; filename*0=\"B\";\
5471 filename*1=\"B\"", "AA.ext", true },
5472 };
5473
5474 for (i = 0; i < countof (test_array); ++i)
5475 {
5476 char *filename;
5477 bool res;
5478
5479 res = parse_content_disposition (test_array[i].hdrval, &filename);
5480
5481 mu_assert ("test_parse_content_disposition: wrong result",
5482 res == test_array[i].result
5483 && (res == false
5484 || 0 == strcmp (test_array[i].filename, filename)));
5485 xfree (filename);
5486 }
5487
5488 return NULL;
5489 }
5490
5491 #endif /* TESTING */
5492
5493 /*
5494 * vim: et sts=2 sw=2 cino+={s
5495 */
5496