1 /* Conversion of links to local files.
2 Copyright (C) 2003-2011, 2014-2015, 2018-2021 Free Software
3 Foundation, Inc.
4
5 This file is part of GNU Wget.
6
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19
20 Additional permission under GNU GPL version 3 section 7
21
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
30
31 #include "wget.h"
32
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <assert.h>
39 #include "convert.h"
40 #include "url.h"
41 #include "recur.h"
42 #include "utils.h"
43 #include "hash.h"
44 #include "ptimer.h"
45 #include "res.h"
46 #include "html-url.h"
47 #include "css-url.h"
48 #include "iri.h"
49 #include "xstrndup.h"
50
51 static struct hash_table *dl_file_url_map;
52 struct hash_table *dl_url_file_map;
53
54 /* Set of HTML/CSS files downloaded in this Wget run, used for link
55 conversion after Wget is done. */
56 struct hash_table *downloaded_html_set;
57 struct hash_table *downloaded_css_set;
58
59 static void convert_links (const char *, struct urlpos *);
60
61
62 static void
convert_links_in_hashtable(struct hash_table * downloaded_set,int is_css,int * file_count)63 convert_links_in_hashtable (struct hash_table *downloaded_set,
64 int is_css,
65 int *file_count)
66 {
67 int i, cnt = 0;
68 char *arr[1024], **file_array;
69
70 if (!downloaded_set || (cnt = hash_table_count (downloaded_set)) == 0)
71 return;
72
73 if (cnt <= (int) countof (arr))
74 file_array = arr;
75 else
76 file_array = xmalloc (cnt * sizeof (arr[0]));
77
78 string_set_to_array (downloaded_set, file_array);
79
80 for (i = 0; i < cnt; i++)
81 {
82 struct urlpos *urls, *cur_url;
83 char *url;
84 char *file = file_array[i];
85
86 /* Determine the URL of the file. get_urls_{html,css} will need
87 it. */
88 url = hash_table_get (dl_file_url_map, file);
89 if (!url)
90 {
91 DEBUGP (("Apparently %s has been removed.\n", file));
92 continue;
93 }
94
95 DEBUGP (("Scanning %s (from %s)\n", file, url));
96
97 /* Parse the file... */
98 urls = is_css ? get_urls_css_file (file, url) :
99 get_urls_html (file, url, NULL, NULL);
100
101 /* We don't respect meta_disallow_follow here because, even if
102 the file is not followed, we might still want to convert the
103 links that have been followed from other files. */
104
105 for (cur_url = urls; cur_url; cur_url = cur_url->next)
106 {
107 char *local_name;
108 struct url *u;
109 struct iri *pi;
110
111 if (cur_url->link_base_p)
112 {
113 /* Base references have been resolved by our parser, so
114 we turn the base URL into an empty string. (Perhaps
115 we should remove the tag entirely?) */
116 cur_url->convert = CO_NULLIFY_BASE;
117 continue;
118 }
119
120 /* We decide the direction of conversion according to whether
121 a URL was downloaded. Downloaded URLs will be converted
122 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
123
124 pi = iri_new ();
125 set_uri_encoding (pi, opt.locale, true);
126
127 u = url_parse (cur_url->url->url, NULL, pi, true);
128 if (!u)
129 continue;
130
131 local_name = hash_table_get (dl_url_file_map, u->url);
132
133 /* Decide on the conversion type. */
134 if (local_name)
135 {
136 /* We've downloaded this URL. Convert it to relative
137 form. We do this even if the URL already is in
138 relative form, because our directory structure may
139 not be identical to that on the server (think `-nd',
140 `--cut-dirs', etc.). If --convert-file-only was passed,
141 we only convert the basename portion of the URL. */
142 cur_url->convert = (opt.convert_file_only ? CO_CONVERT_BASENAME_ONLY : CO_CONVERT_TO_RELATIVE);
143 cur_url->local_name = xstrdup (local_name);
144 DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
145 }
146 else
147 {
148 /* We haven't downloaded this URL. If it's not already
149 complete (including a full host name), convert it to
150 that form, so it can be reached while browsing this
151 HTML locally. */
152 if (!cur_url->link_complete_p)
153 cur_url->convert = CO_CONVERT_TO_COMPLETE;
154 cur_url->local_name = NULL;
155 DEBUGP (("will convert url %s to complete\n", u->url));
156 }
157
158 url_free (u);
159 iri_free (pi);
160 }
161
162 /* Convert the links in the file. */
163 convert_links (file, urls);
164 ++*file_count;
165
166 /* Free the data. */
167 free_urlpos (urls);
168 }
169
170 if (file_array != arr)
171 xfree (file_array);
172 }
173
174 /* This function is called when the retrieval is done to convert the
175 links that have been downloaded. It has to be called at the end of
176 the retrieval, because only then does Wget know conclusively which
177 URLs have been downloaded, and which not, so it can tell which
178 direction to convert to.
179
180 The "direction" means that the URLs to the files that have been
181 downloaded get converted to the relative URL which will point to
182 that file. And the other URLs get converted to the remote URL on
183 the server.
184
185 All the downloaded HTMLs are kept in downloaded_html_files, and
186 downloaded URLs in urls_downloaded. All the information is
187 extracted from these two lists. */
188
189 void
convert_all_links(void)190 convert_all_links (void)
191 {
192 double secs;
193 int file_count = 0;
194
195 struct ptimer *timer = ptimer_new ();
196
197 convert_links_in_hashtable (downloaded_html_set, 0, &file_count);
198 convert_links_in_hashtable (downloaded_css_set, 1, &file_count);
199
200 secs = ptimer_measure (timer);
201 logprintf (LOG_VERBOSE, _("Converted links in %d files in %s seconds.\n"),
202 file_count, print_decimal (secs));
203
204 ptimer_destroy (timer);
205 }
206
207 static void write_backup_file (const char *, downloaded_file_t);
208 static const char *replace_plain (const char*, int, FILE*, const char *);
209 static const char *replace_attr (const char *, int, FILE *, const char *);
210 static const char *replace_attr_refresh_hack (const char *, int, FILE *,
211 const char *, int);
212 static char *local_quote_string (const char *, bool);
213 static char *construct_relative (const char *, const char *);
214 static char *convert_basename (const char *, const struct urlpos *);
215
216 /* Change the links in one file. LINKS is a list of links in the
217 document, along with their positions and the desired direction of
218 the conversion. */
219 static void
convert_links(const char * file,struct urlpos * links)220 convert_links (const char *file, struct urlpos *links)
221 {
222 struct file_memory *fm;
223 FILE *fp;
224 const char *p;
225 downloaded_file_t downloaded_file_return;
226
227 struct urlpos *link;
228 int to_url_count = 0, to_file_count = 0;
229
230 logprintf (LOG_VERBOSE, _("Converting links in %s... "), file);
231
232 {
233 /* First we do a "dry run": go through the list L and see whether
234 any URL needs to be converted in the first place. If not, just
235 leave the file alone. */
236 int dry_count = 0;
237 struct urlpos *dry;
238 for (dry = links; dry; dry = dry->next)
239 if (dry->convert != CO_NOCONVERT)
240 ++dry_count;
241 if (!dry_count)
242 {
243 logputs (LOG_VERBOSE, _("nothing to do.\n"));
244 return;
245 }
246 logprintf (LOG_VERBOSE, _("%d.\n"), dry_count);
247 }
248
249 fm = wget_read_file (file);
250 if (!fm)
251 {
252 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
253 file, strerror (errno));
254 return;
255 }
256
257 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
258 if (opt.backup_converted && downloaded_file_return)
259 write_backup_file (file, downloaded_file_return);
260
261 /* Before opening the file for writing, unlink the file. This is
262 important if the data in FM is mapped. In such case, nulling the
263 file, which is what fopen() below does, would make us read all
264 zeroes from the mapped region. */
265 if (unlink (file) < 0 && errno != ENOENT)
266 {
267 logprintf (LOG_NOTQUIET, _("Unable to delete %s: %s\n"),
268 quote (file), strerror (errno));
269 wget_read_file_free (fm);
270 return;
271 }
272 /* Now open the file for writing. */
273 fp = fopen (file, "wb");
274 if (!fp)
275 {
276 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
277 file, strerror (errno));
278 wget_read_file_free (fm);
279 return;
280 }
281
282 /* Here we loop through all the URLs in file, replacing those of
283 them that are downloaded with relative references. */
284 p = fm->content;
285 for (link = links; link; link = link->next)
286 {
287 char *url_start = fm->content + link->pos;
288
289 if (link->pos >= fm->length)
290 {
291 DEBUGP (("Something strange is going on. Please investigate."));
292 break;
293 }
294 /* If the URL is not to be converted, skip it. */
295 if (link->convert == CO_NOCONVERT)
296 {
297 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
298 continue;
299 }
300
301 /* Echo the file contents, up to the offending URL's opening
302 quote, to the outfile. */
303 fwrite (p, 1, url_start - p, fp);
304 p = url_start;
305
306 switch (link->convert)
307 {
308 case CO_CONVERT_TO_RELATIVE:
309 /* Convert absolute URL to relative. */
310 if (link->local_name) {
311 char *newname = construct_relative (file, link->local_name);
312 char *quoted_newname = local_quote_string (newname,
313 link->link_css_p);
314
315 if (link->link_css_p || link->link_noquote_html_p)
316 p = replace_plain (p, link->size, fp, quoted_newname);
317 else if (!link->link_refresh_p)
318 p = replace_attr (p, link->size, fp, quoted_newname);
319 else
320 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
321 link->refresh_timeout);
322
323 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
324 link->url->url, newname, link->pos, file));
325
326 xfree (newname);
327 xfree (quoted_newname);
328 ++to_file_count;
329 }
330 break;
331 case CO_CONVERT_BASENAME_ONLY:
332 {
333 char *newname = convert_basename (p, link);
334 char *quoted_newname = local_quote_string (newname, link->link_css_p);
335
336 if (link->link_css_p || link->link_noquote_html_p)
337 p = replace_plain (p, link->size, fp, quoted_newname);
338 else if (!link->link_refresh_p)
339 p = replace_attr (p, link->size, fp, quoted_newname);
340 else
341 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
342 link->refresh_timeout);
343
344 DEBUGP (("Converted file part only: %s to %s at position %d in %s.\n",
345 link->url->url, newname, link->pos, file));
346
347 xfree (newname);
348 xfree (quoted_newname);
349 ++to_file_count;
350
351 break;
352 }
353 case CO_CONVERT_TO_COMPLETE:
354 /* Convert the link to absolute URL. */
355 {
356 char *newlink = link->url->url;
357 char *quoted_newlink = html_quote_string (newlink);
358
359 if (link->link_css_p || link->link_noquote_html_p)
360 p = replace_plain (p, link->size, fp, newlink);
361 else if (!link->link_refresh_p)
362 p = replace_attr (p, link->size, fp, quoted_newlink);
363 else
364 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
365 link->refresh_timeout);
366
367 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
368 newlink, link->pos, file));
369
370 xfree (quoted_newlink);
371 ++to_url_count;
372 break;
373 }
374 case CO_NULLIFY_BASE:
375 /* Change the base href to "". */
376 p = replace_attr (p, link->size, fp, "");
377 break;
378 case CO_NOCONVERT:
379 abort ();
380 break;
381 }
382 }
383
384 /* Output the rest of the file. */
385 if (p - fm->content < fm->length)
386 fwrite (p, 1, fm->length - (p - fm->content), fp);
387 fclose (fp);
388 wget_read_file_free (fm);
389
390 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
391 }
392
393 /* Construct and return a link that points from BASEFILE to LINKFILE.
394 Both files should be local file names, BASEFILE of the referrering
395 file, and LINKFILE of the referred file.
396
397 Examples:
398
399 cr("foo", "bar") -> "bar"
400 cr("A/foo", "A/bar") -> "bar"
401 cr("A/foo", "A/B/bar") -> "B/bar"
402 cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
403 cr("X/", "Y/bar") -> "../Y/bar" (trailing slash does matter in BASE)
404
405 Both files should be absolute or relative, otherwise strange
406 results might ensue. The function makes no special efforts to
407 handle "." and ".." in links, so make sure they're not there
408 (e.g. using path_simplify). */
409
410 static char *
construct_relative(const char * basefile,const char * linkfile)411 construct_relative (const char *basefile, const char *linkfile)
412 {
413 char *link;
414 int basedirs;
415 const char *b, *l;
416 int i, start;
417
418 /* First, skip the initial directory components common to both
419 files. */
420 start = 0;
421 for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
422 {
423 if (*b == '/')
424 start = (b - basefile) + 1;
425 }
426 basefile += start;
427 linkfile += start;
428
429 /* With common directories out of the way, the situation we have is
430 as follows:
431 b - b1/b2/[...]/bfile
432 l - l1/l2/[...]/lfile
433
434 The link we're constructing needs to be:
435 lnk - ../../l1/l2/[...]/lfile
436
437 Where the number of ".."'s equals the number of bN directory
438 components in B. */
439
440 /* Count the directory components in B. */
441 basedirs = 0;
442 for (b = basefile; *b; b++)
443 {
444 if (*b == '/')
445 ++basedirs;
446 }
447
448 if (!basedirs && (b = strpbrk (linkfile, "/:")) && *b == ':')
449 {
450 link = xmalloc (2 + strlen (linkfile) + 1);
451 memcpy (link, "./", 2);
452 strcpy (link + 2, linkfile);
453 }
454 else
455 {
456 /* Construct LINK as explained above. */
457 link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
458 for (i = 0; i < basedirs; i++)
459 memcpy (link + 3 * i, "../", 3);
460 strcpy (link + 3 * i, linkfile);
461 }
462
463 return link;
464 }
465
466 /* Construct and return a "transparent proxy" URL
467 reflecting changes made by --adjust-extension to the file component
468 (i.e., "basename") of the original URL, but leaving the "dirname"
469 of the URL (protocol://hostname... portion) untouched.
470
471 Think: populating a squid cache via a recursive wget scrape, where
472 changing URLs to work locally with "file://..." is NOT desirable.
473
474 Example:
475
476 if
477 p = "//foo.com/bar.cgi?xyz"
478 and
479 link->local_name = "docroot/foo.com/bar.cgi?xyz.css"
480 then
481
482 new_construct_func(p, link);
483 will return
484 "//foo.com/bar.cgi?xyz.css"
485
486 Essentially, we do s/$(basename orig_url)/$(basename link->local_name)/
487 */
488 static char *
convert_basename(const char * p,const struct urlpos * link)489 convert_basename (const char *p, const struct urlpos *link)
490 {
491 int len = link->size;
492 char *url = NULL;
493 char *org_basename = NULL, *local_basename;
494 char *result = NULL;
495
496 if (*p == '"' || *p == '\'')
497 {
498 len -= 2;
499 p++;
500 }
501
502 url = xstrndup (p, len);
503
504 org_basename = strrchr (url, '/');
505 if (org_basename)
506 org_basename++;
507 else
508 org_basename = url;
509
510 local_basename = link->local_name ? strrchr (link->local_name, '/') : NULL;
511 if (local_basename)
512 local_basename++;
513 else
514 local_basename = url;
515
516 /*
517 * If the basenames differ, graft the adjusted basename (local_basename)
518 * onto the original URL.
519 */
520 if (strcmp (org_basename, local_basename) == 0)
521 result = url;
522 else
523 {
524 result = uri_merge (url, local_basename);
525 xfree (url);
526 }
527
528 return result;
529 }
530
531 /* Used by write_backup_file to remember which files have been
532 written. */
533 static struct hash_table *converted_files;
534
535 static void
write_backup_file(const char * file,downloaded_file_t downloaded_file_return)536 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
537 {
538 /* Rather than just writing over the original .html file with the
539 converted version, save the former to *.orig. Note we only do
540 this for files we've _successfully_ downloaded, so we don't
541 clobber .orig files sitting around from previous invocations.
542 On VMS, use "_orig" instead of ".orig". See "wget.h". */
543
544 if (!converted_files)
545 converted_files = make_string_hash_table (0);
546
547 /* We can get called twice on the same URL thanks to the
548 convert_all_links() call in main. If we write the .orig file
549 each time in such a case, it'll end up containing the first-pass
550 conversion, not the original file. So, see if we've already been
551 called on this file. */
552 if (!string_set_contains (converted_files, file))
553 {
554 /* Construct the backup filename as the original name plus ".orig". */
555 char buf[1024];
556 size_t filename_len = strlen (file);
557 char *filename_plus_orig_suffix;
558
559 if (filename_len < sizeof (buf) - 5)
560 filename_plus_orig_suffix = buf;
561 else
562 filename_plus_orig_suffix = xmalloc (filename_len + 5 + 1);
563
564 /* TODO: hack this to work with css files */
565 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
566 {
567 /* Just write "orig" over "html". We need to do it this way
568 because when we're checking to see if we've downloaded the
569 file before (to see if we can skip downloading it), we don't
570 know if it's a text/html file. Therefore we don't know yet
571 at that stage that -E is going to cause us to tack on
572 ".html", so we need to compare vs. the original URL plus
573 ".orig", not the original URL plus ".html.orig". */
574 memcpy (filename_plus_orig_suffix, file, filename_len - 4);
575 memcpy (filename_plus_orig_suffix + filename_len - 4, "orig", 5);
576 }
577 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
578 {
579 /* Append ".orig" to the name. */
580 memcpy (filename_plus_orig_suffix, file, filename_len);
581 strcpy (filename_plus_orig_suffix + filename_len, ORIG_SFX);
582 }
583
584 /* Rename <file> to <file>.orig before former gets written over. */
585 if (rename (file, filename_plus_orig_suffix) != 0)
586 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
587 file, filename_plus_orig_suffix, strerror (errno));
588
589 if (filename_plus_orig_suffix != buf)
590 xfree (filename_plus_orig_suffix);
591
592 /* Remember that we've already written a .orig backup for this file.
593 Note that we never free this memory since we need it till the
594 convert_all_links() call, which is one of the last things the
595 program does before terminating. BTW, I'm not sure if it would be
596 safe to just set 'converted_file_ptr->string' to 'file' below,
597 rather than making a copy of the string... Another note is that I
598 thought I could just add a field to the urlpos structure saying
599 that we'd written a .orig file for this URL, but that didn't work,
600 so I had to make this separate list.
601 -- Dan Harkless <wget@harkless.org>
602
603 This [adding a field to the urlpos structure] didn't work
604 because convert_file() is called from convert_all_links at
605 the end of the retrieval with a freshly built new urlpos
606 list.
607 -- Hrvoje Niksic <hniksic@xemacs.org>
608 */
609 string_set_add (converted_files, file);
610 }
611 }
612
613 static bool find_fragment (const char *, int, const char **, const char **);
614
615 /* Replace a string with NEW_TEXT. Ignore quoting. */
616 static const char *
replace_plain(const char * p,int size,FILE * fp,const char * new_text)617 replace_plain (const char *p, int size, FILE *fp, const char *new_text)
618 {
619 fputs (new_text, fp);
620 p += size;
621 return p;
622 }
623
624 /* Replace an attribute's original text with NEW_TEXT. */
625
626 static const char *
replace_attr(const char * p,int size,FILE * fp,const char * new_text)627 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
628 {
629 bool quote_flag = false;
630 char quote_char = '\"'; /* use "..." for quoting, unless the
631 original value is quoted, in which
632 case reuse its quoting char. */
633 const char *frag_beg, *frag_end;
634
635 /* Structure of our string is:
636 "...old-contents..."
637 <--- size ---> (with quotes)
638 OR:
639 ...old-contents...
640 <--- size --> (no quotes) */
641
642 if (*p == '\"' || *p == '\'')
643 {
644 quote_char = *p;
645 quote_flag = true;
646 ++p;
647 size -= 2; /* disregard opening and closing quote */
648 }
649 putc (quote_char, fp);
650 fputs (new_text, fp);
651
652 /* Look for fragment identifier, if any. */
653 if (find_fragment (p, size, &frag_beg, &frag_end))
654 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
655 p += size;
656 if (quote_flag)
657 ++p;
658 putc (quote_char, fp);
659
660 return p;
661 }
662
663 /* The same as REPLACE_ATTR, but used when replacing
664 <meta http-equiv=refresh content="new_text"> because we need to
665 append "timeout_value; URL=" before the next_text. */
666
667 static const char *
replace_attr_refresh_hack(const char * p,int size,FILE * fp,const char * new_text,int timeout)668 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
669 const char *new_text, int timeout)
670 {
671 /* "0; URL=..." */
672 char new_with_timeout[1024];
673
674 if (((unsigned) snprintf (
675 new_with_timeout, sizeof (new_with_timeout),
676 "%d; URL=%s", timeout, new_text)) >= sizeof (new_with_timeout))
677 {
678 // very unlikely fallback using heap memory
679 char *tmp = aprintf("%d; URL=%s", timeout, new_text);
680 const char *res = replace_attr (p, size, fp, tmp);
681 xfree (tmp);
682 return res;
683 }
684
685 return replace_attr (p, size, fp, new_with_timeout);
686 }
687
688 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
689 preceded by '&'. If the character is not found, return zero. If
690 the character is found, return true and set BP and EP to point to
691 the beginning and end of the region.
692
693 This is used for finding the fragment indentifiers in URLs. */
694
695 static bool
find_fragment(const char * beg,int size,const char ** bp,const char ** ep)696 find_fragment (const char *beg, int size, const char **bp, const char **ep)
697 {
698 const char *end = beg + size;
699 bool saw_amp = false;
700 for (; beg < end; beg++)
701 {
702 switch (*beg)
703 {
704 case '&':
705 saw_amp = true;
706 break;
707 case '#':
708 if (!saw_amp)
709 {
710 *bp = beg;
711 *ep = end;
712 return true;
713 }
714 /* fallthrough */
715 default:
716 saw_amp = false;
717 }
718 }
719 return false;
720 }
721
722 /* Quote FILE for use as local reference to an HTML file.
723
724 We quote ? as %3F to avoid passing part of the file name as the
725 parameter when browsing the converted file through HTTP. However,
726 it is safe to do this only when `--adjust-extension' is turned on.
727 This is because converting "index.html?foo=bar" to
728 "index.html%3Ffoo=bar" would break local browsing, as the latter
729 isn't even recognized as an HTML file! However, converting
730 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
731 safe for both local and HTTP-served browsing.
732
733 We always quote "#" as "%23", "%" as "%25" and ";" as "%3B"
734 because those characters have special meanings in URLs. */
735
736 static char *
local_quote_string(const char * file,bool no_html_quote)737 local_quote_string (const char *file, bool no_html_quote)
738 {
739 const char *from;
740 char *newname, *to, *res;
741 char buf[1024];
742 size_t tolen;
743
744 char *any = strpbrk (file, "?#%;");
745 if (!any)
746 return no_html_quote ? strdup (file) : html_quote_string (file);
747
748 /* Allocate space assuming the worst-case scenario, each character
749 having to be quoted. */
750 tolen = 3 * strlen (file);
751 if (tolen < sizeof (buf))
752 to = newname = buf;
753 else
754 to = newname = xmalloc (tolen + 1);
755
756 for (from = file; *from; from++)
757 switch (*from)
758 {
759 case '%':
760 *to++ = '%';
761 *to++ = '2';
762 *to++ = '5';
763 break;
764 case '#':
765 *to++ = '%';
766 *to++ = '2';
767 *to++ = '3';
768 break;
769 case ';':
770 *to++ = '%';
771 *to++ = '3';
772 *to++ = 'B';
773 break;
774 case '?':
775 if (opt.adjust_extension)
776 {
777 *to++ = '%';
778 *to++ = '3';
779 *to++ = 'F';
780 break;
781 }
782 /* fallthrough */
783 default:
784 *to++ = *from;
785 }
786 *to = '\0';
787
788 if (newname == buf)
789 return no_html_quote ? strdup (newname) : html_quote_string (newname);
790
791 if (no_html_quote)
792 return newname;
793
794 res = html_quote_string (newname);
795 xfree (newname);
796 return res;
797 }
798
799 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
800 downloaded_html_list, and downloaded_html_set. Other code calls
801 these functions to let us know that a file has been downloaded. */
802
803 #define ENSURE_TABLES_EXIST do { \
804 if (!dl_file_url_map) \
805 dl_file_url_map = make_string_hash_table (0); \
806 if (!dl_url_file_map) \
807 dl_url_file_map = make_string_hash_table (0); \
808 } while (0)
809
810 /* Return true if S1 and S2 are the same, except for "/index.html".
811 The three cases in which it returns one are (substitute any
812 substring for "foo"):
813
814 m("foo/index.html", "foo/") ==> 1
815 m("foo/", "foo/index.html") ==> 1
816 m("foo", "foo/index.html") ==> 1
817 m("foo", "foo/" ==> 1
818 m("foo", "foo") ==> 1 */
819
820 static bool
match_except_index(const char * s1,const char * s2)821 match_except_index (const char *s1, const char *s2)
822 {
823 int i;
824 const char *lng;
825
826 /* Skip common substring. */
827 for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
828 ;
829 if (i == 0)
830 /* Strings differ at the very beginning -- bail out. We need to
831 check this explicitly to avoid `lng - 1' reading outside the
832 array. */
833 return false;
834
835 if (!*s1 && !*s2)
836 /* Both strings hit EOF -- strings are equal. */
837 return true;
838 else if (*s1 && *s2)
839 /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
840 return false;
841 else if (*s1)
842 /* S1 is the longer one. */
843 lng = s1;
844 else
845 /* S2 is the longer one. */
846 lng = s2;
847
848 /* foo */ /* foo/ */
849 /* foo/index.html */ /* or */ /* foo/index.html */
850 /* ^ */ /* ^ */
851
852 if (*lng != '/')
853 /* The right-hand case. */
854 --lng;
855
856 if (*lng == '/' && *(lng + 1) == '\0')
857 /* foo */
858 /* foo/ */
859 return true;
860
861 return 0 == strcmp (lng, "/index.html");
862 }
863
864 static int
dissociate_urls_from_file_mapper(void * key,void * value,void * arg)865 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
866 {
867 char *mapping_url = (char *)key;
868 char *mapping_file = (char *)value;
869 char *file = (char *)arg;
870
871 if (0 == strcmp (mapping_file, file))
872 {
873 hash_table_remove (dl_url_file_map, mapping_url);
874 xfree (mapping_url);
875 xfree (mapping_file);
876 }
877
878 /* Continue mapping. */
879 return 0;
880 }
881
882 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
883
884 static void
dissociate_urls_from_file(const char * file)885 dissociate_urls_from_file (const char *file)
886 {
887 /* Can't use hash_table_iter_* because the table mutates while mapping. */
888 hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
889 (char *) file);
890 }
891
892 /* Register that URL has been successfully downloaded to FILE. This
893 is used by the link conversion code to convert references to URLs
894 to references to local files. It is also being used to check if a
895 URL has already been downloaded. */
896
897 void
register_download(const char * url,const char * file)898 register_download (const char *url, const char *file)
899 {
900 char *old_file, *old_url;
901
902 ENSURE_TABLES_EXIST;
903
904 /* With some forms of retrieval, it is possible, although not likely
905 or particularly desirable. If both are downloaded, the second
906 download will override the first one. When that happens,
907 dissociate the old file name from the URL. */
908
909 if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
910 {
911 if (0 == strcmp (url, old_url))
912 /* We have somehow managed to download the same URL twice.
913 Nothing to do. */
914 return;
915
916 if (match_except_index (url, old_url)
917 && !hash_table_contains (dl_url_file_map, url))
918 /* The two URLs differ only in the "index.html" ending. For
919 example, one is "http://www.server.com/", and the other is
920 "http://www.server.com/index.html". Don't remove the old
921 one, just add the new one as a non-canonical entry. */
922 goto url_only;
923
924 hash_table_remove (dl_file_url_map, file);
925 xfree (old_file);
926 xfree (old_url);
927
928 /* Remove all the URLs that point to this file. Yes, there can
929 be more than one such URL, because we store redirections as
930 multiple entries in dl_url_file_map. For example, if URL1
931 redirects to URL2 which gets downloaded to FILE, we map both
932 URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map
933 only points to URL2.) When another URL gets loaded to FILE,
934 we want both URL1 and URL2 dissociated from it.
935
936 This is a relatively expensive operation because it performs
937 a linear search of the whole hash table, but it should be
938 called very rarely, only when two URLs resolve to the same
939 file name, *and* the "<file>.1" extensions are turned off.
940 In other words, almost never. */
941 dissociate_urls_from_file (file);
942 }
943
944 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
945
946 url_only:
947 /* A URL->FILE mapping is not possible without a FILE->URL mapping.
948 If the latter were present, it should have been removed by the
949 above `if'. So we could write:
950
951 assert (!hash_table_contains (dl_url_file_map, url));
952
953 The above is correct when running in recursive mode where the
954 same URL always resolves to the same file. But if you do
955 something like:
956
957 wget URL URL
958
959 then the first URL will resolve to "FILE", and the other to
960 "FILE.1". In that case, FILE.1 will not be found in
961 dl_file_url_map, but URL will still point to FILE in
962 dl_url_file_map. */
963 if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
964 {
965 hash_table_remove (dl_url_file_map, url);
966 xfree (old_url);
967 xfree (old_file);
968 }
969
970 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
971 }
972
973 /* Register that FROM has been redirected to "TO". This assumes that TO
974 is successfully downloaded and already registered using
975 register_download() above. */
976
977 void
register_redirection(const char * from,const char * to)978 register_redirection (const char *from, const char *to)
979 {
980 char *file;
981
982 ENSURE_TABLES_EXIST;
983
984 file = hash_table_get (dl_url_file_map, to);
985 assert (file != NULL);
986 if (!hash_table_contains (dl_url_file_map, from))
987 hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
988 }
989
990 /* Register that the file has been deleted. */
991
992 void
register_delete_file(const char * file)993 register_delete_file (const char *file)
994 {
995 char *old_url, *old_file;
996
997 ENSURE_TABLES_EXIST;
998
999 if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
1000 return;
1001
1002 hash_table_remove (dl_file_url_map, file);
1003 xfree (old_file);
1004 xfree (old_url);
1005 dissociate_urls_from_file (file);
1006 }
1007
1008 /* Register that FILE is an HTML file that has been downloaded. */
1009
1010 void
register_html(const char * file)1011 register_html (const char *file)
1012 {
1013 if (!downloaded_html_set)
1014 downloaded_html_set = make_string_hash_table (0);
1015 string_set_add (downloaded_html_set, file);
1016 }
1017
1018 /* Register that FILE is a CSS file that has been downloaded. */
1019
1020 void
register_css(const char * file)1021 register_css (const char *file)
1022 {
1023 if (!downloaded_css_set)
1024 downloaded_css_set = make_string_hash_table (0);
1025 string_set_add (downloaded_css_set, file);
1026 }
1027
1028 /* Cleanup the data structures associated with this file. */
1029
1030 #if defined DEBUG_MALLOC || defined TESTING
1031 static void downloaded_files_free (void);
1032
1033 void
convert_cleanup(void)1034 convert_cleanup (void)
1035 {
1036 if (dl_file_url_map)
1037 {
1038 free_keys_and_values (dl_file_url_map);
1039 hash_table_destroy (dl_file_url_map);
1040 dl_file_url_map = NULL;
1041 }
1042 if (dl_url_file_map)
1043 {
1044 free_keys_and_values (dl_url_file_map);
1045 hash_table_destroy (dl_url_file_map);
1046 dl_url_file_map = NULL;
1047 }
1048 if (downloaded_html_set)
1049 string_set_free (downloaded_html_set);
1050 if (downloaded_css_set)
1051 string_set_free (downloaded_css_set);
1052 downloaded_files_free ();
1053 if (converted_files)
1054 string_set_free (converted_files);
1055 }
1056 #endif
1057
1058 /* Book-keeping code for downloaded files that enables extension
1059 hacks. */
1060
1061 /* This table should really be merged with dl_file_url_map and
1062 downloaded_html_files. This was originally a list, but I changed
1063 it to a hash table because it was actually taking a lot of time to
1064 find things in it. */
1065
1066 static struct hash_table *downloaded_files_hash;
1067
1068 /* We're storing "modes" of type downloaded_file_t in the hash table.
1069 However, our hash tables only accept pointers for keys and values.
1070 So when we need a pointer, we use the address of a
1071 downloaded_file_t variable of static storage. */
1072
1073 static downloaded_file_t *
downloaded_mode_to_ptr(downloaded_file_t mode)1074 downloaded_mode_to_ptr (downloaded_file_t mode)
1075 {
1076 static downloaded_file_t
1077 v1 = FILE_NOT_ALREADY_DOWNLOADED,
1078 v2 = FILE_DOWNLOADED_NORMALLY,
1079 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
1080 v4 = CHECK_FOR_FILE;
1081
1082 switch (mode)
1083 {
1084 case FILE_NOT_ALREADY_DOWNLOADED:
1085 return &v1;
1086 case FILE_DOWNLOADED_NORMALLY:
1087 return &v2;
1088 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
1089 return &v3;
1090 case CHECK_FOR_FILE:
1091 return &v4;
1092 }
1093 return NULL;
1094 }
1095
1096 /* Remembers which files have been downloaded. In the standard case,
1097 should be called with mode == FILE_DOWNLOADED_NORMALLY for each
1098 file we actually download successfully (i.e. not for ones we have
1099 failures on or that we skip due to -N).
1100
1101 When we've downloaded a file and tacked on a ".html" extension due
1102 to -E, call this function with
1103 FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1104 FILE_DOWNLOADED_NORMALLY.
1105
1106 If you just want to check if a file has been previously added
1107 without adding it, call with mode == CHECK_FOR_FILE. Please be
1108 sure to call this function with local filenames, not remote
1109 URLs. */
1110
1111 downloaded_file_t
downloaded_file(downloaded_file_t mode,const char * file)1112 downloaded_file (downloaded_file_t mode, const char *file)
1113 {
1114 downloaded_file_t *ptr;
1115
1116 if (mode == CHECK_FOR_FILE)
1117 {
1118 if (!downloaded_files_hash)
1119 return FILE_NOT_ALREADY_DOWNLOADED;
1120 ptr = hash_table_get (downloaded_files_hash, file);
1121 if (!ptr)
1122 return FILE_NOT_ALREADY_DOWNLOADED;
1123 return *ptr;
1124 }
1125
1126 if (!downloaded_files_hash)
1127 downloaded_files_hash = make_string_hash_table (0);
1128
1129 ptr = hash_table_get (downloaded_files_hash, file);
1130 if (ptr)
1131 return *ptr;
1132
1133 ptr = downloaded_mode_to_ptr (mode);
1134 hash_table_put (downloaded_files_hash, xstrdup (file), ptr);
1135
1136 return FILE_NOT_ALREADY_DOWNLOADED;
1137 }
1138
1139 #if defined DEBUG_MALLOC || defined TESTING
1140 static void
downloaded_files_free(void)1141 downloaded_files_free (void)
1142 {
1143 if (downloaded_files_hash)
1144 {
1145 hash_table_iterator iter;
1146 for (hash_table_iterate (downloaded_files_hash, &iter);
1147 hash_table_iter_next (&iter);
1148 )
1149 xfree (iter.key);
1150 hash_table_destroy (downloaded_files_hash);
1151 downloaded_files_hash = NULL;
1152 }
1153 }
1154 #endif
1155
1156 /* The function returns the pointer to the malloc-ed quoted version of
1157 string s. It will recognize and quote numeric and special graphic
1158 entities, as per RFC1866:
1159
1160 `&' -> `&'
1161 `<' -> `<'
1162 `>' -> `>'
1163 `"' -> `"'
1164 SP -> ` '
1165
1166 No other entities are recognized or replaced. */
1167 char *
html_quote_string(const char * s)1168 html_quote_string (const char *s)
1169 {
1170 const char *b = s;
1171 char *p, *res;
1172 int i;
1173
1174 /* Pass through the string, and count the new size. */
1175 for (i = 0; *s; s++, i++)
1176 {
1177 if (*s == '&')
1178 i += 4; /* `amp;' */
1179 else if (*s == '<' || *s == '>')
1180 i += 3; /* `lt;' and `gt;' */
1181 else if (*s == '\"')
1182 i += 5; /* `quot;' */
1183 else if (*s == ' ')
1184 i += 4; /* #32; */
1185 }
1186 res = xmalloc (i + 1);
1187 s = b;
1188 for (p = res; *s; s++)
1189 {
1190 switch (*s)
1191 {
1192 case '&':
1193 *p++ = '&';
1194 *p++ = 'a';
1195 *p++ = 'm';
1196 *p++ = 'p';
1197 *p++ = ';';
1198 break;
1199 case '<': case '>':
1200 *p++ = '&';
1201 *p++ = (*s == '<' ? 'l' : 'g');
1202 *p++ = 't';
1203 *p++ = ';';
1204 break;
1205 case '\"':
1206 *p++ = '&';
1207 *p++ = 'q';
1208 *p++ = 'u';
1209 *p++ = 'o';
1210 *p++ = 't';
1211 *p++ = ';';
1212 break;
1213 case ' ':
1214 *p++ = '&';
1215 *p++ = '#';
1216 *p++ = '3';
1217 *p++ = '2';
1218 *p++ = ';';
1219 break;
1220 default:
1221 *p++ = *s;
1222 }
1223 }
1224 *p = '\0';
1225 return res;
1226 }
1227
1228 /*
1229 * vim: et ts=2 sw=2
1230 */
1231