1 /* Conversion of links to local files.
2    Copyright (C) 2003-2011, 2014-2015, 2018-2021 Free Software
3    Foundation, Inc.
4 
5 This file is part of GNU Wget.
6 
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10  (at your option) any later version.
11 
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19 
20 Additional permission under GNU GPL version 3 section 7
21 
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work.  */
30 
31 #include "wget.h"
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <assert.h>
39 #include "convert.h"
40 #include "url.h"
41 #include "recur.h"
42 #include "utils.h"
43 #include "hash.h"
44 #include "ptimer.h"
45 #include "res.h"
46 #include "html-url.h"
47 #include "css-url.h"
48 #include "iri.h"
49 #include "xstrndup.h"
50 
51 static struct hash_table *dl_file_url_map;
52 struct hash_table *dl_url_file_map;
53 
54 /* Set of HTML/CSS files downloaded in this Wget run, used for link
55    conversion after Wget is done.  */
56 struct hash_table *downloaded_html_set;
57 struct hash_table *downloaded_css_set;
58 
59 static void convert_links (const char *, struct urlpos *);
60 
61 
62 static void
convert_links_in_hashtable(struct hash_table * downloaded_set,int is_css,int * file_count)63 convert_links_in_hashtable (struct hash_table *downloaded_set,
64                             int is_css,
65                             int *file_count)
66 {
67   int i, cnt = 0;
68   char *arr[1024], **file_array;
69 
70   if (!downloaded_set || (cnt = hash_table_count (downloaded_set)) == 0)
71     return;
72 
73   if (cnt <= (int) countof (arr))
74     file_array = arr;
75   else
76     file_array = xmalloc (cnt * sizeof (arr[0]));
77 
78   string_set_to_array (downloaded_set, file_array);
79 
80   for (i = 0; i < cnt; i++)
81     {
82       struct urlpos *urls, *cur_url;
83       char *url;
84       char *file = file_array[i];
85 
86       /* Determine the URL of the file.  get_urls_{html,css} will need
87          it.  */
88       url = hash_table_get (dl_file_url_map, file);
89       if (!url)
90         {
91           DEBUGP (("Apparently %s has been removed.\n", file));
92           continue;
93         }
94 
95       DEBUGP (("Scanning %s (from %s)\n", file, url));
96 
97       /* Parse the file...  */
98       urls = is_css ? get_urls_css_file (file, url) :
99                       get_urls_html (file, url, NULL, NULL);
100 
101       /* We don't respect meta_disallow_follow here because, even if
102          the file is not followed, we might still want to convert the
103          links that have been followed from other files.  */
104 
105       for (cur_url = urls; cur_url; cur_url = cur_url->next)
106         {
107           char *local_name;
108           struct url *u;
109           struct iri *pi;
110 
111           if (cur_url->link_base_p)
112             {
113               /* Base references have been resolved by our parser, so
114                  we turn the base URL into an empty string.  (Perhaps
115                  we should remove the tag entirely?)  */
116               cur_url->convert = CO_NULLIFY_BASE;
117               continue;
118             }
119 
120           /* We decide the direction of conversion according to whether
121              a URL was downloaded.  Downloaded URLs will be converted
122              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
123 
124           pi = iri_new ();
125           set_uri_encoding (pi, opt.locale, true);
126 
127           u = url_parse (cur_url->url->url, NULL, pi, true);
128           if (!u)
129               continue;
130 
131           local_name = hash_table_get (dl_url_file_map, u->url);
132 
133           /* Decide on the conversion type.  */
134           if (local_name)
135             {
136               /* We've downloaded this URL.  Convert it to relative
137                  form.  We do this even if the URL already is in
138                  relative form, because our directory structure may
139                  not be identical to that on the server (think `-nd',
140                  `--cut-dirs', etc.). If --convert-file-only was passed,
141                  we only convert the basename portion of the URL.  */
142               cur_url->convert = (opt.convert_file_only ? CO_CONVERT_BASENAME_ONLY : CO_CONVERT_TO_RELATIVE);
143               cur_url->local_name = xstrdup (local_name);
144               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
145             }
146           else
147             {
148               /* We haven't downloaded this URL.  If it's not already
149                  complete (including a full host name), convert it to
150                  that form, so it can be reached while browsing this
151                  HTML locally.  */
152               if (!cur_url->link_complete_p)
153                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
154               cur_url->local_name = NULL;
155               DEBUGP (("will convert url %s to complete\n", u->url));
156             }
157 
158           url_free (u);
159           iri_free (pi);
160         }
161 
162       /* Convert the links in the file.  */
163       convert_links (file, urls);
164       ++*file_count;
165 
166       /* Free the data.  */
167       free_urlpos (urls);
168     }
169 
170   if (file_array != arr)
171     xfree (file_array);
172 }
173 
174 /* This function is called when the retrieval is done to convert the
175    links that have been downloaded.  It has to be called at the end of
176    the retrieval, because only then does Wget know conclusively which
177    URLs have been downloaded, and which not, so it can tell which
178    direction to convert to.
179 
180    The "direction" means that the URLs to the files that have been
181    downloaded get converted to the relative URL which will point to
182    that file.  And the other URLs get converted to the remote URL on
183    the server.
184 
185    All the downloaded HTMLs are kept in downloaded_html_files, and
186    downloaded URLs in urls_downloaded.  All the information is
187    extracted from these two lists.  */
188 
189 void
convert_all_links(void)190 convert_all_links (void)
191 {
192   double secs;
193   int file_count = 0;
194 
195   struct ptimer *timer = ptimer_new ();
196 
197   convert_links_in_hashtable (downloaded_html_set, 0, &file_count);
198   convert_links_in_hashtable (downloaded_css_set, 1, &file_count);
199 
200   secs = ptimer_measure (timer);
201   logprintf (LOG_VERBOSE, _("Converted links in %d files in %s seconds.\n"),
202              file_count, print_decimal (secs));
203 
204   ptimer_destroy (timer);
205 }
206 
207 static void write_backup_file (const char *, downloaded_file_t);
208 static const char *replace_plain (const char*, int, FILE*, const char *);
209 static const char *replace_attr (const char *, int, FILE *, const char *);
210 static const char *replace_attr_refresh_hack (const char *, int, FILE *,
211                                               const char *, int);
212 static char *local_quote_string (const char *, bool);
213 static char *construct_relative (const char *, const char *);
214 static char *convert_basename (const char *, const struct urlpos *);
215 
216 /* Change the links in one file.  LINKS is a list of links in the
217    document, along with their positions and the desired direction of
218    the conversion.  */
219 static void
convert_links(const char * file,struct urlpos * links)220 convert_links (const char *file, struct urlpos *links)
221 {
222   struct file_memory *fm;
223   FILE *fp;
224   const char *p;
225   downloaded_file_t downloaded_file_return;
226 
227   struct urlpos *link;
228   int to_url_count = 0, to_file_count = 0;
229 
230   logprintf (LOG_VERBOSE, _("Converting links in %s... "), file);
231 
232   {
233     /* First we do a "dry run": go through the list L and see whether
234        any URL needs to be converted in the first place.  If not, just
235        leave the file alone.  */
236     int dry_count = 0;
237     struct urlpos *dry;
238     for (dry = links; dry; dry = dry->next)
239       if (dry->convert != CO_NOCONVERT)
240         ++dry_count;
241     if (!dry_count)
242       {
243         logputs (LOG_VERBOSE, _("nothing to do.\n"));
244         return;
245       }
246     logprintf (LOG_VERBOSE, _("%d.\n"), dry_count);
247   }
248 
249   fm = wget_read_file (file);
250   if (!fm)
251     {
252       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
253                  file, strerror (errno));
254       return;
255     }
256 
257   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
258   if (opt.backup_converted && downloaded_file_return)
259     write_backup_file (file, downloaded_file_return);
260 
261   /* Before opening the file for writing, unlink the file.  This is
262      important if the data in FM is mapped.  In such case, nulling the
263      file, which is what fopen() below does, would make us read all
264      zeroes from the mapped region.  */
265   if (unlink (file) < 0 && errno != ENOENT)
266     {
267       logprintf (LOG_NOTQUIET, _("Unable to delete %s: %s\n"),
268                  quote (file), strerror (errno));
269       wget_read_file_free (fm);
270       return;
271     }
272   /* Now open the file for writing.  */
273   fp = fopen (file, "wb");
274   if (!fp)
275     {
276       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
277                  file, strerror (errno));
278       wget_read_file_free (fm);
279       return;
280     }
281 
282   /* Here we loop through all the URLs in file, replacing those of
283      them that are downloaded with relative references.  */
284   p = fm->content;
285   for (link = links; link; link = link->next)
286     {
287       char *url_start = fm->content + link->pos;
288 
289       if (link->pos >= fm->length)
290         {
291           DEBUGP (("Something strange is going on.  Please investigate."));
292           break;
293         }
294       /* If the URL is not to be converted, skip it.  */
295       if (link->convert == CO_NOCONVERT)
296         {
297           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
298           continue;
299         }
300 
301       /* Echo the file contents, up to the offending URL's opening
302          quote, to the outfile.  */
303       fwrite (p, 1, url_start - p, fp);
304       p = url_start;
305 
306       switch (link->convert)
307         {
308         case CO_CONVERT_TO_RELATIVE:
309           /* Convert absolute URL to relative. */
310           if (link->local_name) {
311             char *newname = construct_relative (file, link->local_name);
312             char *quoted_newname = local_quote_string (newname,
313                                                        link->link_css_p);
314 
315             if (link->link_css_p || link->link_noquote_html_p)
316               p = replace_plain (p, link->size, fp, quoted_newname);
317             else if (!link->link_refresh_p)
318               p = replace_attr (p, link->size, fp, quoted_newname);
319             else
320               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
321                                              link->refresh_timeout);
322 
323             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
324                      link->url->url, newname, link->pos, file));
325 
326             xfree (newname);
327             xfree (quoted_newname);
328             ++to_file_count;
329           }
330           break;
331         case CO_CONVERT_BASENAME_ONLY:
332           {
333             char *newname = convert_basename (p, link);
334             char *quoted_newname = local_quote_string (newname, link->link_css_p);
335 
336             if (link->link_css_p || link->link_noquote_html_p)
337               p = replace_plain (p, link->size, fp, quoted_newname);
338             else if (!link->link_refresh_p)
339               p = replace_attr (p, link->size, fp, quoted_newname);
340             else
341               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
342                                              link->refresh_timeout);
343 
344             DEBUGP (("Converted file part only: %s to %s at position %d in %s.\n",
345                      link->url->url, newname, link->pos, file));
346 
347             xfree (newname);
348             xfree (quoted_newname);
349             ++to_file_count;
350 
351             break;
352           }
353         case CO_CONVERT_TO_COMPLETE:
354           /* Convert the link to absolute URL. */
355           {
356             char *newlink = link->url->url;
357             char *quoted_newlink = html_quote_string (newlink);
358 
359             if (link->link_css_p || link->link_noquote_html_p)
360               p = replace_plain (p, link->size, fp, newlink);
361             else if (!link->link_refresh_p)
362               p = replace_attr (p, link->size, fp, quoted_newlink);
363             else
364               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
365                                              link->refresh_timeout);
366 
367             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
368                      newlink, link->pos, file));
369 
370             xfree (quoted_newlink);
371             ++to_url_count;
372             break;
373           }
374         case CO_NULLIFY_BASE:
375           /* Change the base href to "". */
376           p = replace_attr (p, link->size, fp, "");
377           break;
378         case CO_NOCONVERT:
379           abort ();
380           break;
381         }
382     }
383 
384   /* Output the rest of the file. */
385   if (p - fm->content < fm->length)
386     fwrite (p, 1, fm->length - (p - fm->content), fp);
387   fclose (fp);
388   wget_read_file_free (fm);
389 
390   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
391 }
392 
393 /* Construct and return a link that points from BASEFILE to LINKFILE.
394    Both files should be local file names, BASEFILE of the referrering
395    file, and LINKFILE of the referred file.
396 
397    Examples:
398 
399    cr("foo", "bar")         -> "bar"
400    cr("A/foo", "A/bar")     -> "bar"
401    cr("A/foo", "A/B/bar")   -> "B/bar"
402    cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
403    cr("X/", "Y/bar")        -> "../Y/bar" (trailing slash does matter in BASE)
404 
405    Both files should be absolute or relative, otherwise strange
406    results might ensue.  The function makes no special efforts to
407    handle "." and ".." in links, so make sure they're not there
408    (e.g. using path_simplify).  */
409 
410 static char *
construct_relative(const char * basefile,const char * linkfile)411 construct_relative (const char *basefile, const char *linkfile)
412 {
413   char *link;
414   int basedirs;
415   const char *b, *l;
416   int i, start;
417 
418   /* First, skip the initial directory components common to both
419      files.  */
420   start = 0;
421   for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
422     {
423       if (*b == '/')
424         start = (b - basefile) + 1;
425     }
426   basefile += start;
427   linkfile += start;
428 
429   /* With common directories out of the way, the situation we have is
430      as follows:
431          b - b1/b2/[...]/bfile
432          l - l1/l2/[...]/lfile
433 
434      The link we're constructing needs to be:
435        lnk - ../../l1/l2/[...]/lfile
436 
437      Where the number of ".."'s equals the number of bN directory
438      components in B.  */
439 
440   /* Count the directory components in B. */
441   basedirs = 0;
442   for (b = basefile; *b; b++)
443     {
444       if (*b == '/')
445         ++basedirs;
446     }
447 
448   if (!basedirs && (b = strpbrk (linkfile, "/:")) && *b == ':')
449     {
450       link = xmalloc (2 + strlen (linkfile) + 1);
451       memcpy (link, "./", 2);
452       strcpy (link + 2, linkfile);
453     }
454   else
455     {
456       /* Construct LINK as explained above. */
457       link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
458       for (i = 0; i < basedirs; i++)
459         memcpy (link + 3 * i, "../", 3);
460       strcpy (link + 3 * i, linkfile);
461     }
462 
463   return link;
464 }
465 
466 /* Construct and return a "transparent proxy" URL
467    reflecting changes made by --adjust-extension to the file component
468    (i.e., "basename") of the original URL, but leaving the "dirname"
469    of the URL (protocol://hostname... portion) untouched.
470 
471    Think: populating a squid cache via a recursive wget scrape, where
472    changing URLs to work locally with "file://..." is NOT desirable.
473 
474    Example:
475 
476    if
477                      p = "//foo.com/bar.cgi?xyz"
478    and
479       link->local_name = "docroot/foo.com/bar.cgi?xyz.css"
480    then
481 
482       new_construct_func(p, link);
483    will return
484       "//foo.com/bar.cgi?xyz.css"
485 
486    Essentially, we do s/$(basename orig_url)/$(basename link->local_name)/
487 */
488 static char *
convert_basename(const char * p,const struct urlpos * link)489 convert_basename (const char *p, const struct urlpos *link)
490 {
491   int len = link->size;
492   char *url = NULL;
493   char *org_basename = NULL, *local_basename;
494   char *result = NULL;
495 
496   if (*p == '"' || *p == '\'')
497     {
498       len -= 2;
499       p++;
500     }
501 
502   url = xstrndup (p, len);
503 
504   org_basename = strrchr (url, '/');
505   if (org_basename)
506     org_basename++;
507   else
508     org_basename = url;
509 
510   local_basename = link->local_name ? strrchr (link->local_name, '/') : NULL;
511   if (local_basename)
512     local_basename++;
513   else
514     local_basename = url;
515 
516   /*
517    * If the basenames differ, graft the adjusted basename (local_basename)
518    * onto the original URL.
519    */
520   if (strcmp (org_basename, local_basename) == 0)
521     result = url;
522   else
523     {
524       result = uri_merge (url, local_basename);
525       xfree (url);
526     }
527 
528   return result;
529 }
530 
531 /* Used by write_backup_file to remember which files have been
532    written. */
533 static struct hash_table *converted_files;
534 
535 static void
write_backup_file(const char * file,downloaded_file_t downloaded_file_return)536 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
537 {
538   /* Rather than just writing over the original .html file with the
539      converted version, save the former to *.orig.  Note we only do
540      this for files we've _successfully_ downloaded, so we don't
541      clobber .orig files sitting around from previous invocations.
542      On VMS, use "_orig" instead of ".orig".  See "wget.h". */
543 
544   if (!converted_files)
545     converted_files = make_string_hash_table (0);
546 
547   /* We can get called twice on the same URL thanks to the
548      convert_all_links() call in main.  If we write the .orig file
549      each time in such a case, it'll end up containing the first-pass
550      conversion, not the original file.  So, see if we've already been
551      called on this file. */
552   if (!string_set_contains (converted_files, file))
553     {
554       /* Construct the backup filename as the original name plus ".orig". */
555       char buf[1024];
556       size_t filename_len = strlen (file);
557       char *filename_plus_orig_suffix;
558 
559       if (filename_len < sizeof (buf) - 5)
560         filename_plus_orig_suffix = buf;
561       else
562         filename_plus_orig_suffix = xmalloc (filename_len + 5 + 1);
563 
564       /* TODO: hack this to work with css files */
565       if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
566         {
567           /* Just write "orig" over "html".  We need to do it this way
568              because when we're checking to see if we've downloaded the
569              file before (to see if we can skip downloading it), we don't
570              know if it's a text/html file.  Therefore we don't know yet
571              at that stage that -E is going to cause us to tack on
572              ".html", so we need to compare vs. the original URL plus
573              ".orig", not the original URL plus ".html.orig". */
574           memcpy (filename_plus_orig_suffix, file, filename_len - 4);
575           memcpy (filename_plus_orig_suffix + filename_len - 4, "orig", 5);
576         }
577       else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
578         {
579           /* Append ".orig" to the name. */
580           memcpy (filename_plus_orig_suffix, file, filename_len);
581           strcpy (filename_plus_orig_suffix + filename_len, ORIG_SFX);
582         }
583 
584       /* Rename <file> to <file>.orig before former gets written over. */
585       if (rename (file, filename_plus_orig_suffix) != 0)
586         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
587                    file, filename_plus_orig_suffix, strerror (errno));
588 
589       if (filename_plus_orig_suffix != buf)
590         xfree (filename_plus_orig_suffix);
591 
592       /* Remember that we've already written a .orig backup for this file.
593          Note that we never free this memory since we need it till the
594          convert_all_links() call, which is one of the last things the
595          program does before terminating.  BTW, I'm not sure if it would be
596          safe to just set 'converted_file_ptr->string' to 'file' below,
597          rather than making a copy of the string...  Another note is that I
598          thought I could just add a field to the urlpos structure saying
599          that we'd written a .orig file for this URL, but that didn't work,
600          so I had to make this separate list.
601          -- Dan Harkless <wget@harkless.org>
602 
603          This [adding a field to the urlpos structure] didn't work
604          because convert_file() is called from convert_all_links at
605          the end of the retrieval with a freshly built new urlpos
606          list.
607          -- Hrvoje Niksic <hniksic@xemacs.org>
608       */
609       string_set_add (converted_files, file);
610     }
611 }
612 
613 static bool find_fragment (const char *, int, const char **, const char **);
614 
615 /* Replace a string with NEW_TEXT.  Ignore quoting. */
616 static const char *
replace_plain(const char * p,int size,FILE * fp,const char * new_text)617 replace_plain (const char *p, int size, FILE *fp, const char *new_text)
618 {
619   fputs (new_text, fp);
620   p += size;
621   return p;
622 }
623 
624 /* Replace an attribute's original text with NEW_TEXT. */
625 
626 static const char *
replace_attr(const char * p,int size,FILE * fp,const char * new_text)627 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
628 {
629   bool quote_flag = false;
630   char quote_char = '\"';       /* use "..." for quoting, unless the
631                                    original value is quoted, in which
632                                    case reuse its quoting char. */
633   const char *frag_beg, *frag_end;
634 
635   /* Structure of our string is:
636        "...old-contents..."
637        <---    size    --->  (with quotes)
638      OR:
639        ...old-contents...
640        <---    size   -->    (no quotes)   */
641 
642   if (*p == '\"' || *p == '\'')
643     {
644       quote_char = *p;
645       quote_flag = true;
646       ++p;
647       size -= 2;                /* disregard opening and closing quote */
648     }
649   putc (quote_char, fp);
650   fputs (new_text, fp);
651 
652   /* Look for fragment identifier, if any. */
653   if (find_fragment (p, size, &frag_beg, &frag_end))
654     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
655   p += size;
656   if (quote_flag)
657     ++p;
658   putc (quote_char, fp);
659 
660   return p;
661 }
662 
663 /* The same as REPLACE_ATTR, but used when replacing
664    <meta http-equiv=refresh content="new_text"> because we need to
665    append "timeout_value; URL=" before the next_text.  */
666 
667 static const char *
replace_attr_refresh_hack(const char * p,int size,FILE * fp,const char * new_text,int timeout)668 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
669                            const char *new_text, int timeout)
670 {
671   /* "0; URL=..." */
672   char new_with_timeout[1024];
673 
674   if (((unsigned) snprintf (
675        new_with_timeout, sizeof (new_with_timeout),
676        "%d; URL=%s", timeout, new_text)) >= sizeof (new_with_timeout))
677     {
678       // very unlikely fallback using heap memory
679       char *tmp = aprintf("%d; URL=%s", timeout, new_text);
680       const char *res = replace_attr (p, size, fp, tmp);
681       xfree (tmp);
682       return res;
683     }
684 
685   return replace_attr (p, size, fp, new_with_timeout);
686 }
687 
688 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
689    preceded by '&'.  If the character is not found, return zero.  If
690    the character is found, return true and set BP and EP to point to
691    the beginning and end of the region.
692 
693    This is used for finding the fragment indentifiers in URLs.  */
694 
695 static bool
find_fragment(const char * beg,int size,const char ** bp,const char ** ep)696 find_fragment (const char *beg, int size, const char **bp, const char **ep)
697 {
698   const char *end = beg + size;
699   bool saw_amp = false;
700   for (; beg < end; beg++)
701     {
702       switch (*beg)
703         {
704         case '&':
705           saw_amp = true;
706           break;
707         case '#':
708           if (!saw_amp)
709             {
710               *bp = beg;
711               *ep = end;
712               return true;
713             }
714           /* fallthrough */
715         default:
716           saw_amp = false;
717         }
718     }
719   return false;
720 }
721 
722 /* Quote FILE for use as local reference to an HTML file.
723 
724    We quote ? as %3F to avoid passing part of the file name as the
725    parameter when browsing the converted file through HTTP.  However,
726    it is safe to do this only when `--adjust-extension' is turned on.
727    This is because converting "index.html?foo=bar" to
728    "index.html%3Ffoo=bar" would break local browsing, as the latter
729    isn't even recognized as an HTML file!  However, converting
730    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
731    safe for both local and HTTP-served browsing.
732 
733    We always quote "#" as "%23", "%" as "%25" and ";" as "%3B"
734    because those characters have special meanings in URLs.  */
735 
736 static char *
local_quote_string(const char * file,bool no_html_quote)737 local_quote_string (const char *file, bool no_html_quote)
738 {
739   const char *from;
740   char *newname, *to, *res;
741   char buf[1024];
742   size_t tolen;
743 
744   char *any = strpbrk (file, "?#%;");
745   if (!any)
746     return no_html_quote ? strdup (file) : html_quote_string (file);
747 
748   /* Allocate space assuming the worst-case scenario, each character
749      having to be quoted.  */
750   tolen = 3 * strlen (file);
751   if (tolen < sizeof (buf))
752     to = newname = buf;
753   else
754     to = newname = xmalloc (tolen + 1);
755 
756   for (from = file; *from; from++)
757     switch (*from)
758       {
759       case '%':
760         *to++ = '%';
761         *to++ = '2';
762         *to++ = '5';
763         break;
764       case '#':
765         *to++ = '%';
766         *to++ = '2';
767         *to++ = '3';
768         break;
769       case ';':
770         *to++ = '%';
771         *to++ = '3';
772         *to++ = 'B';
773         break;
774       case '?':
775         if (opt.adjust_extension)
776           {
777             *to++ = '%';
778             *to++ = '3';
779             *to++ = 'F';
780             break;
781           }
782         /* fallthrough */
783       default:
784         *to++ = *from;
785       }
786   *to = '\0';
787 
788   if (newname == buf)
789     return no_html_quote ? strdup (newname) : html_quote_string (newname);
790 
791   if (no_html_quote)
792     return newname;
793 
794   res = html_quote_string (newname);
795   xfree (newname);
796   return res;
797 }
798 
799 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
800    downloaded_html_list, and downloaded_html_set.  Other code calls
801    these functions to let us know that a file has been downloaded.  */
802 
803 #define ENSURE_TABLES_EXIST do {                        \
804   if (!dl_file_url_map)                                 \
805     dl_file_url_map = make_string_hash_table (0);       \
806   if (!dl_url_file_map)                                 \
807     dl_url_file_map = make_string_hash_table (0);       \
808 } while (0)
809 
810 /* Return true if S1 and S2 are the same, except for "/index.html".
811    The three cases in which it returns one are (substitute any
812    substring for "foo"):
813 
814    m("foo/index.html", "foo/")  ==> 1
815    m("foo/", "foo/index.html")  ==> 1
816    m("foo", "foo/index.html")   ==> 1
817    m("foo", "foo/"              ==> 1
818    m("foo", "foo")              ==> 1  */
819 
820 static bool
match_except_index(const char * s1,const char * s2)821 match_except_index (const char *s1, const char *s2)
822 {
823   int i;
824   const char *lng;
825 
826   /* Skip common substring. */
827   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
828     ;
829   if (i == 0)
830     /* Strings differ at the very beginning -- bail out.  We need to
831        check this explicitly to avoid `lng - 1' reading outside the
832        array.  */
833     return false;
834 
835   if (!*s1 && !*s2)
836     /* Both strings hit EOF -- strings are equal. */
837     return true;
838   else if (*s1 && *s2)
839     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
840     return false;
841   else if (*s1)
842     /* S1 is the longer one. */
843     lng = s1;
844   else
845     /* S2 is the longer one. */
846     lng = s2;
847 
848   /* foo            */            /* foo/           */
849   /* foo/index.html */  /* or */  /* foo/index.html */
850   /*    ^           */            /*     ^          */
851 
852   if (*lng != '/')
853     /* The right-hand case. */
854     --lng;
855 
856   if (*lng == '/' && *(lng + 1) == '\0')
857     /* foo  */
858     /* foo/ */
859     return true;
860 
861   return 0 == strcmp (lng, "/index.html");
862 }
863 
864 static int
dissociate_urls_from_file_mapper(void * key,void * value,void * arg)865 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
866 {
867   char *mapping_url = (char *)key;
868   char *mapping_file = (char *)value;
869   char *file = (char *)arg;
870 
871   if (0 == strcmp (mapping_file, file))
872     {
873       hash_table_remove (dl_url_file_map, mapping_url);
874       xfree (mapping_url);
875       xfree (mapping_file);
876     }
877 
878   /* Continue mapping. */
879   return 0;
880 }
881 
882 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
883 
884 static void
dissociate_urls_from_file(const char * file)885 dissociate_urls_from_file (const char *file)
886 {
887   /* Can't use hash_table_iter_* because the table mutates while mapping.  */
888   hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
889                        (char *) file);
890 }
891 
892 /* Register that URL has been successfully downloaded to FILE.  This
893    is used by the link conversion code to convert references to URLs
894    to references to local files.  It is also being used to check if a
895    URL has already been downloaded.  */
896 
897 void
register_download(const char * url,const char * file)898 register_download (const char *url, const char *file)
899 {
900   char *old_file, *old_url;
901 
902   ENSURE_TABLES_EXIST;
903 
904   /* With some forms of retrieval, it is possible, although not likely
905      or particularly desirable.  If both are downloaded, the second
906      download will override the first one.  When that happens,
907      dissociate the old file name from the URL.  */
908 
909   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
910     {
911       if (0 == strcmp (url, old_url))
912         /* We have somehow managed to download the same URL twice.
913            Nothing to do.  */
914         return;
915 
916       if (match_except_index (url, old_url)
917           && !hash_table_contains (dl_url_file_map, url))
918         /* The two URLs differ only in the "index.html" ending.  For
919            example, one is "http://www.server.com/", and the other is
920            "http://www.server.com/index.html".  Don't remove the old
921            one, just add the new one as a non-canonical entry.  */
922         goto url_only;
923 
924       hash_table_remove (dl_file_url_map, file);
925       xfree (old_file);
926       xfree (old_url);
927 
928       /* Remove all the URLs that point to this file.  Yes, there can
929          be more than one such URL, because we store redirections as
930          multiple entries in dl_url_file_map.  For example, if URL1
931          redirects to URL2 which gets downloaded to FILE, we map both
932          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
933          only points to URL2.)  When another URL gets loaded to FILE,
934          we want both URL1 and URL2 dissociated from it.
935 
936          This is a relatively expensive operation because it performs
937          a linear search of the whole hash table, but it should be
938          called very rarely, only when two URLs resolve to the same
939          file name, *and* the "<file>.1" extensions are turned off.
940          In other words, almost never.  */
941       dissociate_urls_from_file (file);
942     }
943 
944   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
945 
946  url_only:
947   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
948      If the latter were present, it should have been removed by the
949      above `if'.  So we could write:
950 
951          assert (!hash_table_contains (dl_url_file_map, url));
952 
953      The above is correct when running in recursive mode where the
954      same URL always resolves to the same file.  But if you do
955      something like:
956 
957          wget URL URL
958 
959      then the first URL will resolve to "FILE", and the other to
960      "FILE.1".  In that case, FILE.1 will not be found in
961      dl_file_url_map, but URL will still point to FILE in
962      dl_url_file_map.  */
963   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
964     {
965       hash_table_remove (dl_url_file_map, url);
966       xfree (old_url);
967       xfree (old_file);
968     }
969 
970   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
971 }
972 
973 /* Register that FROM has been redirected to "TO".  This assumes that TO
974    is successfully downloaded and already registered using
975    register_download() above.  */
976 
977 void
register_redirection(const char * from,const char * to)978 register_redirection (const char *from, const char *to)
979 {
980   char *file;
981 
982   ENSURE_TABLES_EXIST;
983 
984   file = hash_table_get (dl_url_file_map, to);
985   assert (file != NULL);
986   if (!hash_table_contains (dl_url_file_map, from))
987     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
988 }
989 
990 /* Register that the file has been deleted. */
991 
992 void
register_delete_file(const char * file)993 register_delete_file (const char *file)
994 {
995   char *old_url, *old_file;
996 
997   ENSURE_TABLES_EXIST;
998 
999   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
1000     return;
1001 
1002   hash_table_remove (dl_file_url_map, file);
1003   xfree (old_file);
1004   xfree (old_url);
1005   dissociate_urls_from_file (file);
1006 }
1007 
1008 /* Register that FILE is an HTML file that has been downloaded. */
1009 
1010 void
register_html(const char * file)1011 register_html (const char *file)
1012 {
1013   if (!downloaded_html_set)
1014     downloaded_html_set = make_string_hash_table (0);
1015   string_set_add (downloaded_html_set, file);
1016 }
1017 
1018 /* Register that FILE is a CSS file that has been downloaded. */
1019 
1020 void
register_css(const char * file)1021 register_css (const char *file)
1022 {
1023   if (!downloaded_css_set)
1024     downloaded_css_set = make_string_hash_table (0);
1025   string_set_add (downloaded_css_set, file);
1026 }
1027 
1028 /* Cleanup the data structures associated with this file.  */
1029 
1030 #if defined DEBUG_MALLOC || defined TESTING
1031 static void downloaded_files_free (void);
1032 
1033 void
convert_cleanup(void)1034 convert_cleanup (void)
1035 {
1036   if (dl_file_url_map)
1037     {
1038       free_keys_and_values (dl_file_url_map);
1039       hash_table_destroy (dl_file_url_map);
1040       dl_file_url_map = NULL;
1041     }
1042   if (dl_url_file_map)
1043     {
1044       free_keys_and_values (dl_url_file_map);
1045       hash_table_destroy (dl_url_file_map);
1046       dl_url_file_map = NULL;
1047     }
1048   if (downloaded_html_set)
1049     string_set_free (downloaded_html_set);
1050   if (downloaded_css_set)
1051     string_set_free (downloaded_css_set);
1052   downloaded_files_free ();
1053   if (converted_files)
1054     string_set_free (converted_files);
1055 }
1056 #endif
1057 
1058 /* Book-keeping code for downloaded files that enables extension
1059    hacks.  */
1060 
1061 /* This table should really be merged with dl_file_url_map and
1062    downloaded_html_files.  This was originally a list, but I changed
1063    it to a hash table because it was actually taking a lot of time to
1064    find things in it.  */
1065 
1066 static struct hash_table *downloaded_files_hash;
1067 
1068 /* We're storing "modes" of type downloaded_file_t in the hash table.
1069    However, our hash tables only accept pointers for keys and values.
1070    So when we need a pointer, we use the address of a
1071    downloaded_file_t variable of static storage.  */
1072 
1073 static downloaded_file_t *
downloaded_mode_to_ptr(downloaded_file_t mode)1074 downloaded_mode_to_ptr (downloaded_file_t mode)
1075 {
1076   static downloaded_file_t
1077     v1 = FILE_NOT_ALREADY_DOWNLOADED,
1078     v2 = FILE_DOWNLOADED_NORMALLY,
1079     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
1080     v4 = CHECK_FOR_FILE;
1081 
1082   switch (mode)
1083     {
1084     case FILE_NOT_ALREADY_DOWNLOADED:
1085       return &v1;
1086     case FILE_DOWNLOADED_NORMALLY:
1087       return &v2;
1088     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
1089       return &v3;
1090     case CHECK_FOR_FILE:
1091       return &v4;
1092     }
1093   return NULL;
1094 }
1095 
1096 /* Remembers which files have been downloaded.  In the standard case,
1097    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
1098    file we actually download successfully (i.e. not for ones we have
1099    failures on or that we skip due to -N).
1100 
1101    When we've downloaded a file and tacked on a ".html" extension due
1102    to -E, call this function with
1103    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1104    FILE_DOWNLOADED_NORMALLY.
1105 
1106    If you just want to check if a file has been previously added
1107    without adding it, call with mode == CHECK_FOR_FILE.  Please be
1108    sure to call this function with local filenames, not remote
1109    URLs.  */
1110 
1111 downloaded_file_t
downloaded_file(downloaded_file_t mode,const char * file)1112 downloaded_file (downloaded_file_t mode, const char *file)
1113 {
1114   downloaded_file_t *ptr;
1115 
1116   if (mode == CHECK_FOR_FILE)
1117     {
1118       if (!downloaded_files_hash)
1119         return FILE_NOT_ALREADY_DOWNLOADED;
1120       ptr = hash_table_get (downloaded_files_hash, file);
1121       if (!ptr)
1122         return FILE_NOT_ALREADY_DOWNLOADED;
1123       return *ptr;
1124     }
1125 
1126   if (!downloaded_files_hash)
1127     downloaded_files_hash = make_string_hash_table (0);
1128 
1129   ptr = hash_table_get (downloaded_files_hash, file);
1130   if (ptr)
1131     return *ptr;
1132 
1133   ptr = downloaded_mode_to_ptr (mode);
1134   hash_table_put (downloaded_files_hash, xstrdup (file), ptr);
1135 
1136   return FILE_NOT_ALREADY_DOWNLOADED;
1137 }
1138 
1139 #if defined DEBUG_MALLOC || defined TESTING
1140 static void
downloaded_files_free(void)1141 downloaded_files_free (void)
1142 {
1143   if (downloaded_files_hash)
1144     {
1145       hash_table_iterator iter;
1146       for (hash_table_iterate (downloaded_files_hash, &iter);
1147            hash_table_iter_next (&iter);
1148            )
1149         xfree (iter.key);
1150       hash_table_destroy (downloaded_files_hash);
1151       downloaded_files_hash = NULL;
1152     }
1153 }
1154 #endif
1155 
1156 /* The function returns the pointer to the malloc-ed quoted version of
1157    string s.  It will recognize and quote numeric and special graphic
1158    entities, as per RFC1866:
1159 
1160    `&' -> `&amp;'
1161    `<' -> `&lt;'
1162    `>' -> `&gt;'
1163    `"' -> `&quot;'
1164    SP  -> `&#32;'
1165 
1166    No other entities are recognized or replaced.  */
1167 char *
html_quote_string(const char * s)1168 html_quote_string (const char *s)
1169 {
1170   const char *b = s;
1171   char *p, *res;
1172   int i;
1173 
1174   /* Pass through the string, and count the new size.  */
1175   for (i = 0; *s; s++, i++)
1176     {
1177       if (*s == '&')
1178         i += 4;                 /* `amp;' */
1179       else if (*s == '<' || *s == '>')
1180         i += 3;                 /* `lt;' and `gt;' */
1181       else if (*s == '\"')
1182         i += 5;                 /* `quot;' */
1183       else if (*s == ' ')
1184         i += 4;                 /* #32; */
1185     }
1186   res = xmalloc (i + 1);
1187   s = b;
1188   for (p = res; *s; s++)
1189     {
1190       switch (*s)
1191         {
1192         case '&':
1193           *p++ = '&';
1194           *p++ = 'a';
1195           *p++ = 'm';
1196           *p++ = 'p';
1197           *p++ = ';';
1198           break;
1199         case '<': case '>':
1200           *p++ = '&';
1201           *p++ = (*s == '<' ? 'l' : 'g');
1202           *p++ = 't';
1203           *p++ = ';';
1204           break;
1205         case '\"':
1206           *p++ = '&';
1207           *p++ = 'q';
1208           *p++ = 'u';
1209           *p++ = 'o';
1210           *p++ = 't';
1211           *p++ = ';';
1212           break;
1213         case ' ':
1214           *p++ = '&';
1215           *p++ = '#';
1216           *p++ = '3';
1217           *p++ = '2';
1218           *p++ = ';';
1219           break;
1220         default:
1221           *p++ = *s;
1222         }
1223     }
1224   *p = '\0';
1225   return res;
1226 }
1227 
1228 /*
1229  * vim: et ts=2 sw=2
1230  */
1231