1 /* Collect URLs from HTML source.
2    Copyright (C) 1998-2012, 2015, 2018-2021 Free Software Foundation,
3    Inc.
4 
5 This file is part of GNU Wget.
6 
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10  (at your option) any later version.
11 
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19 
20 Additional permission under GNU GPL version 3 section 7
21 
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work.  */
30 
31 #include "wget.h"
32 
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <errno.h>
37 #include <assert.h>
38 
39 #include "exits.h"
40 #include "html-parse.h"
41 #include "url.h"
42 #include "utils.h"
43 #include "hash.h"
44 #include "convert.h"
45 #include "recur.h"
46 #include "html-url.h"
47 #include "css-url.h"
48 #include "c-strcase.h"
49 
50 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
51 
52 #define DECLARE_TAG_HANDLER(fun)                                \
53   static void fun (int, struct taginfo *, struct map_context *)
54 
55 DECLARE_TAG_HANDLER (tag_find_urls);
56 DECLARE_TAG_HANDLER (tag_handle_base);
57 DECLARE_TAG_HANDLER (tag_handle_form);
58 DECLARE_TAG_HANDLER (tag_handle_img);
59 DECLARE_TAG_HANDLER (tag_handle_link);
60 DECLARE_TAG_HANDLER (tag_handle_meta);
61 
62 enum {
63   TAG_A,
64   TAG_APPLET,
65   TAG_AREA,
66   TAG_BASE,
67   TAG_BGSOUND,
68   TAG_BODY,
69   TAG_EMBED,
70   TAG_FIG,
71   TAG_FORM,
72   TAG_FRAME,
73   TAG_IFRAME,
74   TAG_IMG,
75   TAG_INPUT,
76   TAG_LAYER,
77   TAG_LINK,
78   TAG_META,
79   TAG_OBJECT,
80   TAG_OVERLAY,
81   TAG_SCRIPT,
82   TAG_TABLE,
83   TAG_TD,
84   TAG_TH,
85   TAG_VIDEO,
86   TAG_AUDIO,
87   TAG_SOURCE
88 };
89 
90 /* The list of known tags and functions used for handling them.  Most
91    tags are simply harvested for URLs. */
92 static struct known_tag {
93   int tagid;
94   const char *name;
95   tag_handler_t handler;
96 } known_tags[] = {
97   { TAG_A,       "a",           tag_find_urls },
98   { TAG_APPLET,  "applet",      tag_find_urls },
99   { TAG_AREA,    "area",        tag_find_urls },
100   { TAG_BASE,    "base",        tag_handle_base },
101   { TAG_BGSOUND, "bgsound",     tag_find_urls },
102   { TAG_BODY,    "body",        tag_find_urls },
103   { TAG_EMBED,   "embed",       tag_find_urls },
104   { TAG_FIG,     "fig",         tag_find_urls },
105   { TAG_FORM,    "form",        tag_handle_form },
106   { TAG_FRAME,   "frame",       tag_find_urls },
107   { TAG_IFRAME,  "iframe",      tag_find_urls },
108   { TAG_IMG,     "img",         tag_handle_img },
109   { TAG_INPUT,   "input",       tag_find_urls },
110   { TAG_LAYER,   "layer",       tag_find_urls },
111   { TAG_LINK,    "link",        tag_handle_link },
112   { TAG_META,    "meta",        tag_handle_meta },
113   { TAG_OBJECT,  "object",      tag_find_urls },
114   { TAG_OVERLAY, "overlay",     tag_find_urls },
115   { TAG_SCRIPT,  "script",      tag_find_urls },
116   { TAG_TABLE,   "table",       tag_find_urls },
117   { TAG_TD,      "td",          tag_find_urls },
118   { TAG_TH,      "th",          tag_find_urls },
119   { TAG_VIDEO,   "video",       tag_find_urls },
120   { TAG_AUDIO,   "audio",       tag_find_urls },
121   { TAG_SOURCE,  "source",      tag_find_urls }
122 };
123 
124 /* tag_url_attributes documents which attributes of which tags contain
125    URLs to harvest.  It is used by tag_find_urls.  */
126 
127 /* Defines for the FLAGS. */
128 
129 /* The link is "inline", i.e. needs to be retrieved for this document
130    to be correctly rendered.  Inline links include inlined images,
131    stylesheets, children frames, etc.  */
132 #define ATTR_INLINE     1
133 
134 /* The link is expected to yield HTML contents.  It's important not to
135    try to follow HTML obtained by following e.g. <img src="...">
136    regardless of content-type.  Doing this causes infinite loops for
137    "images" that return non-404 error pages with links to the same
138    image.  */
139 #define ATTR_HTML       2
140 
141 /* For tags handled by tag_find_urls: attributes that contain URLs to
142    download. */
143 static struct {
144   int tagid;
145   const char *attr_name;
146   int flags;
147 } tag_url_attributes[] = {
148   { TAG_A,              "href",         ATTR_HTML },
149   { TAG_APPLET,         "code",         ATTR_INLINE },
150   { TAG_AREA,           "href",         ATTR_HTML },
151   { TAG_BGSOUND,        "src",          ATTR_INLINE },
152   { TAG_BODY,           "background",   ATTR_INLINE },
153   { TAG_EMBED,          "href",         ATTR_HTML },
154   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
155   { TAG_FIG,            "src",          ATTR_INLINE },
156   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
157   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
158   { TAG_IMG,            "href",         ATTR_INLINE },
159   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
160   { TAG_IMG,            "src",          ATTR_INLINE },
161   { TAG_INPUT,          "src",          ATTR_INLINE },
162   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
163   { TAG_OBJECT,         "data",         ATTR_INLINE },
164   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
165   { TAG_SCRIPT,         "src",          ATTR_INLINE },
166   { TAG_TABLE,          "background",   ATTR_INLINE },
167   { TAG_TD,             "background",   ATTR_INLINE },
168   { TAG_TH,             "background",   ATTR_INLINE },
169   { TAG_VIDEO,          "src",          ATTR_INLINE },
170   { TAG_VIDEO,          "poster",       ATTR_INLINE },
171   { TAG_AUDIO,          "src",          ATTR_INLINE },
172   { TAG_AUDIO,          "poster",       ATTR_INLINE },
173   { TAG_SOURCE,         "src",          ATTR_INLINE }
174 };
175 
176 /* The lists of interesting tags and attributes are built dynamically,
177    from the information above.  However, some places in the code refer
178    to the attributes not mentioned here.  We add them manually.  */
179 static const char *additional_attributes[] = {
180   "rel",                        /* used by tag_handle_link  */
181   "type",                       /* used by tag_handle_link  */
182   "http-equiv",                 /* used by tag_handle_meta  */
183   "name",                       /* used by tag_handle_meta  */
184   "content",                    /* used by tag_handle_meta  */
185   "action",                     /* used by tag_handle_form  */
186   "style",                      /* used by check_style_attr */
187   "srcset",                     /* used by tag_handle_img */
188 };
189 
190 static struct hash_table *interesting_tags;
191 static struct hash_table *interesting_attributes;
192 
193 /* Will contains the (last) charset found in 'http-equiv=content-type'
194    meta tags  */
195 static char *meta_charset;
196 
197 static void
init_interesting(void)198 init_interesting (void)
199 {
200   /* Init the variables interesting_tags and interesting_attributes
201      that are used by the HTML parser to know which tags and
202      attributes we're interested in.  We initialize this only once,
203      for performance reasons.
204 
205      Here we also make sure that what we put in interesting_tags
206      matches the user's preferences as specified through --ignore-tags
207      and --follow-tags.  */
208 
209   size_t i;
210   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
211 
212   /* First, add all the tags we know hot to handle, mapped to their
213      respective entries in known_tags.  */
214   for (i = 0; i < countof (known_tags); i++)
215     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
216 
217   /* Then remove the tags ignored through --ignore-tags.  */
218   if (opt.ignore_tags)
219     {
220       char **ignored;
221       for (ignored = opt.ignore_tags; *ignored; ignored++)
222         hash_table_remove (interesting_tags, *ignored);
223     }
224 
225   /* If --follow-tags is specified, use only those tags.  */
226   if (opt.follow_tags)
227     {
228       /* Create a new table intersecting --follow-tags and known_tags,
229          and use it as interesting_tags.  */
230       struct hash_table *intersect = make_nocase_string_hash_table (0);
231       char **followed;
232       for (followed = opt.follow_tags; *followed; followed++)
233         {
234           struct known_tag *t = hash_table_get (interesting_tags, *followed);
235           if (!t)
236             continue;           /* ignore unknown --follow-tags entries. */
237           hash_table_put (intersect, *followed, t);
238         }
239       hash_table_destroy (interesting_tags);
240       interesting_tags = intersect;
241     }
242 
243   /* Add the attributes we care about. */
244   interesting_attributes = make_nocase_string_hash_table (10);
245   for (i = 0; i < countof (additional_attributes); i++)
246     hash_table_put (interesting_attributes, additional_attributes[i], "1");
247   for (i = 0; i < countof (tag_url_attributes); i++)
248     hash_table_put (interesting_attributes,
249                     tag_url_attributes[i].attr_name, "1");
250 }
251 
252 /* Find the value of attribute named NAME in the taginfo TAG.  If the
253    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
254    index of the attribute in TAG will be stored there.  */
255 
256 static char *
find_attr(struct taginfo * tag,const char * name,int * attrind)257 find_attr (struct taginfo *tag, const char *name, int *attrind)
258 {
259   int i;
260   for (i = 0; i < tag->nattrs; i++)
261     if (!c_strcasecmp (tag->attrs[i].name, name))
262       {
263         if (attrind)
264           *attrind = i;
265         return tag->attrs[i].value;
266       }
267   return NULL;
268 }
269 
270 /* used for calls to append_url */
271 #define ATTR_POS(tag, attrind, ctx) \
272  (tag->attrs[attrind].value_raw_beginning - ctx->text)
273 #define ATTR_SIZE(tag, attrind) \
274  (tag->attrs[attrind].value_raw_size)
275 
276 /* Append LINK_URI to the urlpos structure that is being built.
277 
278    LINK_URI will be merged with the current document base.
279 */
280 
281 struct urlpos *
append_url(const char * link_uri,int position,int size,struct map_context * ctx)282 append_url (const char *link_uri, int position, int size,
283             struct map_context *ctx)
284 {
285   int link_has_scheme = url_has_scheme (link_uri);
286   struct urlpos *newel;
287   const char *base = ctx->base ? ctx->base : ctx->parent_base;
288   struct url *url;
289 
290   struct iri *iri = iri_new ();
291   set_uri_encoding (iri, opt.locale, true);
292   iri->utf8_encode = true;
293 
294   if (!base)
295     {
296       DEBUGP (("%s: no base, merge will use \"%s\".\n",
297                ctx->document_file, link_uri));
298 
299       if (!link_has_scheme)
300         {
301           /* Base URL is unavailable, and the link does not have a
302              location attached to it -- we have to give up.  Since
303              this can only happen when using `--force-html -i', print
304              a warning.  */
305           logprintf (LOG_NOTQUIET,
306                      _("%s: Cannot resolve incomplete link %s.\n"),
307                      ctx->document_file, link_uri);
308           iri_free (iri);
309           return NULL;
310         }
311 
312       url = url_parse (link_uri, NULL, iri, false);
313       if (!url)
314         {
315           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
316                    ctx->document_file, link_uri));
317           iri_free (iri);
318           return NULL;
319         }
320     }
321   else
322     {
323       /* Merge BASE with LINK_URI, but also make sure the result is
324          canonicalized, i.e. that "../" have been resolved.
325          (parse_url will do that for us.) */
326 
327       char *complete_uri = uri_merge (base, link_uri);
328 
329       DEBUGP (("%s: merge(%s, %s) -> %s\n",
330                quotearg_n_style (0, escape_quoting_style, ctx->document_file),
331                quote_n (1, base),
332                quote_n (2, link_uri),
333                quotearg_n_style (3, escape_quoting_style, complete_uri)));
334 
335       url = url_parse (complete_uri, NULL, iri, false);
336       if (!url)
337         {
338           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
339                    ctx->document_file, complete_uri));
340           xfree (complete_uri);
341           iri_free (iri);
342           return NULL;
343         }
344       xfree (complete_uri);
345     }
346 
347   iri_free (iri);
348 
349   DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
350 
351   newel = xnew0 (struct urlpos);
352   newel->url = url;
353   newel->pos = position;
354   newel->size = size;
355 
356   /* A URL is relative if the host is not named, and the name does not
357      start with `/'.  */
358   if (!link_has_scheme && *link_uri != '/')
359     newel->link_relative_p = 1;
360   else if (link_has_scheme)
361     newel->link_complete_p = 1;
362 
363   /* Append the new URL maintaining the order by position.  */
364   if (ctx->head == NULL)
365     ctx->head = newel;
366   else
367     {
368       struct urlpos *it, *prev = NULL;
369 
370       it = ctx->head;
371       while (it && position > it->pos)
372         {
373           prev = it;
374           it = it->next;
375         }
376 
377       newel->next = it;
378 
379       if (prev)
380         prev->next = newel;
381       else
382         ctx->head = newel;
383     }
384 
385   return newel;
386 }
387 
388 static void
check_style_attr(struct taginfo * tag,struct map_context * ctx)389 check_style_attr (struct taginfo *tag, struct map_context *ctx)
390 {
391   int attrind;
392   int raw_start;
393   int raw_len;
394   char *style = find_attr (tag, "style", &attrind);
395   if (!style)
396     return;
397 
398   /* raw pos and raw size include the quotes, skip them when they are
399      present.  */
400   raw_start = ATTR_POS (tag, attrind, ctx);
401   raw_len  = ATTR_SIZE (tag, attrind);
402   if( *(char *)(ctx->text + raw_start) == '\''
403       || *(char *)(ctx->text + raw_start) == '"')
404     {
405       raw_start += 1;
406       raw_len -= 2;
407     }
408 
409   if(raw_len <= 0)
410        return;
411 
412   get_urls_css (ctx, raw_start, raw_len);
413 }
414 
415 /* All the tag_* functions are called from collect_tags_mapper, as
416    specified by KNOWN_TAGS.  */
417 
418 /* Default tag handler: collect URLs from attributes specified for
419    this tag by tag_url_attributes.  */
420 
421 static void
tag_find_urls(int tagid,struct taginfo * tag,struct map_context * ctx)422 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
423 {
424   size_t i;
425   int attrind;
426   int first = -1;
427 
428   for (i = 0; i < countof (tag_url_attributes); i++)
429     if (tag_url_attributes[i].tagid == tagid)
430       {
431         /* We've found the index of tag_url_attributes where the
432            attributes of our tag begin.  */
433         first = i;
434         break;
435       }
436   assert (first != -1);
437 
438   /* Loop over the "interesting" attributes of this tag.  In this
439      example, it will loop over "src" and "lowsrc".
440 
441        <img src="foo.png" lowsrc="bar.png">
442 
443      This has to be done in the outer loop so that the attributes are
444      processed in the same order in which they appear in the page.
445      This is required when converting links.  */
446 
447   for (attrind = 0; attrind < tag->nattrs; attrind++)
448     {
449       /* Find whether TAG/ATTRIND is a combination that contains a
450          URL. */
451       char *link = tag->attrs[attrind].value;
452       const size_t size = countof (tag_url_attributes);
453 
454       /* If you're cringing at the inefficiency of the nested loops,
455          remember that they both iterate over a very small number of
456          items.  The worst-case inner loop is for the IMG tag, which
457          has three attributes.  */
458       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
459         {
460           if (0 == strcasecmp (tag->attrs[attrind].name,
461                                tag_url_attributes[i].attr_name))
462             {
463               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
464                                               ATTR_SIZE(tag,attrind), ctx);
465               if (up)
466                 {
467                   int flags = tag_url_attributes[i].flags;
468                   if (flags & ATTR_INLINE)
469                     up->link_inline_p = 1;
470                   if (flags & ATTR_HTML)
471                     up->link_expect_html = 1;
472                 }
473             }
474         }
475     }
476 }
477 
478 /* Handle the BASE tag, for <base href=...>. */
479 
480 static void
tag_handle_base(int tagid _GL_UNUSED,struct taginfo * tag,struct map_context * ctx)481 tag_handle_base (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
482 {
483   struct urlpos *base_urlpos;
484   int attrind;
485   char *newbase = find_attr (tag, "href", &attrind);
486   if (!newbase)
487     return;
488 
489   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
490                             ATTR_SIZE(tag,attrind), ctx);
491   if (!base_urlpos)
492     return;
493   base_urlpos->ignore_when_downloading = 1;
494   base_urlpos->link_base_p = 1;
495 
496   xfree (ctx->base);
497   if (ctx->parent_base)
498     ctx->base = uri_merge (ctx->parent_base, newbase);
499   else
500     ctx->base = xstrdup (newbase);
501 }
502 
503 /* Mark the URL found in <form action=...> for conversion. */
504 
505 static void
tag_handle_form(int tagid _GL_UNUSED,struct taginfo * tag,struct map_context * ctx)506 tag_handle_form (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
507 {
508   int attrind;
509   char *action = find_attr (tag, "action", &attrind);
510 
511   if (action)
512     {
513       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
514                                       ATTR_SIZE(tag,attrind), ctx);
515       if (up)
516         up->ignore_when_downloading = 1;
517     }
518 }
519 
520 /* Handle the LINK tag.  It requires special handling because how its
521    links will be followed in -p mode depends on the REL attribute.  */
522 
523 static void
tag_handle_link(int tagid _GL_UNUSED,struct taginfo * tag,struct map_context * ctx)524 tag_handle_link (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
525 {
526   int attrind;
527   char *href = find_attr (tag, "href", &attrind);
528 
529   /* All <link href="..."> link references are external, except those
530      known not to be, such as style sheet and shortcut icon:
531 
532      <link rel="stylesheet" href="..."> or <link rel="alternate stylesheet" href="...">
533      <link rel="shortcut icon" href="..."> or <link rel="icon" href="...">
534   */
535   if (href)
536     {
537       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
538                                       ATTR_SIZE(tag,attrind), ctx);
539       if (up)
540         {
541           char *rel = find_attr (tag, "rel", NULL);
542           if (rel)
543             {
544               if (0 == c_strcasecmp (rel, "stylesheet") || 0 == c_strcasecmp (rel, "alternate stylesheet"))
545                 {
546                   up->link_inline_p = 1;
547                   up->link_expect_css = 1;
548                 }
549               else if (0 == c_strcasecmp (rel, "shortcut icon") || 0 == c_strcasecmp (rel, "icon"))
550                 {
551                   up->link_inline_p = 1;
552                 }
553               else if (0 == c_strcasecmp (rel, "manifest"))
554                 {
555                   up->link_inline_p = 1;
556                 }
557               else
558                 {
559                   /* The external ones usually point to HTML pages, such as
560                      <link rel="next" href="...">
561                      except when the type attribute says otherwise:
562                      <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
563                   */
564                   char *type = find_attr (tag, "type", NULL);
565                   if (!type || c_strcasecmp (type, "text/html") == 0)
566                     up->link_expect_html = 1;
567                 }
568             }
569         }
570     }
571 }
572 
573 /* Handle the META tag.  This requires special handling because of the
574    refresh feature and because of robot exclusion.  */
575 
576 static void
tag_handle_meta(int tagid _GL_UNUSED,struct taginfo * tag,struct map_context * ctx)577 tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
578 {
579   char *name = find_attr (tag, "name", NULL);
580   char *http_equiv = find_attr (tag, "http-equiv", NULL);
581 
582   if (http_equiv && 0 == c_strcasecmp (http_equiv, "refresh"))
583     {
584       /* Some pages use a META tag to specify that the page be
585          refreshed by a new page after a given number of seconds.  The
586          general format for this is:
587 
588            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
589 
590          So we just need to skip past the "NUMBER; URL=" garbage to
591          get to the URL.  */
592 
593       struct urlpos *entry;
594       int attrind;
595       int timeout;
596       char *p;
597 
598       char *refresh = find_attr (tag, "content", &attrind);
599       if (!refresh)
600         return;
601 
602       timeout = strtol(refresh, &p, 10);
603 
604       if (timeout < 0 || *p++ != ';')
605         return;
606 
607       while (c_isspace (*p))
608         ++p;
609       if (!(   c_toupper (*p)       == 'U'
610             && c_toupper (*(p + 1)) == 'R'
611             && c_toupper (*(p + 2)) == 'L'
612             &&          *(p + 3)  == '='))
613         return;
614       p += 4;
615       while (c_isspace (*p))
616         ++p;
617 
618       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
619                           ATTR_SIZE(tag,attrind), ctx);
620       if (entry)
621         {
622           entry->link_refresh_p = 1;
623           entry->refresh_timeout = timeout;
624           entry->link_expect_html = 1;
625         }
626     }
627   else if (http_equiv && 0 == c_strcasecmp (http_equiv, "content-type"))
628     {
629       /* Handle stuff like:
630          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
631 
632       char *mcharset;
633       char *content = find_attr (tag, "content", NULL);
634       if (!content)
635         return;
636 
637       mcharset = parse_charset (content);
638       if (!mcharset)
639         return;
640 
641       xfree (meta_charset);
642       meta_charset = mcharset;
643     }
644   else if (name && 0 == c_strcasecmp (name, "robots"))
645     {
646       /* Handle stuff like:
647          <meta name="robots" content="index,nofollow"> */
648       char *content = find_attr (tag, "content", NULL);
649       if (!content)
650         return;
651       if (!c_strcasecmp (content, "none"))
652         ctx->nofollow = true;
653       else
654         {
655           while (*content)
656             {
657               char *end;
658               /* Skip any initial whitespace. */
659               content += strspn (content, " \f\n\r\t\v");
660               /* Find the next occurrence of ',' or whitespace,
661                * or the end of the string.  */
662               end = content + strcspn (content, ", \f\n\r\t\v");
663               if (!c_strncasecmp (content, "nofollow", end - content))
664                 ctx->nofollow = true;
665               /* Skip past the next comma, if any. */
666               if (*end == ',')
667                 ++end;
668               else
669                 {
670                   end = strchr (end, ',');
671                   if (end)
672                     ++end;
673                   else
674                     end = content + strlen (content);
675                 }
676               content = end;
677             }
678         }
679     }
680 }
681 
682 /* Handle the IMG tag.  This requires special handling for the srcset attr,
683    while the traditional src/lowsrc/href attributes can be handled generically.
684 */
685 
686 static void
tag_handle_img(int tagid,struct taginfo * tag,struct map_context * ctx)687 tag_handle_img (int tagid, struct taginfo *tag, struct map_context *ctx) {
688   int attrind;
689   char *srcset;
690 
691   /* Use the generic approach for the attributes without special syntax. */
692   tag_find_urls(tagid, tag, ctx);
693 
694   srcset = find_attr (tag, "srcset", &attrind);
695   if (srcset)
696     {
697       /* These are relative to the input text. */
698       int base_ind = ATTR_POS (tag,attrind,ctx);
699       int size = strlen (srcset);
700 
701       /* These are relative to srcset. */
702       int offset, url_start, url_end;
703 
704       /* Make sure to line up base_ind with srcset[0], not outside quotes. */
705       if (ctx->text[base_ind] == '"' || ctx->text[base_ind] == '\'')
706         ++base_ind;
707 
708       offset = 0;
709       while (offset < size)
710         {
711           bool has_descriptor = true;
712 
713           /* Skip over initial whitespace and commas. Note there is no \v
714             in HTML5 whitespace. */
715           url_start = offset + strspn (srcset + offset, " \f\n\r\t,");
716 
717           if (url_start == size)
718             return;
719 
720           /* URL is any non-whitespace chars (including commas) - but with
721              trailing commas removed. */
722           url_end = url_start + strcspn (srcset + url_start, " \f\n\r\t");
723           while ((url_end - 1) > url_start && srcset[url_end - 1] == ',')
724             {
725               has_descriptor = false;
726               --url_end;
727             }
728 
729           if (url_end > url_start)
730             {
731               char *url_text = strdupdelim (srcset + url_start,
732                                             srcset + url_end);
733               struct urlpos *up = append_url (url_text, base_ind + url_start,
734                                               url_end - url_start, ctx);
735               if (up)
736                 {
737                   up->link_inline_p = 1;
738                   up->link_noquote_html_p = 1;
739                 }
740               xfree (url_text);
741             }
742 
743           /* If the URL wasn't terminated by a , there may also be a descriptor
744              which we just skip. */
745           if (has_descriptor)
746             {
747               /* This is comma-terminated, except there may be one level of
748                  parentheses escaping that. */
749               bool in_paren = false;
750               for (offset = url_end; offset < size; ++offset)
751                 {
752                   char c = srcset[offset];
753                   if (c == '(')
754                     in_paren = true;
755                   else if (c == ')' && in_paren)
756                     in_paren = false;
757                   else if (c == ',' && !in_paren)
758                     break;
759                 }
760             }
761           else
762             offset = url_end;
763         }
764     }
765 }
766 
767 /* Dispatch the tag handler appropriate for the tag we're mapping
768    over.  See known_tags[] for definition of tag handlers.  */
769 
770 static void
collect_tags_mapper(struct taginfo * tag,void * arg)771 collect_tags_mapper (struct taginfo *tag, void *arg)
772 {
773   struct map_context *ctx = (struct map_context *)arg;
774 
775   /* Find the tag in our table of tags.  This must not fail because
776      map_html_tags only returns tags found in interesting_tags.
777 
778      I've changed this for now, I'm passing NULL as interesting_tags
779      to map_html_tags.  This way we can check all tags for a style
780      attribute.
781   */
782   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
783 
784   if (t != NULL)
785     t->handler (t->tagid, tag, ctx);
786 
787   check_style_attr (tag, ctx);
788 
789   if (tag->end_tag_p && (0 == c_strcasecmp (tag->name, "style"))
790       && tag->contents_begin && tag->contents_end
791       && tag->contents_begin <= tag->contents_end)
792   {
793     /* parse contents */
794     get_urls_css (ctx, tag->contents_begin - ctx->text,
795                   tag->contents_end - tag->contents_begin);
796   }
797 }
798 
799 /* Analyze HTML tags FILE and construct a list of URLs referenced from
800    it.  It merges relative links in FILE with URL.  It is aware of
801    <base href=...> and does the right thing.  */
802 
803 struct urlpos *
get_urls_html_fm(const char * file,const struct file_memory * fm,const char * url,bool * meta_disallow_follow,struct iri * iri)804 get_urls_html_fm (const char *file, const struct file_memory *fm,
805                     const char *url, bool *meta_disallow_follow,
806                     struct iri *iri)
807 {
808   struct map_context ctx;
809   int flags;
810 
811   ctx.text = fm->content;
812   ctx.head = NULL;
813   ctx.base = NULL;
814   ctx.parent_base = url ? url : opt.base_href;
815   ctx.document_file = file;
816   ctx.nofollow = false;
817 
818   if (!interesting_tags)
819     init_interesting ();
820 
821   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
822      generate <a href=" foo"> instead of <a href="foo"> (browsers
823      ignore spaces as well.)  If you really mean space, use &32; or
824      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
825      e.g. in <img src="foo.[newline]html">.  Such newlines are also
826      ignored by IE and Mozilla and are presumably introduced by
827      writing HTML with editors that force word wrap.  */
828   flags = MHT_TRIM_VALUES;
829   if (opt.strict_comments)
830     flags |= MHT_STRICT_COMMENTS;
831 
832   /* the NULL here used to be interesting_tags */
833   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
834                  NULL, interesting_attributes);
835 
836 #ifdef ENABLE_IRI
837   /* Meta charset is only valid if there was no HTTP header Content-Type charset. */
838   /* This is true for HTTP 1.0 and 1.1. */
839   if (iri && !iri->content_encoding && meta_charset)
840     set_content_encoding (iri, meta_charset);
841 #endif
842   xfree (meta_charset);
843 
844   DEBUGP (("nofollow in %s: %d\n", file, ctx.nofollow));
845 
846   if (meta_disallow_follow)
847     *meta_disallow_follow = ctx.nofollow;
848 
849   xfree (ctx.base);
850   return ctx.head;
851 }
852 
853 struct urlpos *
get_urls_html(const char * file,const char * url,bool * meta_disallow_follow,struct iri * iri)854 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
855                  struct iri *iri)
856 {
857   struct urlpos *urls;
858   struct file_memory *fm;
859 
860   fm = wget_read_file (file);
861   if (!fm)
862     {
863       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
864       return NULL;
865     }
866   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
867 
868   urls = get_urls_html_fm (file, fm, url, meta_disallow_follow, iri);
869   wget_read_file_free (fm);
870   return urls;
871 }
872 
873 /* This doesn't really have anything to do with HTML, but it's similar
874    to get_urls_html, so we put it here.  */
875 
876 struct urlpos *
get_urls_file(const char * file)877 get_urls_file (const char *file)
878 {
879   struct file_memory *fm;
880   struct urlpos *head, *tail;
881   const char *text, *text_end;
882 
883   /* Load the file.  */
884   fm = wget_read_file (file);
885   if (!fm)
886     {
887       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
888       return NULL;
889     }
890   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
891 
892   head = tail = NULL;
893   text = fm->content;
894   text_end = fm->content + fm->length;
895   while (text < text_end)
896     {
897       int up_error_code;
898       char *url_text;
899       char *new_url;
900       struct urlpos *entry;
901       struct url *url;
902 
903       const char *line_beg = text;
904       const char *line_end = memchr (text, '\n', text_end - text);
905       if (!line_end)
906         line_end = text_end;
907       else
908         ++line_end;
909       text = line_end;
910 
911       /* Strip whitespace from the beginning and end of line. */
912       while (line_beg < line_end && c_isspace (*line_beg))
913         ++line_beg;
914       while (line_end > line_beg && c_isspace (*(line_end - 1)))
915         --line_end;
916 
917       if (line_beg == line_end)
918         continue;
919 
920       /* The URL is in the [line_beg, line_end) region. */
921 
922       /* We must copy the URL to a zero-terminated string, and we
923          can't use alloca because we're in a loop.  *sigh*.  */
924       url_text = strdupdelim (line_beg, line_end);
925 
926       if (opt.base_href)
927         {
928           /* Merge opt.base_href with URL. */
929           char *merged = uri_merge (opt.base_href, url_text);
930           xfree (url_text);
931           url_text = merged;
932         }
933 
934       new_url = rewrite_shorthand_url (url_text);
935       if (new_url)
936         {
937           xfree (url_text);
938           url_text = new_url;
939         }
940 
941       url = url_parse (url_text, &up_error_code, NULL, false);
942       if (!url)
943         {
944           char *error = url_error (url_text, up_error_code);
945           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
946                      file, url_text, error);
947           xfree (url_text);
948           xfree (error);
949           inform_exit_status (URLERROR);
950           continue;
951         }
952       xfree (url_text);
953 
954       entry = xnew0 (struct urlpos);
955       entry->url = url;
956 
957       if (!head)
958         head = entry;
959       else
960         tail->next = entry;
961       tail = entry;
962     }
963   wget_read_file_free (fm);
964   return head;
965 }
966 
967 #if defined DEBUG_MALLOC || defined TESTING
968 void
cleanup_html_url(void)969 cleanup_html_url (void)
970 {
971   /* Destroy the hash tables.  The hash table keys and values are not
972      allocated by this code, so we don't need to free them here.  */
973   if (interesting_tags)
974     hash_table_destroy (interesting_tags);
975   if (interesting_attributes)
976     hash_table_destroy (interesting_attributes);
977 }
978 #endif
979