1 /* Collect URLs from HTML source.
2 Copyright (C) 1998-2012, 2015, 2018-2021 Free Software Foundation,
3 Inc.
4
5 This file is part of GNU Wget.
6
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19
20 Additional permission under GNU GPL version 3 section 7
21
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
30
31 #include "wget.h"
32
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <errno.h>
37 #include <assert.h>
38
39 #include "exits.h"
40 #include "html-parse.h"
41 #include "url.h"
42 #include "utils.h"
43 #include "hash.h"
44 #include "convert.h"
45 #include "recur.h"
46 #include "html-url.h"
47 #include "css-url.h"
48 #include "c-strcase.h"
49
50 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
51
52 #define DECLARE_TAG_HANDLER(fun) \
53 static void fun (int, struct taginfo *, struct map_context *)
54
55 DECLARE_TAG_HANDLER (tag_find_urls);
56 DECLARE_TAG_HANDLER (tag_handle_base);
57 DECLARE_TAG_HANDLER (tag_handle_form);
58 DECLARE_TAG_HANDLER (tag_handle_img);
59 DECLARE_TAG_HANDLER (tag_handle_link);
60 DECLARE_TAG_HANDLER (tag_handle_meta);
61
62 enum {
63 TAG_A,
64 TAG_APPLET,
65 TAG_AREA,
66 TAG_BASE,
67 TAG_BGSOUND,
68 TAG_BODY,
69 TAG_EMBED,
70 TAG_FIG,
71 TAG_FORM,
72 TAG_FRAME,
73 TAG_IFRAME,
74 TAG_IMG,
75 TAG_INPUT,
76 TAG_LAYER,
77 TAG_LINK,
78 TAG_META,
79 TAG_OBJECT,
80 TAG_OVERLAY,
81 TAG_SCRIPT,
82 TAG_TABLE,
83 TAG_TD,
84 TAG_TH,
85 TAG_VIDEO,
86 TAG_AUDIO,
87 TAG_SOURCE
88 };
89
90 /* The list of known tags and functions used for handling them. Most
91 tags are simply harvested for URLs. */
92 static struct known_tag {
93 int tagid;
94 const char *name;
95 tag_handler_t handler;
96 } known_tags[] = {
97 { TAG_A, "a", tag_find_urls },
98 { TAG_APPLET, "applet", tag_find_urls },
99 { TAG_AREA, "area", tag_find_urls },
100 { TAG_BASE, "base", tag_handle_base },
101 { TAG_BGSOUND, "bgsound", tag_find_urls },
102 { TAG_BODY, "body", tag_find_urls },
103 { TAG_EMBED, "embed", tag_find_urls },
104 { TAG_FIG, "fig", tag_find_urls },
105 { TAG_FORM, "form", tag_handle_form },
106 { TAG_FRAME, "frame", tag_find_urls },
107 { TAG_IFRAME, "iframe", tag_find_urls },
108 { TAG_IMG, "img", tag_handle_img },
109 { TAG_INPUT, "input", tag_find_urls },
110 { TAG_LAYER, "layer", tag_find_urls },
111 { TAG_LINK, "link", tag_handle_link },
112 { TAG_META, "meta", tag_handle_meta },
113 { TAG_OBJECT, "object", tag_find_urls },
114 { TAG_OVERLAY, "overlay", tag_find_urls },
115 { TAG_SCRIPT, "script", tag_find_urls },
116 { TAG_TABLE, "table", tag_find_urls },
117 { TAG_TD, "td", tag_find_urls },
118 { TAG_TH, "th", tag_find_urls },
119 { TAG_VIDEO, "video", tag_find_urls },
120 { TAG_AUDIO, "audio", tag_find_urls },
121 { TAG_SOURCE, "source", tag_find_urls }
122 };
123
124 /* tag_url_attributes documents which attributes of which tags contain
125 URLs to harvest. It is used by tag_find_urls. */
126
127 /* Defines for the FLAGS. */
128
129 /* The link is "inline", i.e. needs to be retrieved for this document
130 to be correctly rendered. Inline links include inlined images,
131 stylesheets, children frames, etc. */
132 #define ATTR_INLINE 1
133
134 /* The link is expected to yield HTML contents. It's important not to
135 try to follow HTML obtained by following e.g. <img src="...">
136 regardless of content-type. Doing this causes infinite loops for
137 "images" that return non-404 error pages with links to the same
138 image. */
139 #define ATTR_HTML 2
140
141 /* For tags handled by tag_find_urls: attributes that contain URLs to
142 download. */
143 static struct {
144 int tagid;
145 const char *attr_name;
146 int flags;
147 } tag_url_attributes[] = {
148 { TAG_A, "href", ATTR_HTML },
149 { TAG_APPLET, "code", ATTR_INLINE },
150 { TAG_AREA, "href", ATTR_HTML },
151 { TAG_BGSOUND, "src", ATTR_INLINE },
152 { TAG_BODY, "background", ATTR_INLINE },
153 { TAG_EMBED, "href", ATTR_HTML },
154 { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
155 { TAG_FIG, "src", ATTR_INLINE },
156 { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
157 { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
158 { TAG_IMG, "href", ATTR_INLINE },
159 { TAG_IMG, "lowsrc", ATTR_INLINE },
160 { TAG_IMG, "src", ATTR_INLINE },
161 { TAG_INPUT, "src", ATTR_INLINE },
162 { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
163 { TAG_OBJECT, "data", ATTR_INLINE },
164 { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
165 { TAG_SCRIPT, "src", ATTR_INLINE },
166 { TAG_TABLE, "background", ATTR_INLINE },
167 { TAG_TD, "background", ATTR_INLINE },
168 { TAG_TH, "background", ATTR_INLINE },
169 { TAG_VIDEO, "src", ATTR_INLINE },
170 { TAG_VIDEO, "poster", ATTR_INLINE },
171 { TAG_AUDIO, "src", ATTR_INLINE },
172 { TAG_AUDIO, "poster", ATTR_INLINE },
173 { TAG_SOURCE, "src", ATTR_INLINE }
174 };
175
176 /* The lists of interesting tags and attributes are built dynamically,
177 from the information above. However, some places in the code refer
178 to the attributes not mentioned here. We add them manually. */
179 static const char *additional_attributes[] = {
180 "rel", /* used by tag_handle_link */
181 "type", /* used by tag_handle_link */
182 "http-equiv", /* used by tag_handle_meta */
183 "name", /* used by tag_handle_meta */
184 "content", /* used by tag_handle_meta */
185 "action", /* used by tag_handle_form */
186 "style", /* used by check_style_attr */
187 "srcset", /* used by tag_handle_img */
188 };
189
190 static struct hash_table *interesting_tags;
191 static struct hash_table *interesting_attributes;
192
193 /* Will contains the (last) charset found in 'http-equiv=content-type'
194 meta tags */
195 static char *meta_charset;
196
197 static void
init_interesting(void)198 init_interesting (void)
199 {
200 /* Init the variables interesting_tags and interesting_attributes
201 that are used by the HTML parser to know which tags and
202 attributes we're interested in. We initialize this only once,
203 for performance reasons.
204
205 Here we also make sure that what we put in interesting_tags
206 matches the user's preferences as specified through --ignore-tags
207 and --follow-tags. */
208
209 size_t i;
210 interesting_tags = make_nocase_string_hash_table (countof (known_tags));
211
212 /* First, add all the tags we know hot to handle, mapped to their
213 respective entries in known_tags. */
214 for (i = 0; i < countof (known_tags); i++)
215 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
216
217 /* Then remove the tags ignored through --ignore-tags. */
218 if (opt.ignore_tags)
219 {
220 char **ignored;
221 for (ignored = opt.ignore_tags; *ignored; ignored++)
222 hash_table_remove (interesting_tags, *ignored);
223 }
224
225 /* If --follow-tags is specified, use only those tags. */
226 if (opt.follow_tags)
227 {
228 /* Create a new table intersecting --follow-tags and known_tags,
229 and use it as interesting_tags. */
230 struct hash_table *intersect = make_nocase_string_hash_table (0);
231 char **followed;
232 for (followed = opt.follow_tags; *followed; followed++)
233 {
234 struct known_tag *t = hash_table_get (interesting_tags, *followed);
235 if (!t)
236 continue; /* ignore unknown --follow-tags entries. */
237 hash_table_put (intersect, *followed, t);
238 }
239 hash_table_destroy (interesting_tags);
240 interesting_tags = intersect;
241 }
242
243 /* Add the attributes we care about. */
244 interesting_attributes = make_nocase_string_hash_table (10);
245 for (i = 0; i < countof (additional_attributes); i++)
246 hash_table_put (interesting_attributes, additional_attributes[i], "1");
247 for (i = 0; i < countof (tag_url_attributes); i++)
248 hash_table_put (interesting_attributes,
249 tag_url_attributes[i].attr_name, "1");
250 }
251
252 /* Find the value of attribute named NAME in the taginfo TAG. If the
253 attribute is not present, return NULL. If ATTRIND is non-NULL, the
254 index of the attribute in TAG will be stored there. */
255
256 static char *
find_attr(struct taginfo * tag,const char * name,int * attrind)257 find_attr (struct taginfo *tag, const char *name, int *attrind)
258 {
259 int i;
260 for (i = 0; i < tag->nattrs; i++)
261 if (!c_strcasecmp (tag->attrs[i].name, name))
262 {
263 if (attrind)
264 *attrind = i;
265 return tag->attrs[i].value;
266 }
267 return NULL;
268 }
269
270 /* used for calls to append_url */
271 #define ATTR_POS(tag, attrind, ctx) \
272 (tag->attrs[attrind].value_raw_beginning - ctx->text)
273 #define ATTR_SIZE(tag, attrind) \
274 (tag->attrs[attrind].value_raw_size)
275
276 /* Append LINK_URI to the urlpos structure that is being built.
277
278 LINK_URI will be merged with the current document base.
279 */
280
281 struct urlpos *
append_url(const char * link_uri,int position,int size,struct map_context * ctx)282 append_url (const char *link_uri, int position, int size,
283 struct map_context *ctx)
284 {
285 int link_has_scheme = url_has_scheme (link_uri);
286 struct urlpos *newel;
287 const char *base = ctx->base ? ctx->base : ctx->parent_base;
288 struct url *url;
289
290 struct iri *iri = iri_new ();
291 set_uri_encoding (iri, opt.locale, true);
292 iri->utf8_encode = true;
293
294 if (!base)
295 {
296 DEBUGP (("%s: no base, merge will use \"%s\".\n",
297 ctx->document_file, link_uri));
298
299 if (!link_has_scheme)
300 {
301 /* Base URL is unavailable, and the link does not have a
302 location attached to it -- we have to give up. Since
303 this can only happen when using `--force-html -i', print
304 a warning. */
305 logprintf (LOG_NOTQUIET,
306 _("%s: Cannot resolve incomplete link %s.\n"),
307 ctx->document_file, link_uri);
308 iri_free (iri);
309 return NULL;
310 }
311
312 url = url_parse (link_uri, NULL, iri, false);
313 if (!url)
314 {
315 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
316 ctx->document_file, link_uri));
317 iri_free (iri);
318 return NULL;
319 }
320 }
321 else
322 {
323 /* Merge BASE with LINK_URI, but also make sure the result is
324 canonicalized, i.e. that "../" have been resolved.
325 (parse_url will do that for us.) */
326
327 char *complete_uri = uri_merge (base, link_uri);
328
329 DEBUGP (("%s: merge(%s, %s) -> %s\n",
330 quotearg_n_style (0, escape_quoting_style, ctx->document_file),
331 quote_n (1, base),
332 quote_n (2, link_uri),
333 quotearg_n_style (3, escape_quoting_style, complete_uri)));
334
335 url = url_parse (complete_uri, NULL, iri, false);
336 if (!url)
337 {
338 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
339 ctx->document_file, complete_uri));
340 xfree (complete_uri);
341 iri_free (iri);
342 return NULL;
343 }
344 xfree (complete_uri);
345 }
346
347 iri_free (iri);
348
349 DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
350
351 newel = xnew0 (struct urlpos);
352 newel->url = url;
353 newel->pos = position;
354 newel->size = size;
355
356 /* A URL is relative if the host is not named, and the name does not
357 start with `/'. */
358 if (!link_has_scheme && *link_uri != '/')
359 newel->link_relative_p = 1;
360 else if (link_has_scheme)
361 newel->link_complete_p = 1;
362
363 /* Append the new URL maintaining the order by position. */
364 if (ctx->head == NULL)
365 ctx->head = newel;
366 else
367 {
368 struct urlpos *it, *prev = NULL;
369
370 it = ctx->head;
371 while (it && position > it->pos)
372 {
373 prev = it;
374 it = it->next;
375 }
376
377 newel->next = it;
378
379 if (prev)
380 prev->next = newel;
381 else
382 ctx->head = newel;
383 }
384
385 return newel;
386 }
387
388 static void
check_style_attr(struct taginfo * tag,struct map_context * ctx)389 check_style_attr (struct taginfo *tag, struct map_context *ctx)
390 {
391 int attrind;
392 int raw_start;
393 int raw_len;
394 char *style = find_attr (tag, "style", &attrind);
395 if (!style)
396 return;
397
398 /* raw pos and raw size include the quotes, skip them when they are
399 present. */
400 raw_start = ATTR_POS (tag, attrind, ctx);
401 raw_len = ATTR_SIZE (tag, attrind);
402 if( *(char *)(ctx->text + raw_start) == '\''
403 || *(char *)(ctx->text + raw_start) == '"')
404 {
405 raw_start += 1;
406 raw_len -= 2;
407 }
408
409 if(raw_len <= 0)
410 return;
411
412 get_urls_css (ctx, raw_start, raw_len);
413 }
414
415 /* All the tag_* functions are called from collect_tags_mapper, as
416 specified by KNOWN_TAGS. */
417
418 /* Default tag handler: collect URLs from attributes specified for
419 this tag by tag_url_attributes. */
420
421 static void
tag_find_urls(int tagid,struct taginfo * tag,struct map_context * ctx)422 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
423 {
424 size_t i;
425 int attrind;
426 int first = -1;
427
428 for (i = 0; i < countof (tag_url_attributes); i++)
429 if (tag_url_attributes[i].tagid == tagid)
430 {
431 /* We've found the index of tag_url_attributes where the
432 attributes of our tag begin. */
433 first = i;
434 break;
435 }
436 assert (first != -1);
437
438 /* Loop over the "interesting" attributes of this tag. In this
439 example, it will loop over "src" and "lowsrc".
440
441 <img src="foo.png" lowsrc="bar.png">
442
443 This has to be done in the outer loop so that the attributes are
444 processed in the same order in which they appear in the page.
445 This is required when converting links. */
446
447 for (attrind = 0; attrind < tag->nattrs; attrind++)
448 {
449 /* Find whether TAG/ATTRIND is a combination that contains a
450 URL. */
451 char *link = tag->attrs[attrind].value;
452 const size_t size = countof (tag_url_attributes);
453
454 /* If you're cringing at the inefficiency of the nested loops,
455 remember that they both iterate over a very small number of
456 items. The worst-case inner loop is for the IMG tag, which
457 has three attributes. */
458 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
459 {
460 if (0 == strcasecmp (tag->attrs[attrind].name,
461 tag_url_attributes[i].attr_name))
462 {
463 struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
464 ATTR_SIZE(tag,attrind), ctx);
465 if (up)
466 {
467 int flags = tag_url_attributes[i].flags;
468 if (flags & ATTR_INLINE)
469 up->link_inline_p = 1;
470 if (flags & ATTR_HTML)
471 up->link_expect_html = 1;
472 }
473 }
474 }
475 }
476 }
477
478 /* Handle the BASE tag, for <base href=...>. */
479
480 static void
tag_handle_base(int tagid _GL_UNUSED,struct taginfo * tag,struct map_context * ctx)481 tag_handle_base (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
482 {
483 struct urlpos *base_urlpos;
484 int attrind;
485 char *newbase = find_attr (tag, "href", &attrind);
486 if (!newbase)
487 return;
488
489 base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
490 ATTR_SIZE(tag,attrind), ctx);
491 if (!base_urlpos)
492 return;
493 base_urlpos->ignore_when_downloading = 1;
494 base_urlpos->link_base_p = 1;
495
496 xfree (ctx->base);
497 if (ctx->parent_base)
498 ctx->base = uri_merge (ctx->parent_base, newbase);
499 else
500 ctx->base = xstrdup (newbase);
501 }
502
503 /* Mark the URL found in <form action=...> for conversion. */
504
505 static void
tag_handle_form(int tagid _GL_UNUSED,struct taginfo * tag,struct map_context * ctx)506 tag_handle_form (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
507 {
508 int attrind;
509 char *action = find_attr (tag, "action", &attrind);
510
511 if (action)
512 {
513 struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
514 ATTR_SIZE(tag,attrind), ctx);
515 if (up)
516 up->ignore_when_downloading = 1;
517 }
518 }
519
520 /* Handle the LINK tag. It requires special handling because how its
521 links will be followed in -p mode depends on the REL attribute. */
522
523 static void
tag_handle_link(int tagid _GL_UNUSED,struct taginfo * tag,struct map_context * ctx)524 tag_handle_link (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
525 {
526 int attrind;
527 char *href = find_attr (tag, "href", &attrind);
528
529 /* All <link href="..."> link references are external, except those
530 known not to be, such as style sheet and shortcut icon:
531
532 <link rel="stylesheet" href="..."> or <link rel="alternate stylesheet" href="...">
533 <link rel="shortcut icon" href="..."> or <link rel="icon" href="...">
534 */
535 if (href)
536 {
537 struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
538 ATTR_SIZE(tag,attrind), ctx);
539 if (up)
540 {
541 char *rel = find_attr (tag, "rel", NULL);
542 if (rel)
543 {
544 if (0 == c_strcasecmp (rel, "stylesheet") || 0 == c_strcasecmp (rel, "alternate stylesheet"))
545 {
546 up->link_inline_p = 1;
547 up->link_expect_css = 1;
548 }
549 else if (0 == c_strcasecmp (rel, "shortcut icon") || 0 == c_strcasecmp (rel, "icon"))
550 {
551 up->link_inline_p = 1;
552 }
553 else if (0 == c_strcasecmp (rel, "manifest"))
554 {
555 up->link_inline_p = 1;
556 }
557 else
558 {
559 /* The external ones usually point to HTML pages, such as
560 <link rel="next" href="...">
561 except when the type attribute says otherwise:
562 <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
563 */
564 char *type = find_attr (tag, "type", NULL);
565 if (!type || c_strcasecmp (type, "text/html") == 0)
566 up->link_expect_html = 1;
567 }
568 }
569 }
570 }
571 }
572
573 /* Handle the META tag. This requires special handling because of the
574 refresh feature and because of robot exclusion. */
575
576 static void
tag_handle_meta(int tagid _GL_UNUSED,struct taginfo * tag,struct map_context * ctx)577 tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
578 {
579 char *name = find_attr (tag, "name", NULL);
580 char *http_equiv = find_attr (tag, "http-equiv", NULL);
581
582 if (http_equiv && 0 == c_strcasecmp (http_equiv, "refresh"))
583 {
584 /* Some pages use a META tag to specify that the page be
585 refreshed by a new page after a given number of seconds. The
586 general format for this is:
587
588 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
589
590 So we just need to skip past the "NUMBER; URL=" garbage to
591 get to the URL. */
592
593 struct urlpos *entry;
594 int attrind;
595 int timeout;
596 char *p;
597
598 char *refresh = find_attr (tag, "content", &attrind);
599 if (!refresh)
600 return;
601
602 timeout = strtol(refresh, &p, 10);
603
604 if (timeout < 0 || *p++ != ';')
605 return;
606
607 while (c_isspace (*p))
608 ++p;
609 if (!( c_toupper (*p) == 'U'
610 && c_toupper (*(p + 1)) == 'R'
611 && c_toupper (*(p + 2)) == 'L'
612 && *(p + 3) == '='))
613 return;
614 p += 4;
615 while (c_isspace (*p))
616 ++p;
617
618 entry = append_url (p, ATTR_POS(tag,attrind,ctx),
619 ATTR_SIZE(tag,attrind), ctx);
620 if (entry)
621 {
622 entry->link_refresh_p = 1;
623 entry->refresh_timeout = timeout;
624 entry->link_expect_html = 1;
625 }
626 }
627 else if (http_equiv && 0 == c_strcasecmp (http_equiv, "content-type"))
628 {
629 /* Handle stuff like:
630 <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
631
632 char *mcharset;
633 char *content = find_attr (tag, "content", NULL);
634 if (!content)
635 return;
636
637 mcharset = parse_charset (content);
638 if (!mcharset)
639 return;
640
641 xfree (meta_charset);
642 meta_charset = mcharset;
643 }
644 else if (name && 0 == c_strcasecmp (name, "robots"))
645 {
646 /* Handle stuff like:
647 <meta name="robots" content="index,nofollow"> */
648 char *content = find_attr (tag, "content", NULL);
649 if (!content)
650 return;
651 if (!c_strcasecmp (content, "none"))
652 ctx->nofollow = true;
653 else
654 {
655 while (*content)
656 {
657 char *end;
658 /* Skip any initial whitespace. */
659 content += strspn (content, " \f\n\r\t\v");
660 /* Find the next occurrence of ',' or whitespace,
661 * or the end of the string. */
662 end = content + strcspn (content, ", \f\n\r\t\v");
663 if (!c_strncasecmp (content, "nofollow", end - content))
664 ctx->nofollow = true;
665 /* Skip past the next comma, if any. */
666 if (*end == ',')
667 ++end;
668 else
669 {
670 end = strchr (end, ',');
671 if (end)
672 ++end;
673 else
674 end = content + strlen (content);
675 }
676 content = end;
677 }
678 }
679 }
680 }
681
682 /* Handle the IMG tag. This requires special handling for the srcset attr,
683 while the traditional src/lowsrc/href attributes can be handled generically.
684 */
685
686 static void
tag_handle_img(int tagid,struct taginfo * tag,struct map_context * ctx)687 tag_handle_img (int tagid, struct taginfo *tag, struct map_context *ctx) {
688 int attrind;
689 char *srcset;
690
691 /* Use the generic approach for the attributes without special syntax. */
692 tag_find_urls(tagid, tag, ctx);
693
694 srcset = find_attr (tag, "srcset", &attrind);
695 if (srcset)
696 {
697 /* These are relative to the input text. */
698 int base_ind = ATTR_POS (tag,attrind,ctx);
699 int size = strlen (srcset);
700
701 /* These are relative to srcset. */
702 int offset, url_start, url_end;
703
704 /* Make sure to line up base_ind with srcset[0], not outside quotes. */
705 if (ctx->text[base_ind] == '"' || ctx->text[base_ind] == '\'')
706 ++base_ind;
707
708 offset = 0;
709 while (offset < size)
710 {
711 bool has_descriptor = true;
712
713 /* Skip over initial whitespace and commas. Note there is no \v
714 in HTML5 whitespace. */
715 url_start = offset + strspn (srcset + offset, " \f\n\r\t,");
716
717 if (url_start == size)
718 return;
719
720 /* URL is any non-whitespace chars (including commas) - but with
721 trailing commas removed. */
722 url_end = url_start + strcspn (srcset + url_start, " \f\n\r\t");
723 while ((url_end - 1) > url_start && srcset[url_end - 1] == ',')
724 {
725 has_descriptor = false;
726 --url_end;
727 }
728
729 if (url_end > url_start)
730 {
731 char *url_text = strdupdelim (srcset + url_start,
732 srcset + url_end);
733 struct urlpos *up = append_url (url_text, base_ind + url_start,
734 url_end - url_start, ctx);
735 if (up)
736 {
737 up->link_inline_p = 1;
738 up->link_noquote_html_p = 1;
739 }
740 xfree (url_text);
741 }
742
743 /* If the URL wasn't terminated by a , there may also be a descriptor
744 which we just skip. */
745 if (has_descriptor)
746 {
747 /* This is comma-terminated, except there may be one level of
748 parentheses escaping that. */
749 bool in_paren = false;
750 for (offset = url_end; offset < size; ++offset)
751 {
752 char c = srcset[offset];
753 if (c == '(')
754 in_paren = true;
755 else if (c == ')' && in_paren)
756 in_paren = false;
757 else if (c == ',' && !in_paren)
758 break;
759 }
760 }
761 else
762 offset = url_end;
763 }
764 }
765 }
766
767 /* Dispatch the tag handler appropriate for the tag we're mapping
768 over. See known_tags[] for definition of tag handlers. */
769
770 static void
collect_tags_mapper(struct taginfo * tag,void * arg)771 collect_tags_mapper (struct taginfo *tag, void *arg)
772 {
773 struct map_context *ctx = (struct map_context *)arg;
774
775 /* Find the tag in our table of tags. This must not fail because
776 map_html_tags only returns tags found in interesting_tags.
777
778 I've changed this for now, I'm passing NULL as interesting_tags
779 to map_html_tags. This way we can check all tags for a style
780 attribute.
781 */
782 struct known_tag *t = hash_table_get (interesting_tags, tag->name);
783
784 if (t != NULL)
785 t->handler (t->tagid, tag, ctx);
786
787 check_style_attr (tag, ctx);
788
789 if (tag->end_tag_p && (0 == c_strcasecmp (tag->name, "style"))
790 && tag->contents_begin && tag->contents_end
791 && tag->contents_begin <= tag->contents_end)
792 {
793 /* parse contents */
794 get_urls_css (ctx, tag->contents_begin - ctx->text,
795 tag->contents_end - tag->contents_begin);
796 }
797 }
798
799 /* Analyze HTML tags FILE and construct a list of URLs referenced from
800 it. It merges relative links in FILE with URL. It is aware of
801 <base href=...> and does the right thing. */
802
803 struct urlpos *
get_urls_html_fm(const char * file,const struct file_memory * fm,const char * url,bool * meta_disallow_follow,struct iri * iri)804 get_urls_html_fm (const char *file, const struct file_memory *fm,
805 const char *url, bool *meta_disallow_follow,
806 struct iri *iri)
807 {
808 struct map_context ctx;
809 int flags;
810
811 ctx.text = fm->content;
812 ctx.head = NULL;
813 ctx.base = NULL;
814 ctx.parent_base = url ? url : opt.base_href;
815 ctx.document_file = file;
816 ctx.nofollow = false;
817
818 if (!interesting_tags)
819 init_interesting ();
820
821 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
822 generate <a href=" foo"> instead of <a href="foo"> (browsers
823 ignore spaces as well.) If you really mean space, use &32; or
824 %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
825 e.g. in <img src="foo.[newline]html">. Such newlines are also
826 ignored by IE and Mozilla and are presumably introduced by
827 writing HTML with editors that force word wrap. */
828 flags = MHT_TRIM_VALUES;
829 if (opt.strict_comments)
830 flags |= MHT_STRICT_COMMENTS;
831
832 /* the NULL here used to be interesting_tags */
833 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
834 NULL, interesting_attributes);
835
836 #ifdef ENABLE_IRI
837 /* Meta charset is only valid if there was no HTTP header Content-Type charset. */
838 /* This is true for HTTP 1.0 and 1.1. */
839 if (iri && !iri->content_encoding && meta_charset)
840 set_content_encoding (iri, meta_charset);
841 #endif
842 xfree (meta_charset);
843
844 DEBUGP (("nofollow in %s: %d\n", file, ctx.nofollow));
845
846 if (meta_disallow_follow)
847 *meta_disallow_follow = ctx.nofollow;
848
849 xfree (ctx.base);
850 return ctx.head;
851 }
852
853 struct urlpos *
get_urls_html(const char * file,const char * url,bool * meta_disallow_follow,struct iri * iri)854 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
855 struct iri *iri)
856 {
857 struct urlpos *urls;
858 struct file_memory *fm;
859
860 fm = wget_read_file (file);
861 if (!fm)
862 {
863 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
864 return NULL;
865 }
866 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
867
868 urls = get_urls_html_fm (file, fm, url, meta_disallow_follow, iri);
869 wget_read_file_free (fm);
870 return urls;
871 }
872
873 /* This doesn't really have anything to do with HTML, but it's similar
874 to get_urls_html, so we put it here. */
875
876 struct urlpos *
get_urls_file(const char * file)877 get_urls_file (const char *file)
878 {
879 struct file_memory *fm;
880 struct urlpos *head, *tail;
881 const char *text, *text_end;
882
883 /* Load the file. */
884 fm = wget_read_file (file);
885 if (!fm)
886 {
887 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
888 return NULL;
889 }
890 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
891
892 head = tail = NULL;
893 text = fm->content;
894 text_end = fm->content + fm->length;
895 while (text < text_end)
896 {
897 int up_error_code;
898 char *url_text;
899 char *new_url;
900 struct urlpos *entry;
901 struct url *url;
902
903 const char *line_beg = text;
904 const char *line_end = memchr (text, '\n', text_end - text);
905 if (!line_end)
906 line_end = text_end;
907 else
908 ++line_end;
909 text = line_end;
910
911 /* Strip whitespace from the beginning and end of line. */
912 while (line_beg < line_end && c_isspace (*line_beg))
913 ++line_beg;
914 while (line_end > line_beg && c_isspace (*(line_end - 1)))
915 --line_end;
916
917 if (line_beg == line_end)
918 continue;
919
920 /* The URL is in the [line_beg, line_end) region. */
921
922 /* We must copy the URL to a zero-terminated string, and we
923 can't use alloca because we're in a loop. *sigh*. */
924 url_text = strdupdelim (line_beg, line_end);
925
926 if (opt.base_href)
927 {
928 /* Merge opt.base_href with URL. */
929 char *merged = uri_merge (opt.base_href, url_text);
930 xfree (url_text);
931 url_text = merged;
932 }
933
934 new_url = rewrite_shorthand_url (url_text);
935 if (new_url)
936 {
937 xfree (url_text);
938 url_text = new_url;
939 }
940
941 url = url_parse (url_text, &up_error_code, NULL, false);
942 if (!url)
943 {
944 char *error = url_error (url_text, up_error_code);
945 logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
946 file, url_text, error);
947 xfree (url_text);
948 xfree (error);
949 inform_exit_status (URLERROR);
950 continue;
951 }
952 xfree (url_text);
953
954 entry = xnew0 (struct urlpos);
955 entry->url = url;
956
957 if (!head)
958 head = entry;
959 else
960 tail->next = entry;
961 tail = entry;
962 }
963 wget_read_file_free (fm);
964 return head;
965 }
966
967 #if defined DEBUG_MALLOC || defined TESTING
968 void
cleanup_html_url(void)969 cleanup_html_url (void)
970 {
971 /* Destroy the hash tables. The hash table keys and values are not
972 allocated by this code, so we don't need to free them here. */
973 if (interesting_tags)
974 hash_table_destroy (interesting_tags);
975 if (interesting_attributes)
976 hash_table_destroy (interesting_attributes);
977 }
978 #endif
979