1 /*-
2 * Copyright 2021 Vsevolod Stakhov
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "config.h"
17 #include "util.h"
18 #include "message.h"
19 #include "html.h"
20 #include "html_tags.h"
21 #include "html_block.hxx"
22 #include "html.hxx"
23 #include "libserver/css/css_value.hxx"
24 #include "libserver/css/css.hxx"
25
26 #include "url.h"
27 #include "contrib/libucl/khash.h"
28 #include "libmime/images.h"
29 #include "libutil/cxx/utf8_util.h"
30
31 #include "html_tag_defs.hxx"
32 #include "html_entities.hxx"
33 #include "html_tag.hxx"
34 #include "html_url.hxx"
35
36 #include <frozen/unordered_map.h>
37 #include <frozen/string.h>
38 #include <fmt/core.h>
39
40 #include <unicode/uversion.h>
41
42 namespace rspamd::html {
43
44 static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
45
46 static const html_tags_storage html_tags_defs;
47
48 auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>(
49 {
50 {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME},
51 {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
52 {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
53 {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
54 {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR},
55 {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
56 {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE},
57 {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS},
58 {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH},
59 {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT},
60 {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE},
61 {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL},
62 {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT},
63 {"id", html_component_type::RSPAMD_HTML_COMPONENT_ID},
64 {"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN},
65 });
66
67 #define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \
68 rspamd_html_log_id, "html", pool->tag.uid, \
69 __FUNCTION__, \
70 __VA_ARGS__)
71
INIT_LOG_MODULE(html)72 INIT_LOG_MODULE(html)
73
74 /*
75 * This function is expected to be called on a closing tag to fill up all tags
76 * and return the current parent (meaning unclosed) tag
77 */
78 static auto
79 html_check_balance(struct html_content *hc,
80 struct html_tag *tag,
81 goffset tag_start_offset,
82 goffset tag_end_offset) -> html_tag *
83 {
84 /* As agreed, the closing tag has the last opening at the parent ptr */
85 auto *opening_tag = tag->parent;
86
87 auto calculate_content_length = [tag_start_offset, tag_end_offset](html_tag *t) {
88 auto opening_content_offset = t->content_offset;
89
90 if (t->flags & (CM_EMPTY)) {
91 /* Attach closing tag just at the opening tag */
92 t->closing.start = t->tag_start;
93 t->closing.end = t->content_offset;
94 }
95 else {
96
97 if (opening_content_offset <= tag_start_offset) {
98 t->closing.start = tag_start_offset;
99 t->closing.end = tag_end_offset;
100 }
101 else {
102
103 t->closing.start = t->content_offset;
104 t->closing.end = tag_end_offset;
105 }
106 }
107 };
108
109 auto balance_tag = [&]() -> html_tag * {
110 auto it = tag->parent;
111 auto found_pair = false;
112
113 for (; it != nullptr; it = it->parent) {
114 if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
115 found_pair = true;
116 break;
117 }
118
119 }
120
121 /*
122 * If we have found a closing pair, then we need to close all tags and
123 * return the top-most tag
124 */
125 if (found_pair) {
126 for (it = tag->parent; it != nullptr; it = it->parent) {
127 it->flags |= FL_CLOSED;
128 /* Insert a virtual closing tag for all tags that are not closed */
129 calculate_content_length(it);
130 if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
131 break;
132 }
133 }
134
135 return it;
136 }
137 else {
138 /*
139 * We have not found a pair, so this closing tag is bogus and should
140 * be ignored completely.
141 * Unfortunately, it also means that we need to insert another tag,
142 * as the current closing tag is unusable for that purposes.
143 *
144 * We assume that callee will recognise that and reconstruct the
145 * tag at the tag_end_closing state, so we return nullptr...
146 */
147
148 }
149
150 /* Tag must be ignored and reconstructed */
151 return nullptr;
152 };
153
154 if (opening_tag) {
155
156 if (opening_tag->id == tag->id) {
157 opening_tag->flags |= FL_CLOSED;
158
159 calculate_content_length(opening_tag);
160 /* All good */
161 return opening_tag->parent;
162 }
163 else {
164 return balance_tag();
165 }
166 }
167 else {
168 /*
169 * We have no opening tag
170 * There are two possibilities:
171 *
172 * 1) We have some block tag in hc->all_tags;
173 * 2) We have no tags
174 */
175
176 if (hc->all_tags.empty()) {
177 hc->all_tags.push_back(std::make_unique<html_tag>());
178 auto *vtag = hc->all_tags.back().get();
179 vtag->id = Tag_HTML;
180 vtag->flags = FL_VIRTUAL;
181 vtag->tag_start = 0;
182 vtag->content_offset = 0;
183 calculate_content_length(vtag);
184
185 if (!hc->root_tag) {
186 hc->root_tag = vtag;
187 }
188 else {
189 vtag->parent = hc->root_tag;
190 }
191
192 tag->parent = vtag;
193
194 /* Recursively call with a virtual <html> tag inserted */
195 return html_check_balance(hc, tag, tag_start_offset, tag_end_offset);
196 }
197 }
198
199 return nullptr;
200 }
201
202 auto
html_component_from_string(const std::string_view & st)203 html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>
204 {
205 auto known_component_it = html_components_map.find(st);
206
207 if (known_component_it != html_components_map.end()) {
208 return known_component_it->second;
209 }
210 else {
211 return std::nullopt;
212 }
213 }
214
215 struct tag_content_parser_state {
216 int cur_state = 0;
217 std::string buf;
218 std::optional<html_component_type> cur_component;
219
resetrspamd::html::tag_content_parser_state220 void reset()
221 {
222 cur_state = 0;
223 buf.clear();
224 cur_component = std::nullopt;
225 }
226 };
227
228 static inline void
html_parse_tag_content(rspamd_mempool_t * pool,struct html_content * hc,struct html_tag * tag,const char * in,struct tag_content_parser_state & parser_env)229 html_parse_tag_content(rspamd_mempool_t *pool,
230 struct html_content *hc,
231 struct html_tag *tag,
232 const char *in,
233 struct tag_content_parser_state &parser_env)
234 {
235 enum tag_parser_state {
236 parse_start = 0,
237 parse_name,
238 parse_attr_name,
239 parse_equal,
240 parse_start_dquote,
241 parse_dqvalue,
242 parse_end_dquote,
243 parse_start_squote,
244 parse_sqvalue,
245 parse_end_squote,
246 parse_value,
247 spaces_before_eq,
248 spaces_after_eq,
249 spaces_after_param,
250 ignore_bad_tag,
251 tag_end,
252 slash_after_value,
253 slash_in_unqouted_value,
254 } state;
255
256 state = static_cast<enum tag_parser_state>(parser_env.cur_state);
257
258 /*
259 * Stores tag component if it doesn't exist, performing copy of the
260 * value + decoding of the entities
261 * Parser env is set to clear the current html attribute fields (saved_p and
262 * cur_component)
263 */
264 auto store_component_value = [&]() -> void {
265 if (parser_env.cur_component) {
266
267 if (parser_env.buf.empty()) {
268 tag->components.emplace_back(parser_env.cur_component.value(),
269 std::string_view{});
270 }
271 else {
272 /* We need to copy buf to a persistent storage */
273 auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
274
275 if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID ||
276 parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
277 /* Lowercase */
278 rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size());
279 }
280 else {
281 memcpy(s, parser_env.buf.data(), parser_env.buf.size());
282 }
283
284 auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size());
285 tag->components.emplace_back(parser_env.cur_component.value(),
286 std::string_view{s, sz});
287 }
288 }
289
290 parser_env.buf.clear();
291 parser_env.cur_component = std::nullopt;
292 };
293
294 auto store_component_name = [&]() -> bool {
295 decode_html_entitles_inplace(parser_env.buf);
296 auto known_component_it = html_components_map.find(std::string_view{parser_env.buf});
297 parser_env.buf.clear();
298
299 if (known_component_it != html_components_map.end()) {
300 parser_env.cur_component = known_component_it->second;
301
302 return true;
303 }
304 else {
305 parser_env.cur_component = std::nullopt;
306 }
307
308 return false;
309 };
310
311 auto store_value_character = [&](bool lc) -> void {
312 auto c = lc ? g_ascii_tolower(*in) : *in;
313
314 if (c == '\0') {
315 /* Replace with u0FFD */
316 parser_env.buf.append(u8"\uFFFD");
317 }
318 else {
319 parser_env.buf.push_back(c);
320 }
321 };
322
323 switch (state) {
324 case parse_start:
325 if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
326 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
327 state = ignore_bad_tag;
328 tag->id = N_TAGS;
329 tag->flags |= FL_BROKEN;
330 }
331 else if (g_ascii_isalpha (*in)) {
332 state = parse_name;
333 store_value_character(true);
334 }
335 break;
336
337 case parse_name:
338 if ((g_ascii_isspace (*in) || *in == '>' || *in == '/')) {
339 if (*in == '/') {
340 tag->flags |= FL_CLOSED;
341 }
342
343 if (parser_env.buf.empty()) {
344 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
345 tag->id = N_TAGS;
346 tag->flags |= FL_BROKEN;
347 state = ignore_bad_tag;
348 }
349 else {
350 decode_html_entitles_inplace(parser_env.buf);
351 const auto *tag_def = rspamd::html::html_tags_defs.by_name(parser_env.buf);
352
353 if (tag_def == nullptr) {
354 hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
355 /* Assign -hash to match closing tag if needed */
356 auto nhash = static_cast<std::int32_t>(std::hash<std::string>{}(parser_env.buf));
357 /* Always negative */
358 tag->id = static_cast<tag_id_t>(nhash | G_MININT32);
359 }
360 else {
361 tag->id = tag_def->id;
362 tag->flags = tag_def->flags;
363 }
364
365 parser_env.buf.clear();
366
367 state = spaces_after_param;
368 }
369 }
370 else {
371 store_value_character(true);
372 }
373 break;
374
375 case parse_attr_name:
376 if (*in == '=') {
377 if (!parser_env.buf.empty()) {
378 store_component_name();
379 }
380 state = parse_equal;
381 }
382 else if (g_ascii_isspace(*in)) {
383 store_component_name();
384 state = spaces_before_eq;
385 }
386 else if (*in == '/') {
387 store_component_name();
388 store_component_value();
389 state = slash_after_value;
390 }
391 else if (*in == '>') {
392 store_component_name();
393 store_component_value();
394 state = tag_end;
395 }
396 else {
397 if (*in == '"' || *in == '\'' || *in == '<') {
398 /* Should never be in attribute names but ignored */
399 tag->flags |= FL_BROKEN;
400 }
401
402 store_value_character(true);
403 }
404
405 break;
406
407 case spaces_before_eq:
408 if (*in == '=') {
409 state = parse_equal;
410 }
411 else if (!g_ascii_isspace (*in)) {
412 /*
413 * HTML defines that crap could still be restored and
414 * calculated somehow... So we have to follow this stupid behaviour
415 */
416 /*
417 * TODO: estimate what insane things do email clients in each case
418 */
419 if (*in == '>') {
420 /*
421 * Attribtute name followed by end of tag
422 * Should be okay (empty attribute). The rest is handled outside
423 * this automata.
424 */
425 store_component_value();
426 state = tag_end;
427 }
428 else if (*in == '"' || *in == '\'' || *in == '<') {
429 /* Attribute followed by quote... Missing '=' ? Dunno, need to test */
430 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
431 tag->flags |= FL_BROKEN;
432 store_component_value();
433 store_value_character(true);
434 state = spaces_after_param;
435 }
436 else {
437 /* Empty attribute */
438 store_component_value();
439 store_value_character(true);
440 state = spaces_after_param;
441 }
442 }
443 break;
444
445 case spaces_after_eq:
446 if (*in == '"') {
447 state = parse_start_dquote;
448 }
449 else if (*in == '\'') {
450 state = parse_start_squote;
451 }
452 else if (!g_ascii_isspace (*in)) {
453 store_value_character(true);
454 state = parse_value;
455 }
456 break;
457
458 case parse_equal:
459 if (g_ascii_isspace (*in)) {
460 state = spaces_after_eq;
461 }
462 else if (*in == '"') {
463 state = parse_start_dquote;
464 }
465 else if (*in == '\'') {
466 state = parse_start_squote;
467 }
468 else {
469 store_value_character(true);
470 state = parse_value;
471 }
472 break;
473
474 case parse_start_dquote:
475 if (*in == '"') {
476 state = spaces_after_param;
477 }
478 else {
479 store_value_character(false);
480 state = parse_dqvalue;
481 }
482 break;
483
484 case parse_start_squote:
485 if (*in == '\'') {
486 state = spaces_after_param;
487 }
488 else {
489 store_value_character(false);
490 state = parse_sqvalue;
491 }
492 break;
493
494 case parse_dqvalue:
495 if (*in == '"') {
496 store_component_value();
497 state = parse_end_dquote;
498 }
499 else {
500 store_value_character(false);
501 }
502 break;
503
504 case parse_sqvalue:
505 if (*in == '\'') {
506 store_component_value();
507 state = parse_end_squote;
508 }
509 else {
510 store_value_character(false);
511 }
512
513 break;
514
515 case parse_value:
516 if (*in == '/') {
517 state = slash_in_unqouted_value;
518 }
519 else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
520 store_component_value();
521 state = spaces_after_param;
522 }
523 else {
524 store_value_character(false);
525 }
526 break;
527
528 case parse_end_dquote:
529 case parse_end_squote:
530 if (g_ascii_isspace (*in)) {
531 state = spaces_after_param;
532 }
533 else if (*in == '/') {
534 store_component_value();
535 store_value_character(true);
536 state = slash_after_value;
537 }
538 else {
539 /* No space, proceed immediately to the attribute name */
540 state = parse_attr_name;
541 store_component_value();
542 store_value_character(true);
543 }
544 break;
545
546 case spaces_after_param:
547 if (!g_ascii_isspace (*in)) {
548 if (*in == '/') {
549 state = slash_after_value;
550 }
551 else if (*in == '=') {
552 /* Attributes cannot start with '=' */
553 tag->flags |= FL_BROKEN;
554 store_value_character(true);
555 state = parse_attr_name;
556 }
557 else {
558 store_value_character(true);
559 state = parse_attr_name;
560 }
561 }
562 break;
563 case slash_after_value:
564 if (*in == '>') {
565 tag->flags |= FL_CLOSED;
566 state = tag_end;
567 }
568 else if (!g_ascii_isspace(*in)) {
569 tag->flags |= FL_BROKEN;
570 state = parse_attr_name;
571 }
572 break;
573 case slash_in_unqouted_value:
574 if (*in == '>') {
575 /* That slash was in fact closing tag slash, wohoo */
576 tag->flags |= FL_CLOSED;
577 state = tag_end;
578 store_component_value();
579 }
580 else {
581 /* Welcome to the world of html, revert state and save missing / */
582 parser_env.buf.push_back('/');
583 store_value_character(false);
584 state = parse_value;
585 }
586 break;
587 case ignore_bad_tag:
588 case tag_end:
589 break;
590 }
591
592 parser_env.cur_state = state;
593 }
594
595 static inline auto
html_is_absolute_url(std::string_view st)596 html_is_absolute_url(std::string_view st) -> bool
597 {
598 auto alnum_pos = std::find_if(std::begin(st), std::end(st),
599 [](auto c) {return !g_ascii_isalnum(c);});
600
601 if (alnum_pos != std::end(st) && alnum_pos != std::begin(st)) {
602 if (*alnum_pos == ':') {
603 if (st.substr(0, std::distance(std::begin(st), alnum_pos)) == "mailto") {
604 return true;
605 }
606
607 std::advance(alnum_pos, 1);
608 if (alnum_pos != std::end(st)) {
609 /* Include even malformed urls */
610 if (*alnum_pos == '/' || *alnum_pos == '\\') {
611 return true;
612 }
613 }
614 }
615 }
616
617 return false;
618 }
619
620 static auto
html_process_url_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc)621 html_process_url_tag(rspamd_mempool_t *pool,
622 struct html_tag *tag,
623 struct html_content *hc) -> std::optional<struct rspamd_url *>
624 {
625 auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
626
627 if (found_href_maybe) {
628 /* Check base url */
629 auto &href_value = found_href_maybe.value();
630
631 if (hc && hc->base_url) {
632 /*
633 * Relative url cannot start from the following:
634 * schema://
635 * data:
636 * slash
637 */
638
639 if (!html_is_absolute_url(href_value)) {
640
641 if (href_value.size() >= sizeof("data:") &&
642 g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) {
643 /* Image data url, never insert as url */
644 return std::nullopt;
645 }
646
647 /* Assume relative url */
648 auto need_slash = false;
649
650 auto orig_len = href_value.size();
651 auto len = orig_len + hc->base_url->urllen;
652
653 if (hc->base_url->datalen == 0) {
654 need_slash = true;
655 len++;
656 }
657
658 auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
659 auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1,
660 "%*s%s%*s",
661 (int) hc->base_url->urllen, hc->base_url->string,
662 need_slash ? "/" : "",
663 (gint) orig_len, href_value.data());
664 href_value = {buf, nlen};
665 }
666 else if (href_value.size() > 2 && href_value[0] == '/' && href_value[1] != '/') {
667 /* Relative to the hostname */
668 auto orig_len = href_value.size();
669 auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen +
670 3 /* for :// */;
671 auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
672 auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
673 (int) hc->base_url->protocollen, hc->base_url->string,
674 (int) hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
675 (gint) orig_len, href_value.data());
676 href_value = {buf, nlen};
677 }
678 }
679
680 auto url = html_process_url(pool, href_value);
681
682 if (url && std::holds_alternative<std::monostate>(tag->extra)) {
683 tag->extra = url.value();
684 }
685
686 return url;
687 }
688
689 return std::nullopt;
690 }
691
692 struct rspamd_html_url_query_cbd {
693 rspamd_mempool_t *pool;
694 khash_t (rspamd_url_hash) *url_set;
695 struct rspamd_url *url;
696 GPtrArray *part_urls;
697 };
698
699 static gboolean
html_url_query_callback(struct rspamd_url * url,gsize start_offset,gsize end_offset,gpointer ud)700 html_url_query_callback(struct rspamd_url *url, gsize start_offset,
701 gsize end_offset, gpointer ud)
702 {
703 struct rspamd_html_url_query_cbd *cbd =
704 (struct rspamd_html_url_query_cbd *) ud;
705 rspamd_mempool_t *pool;
706
707 pool = cbd->pool;
708
709 if (url->protocol == PROTOCOL_MAILTO) {
710 if (url->userlen == 0) {
711 return FALSE;
712 }
713 }
714
715 msg_debug_html ("found url %s in query of url"
716 " %*s", url->string,
717 cbd->url->querylen, rspamd_url_query_unsafe(cbd->url));
718
719 url->flags |= RSPAMD_URL_FLAG_QUERY;
720
721 if (rspamd_url_set_add_or_increase(cbd->url_set, url, false)
722 && cbd->part_urls) {
723 g_ptr_array_add(cbd->part_urls, url);
724 }
725
726 return TRUE;
727 }
728
729 static void
html_process_query_url(rspamd_mempool_t * pool,struct rspamd_url * url,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)730 html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
731 khash_t (rspamd_url_hash) *url_set,
732 GPtrArray *part_urls)
733 {
734 if (url->querylen > 0) {
735 struct rspamd_html_url_query_cbd qcbd;
736
737 qcbd.pool = pool;
738 qcbd.url_set = url_set;
739 qcbd.url = url;
740 qcbd.part_urls = part_urls;
741
742 rspamd_url_find_multiple(pool,
743 rspamd_url_query_unsafe (url), url->querylen,
744 RSPAMD_URL_FIND_ALL, NULL,
745 html_url_query_callback, &qcbd);
746 }
747
748 if (part_urls) {
749 g_ptr_array_add(part_urls, url);
750 }
751 }
752
753 static auto
html_process_data_image(rspamd_mempool_t * pool,struct html_image * img,std::string_view input)754 html_process_data_image(rspamd_mempool_t *pool,
755 struct html_image *img,
756 std::string_view input) -> void
757 {
758 /*
759 * Here, we do very basic processing of the data:
760 * detect if we have something like: `data:image/xxx;base64,yyyzzz==`
761 * We only parse base64 encoded data.
762 * We ignore content type so far
763 */
764 struct rspamd_image *parsed_image;
765 const gchar *semicolon_pos = input.data(),
766 *end = input.data() + input.size();
767
768 if ((semicolon_pos = (const gchar *) memchr(semicolon_pos, ';', end - semicolon_pos)) != NULL) {
769 if (end - semicolon_pos > sizeof("base64,")) {
770 if (memcmp(semicolon_pos + 1, "base64,", sizeof("base64,") - 1) == 0) {
771 const gchar *data_pos = semicolon_pos + sizeof("base64,");
772 gchar *decoded;
773 gsize encoded_len = end - data_pos, decoded_len;
774 rspamd_ftok_t inp;
775
776 decoded_len = (encoded_len / 4 * 3) + 12;
777 decoded = rspamd_mempool_alloc_buffer(pool, decoded_len);
778 rspamd_cryptobox_base64_decode(data_pos, encoded_len,
779 reinterpret_cast<guchar *>(decoded), &decoded_len);
780 inp.begin = decoded;
781 inp.len = decoded_len;
782
783 parsed_image = rspamd_maybe_process_image(pool, &inp);
784
785 if (parsed_image) {
786 msg_debug_html ("detected %s image of size %ud x %ud in data url",
787 rspamd_image_type_str(parsed_image->type),
788 parsed_image->width, parsed_image->height);
789 img->embedded_image = parsed_image;
790 }
791 }
792 }
793 else {
794 /* Nothing useful */
795 return;
796 }
797 }
798 }
799
800 static void
html_process_img_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)801 html_process_img_tag(rspamd_mempool_t *pool,
802 struct html_tag *tag,
803 struct html_content *hc,
804 khash_t (rspamd_url_hash) *url_set,
805 GPtrArray *part_urls)
806 {
807 struct html_image *img;
808
809 img = rspamd_mempool_alloc0_type (pool, struct html_image);
810 img->tag = tag;
811
812 for (const auto ¶m : tag->components) {
813
814 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
815 /* Check base url */
816 const auto &href_value = param.value;
817
818 if (href_value.size() > 0) {
819 rspamd_ftok_t fstr;
820 fstr.begin = href_value.data();
821 fstr.len = href_value.size();
822 img->src = rspamd_mempool_ftokdup (pool, &fstr);
823
824 if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
825 "cid:", sizeof("cid:") - 1) == 0) {
826 /* We have an embedded image */
827 img->src += sizeof("cid:") - 1;
828 img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
829 }
830 else {
831 if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
832 "data:", sizeof("data:") - 1) == 0) {
833 /* We have an embedded image in HTML tag */
834 img->flags |=
835 (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
836 html_process_data_image(pool, img, href_value);
837 hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
838 }
839 else {
840 img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
841 if (img->src) {
842
843 std::string_view cpy{href_value};
844 auto maybe_url = html_process_url(pool, cpy);
845
846 if (maybe_url) {
847 img->url = maybe_url.value();
848 struct rspamd_url *existing;
849
850 img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
851 existing = rspamd_url_set_add_or_return(url_set,
852 img->url);
853
854 if (existing && existing != img->url) {
855 /*
856 * We have some other URL that could be
857 * found, e.g. from another part. However,
858 * we still want to set an image flag on it
859 */
860 existing->flags |= img->url->flags;
861 existing->count++;
862 }
863 else if (part_urls) {
864 /* New url */
865 g_ptr_array_add(part_urls, img->url);
866 }
867 }
868 }
869 }
870 }
871 }
872 }
873
874
875 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) {
876 unsigned long val;
877
878 rspamd_strtoul(param.value.data(), param.value.size(), &val);
879 img->height = val;
880 }
881
882 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) {
883 unsigned long val;
884
885 rspamd_strtoul(param.value.data(), param.value.size(), &val);
886 img->width = val;
887 }
888
889 /* TODO: rework to css at some time */
890 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
891 if (img->height == 0) {
892 auto style_st = param.value;
893 auto pos = rspamd_substring_search_caseless(style_st.data(),
894 style_st.size(),
895 "height", sizeof("height") - 1);
896 if (pos != -1) {
897 auto substr = style_st.substr(pos + sizeof("height") - 1);
898
899 for (auto i = 0; i < substr.size(); i++) {
900 auto t = substr[i];
901 if (g_ascii_isdigit (t)) {
902 unsigned long val;
903 rspamd_strtoul(substr.data(),
904 substr.size(), &val);
905 img->height = val;
906 break;
907 }
908 else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
909 /* Fallback */
910 break;
911 }
912 }
913 }
914 }
915 if (img->width == 0) {
916 auto style_st = param.value;
917 auto pos = rspamd_substring_search_caseless(style_st.data(),
918 style_st.size(),
919 "width", sizeof("width") - 1);
920 if (pos != -1) {
921 auto substr = style_st.substr(pos + sizeof("width") - 1);
922
923 for (auto i = 0; i < substr.size(); i++) {
924 auto t = substr[i];
925 if (g_ascii_isdigit (t)) {
926 unsigned long val;
927 rspamd_strtoul(substr.data(),
928 substr.size(), &val);
929 img->width = val;
930 break;
931 }
932 else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
933 /* Fallback */
934 break;
935 }
936 }
937 }
938 }
939 }
940 }
941
942 if (img->embedded_image) {
943 if (img->height == 0) {
944 img->height = img->embedded_image->height;
945 }
946 if (img->width == 0) {
947 img->width = img->embedded_image->width;
948 }
949 }
950
951 hc->images.push_back(img);
952 tag->extra = img;
953 }
954
955 static auto
html_process_link_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)956 html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
957 struct html_content *hc,
958 khash_t (rspamd_url_hash) *url_set,
959 GPtrArray *part_urls) -> void
960 {
961 auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL);
962
963 if (found_rel_maybe) {
964 if (found_rel_maybe.value() == "icon") {
965 html_process_img_tag(pool, tag, hc, url_set, part_urls);
966 }
967 }
968 }
969
970 static auto
html_process_block_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc)971 html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
972 struct html_content *hc) -> void
973 {
974 std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
975 bool hidden = false;
976
977 for (const auto ¶m : tag->components) {
978 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
979 maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
980 }
981
982 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
983 maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
984 }
985
986 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
987 tag->block = rspamd::css::parse_css_declaration(pool, param.value);
988 }
989
990 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) {
991 hidden = true;
992 }
993 }
994
995 if (!tag->block) {
996 tag->block = html_block::undefined_html_block_pool(pool);
997 }
998
999 if (hidden) {
1000 tag->block->set_display(false);
1001 }
1002
1003 if (maybe_fgcolor) {
1004 tag->block->set_fgcolor(maybe_fgcolor->to_color().value());
1005 }
1006
1007 if (maybe_bgcolor) {
1008 tag->block->set_bgcolor(maybe_bgcolor->to_color().value());
1009 }
1010 }
1011
1012 static inline auto
html_append_parsed(struct html_content * hc,std::string_view data,bool transparent,std::size_t input_len,std::string & dest)1013 html_append_parsed(struct html_content *hc,
1014 std::string_view data,
1015 bool transparent,
1016 std::size_t input_len,
1017 std::string &dest) -> std::size_t
1018 {
1019 auto cur_offset = dest.size();
1020
1021 if (dest.size() > input_len) {
1022 /* Impossible case, refuse to append */
1023 return 0;
1024 }
1025
1026 if (data.size() > 0) {
1027 /* Handle multiple spaces at the begin */
1028
1029 if (cur_offset > 0) {
1030 auto last = dest.back();
1031 if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) {
1032 dest.append(" ");
1033 data = {data.data() + 1, data.size() - 1};
1034 cur_offset++;
1035 }
1036 }
1037
1038 if (data.find('\0') != std::string_view::npos) {
1039 auto replace_zero_func = [](const auto &input, auto &output) {
1040 const auto last = input.cend();
1041 for (auto it = input.cbegin(); it != last; ++it) {
1042 if (*it == '\0') {
1043 output.append(u8"\uFFFD");
1044 }
1045 else {
1046 output.push_back(*it);
1047 }
1048 }
1049 };
1050
1051 dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD"));
1052 replace_zero_func(data, dest);
1053 hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
1054 }
1055 else {
1056 dest.append(data);
1057 }
1058 }
1059
1060 auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset,
1061 dest.size() - cur_offset, true);
1062
1063 dest.resize(nlen + cur_offset);
1064
1065 if (transparent) {
1066 /* Replace all visible characters with spaces */
1067 auto start = std::next(dest.begin(), cur_offset);
1068 std::replace_if(start, std::end(dest), [](const auto c) {
1069 return !g_ascii_isspace(c);
1070 }, ' ');
1071 }
1072
1073 return nlen;
1074 }
1075
1076 static auto
html_process_displayed_href_tag(rspamd_mempool_t * pool,struct html_content * hc,std::string_view data,const struct html_tag * cur_tag,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,goffset dest_offset)1077 html_process_displayed_href_tag(rspamd_mempool_t *pool,
1078 struct html_content *hc,
1079 std::string_view data,
1080 const struct html_tag *cur_tag,
1081 GList **exceptions,
1082 khash_t (rspamd_url_hash) *url_set,
1083 goffset dest_offset) -> void
1084 {
1085
1086 if (std::holds_alternative<rspamd_url *>(cur_tag->extra)) {
1087 auto *url = std::get<rspamd_url *>(cur_tag->extra);
1088
1089 html_check_displayed_url(pool,
1090 exceptions, url_set,
1091 data,
1092 dest_offset,
1093 url);
1094 }
1095 }
1096
1097 static auto
html_append_tag_content(rspamd_mempool_t * pool,const gchar * start,gsize len,struct html_content * hc,html_tag * tag,GList ** exceptions,khash_t (rspamd_url_hash)* url_set)1098 html_append_tag_content(rspamd_mempool_t *pool,
1099 const gchar *start, gsize len,
1100 struct html_content *hc,
1101 html_tag *tag,
1102 GList **exceptions,
1103 khash_t (rspamd_url_hash) *url_set) -> goffset
1104 {
1105 auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false;
1106 goffset next_tag_offset = tag->closing.end,
1107 initial_parsed_offset = hc->parsed.size(),
1108 initial_invisible_offset = hc->invisible.size();
1109
1110 auto calculate_final_tag_offsets = [&]() -> void {
1111 if (is_visible) {
1112 tag->content_offset = initial_parsed_offset;
1113 tag->closing.start = hc->parsed.size();
1114 }
1115 else {
1116 tag->content_offset = initial_invisible_offset;
1117 tag->closing.start = hc->invisible.size();
1118 }
1119 };
1120
1121 if (tag->closing.end == -1) {
1122 if (tag->closing.start != -1) {
1123 next_tag_offset = tag->closing.start;
1124 tag->closing.end = tag->closing.start;
1125 }
1126 else {
1127 next_tag_offset = tag->content_offset;
1128 tag->closing.end = tag->content_offset;
1129 }
1130 }
1131 if (tag->closing.start == -1) {
1132 tag->closing.start = tag->closing.end;
1133 }
1134
1135 auto append_margin = [&](char c) -> void {
1136 /* We do care about visible margins only */
1137 if (is_visible) {
1138 if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
1139 if (hc->parsed.back() == ' ') {
1140 /* We also strip extra spaces at the end, but limiting the start */
1141 auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset);
1142 auto first = std::find_if(hc->parsed.rbegin(), last,
1143 [](auto ch) -> auto {
1144 return ch != ' ';
1145 });
1146 hc->parsed.erase(first.base(), hc->parsed.end());
1147 g_assert(hc->parsed.size() >= initial_parsed_offset);
1148 }
1149 hc->parsed.push_back(c);
1150 }
1151 }
1152 };
1153
1154 if (tag->id == Tag_BR || tag->id == Tag_HR) {
1155
1156 if (!(tag->flags & FL_IGNORE)) {
1157 hc->parsed.append("\n");
1158 }
1159
1160 auto ret = tag->content_offset;
1161 calculate_final_tag_offsets();
1162
1163 return ret;
1164 }
1165 else if ((tag->id == Tag_HEAD && (tag->flags & FL_IGNORE)) || (tag->flags & CM_HEAD)) {
1166 auto ret = tag->closing.end;
1167 calculate_final_tag_offsets();
1168
1169 return ret;
1170 }
1171
1172 if ((tag->flags & (FL_COMMENT | FL_XML | FL_IGNORE | CM_HEAD))) {
1173 is_visible = false;
1174 }
1175 else {
1176 if (!tag->block) {
1177 is_visible = true;
1178 }
1179 else if (!tag->block->is_visible()) {
1180 if (!tag->block->is_transparent()) {
1181 is_visible = false;
1182 }
1183 else {
1184 if (tag->block->has_display() &&
1185 tag->block->display == css::css_display_value::DISPLAY_HIDDEN) {
1186 is_visible = false;
1187 }
1188 else {
1189 is_transparent = true;
1190 }
1191 }
1192 }
1193 else {
1194 if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
1195 is_block = true;
1196 }
1197 else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) {
1198 is_spaces = true;
1199 }
1200 }
1201 }
1202
1203 if (is_block) {
1204 append_margin('\n');
1205 }
1206 else if (is_spaces) {
1207 append_margin(' ');
1208 }
1209
1210 goffset cur_offset = tag->content_offset;
1211
1212 for (auto *cld : tag->children) {
1213 auto enclosed_start = cld->tag_start;
1214 goffset initial_part_len = enclosed_start - cur_offset;
1215
1216 if (initial_part_len > 0) {
1217 if (is_visible) {
1218 html_append_parsed(hc,
1219 {start + cur_offset, std::size_t(initial_part_len)},
1220 is_transparent, len, hc->parsed);
1221 }
1222 else {
1223 html_append_parsed(hc,
1224 {start + cur_offset, std::size_t(initial_part_len)},
1225 is_transparent, len, hc->invisible);
1226 }
1227 }
1228
1229 auto next_offset = html_append_tag_content(pool, start, len,
1230 hc, cld, exceptions, url_set);
1231
1232 /* Do not allow shifting back */
1233 if (next_offset > cur_offset) {
1234 cur_offset = next_offset;
1235 }
1236 }
1237
1238 if (cur_offset < tag->closing.start) {
1239 goffset final_part_len = tag->closing.start - cur_offset;
1240
1241 if (final_part_len > 0) {
1242 if (is_visible) {
1243 html_append_parsed(hc,
1244 {start + cur_offset, std::size_t(final_part_len)},
1245 is_transparent,
1246 len,
1247 hc->parsed);
1248 }
1249 else {
1250 html_append_parsed(hc,
1251 {start + cur_offset, std::size_t(final_part_len)},
1252 is_transparent,
1253 len,
1254 hc->invisible);
1255 }
1256 }
1257 }
1258 if (is_block) {
1259 append_margin('\n');
1260 }
1261 else if (is_spaces) {
1262 append_margin(' ');
1263 }
1264
1265 if (is_visible) {
1266 if (tag->id == Tag_A) {
1267 auto written_len = hc->parsed.size() - initial_parsed_offset;
1268 html_process_displayed_href_tag(pool, hc,
1269 {hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)},
1270 tag, exceptions,
1271 url_set, initial_parsed_offset);
1272 }
1273 else if (tag->id == Tag_IMG) {
1274 /* Process ALT if presented */
1275 auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
1276
1277 if (maybe_alt) {
1278 if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) {
1279 /* Add a space */
1280 hc->parsed += ' ';
1281 }
1282
1283 hc->parsed.append(maybe_alt.value());
1284
1285 if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) {
1286 /* Add a space */
1287 hc->parsed += ' ';
1288 }
1289 }
1290 }
1291 }
1292 else {
1293 /* Invisible stuff */
1294 if (std::holds_alternative<rspamd_url *>(tag->extra)) {
1295 auto *url_enclosed = std::get<rspamd_url *>(tag->extra);
1296
1297 /*
1298 * TODO: when hash is fixed to include flags we need to remove and add
1299 * url to the hash set
1300 */
1301 if (url_enclosed) {
1302 url_enclosed->flags |= RSPAMD_URL_FLAG_INVISIBLE;
1303 }
1304 }
1305 }
1306
1307 calculate_final_tag_offsets();
1308
1309 return next_tag_offset;
1310 }
1311
1312 auto
html_process_input(rspamd_mempool_t * pool,GByteArray * in,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls,bool allow_css)1313 html_process_input(rspamd_mempool_t *pool,
1314 GByteArray *in,
1315 GList **exceptions,
1316 khash_t (rspamd_url_hash) *url_set,
1317 GPtrArray *part_urls,
1318 bool allow_css) -> html_content *
1319 {
1320 const gchar *p, *c, *end, *start;
1321 guchar t;
1322 auto closing = false;
1323 guint obrace = 0, ebrace = 0;
1324 struct rspamd_url *url = nullptr;
1325 gint href_offset = -1;
1326 struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
1327 struct tag_content_parser_state content_parser_env;
1328
1329 enum {
1330 parse_start = 0,
1331 content_before_start,
1332 tag_begin,
1333 sgml_tag,
1334 xml_tag,
1335 compound_tag,
1336 comment_tag,
1337 comment_content,
1338 sgml_content,
1339 tag_content,
1340 tag_end_opening,
1341 tag_end_closing,
1342 html_text_content,
1343 xml_tag_end,
1344 tag_raw_text,
1345 tag_raw_text_less_than,
1346 tags_limit_overflow,
1347 } state = parse_start;
1348
1349 enum class html_document_state {
1350 doctype,
1351 head,
1352 body
1353 } html_document_state = html_document_state::doctype;
1354
1355 g_assert (in != NULL);
1356 g_assert (pool != NULL);
1357
1358 struct html_content *hc = new html_content;
1359 rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
1360
1361 auto new_tag = [&](int flags = 0) -> struct html_tag * {
1362
1363 if (hc->all_tags.size() > rspamd::html::max_tags) {
1364 hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
1365
1366 return nullptr;
1367 }
1368
1369 hc->all_tags.emplace_back(std::make_unique<html_tag>());
1370 auto *ntag = hc->all_tags.back().get();
1371 ntag->tag_start = c - start;
1372 ntag->flags = flags;
1373
1374 if (cur_tag && !(cur_tag->flags & (CM_EMPTY | FL_CLOSED)) && cur_tag != &cur_closing_tag) {
1375 parent_tag = cur_tag;
1376 }
1377
1378 if (flags & FL_XML) {
1379 return ntag;
1380 }
1381
1382 return ntag;
1383 };
1384
1385 auto process_opening_tag = [&]() {
1386 if (cur_tag->id > Tag_UNKNOWN) {
1387 if (cur_tag->flags & CM_UNIQUE) {
1388 if (!hc->tags_seen[cur_tag->id]) {
1389 /* Duplicate tag has been found */
1390 hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
1391 }
1392 }
1393 hc->tags_seen[cur_tag->id] = true;
1394 }
1395
1396 /* Shift to the first unclosed tag */
1397 auto *pt = parent_tag;
1398 while (pt && (pt->flags & FL_CLOSED)) {
1399 pt = pt->parent;
1400 }
1401
1402 if (pt) {
1403 cur_tag->parent = pt;
1404 g_assert(cur_tag->parent != cur_tag);
1405 g_assert(cur_tag->parent != &cur_closing_tag);
1406 parent_tag = pt;
1407 parent_tag->children.push_back(cur_tag);
1408 }
1409 else {
1410 if (hc->root_tag) {
1411 cur_tag->parent = hc->root_tag;
1412 g_assert(cur_tag->parent != cur_tag);
1413 hc->root_tag->children.push_back(cur_tag);
1414 parent_tag = hc->root_tag;
1415 }
1416 else {
1417 if (cur_tag->id == Tag_HTML) {
1418 hc->root_tag = cur_tag;
1419 }
1420 else {
1421 /* Insert a fake html tag */
1422 hc->all_tags.emplace_back(std::make_unique<html_tag>());
1423 auto *top_tag = hc->all_tags.back().get();
1424 top_tag->tag_start = 0;
1425 top_tag->flags = FL_VIRTUAL;
1426 top_tag->id = Tag_HTML;
1427 top_tag->content_offset = 0;
1428 top_tag->children.push_back(cur_tag);
1429 cur_tag->parent = top_tag;
1430 g_assert(cur_tag->parent != cur_tag);
1431 hc->root_tag = top_tag;
1432 parent_tag = top_tag;
1433 }
1434 }
1435 }
1436
1437 if (cur_tag->flags & FL_HREF && html_document_state == html_document_state::body) {
1438 auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
1439
1440 if (maybe_url) {
1441 url = maybe_url.value();
1442
1443 if (url_set != NULL) {
1444 struct rspamd_url *maybe_existing =
1445 rspamd_url_set_add_or_return(url_set, maybe_url.value());
1446 if (maybe_existing == maybe_url.value()) {
1447 html_process_query_url(pool, url, url_set,
1448 part_urls);
1449 }
1450 else {
1451 url = maybe_existing;
1452 /* Replace extra as well */
1453 cur_tag->extra = maybe_existing;
1454 /* Increase count to avoid odd checks failure */
1455 url->count++;
1456 }
1457 }
1458 if (part_urls) {
1459 g_ptr_array_add(part_urls, url);
1460 }
1461
1462 href_offset = hc->parsed.size();
1463 }
1464 }
1465 else if (cur_tag->id == Tag_BASE) {
1466 /*
1467 * Base is allowed only within head tag but HTML is retarded
1468 */
1469 auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
1470
1471 if (maybe_url) {
1472 msg_debug_html ("got valid base tag");
1473 cur_tag->extra = maybe_url.value();
1474 cur_tag->flags |= FL_HREF;
1475
1476 if (hc->base_url == nullptr) {
1477 hc->base_url = maybe_url.value();
1478 }
1479 else {
1480 msg_debug_html ("ignore redundant base tag");
1481 }
1482 }
1483 else {
1484 msg_debug_html ("got invalid base tag!");
1485 }
1486 }
1487
1488 if (cur_tag->id == Tag_IMG) {
1489 html_process_img_tag(pool, cur_tag, hc, url_set,
1490 part_urls);
1491 }
1492 else if (cur_tag->id == Tag_LINK) {
1493 html_process_link_tag(pool, cur_tag, hc, url_set,
1494 part_urls);
1495 }
1496
1497 if (!(cur_tag->flags & CM_EMPTY)) {
1498 html_process_block_tag(pool, cur_tag, hc);
1499 }
1500 else {
1501 /* Implicitly close */
1502 cur_tag->flags |= FL_CLOSED;
1503 }
1504
1505 if (cur_tag->flags & FL_CLOSED) {
1506 cur_tag->closing.end = cur_tag->content_offset;
1507 cur_tag->closing.start = cur_tag->tag_start;
1508
1509 cur_tag = parent_tag;
1510 }
1511 };
1512
1513 p = (const char *) in->data;
1514 c = p;
1515 end = p + in->len;
1516 start = c;
1517
1518 while (p < end) {
1519 t = *p;
1520
1521 switch (state) {
1522 case parse_start:
1523 if (t == '<') {
1524 state = tag_begin;
1525 }
1526 else {
1527 /* We have no starting tag, so assume that it's content */
1528 hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
1529 cur_tag = new_tag();
1530 html_document_state = html_document_state::body;
1531
1532 if (cur_tag) {
1533 cur_tag->id = Tag_HTML;
1534 hc->root_tag = cur_tag;
1535 state = content_before_start;
1536 }
1537 else {
1538 state = tags_limit_overflow;
1539 }
1540 }
1541 break;
1542 case content_before_start:
1543 if (t == '<') {
1544 state = tag_begin;
1545 }
1546 else {
1547 p++;
1548 }
1549 break;
1550 case tag_begin:
1551 switch (t) {
1552 case '<':
1553 c = p;
1554 p++;
1555 closing = FALSE;
1556 break;
1557 case '!':
1558 cur_tag = new_tag(FL_XML | FL_CLOSED);
1559 if (cur_tag) {
1560 state = sgml_tag;
1561 }
1562 else {
1563 state = tags_limit_overflow;
1564 }
1565 p++;
1566 break;
1567 case '?':
1568 cur_tag = new_tag(FL_XML | FL_CLOSED);
1569 if (cur_tag) {
1570 state = xml_tag;
1571 }
1572 else {
1573 state = tags_limit_overflow;
1574 }
1575 hc->flags |= RSPAMD_HTML_FLAG_XML;
1576 p++;
1577 break;
1578 case '/':
1579 closing = TRUE;
1580 /* We fill fake closing tag to fill it with the content parser */
1581 cur_closing_tag.clear();
1582 /*
1583 * For closing tags, we need to find some corresponding opening tag.
1584 * However, at this point we have not even parsed a name, so we
1585 * can not assume anything about balancing, etc.
1586 *
1587 * So we need to ensure that:
1588 * 1) We have some opening tag in the chain cur_tag->parent...
1589 * 2) cur_tag is nullptr - okay, html is just brain damaged
1590 * 3) cur_tag must NOT be equal to cur_closing tag. It means that
1591 * we had some poor closing tag but we still need to find an opening
1592 * tag... Somewhere...
1593 */
1594
1595 if (cur_tag == &cur_closing_tag) {
1596 if (parent_tag != &cur_closing_tag) {
1597 cur_closing_tag.parent = parent_tag;
1598 }
1599 else {
1600 cur_closing_tag.parent = nullptr;
1601 }
1602 }
1603 else if (cur_tag && cur_tag->flags & FL_CLOSED) {
1604 /* Cur tag is already closed, we should find something else */
1605 auto *tmp = cur_tag;
1606 while (tmp) {
1607 tmp = tmp->parent;
1608
1609 if (tmp == nullptr || !(tmp->flags & FL_CLOSED)) {
1610 break;
1611 }
1612 }
1613
1614 cur_closing_tag.parent = tmp;
1615 }
1616 else {
1617 cur_closing_tag.parent = cur_tag;
1618 }
1619
1620 cur_tag = &cur_closing_tag;
1621 p++;
1622 break;
1623 case '>':
1624 /* Empty tag */
1625 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1626 state = html_text_content;
1627 continue;
1628 default:
1629 if (g_ascii_isalpha(t)) {
1630 state = tag_content;
1631 content_parser_env.reset();
1632
1633 if (!closing) {
1634 cur_tag = new_tag();
1635 }
1636
1637 if (cur_tag) {
1638 state = tag_content;
1639 }
1640 else {
1641 state = tags_limit_overflow;
1642 }
1643 }
1644 else {
1645 /* Wrong bad tag */
1646 state = html_text_content;
1647 }
1648 break;
1649 }
1650
1651 break;
1652
1653 case sgml_tag:
1654 switch (t) {
1655 case '[':
1656 state = compound_tag;
1657 obrace = 1;
1658 ebrace = 0;
1659 p++;
1660 break;
1661 case '-':
1662 cur_tag->flags |= FL_COMMENT;
1663 state = comment_tag;
1664 p++;
1665 break;
1666 default:
1667 state = sgml_content;
1668 break;
1669 }
1670
1671 break;
1672
1673 case xml_tag:
1674 if (t == '?') {
1675 state = xml_tag_end;
1676 }
1677 else if (t == '>') {
1678 /* Misformed xml tag */
1679 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1680 state = tag_end_opening;
1681 continue;
1682 }
1683 /* We efficiently ignore xml tags */
1684 p++;
1685 break;
1686
1687 case xml_tag_end:
1688 if (t == '>') {
1689 state = tag_end_opening;
1690 cur_tag->content_offset = p - start + 1;
1691 continue;
1692 }
1693 else {
1694 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1695 }
1696 p++;
1697 break;
1698
1699 case compound_tag:
1700 if (t == '[') {
1701 obrace++;
1702 }
1703 else if (t == ']') {
1704 ebrace++;
1705 }
1706 else if (t == '>' && obrace == ebrace) {
1707 state = tag_end_opening;
1708 cur_tag->content_offset = p - start + 1;
1709 continue;
1710 }
1711 p++;
1712 break;
1713
1714 case comment_tag:
1715 if (t != '-') {
1716 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1717 state = tag_end_opening;
1718 }
1719 else {
1720 p++;
1721 ebrace = 0;
1722 /*
1723 * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments
1724 * ... the text must not start with a single
1725 * U+003E GREATER-THAN SIGN character (>),
1726 * nor start with a "-" (U+002D) character followed by
1727 * a U+003E GREATER-THAN SIGN (>) character,
1728 * nor contain two consecutive U+002D HYPHEN-MINUS
1729 * characters (--), nor end with a "-" (U+002D) character.
1730 */
1731 if (p[0] == '-' && p + 1 < end && p[1] == '>') {
1732 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1733 p++;
1734 state = tag_end_opening;
1735 }
1736 else if (*p == '>') {
1737 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1738 state = tag_end_opening;
1739 }
1740 else {
1741 state = comment_content;
1742 }
1743 }
1744 break;
1745
1746 case comment_content:
1747 if (t == '-') {
1748 ebrace++;
1749 }
1750 else if (t == '>' && ebrace >= 2) {
1751 cur_tag->content_offset = p - start + 1;
1752 state = tag_end_opening;
1753 continue;
1754 }
1755 else {
1756 ebrace = 0;
1757 }
1758
1759 p++;
1760 break;
1761
1762 case html_text_content:
1763 if (t != '<') {
1764 p++;
1765 }
1766 else {
1767 state = tag_begin;
1768 }
1769 break;
1770
1771 case tag_raw_text:
1772 if (t == '<') {
1773 c = p;
1774 state = tag_raw_text_less_than;
1775 }
1776 p ++;
1777 break;
1778 case tag_raw_text_less_than:
1779 if (t == '/') {
1780 /* Here are special things: we look for obrace and then ensure
1781 * that if there is any closing brace nearby
1782 * (we look maximum at 30 characters). We also need to ensure
1783 * that we have no special characters, such as punctuation marks and
1784 * so on.
1785 * Basically, we validate the input to be sane.
1786 * Since closing tags must not have attributes, these assumptions
1787 * seems to be reasonable enough for our toy parser.
1788 */
1789 gint cur_lookahead = 1;
1790 gint max_lookahead = MIN (end - p, 30);
1791 bool valid_closing_tag = true;
1792
1793 if (p + 1 < end && !g_ascii_isalpha (p[1])) {
1794 valid_closing_tag = false;
1795 }
1796 else {
1797 while (cur_lookahead < max_lookahead) {
1798 gchar tt = p[cur_lookahead];
1799 if (tt == '>') {
1800 break;
1801 }
1802 else if (tt < '\n' || tt == ',') {
1803 valid_closing_tag = false;
1804 break;
1805 }
1806 cur_lookahead ++;
1807 }
1808
1809 if (cur_lookahead == max_lookahead) {
1810 valid_closing_tag = false;
1811 }
1812 }
1813
1814 if (valid_closing_tag) {
1815 /* Shift back */
1816 p = c;
1817 state = tag_begin;
1818 }
1819 else {
1820 p ++;
1821 state = tag_raw_text;
1822 }
1823 }
1824 else {
1825 p ++;
1826 state = tag_raw_text;
1827 }
1828 break;
1829 case sgml_content:
1830 /* TODO: parse DOCTYPE here */
1831 if (t == '>') {
1832 cur_tag->content_offset = p - start + 1;
1833 state = tag_end_opening;
1834 }
1835 else {
1836 p++;
1837 }
1838 break;
1839
1840 case tag_content:
1841 html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env);
1842
1843 if (t == '>') {
1844 if (closing) {
1845 cur_tag->closing.start = c - start;
1846 cur_tag->closing.end = p - start + 1;
1847
1848 closing = FALSE;
1849 state = tag_end_closing;
1850 }
1851 else {
1852 cur_tag->content_offset = p - start + 1;
1853 state = tag_end_opening;
1854 }
1855
1856
1857 continue;
1858 }
1859 p++;
1860 break;
1861
1862 case tag_end_opening:
1863 content_parser_env.reset();
1864 state = html_text_content;
1865
1866 if (cur_tag) {
1867 if (cur_tag->id == Tag_STYLE || cur_tag->id == Tag_NOSCRIPT || cur_tag->id == Tag_SCRIPT) {
1868 state = tag_raw_text;
1869 }
1870 if (html_document_state == html_document_state::doctype) {
1871 if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) {
1872 html_document_state = html_document_state::head;
1873 cur_tag->flags |= FL_IGNORE;
1874 }
1875 else if (cur_tag->id != Tag_HTML) {
1876 html_document_state = html_document_state::body;
1877 }
1878 }
1879 else if (html_document_state == html_document_state::head) {
1880 if (!(cur_tag->flags & (CM_EMPTY | CM_HEAD))) {
1881 if (parent_tag && (parent_tag->id == Tag_HEAD || !(parent_tag->flags & CM_HEAD))) {
1882 /*
1883 * As by standard, we have to close the HEAD tag
1884 * and switch to the body state
1885 */
1886 parent_tag->flags |= FL_CLOSED;
1887 parent_tag->closing.start = cur_tag->tag_start;
1888 parent_tag->closing.end = cur_tag->content_offset;
1889
1890 html_document_state = html_document_state::body;
1891 }
1892 else if (cur_tag->id == Tag_BODY) {
1893 html_document_state = html_document_state::body;
1894 }
1895 else {
1896 /*
1897 * For propagation in something like
1898 * <title><p><a>ololo</a></p></title> - should be unprocessed
1899 */
1900 cur_tag->flags |= CM_HEAD;
1901 }
1902 }
1903 }
1904
1905 process_opening_tag();
1906 }
1907
1908 p++;
1909 c = p;
1910 break;
1911 case tag_end_closing: {
1912 if (cur_tag) {
1913
1914 if (cur_tag->flags & CM_EMPTY) {
1915 /* Ignore closing empty tags */
1916 cur_tag->flags |= FL_IGNORE;
1917 }
1918 if (html_document_state == html_document_state::doctype) {
1919
1920 }
1921 else if (html_document_state == html_document_state::head) {
1922 if (cur_tag->id == Tag_HEAD) {
1923 html_document_state = html_document_state::body;
1924 }
1925 }
1926
1927 /* cur_tag here is a closing tag */
1928 auto *next_cur_tag = html_check_balance(hc, cur_tag,
1929 c - start, p - start + 1);
1930
1931 if (cur_tag->id == Tag_STYLE && allow_css) {
1932 auto *opening_tag = cur_tag->parent;
1933
1934 if (opening_tag && opening_tag->id == Tag_STYLE &&
1935 (int)opening_tag->content_offset < opening_tag->closing.start) {
1936 auto ret_maybe = rspamd::css::parse_css(pool,
1937 {start + opening_tag->content_offset,
1938 opening_tag->closing.start - opening_tag->content_offset},
1939 std::move(hc->css_style));
1940
1941 if (!ret_maybe.has_value()) {
1942 if (ret_maybe.error().is_fatal()) {
1943 auto err_str = fmt::format(
1944 "cannot parse css (error code: {}): {}",
1945 static_cast<int>(ret_maybe.error().type),
1946 ret_maybe.error().description.value_or("unknown error"));
1947 msg_info_pool ("%*s", (int) err_str.size(), err_str.data());
1948 }
1949 }
1950 else {
1951 hc->css_style = ret_maybe.value();
1952 }
1953 }
1954 }
1955
1956 if (next_cur_tag != nullptr) {
1957 cur_tag = next_cur_tag;
1958 }
1959 else {
1960 /*
1961 * Here, we handle cases like <p>lala</b>...
1962 * So the tag </b> is bogus and unpaired
1963 * However, we need to exclude it from the output of <p> tag
1964 * To do that, we create a fake opening tag and insert that to
1965 * the current opening tag
1966 */
1967 auto *cur_opening_tag = cur_tag->parent;
1968
1969 while (cur_opening_tag && (cur_opening_tag->flags & FL_CLOSED)) {
1970 cur_opening_tag = cur_opening_tag->parent;
1971 }
1972
1973 if (!cur_opening_tag) {
1974 cur_opening_tag = hc->root_tag;
1975 }
1976
1977 auto &&vtag = std::make_unique<html_tag>();
1978 vtag->id = cur_tag->id;
1979 vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags;
1980 vtag->tag_start = cur_tag->closing.start;
1981 vtag->content_offset = p - start + 1;
1982 vtag->closing = cur_tag->closing;
1983 vtag->parent = cur_opening_tag;
1984 g_assert(vtag->parent != &cur_closing_tag);
1985 cur_opening_tag->children.push_back(vtag.get());
1986 hc->all_tags.emplace_back(std::move(vtag));
1987 cur_tag = cur_opening_tag;
1988 parent_tag = cur_tag->parent;
1989 g_assert(cur_tag->parent != &cur_closing_tag);
1990 }
1991 } /* if cur_tag != nullptr */
1992 state = html_text_content;
1993 p++;
1994 c = p;
1995 break;
1996 }
1997 case tags_limit_overflow:
1998 msg_warn_pool("tags limit of %d tags is reached at the position %d;"
1999 " ignoring the rest of the HTML content",
2000 (int) hc->all_tags.size(), (int) (p - start));
2001 c = p;
2002 p = end;
2003 break;
2004 }
2005 }
2006
2007 if (cur_tag && !(cur_tag->flags & FL_CLOSED) && cur_tag != &cur_closing_tag) {
2008 cur_closing_tag.parent = cur_tag;
2009 cur_closing_tag.id = cur_tag->id;
2010 cur_tag = &cur_closing_tag;
2011 html_check_balance(hc, cur_tag,
2012 end - start, end - start);
2013 }
2014
2015 /* Propagate styles */
2016 hc->traverse_block_tags([&hc, &pool](const html_tag *tag) -> bool {
2017
2018 if (hc->css_style) {
2019 auto *css_block = hc->css_style->check_tag_block(tag);
2020
2021 if (css_block) {
2022 if (tag->block) {
2023 tag->block->set_block(*css_block);
2024 }
2025 else {
2026 tag->block = css_block;
2027 }
2028 }
2029 }
2030 if (tag->block) {
2031 if (!tag->block->has_display()) {
2032 /* If we have no display field, we can check it by tag */
2033 if (tag->flags & CM_HEAD) {
2034 tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN,
2035 html_block::set);
2036 }
2037 else if (tag->flags & (CM_BLOCK | CM_TABLE)) {
2038 tag->block->set_display(css::css_display_value::DISPLAY_BLOCK,
2039 html_block::implicit);
2040 }
2041 else if (tag->flags & CM_ROW) {
2042 tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW,
2043 html_block::implicit);
2044 }
2045 else {
2046 tag->block->set_display(css::css_display_value::DISPLAY_INLINE,
2047 html_block::implicit);
2048 }
2049 }
2050
2051 tag->block->compute_visibility();
2052
2053 for (const auto *cld_tag : tag->children) {
2054
2055 if (cld_tag->block) {
2056 cld_tag->block->propagate_block(*tag->block);
2057 }
2058 else {
2059 cld_tag->block = rspamd_mempool_alloc0_type(pool, html_block);
2060 *cld_tag->block = *tag->block;
2061 }
2062 }
2063 }
2064 return true;
2065 }, html_content::traverse_type::PRE_ORDER);
2066
2067 /* Leftover before content */
2068 switch (state) {
2069 case tag_end_opening:
2070 if (cur_tag != nullptr) {
2071 process_opening_tag();
2072 }
2073 break;
2074 default:
2075 /* Do nothing */
2076 break;
2077 }
2078
2079 if (!hc->all_tags.empty() && hc->root_tag) {
2080 html_append_tag_content(pool, start, end - start, hc, hc->root_tag,
2081 exceptions, url_set);
2082 }
2083
2084 /* Leftover after content */
2085 switch (state) {
2086 case tag_end_opening:
2087 if (cur_tag != nullptr) {
2088 process_opening_tag();
2089 }
2090 break;
2091 case tags_limit_overflow:
2092 html_append_parsed(hc, {c, (std::size_t) (end - c)},
2093 false, end - start, hc->parsed);
2094 break;
2095 default:
2096 /* Do nothing */
2097 break;
2098 }
2099
2100 if (!hc->parsed.empty()) {
2101 /* Trim extra spaces at the at the end if needed */
2102 if (g_ascii_isspace(hc->parsed.back())) {
2103 auto last_it = std::end(hc->parsed);
2104
2105 /* Allow last newline */
2106 if (hc->parsed.back() == '\n') {
2107 --last_it;
2108 }
2109
2110 hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(),
2111 [](auto ch) -> auto {
2112 return !g_ascii_isspace(ch);
2113 }).base(),
2114 last_it);
2115 }
2116 }
2117
2118 return hc;
2119 }
2120
2121 static auto
html_find_image_by_cid(const html_content & hc,std::string_view cid)2122 html_find_image_by_cid(const html_content &hc, std::string_view cid)
2123 -> std::optional<const html_image *>
2124 {
2125 for (const auto *html_image : hc.images) {
2126 /* Filter embedded images */
2127 if (html_image->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED &&
2128 html_image->src != nullptr) {
2129 if (cid == html_image->src) {
2130 return html_image;
2131 }
2132 }
2133 }
2134
2135 return std::nullopt;
2136 }
2137
2138 auto
html_debug_structure(const html_content & hc)2139 html_debug_structure(const html_content &hc) -> std::string
2140 {
2141 std::string output;
2142
2143 if (hc.root_tag) {
2144 auto rec_functor = [&](const html_tag *t, int level, auto rec_functor) -> void {
2145 std::string pluses(level, '+');
2146
2147 if (!(t->flags & (FL_VIRTUAL | FL_IGNORE))) {
2148 if (t->flags & FL_XML) {
2149 output += fmt::format("{}xml;", pluses);
2150 }
2151 else {
2152 output += fmt::format("{}{};", pluses,
2153 html_tags_defs.name_by_id_safe(t->id));
2154 }
2155 level++;
2156 }
2157 for (const auto *cld : t->children) {
2158 rec_functor(cld, level, rec_functor);
2159 }
2160 };
2161
2162 rec_functor(hc.root_tag, 1, rec_functor);
2163 }
2164
2165 return output;
2166 }
2167
html_tag_by_name(const std::string_view & name)2168 auto html_tag_by_name(const std::string_view &name)
2169 -> std::optional<tag_id_t>
2170 {
2171 const auto *td = rspamd::html::html_tags_defs.by_name(name);
2172
2173 if (td != nullptr) {
2174 return td->id;
2175 }
2176
2177 return std::nullopt;
2178 }
2179
2180 auto
get_content(const struct html_content * hc) const2181 html_tag::get_content(const struct html_content *hc) const -> std::string_view
2182 {
2183 const std::string *dest = &hc->parsed;
2184
2185 if (block && !block->is_visible()) {
2186 dest = &hc->invisible;
2187 }
2188 const auto clen = get_content_length();
2189 if (content_offset < dest->size()) {
2190 if (dest->size() - content_offset >= clen) {
2191 return std::string_view{*dest}.substr(content_offset, clen);
2192 }
2193 else {
2194 return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset);
2195 }
2196 }
2197
2198 return std::string_view{};
2199 }
2200
2201 }
2202
2203 void *
rspamd_html_process_part_full(rspamd_mempool_t * pool,GByteArray * in,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls,bool allow_css)2204 rspamd_html_process_part_full(rspamd_mempool_t *pool,
2205 GByteArray *in, GList **exceptions,
2206 khash_t (rspamd_url_hash) *url_set,
2207 GPtrArray *part_urls,
2208 bool allow_css)
2209 {
2210 return rspamd::html::html_process_input(pool, in, exceptions, url_set,
2211 part_urls, allow_css);
2212 }
2213
2214 void *
rspamd_html_process_part(rspamd_mempool_t * pool,GByteArray * in)2215 rspamd_html_process_part(rspamd_mempool_t *pool,
2216 GByteArray *in)
2217 {
2218 return rspamd_html_process_part_full (pool, in, NULL,
2219 NULL, NULL, FALSE);
2220 }
2221
2222 guint
rspamd_html_decode_entitles_inplace(gchar * s,gsize len)2223 rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
2224 {
2225 return rspamd::html::decode_html_entitles_inplace(s, len);
2226 }
2227
2228 gint
rspamd_html_tag_by_name(const gchar * name)2229 rspamd_html_tag_by_name(const gchar *name)
2230 {
2231 const auto *td = rspamd::html::html_tags_defs.by_name(name);
2232
2233 if (td != nullptr) {
2234 return td->id;
2235 }
2236
2237 return -1;
2238 }
2239
2240 gboolean
rspamd_html_tag_seen(void * ptr,const gchar * tagname)2241 rspamd_html_tag_seen(void *ptr, const gchar *tagname)
2242 {
2243 gint id;
2244 auto *hc = rspamd::html::html_content::from_ptr(ptr);
2245
2246 g_assert (hc != NULL);
2247
2248 id = rspamd_html_tag_by_name(tagname);
2249
2250 if (id != -1) {
2251 return hc->tags_seen[id];
2252 }
2253
2254 return FALSE;
2255 }
2256
2257 const gchar *
rspamd_html_tag_by_id(gint id)2258 rspamd_html_tag_by_id(gint id)
2259 {
2260 const auto *td = rspamd::html::html_tags_defs.by_id(id);
2261
2262 if (td != nullptr) {
2263 return td->name.c_str();
2264 }
2265
2266 return nullptr;
2267 }
2268
2269 const gchar *
rspamd_html_tag_name(void * p,gsize * len)2270 rspamd_html_tag_name(void *p, gsize *len)
2271 {
2272 auto *tag = reinterpret_cast<rspamd::html::html_tag *>(p);
2273 auto tname = rspamd::html::html_tags_defs.name_by_id_safe(tag->id);
2274
2275 if (len) {
2276 *len = tname.size();
2277 }
2278
2279 return tname.data();
2280 }
2281
2282 struct html_image*
rspamd_html_find_embedded_image(void * html_content,const char * cid,gsize cid_len)2283 rspamd_html_find_embedded_image(void *html_content,
2284 const char *cid, gsize cid_len)
2285 {
2286 auto *hc = rspamd::html::html_content::from_ptr(html_content);
2287
2288 auto maybe_img = rspamd::html::html_find_image_by_cid(*hc, {cid, cid_len});
2289
2290 if (maybe_img) {
2291 return (html_image *)maybe_img.value();
2292 }
2293
2294 return nullptr;
2295 }
2296
2297 bool
rspamd_html_get_parsed_content(void * html_content,rspamd_ftok_t * dest)2298 rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest)
2299 {
2300 auto *hc = rspamd::html::html_content::from_ptr(html_content);
2301
2302 dest->begin = hc->parsed.data();
2303 dest->len = hc->parsed.size();
2304
2305 return true;
2306 }
2307
2308 gsize
rspamd_html_get_tags_count(void * html_content)2309 rspamd_html_get_tags_count(void *html_content)
2310 {
2311 auto *hc = rspamd::html::html_content::from_ptr(html_content);
2312
2313 if (!hc) {
2314 return 0;
2315 }
2316
2317 return hc->all_tags.size();
2318 }