1 /*-
2 * Copyright 2021 Vsevolod Stakhov
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "config.h"
17 #include "util.h"
18 #include "message.h"
19 #include "html.h"
20 #include "html_tags.h"
21 #include "html_block.hxx"
22 #include "html.hxx"
23 #include "libserver/css/css_value.hxx"
24 #include "libserver/css/css.hxx"
25
26 #include "url.h"
27 #include "contrib/libucl/khash.h"
28 #include "libmime/images.h"
29 #include "libutil/cxx/utf8_util.h"
30
31 #include "html_tag_defs.hxx"
32 #include "html_entities.hxx"
33 #include "html_tag.hxx"
34 #include "html_url.hxx"
35
36 #include <frozen/unordered_map.h>
37 #include <frozen/string.h>
38 #include <fmt/core.h>
39
40 #include <unicode/uversion.h>
41
42 namespace rspamd::html {
43
44 static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
45
46 static const html_tags_storage html_tags_defs;
47
48 auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>(
49 {
50 {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME},
51 {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
52 {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
53 {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
54 {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR},
55 {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
56 {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE},
57 {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS},
58 {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH},
59 {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT},
60 {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE},
61 {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL},
62 {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT},
63 {"id", html_component_type::RSPAMD_HTML_COMPONENT_ID},
64 {"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN},
65 });
66
67 #define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \
68 rspamd_html_log_id, "html", pool->tag.uid, \
69 __FUNCTION__, \
70 __VA_ARGS__)
71
INIT_LOG_MODULE(html)72 INIT_LOG_MODULE(html)
73
74 /*
75 * This function is expected to be called on a closing tag to fill up all tags
76 * and return the current parent (meaning unclosed) tag
77 */
78 static auto
79 html_check_balance(struct html_content *hc,
80 struct html_tag *tag,
81 goffset tag_start_offset,
82 goffset tag_end_offset) -> html_tag *
83 {
84 /* As agreed, the closing tag has the last opening at the parent ptr */
85 auto *opening_tag = tag->parent;
86
87 auto calculate_content_length = [tag_start_offset, tag_end_offset](html_tag *t) {
88 auto opening_content_offset = t->content_offset;
89
90 if (t->flags & (CM_EMPTY)) {
91 /* Attach closing tag just at the opening tag */
92 t->closing.start = t->tag_start;
93 t->closing.end = t->content_offset;
94 }
95 else {
96
97 if (opening_content_offset <= tag_start_offset) {
98 t->closing.start = tag_start_offset;
99 t->closing.end = tag_end_offset;
100 }
101 else {
102
103 t->closing.start = t->content_offset;
104 t->closing.end = tag_end_offset;
105 }
106 }
107 };
108
109 auto balance_tag = [&]() -> html_tag * {
110 auto it = tag->parent;
111 auto found_pair = false;
112
113 for (; it != nullptr; it = it->parent) {
114 if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
115 found_pair = true;
116 break;
117 }
118
119 }
120
121 /*
122 * If we have found a closing pair, then we need to close all tags and
123 * return the top-most tag
124 */
125 if (found_pair) {
126 for (it = tag->parent; it != nullptr; it = it->parent) {
127 it->flags |= FL_CLOSED;
128 /* Insert a virtual closing tag for all tags that are not closed */
129 calculate_content_length(it);
130 if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
131 break;
132 }
133 }
134
135 return it;
136 }
137 else {
138 /*
139 * We have not found a pair, so this closing tag is bogus and should
140 * be ignored completely.
141 * Unfortunately, it also means that we need to insert another tag,
142 * as the current closing tag is unusable for that purposes.
143 *
144 * We assume that callee will recognise that and reconstruct the
145 * tag at the tag_end_closing state, so we return nullptr...
146 */
147
148 }
149
150 /* Tag must be ignored and reconstructed */
151 return nullptr;
152 };
153
154 if (opening_tag) {
155
156 if (opening_tag->id == tag->id) {
157 opening_tag->flags |= FL_CLOSED;
158
159 calculate_content_length(opening_tag);
160 /* All good */
161 return opening_tag->parent;
162 }
163 else {
164 return balance_tag();
165 }
166 }
167 else {
168 /*
169 * We have no opening tag
170 * There are two possibilities:
171 *
172 * 1) We have some block tag in hc->all_tags;
173 * 2) We have no tags
174 */
175
176 if (hc->all_tags.empty()) {
177 hc->all_tags.push_back(std::make_unique<html_tag>());
178 auto *vtag = hc->all_tags.back().get();
179 vtag->id = Tag_HTML;
180 vtag->flags = FL_VIRTUAL;
181 vtag->tag_start = 0;
182 vtag->content_offset = 0;
183 calculate_content_length(vtag);
184
185 if (!hc->root_tag) {
186 hc->root_tag = vtag;
187 }
188 else {
189 vtag->parent = hc->root_tag;
190 }
191
192 tag->parent = vtag;
193
194 /* Recursively call with a virtual <html> tag inserted */
195 return html_check_balance(hc, tag, tag_start_offset, tag_end_offset);
196 }
197 }
198
199 return nullptr;
200 }
201
202 auto
html_component_from_string(const std::string_view & st)203 html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>
204 {
205 auto known_component_it = html_components_map.find(st);
206
207 if (known_component_it != html_components_map.end()) {
208 return known_component_it->second;
209 }
210 else {
211 return std::nullopt;
212 }
213 }
214
215 struct tag_content_parser_state {
216 int cur_state = 0;
217 std::string buf;
218 std::optional<html_component_type> cur_component;
219
resetrspamd::html::tag_content_parser_state220 void reset()
221 {
222 cur_state = 0;
223 buf.clear();
224 cur_component = std::nullopt;
225 }
226 };
227
228 static inline void
html_parse_tag_content(rspamd_mempool_t * pool,struct html_content * hc,struct html_tag * tag,const char * in,struct tag_content_parser_state & parser_env)229 html_parse_tag_content(rspamd_mempool_t *pool,
230 struct html_content *hc,
231 struct html_tag *tag,
232 const char *in,
233 struct tag_content_parser_state &parser_env)
234 {
235 enum tag_parser_state {
236 parse_start = 0,
237 parse_name,
238 parse_attr_name,
239 parse_equal,
240 parse_start_dquote,
241 parse_dqvalue,
242 parse_end_dquote,
243 parse_start_squote,
244 parse_sqvalue,
245 parse_end_squote,
246 parse_value,
247 spaces_before_eq,
248 spaces_after_eq,
249 spaces_after_param,
250 ignore_bad_tag,
251 tag_end,
252 slash_after_value,
253 slash_in_unqouted_value,
254 } state;
255
256 state = static_cast<enum tag_parser_state>(parser_env.cur_state);
257
258 /*
259 * Stores tag component if it doesn't exist, performing copy of the
260 * value + decoding of the entities
261 * Parser env is set to clear the current html attribute fields (saved_p and
262 * cur_component)
263 */
264 auto store_component_value = [&]() -> void {
265 if (parser_env.cur_component) {
266
267 if (parser_env.buf.empty()) {
268 tag->components.emplace_back(parser_env.cur_component.value(),
269 std::string_view{});
270 }
271 else {
272 /* We need to copy buf to a persistent storage */
273 auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
274
275 if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID ||
276 parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
277 /* Lowercase */
278 rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size());
279 }
280 else {
281 memcpy(s, parser_env.buf.data(), parser_env.buf.size());
282 }
283
284 auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size());
285 tag->components.emplace_back(parser_env.cur_component.value(),
286 std::string_view{s, sz});
287 }
288 }
289
290 parser_env.buf.clear();
291 parser_env.cur_component = std::nullopt;
292 };
293
294 auto store_component_name = [&]() -> bool {
295 decode_html_entitles_inplace(parser_env.buf);
296 auto known_component_it = html_components_map.find(std::string_view{parser_env.buf});
297 parser_env.buf.clear();
298
299 if (known_component_it != html_components_map.end()) {
300 parser_env.cur_component = known_component_it->second;
301
302 return true;
303 }
304 else {
305 parser_env.cur_component = std::nullopt;
306 }
307
308 return false;
309 };
310
311 auto store_value_character = [&](bool lc) -> void {
312 auto c = lc ? g_ascii_tolower(*in) : *in;
313
314 if (c == '\0') {
315 /* Replace with u0FFD */
316 parser_env.buf.append(u8"\uFFFD");
317 }
318 else {
319 parser_env.buf.push_back(c);
320 }
321 };
322
323 switch (state) {
324 case parse_start:
325 if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
326 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
327 state = ignore_bad_tag;
328 tag->id = N_TAGS;
329 tag->flags |= FL_BROKEN;
330 }
331 else if (g_ascii_isalpha (*in)) {
332 state = parse_name;
333 store_value_character(true);
334 }
335 break;
336
337 case parse_name:
338 if ((g_ascii_isspace (*in) || *in == '>' || *in == '/')) {
339 if (*in == '/') {
340 tag->flags |= FL_CLOSED;
341 }
342
343 if (parser_env.buf.empty()) {
344 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
345 tag->id = N_TAGS;
346 tag->flags |= FL_BROKEN;
347 state = ignore_bad_tag;
348 }
349 else {
350 decode_html_entitles_inplace(parser_env.buf);
351 const auto *tag_def = rspamd::html::html_tags_defs.by_name(parser_env.buf);
352
353 if (tag_def == nullptr) {
354 hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
355 /* Assign -hash to match closing tag if needed */
356 auto nhash = static_cast<std::int32_t>(std::hash<std::string>{}(parser_env.buf));
357 /* Always negative */
358 tag->id = static_cast<tag_id_t>(nhash | G_MININT32);
359 }
360 else {
361 tag->id = tag_def->id;
362 tag->flags = tag_def->flags;
363 }
364
365 parser_env.buf.clear();
366
367 state = spaces_after_param;
368 }
369 }
370 else {
371 store_value_character(true);
372 }
373 break;
374
375 case parse_attr_name:
376 if (*in == '=') {
377 if (!parser_env.buf.empty()) {
378 store_component_name();
379 }
380 state = parse_equal;
381 }
382 else if (g_ascii_isspace(*in)) {
383 store_component_name();
384 state = spaces_before_eq;
385 }
386 else if (*in == '/') {
387 store_component_name();
388 store_component_value();
389 state = slash_after_value;
390 }
391 else if (*in == '>') {
392 store_component_name();
393 store_component_value();
394 state = tag_end;
395 }
396 else {
397 if (*in == '"' || *in == '\'' || *in == '<') {
398 /* Should never be in attribute names but ignored */
399 tag->flags |= FL_BROKEN;
400 }
401
402 store_value_character(true);
403 }
404
405 break;
406
407 case spaces_before_eq:
408 if (*in == '=') {
409 state = parse_equal;
410 }
411 else if (!g_ascii_isspace (*in)) {
412 /*
413 * HTML defines that crap could still be restored and
414 * calculated somehow... So we have to follow this stupid behaviour
415 */
416 /*
417 * TODO: estimate what insane things do email clients in each case
418 */
419 if (*in == '>') {
420 /*
421 * Attribtute name followed by end of tag
422 * Should be okay (empty attribute). The rest is handled outside
423 * this automata.
424 */
425 store_component_value();
426 state = tag_end;
427 }
428 else if (*in == '"' || *in == '\'' || *in == '<') {
429 /* Attribute followed by quote... Missing '=' ? Dunno, need to test */
430 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
431 tag->flags |= FL_BROKEN;
432 store_component_value();
433 store_value_character(true);
434 state = spaces_after_param;
435 }
436 else {
437 /* Empty attribute */
438 store_component_value();
439 store_value_character(true);
440 state = spaces_after_param;
441 }
442 }
443 break;
444
445 case spaces_after_eq:
446 if (*in == '"') {
447 state = parse_start_dquote;
448 }
449 else if (*in == '\'') {
450 state = parse_start_squote;
451 }
452 else if (!g_ascii_isspace (*in)) {
453 store_value_character(true);
454 state = parse_value;
455 }
456 break;
457
458 case parse_equal:
459 if (g_ascii_isspace (*in)) {
460 state = spaces_after_eq;
461 }
462 else if (*in == '"') {
463 state = parse_start_dquote;
464 }
465 else if (*in == '\'') {
466 state = parse_start_squote;
467 }
468 else {
469 store_value_character(true);
470 state = parse_value;
471 }
472 break;
473
474 case parse_start_dquote:
475 if (*in == '"') {
476 state = spaces_after_param;
477 }
478 else {
479 store_value_character(false);
480 state = parse_dqvalue;
481 }
482 break;
483
484 case parse_start_squote:
485 if (*in == '\'') {
486 state = spaces_after_param;
487 }
488 else {
489 store_value_character(false);
490 state = parse_sqvalue;
491 }
492 break;
493
494 case parse_dqvalue:
495 if (*in == '"') {
496 store_component_value();
497 state = parse_end_dquote;
498 }
499 else {
500 store_value_character(false);
501 }
502 break;
503
504 case parse_sqvalue:
505 if (*in == '\'') {
506 store_component_value();
507 state = parse_end_squote;
508 }
509 else {
510 store_value_character(false);
511 }
512
513 break;
514
515 case parse_value:
516 if (*in == '/') {
517 state = slash_in_unqouted_value;
518 }
519 else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
520 store_component_value();
521 state = spaces_after_param;
522 }
523 else {
524 store_value_character(false);
525 }
526 break;
527
528 case parse_end_dquote:
529 case parse_end_squote:
530 if (g_ascii_isspace (*in)) {
531 state = spaces_after_param;
532 }
533 else if (*in == '/') {
534 store_component_value();
535 store_value_character(true);
536 state = slash_after_value;
537 }
538 else {
539 /* No space, proceed immediately to the attribute name */
540 state = parse_attr_name;
541 store_component_value();
542 store_value_character(true);
543 }
544 break;
545
546 case spaces_after_param:
547 if (!g_ascii_isspace (*in)) {
548 if (*in == '/') {
549 state = slash_after_value;
550 }
551 else if (*in == '=') {
552 /* Attributes cannot start with '=' */
553 tag->flags |= FL_BROKEN;
554 store_value_character(true);
555 state = parse_attr_name;
556 }
557 else {
558 store_value_character(true);
559 state = parse_attr_name;
560 }
561 }
562 break;
563 case slash_after_value:
564 if (*in == '>') {
565 tag->flags |= FL_CLOSED;
566 state = tag_end;
567 }
568 else if (!g_ascii_isspace(*in)) {
569 tag->flags |= FL_BROKEN;
570 state = parse_attr_name;
571 }
572 break;
573 case slash_in_unqouted_value:
574 if (*in == '>') {
575 /* That slash was in fact closing tag slash, wohoo */
576 tag->flags |= FL_CLOSED;
577 state = tag_end;
578 store_component_value();
579 }
580 else {
581 /* Welcome to the world of html, revert state and save missing / */
582 parser_env.buf.push_back('/');
583 store_value_character(false);
584 state = parse_value;
585 }
586 break;
587 case ignore_bad_tag:
588 case tag_end:
589 break;
590 }
591
592 parser_env.cur_state = state;
593 }
594
595 static inline auto
html_is_absolute_url(std::string_view st)596 html_is_absolute_url(std::string_view st) -> bool
597 {
598 auto alnum_pos = std::find_if(std::begin(st), std::end(st),
599 [](auto c) {return !g_ascii_isalnum(c);});
600
601 if (alnum_pos != std::end(st) && alnum_pos != std::begin(st)) {
602 if (*alnum_pos == ':') {
603 if (st.substr(0, std::distance(std::begin(st), alnum_pos)) == "mailto") {
604 return true;
605 }
606
607 std::advance(alnum_pos, 1);
608 if (alnum_pos != std::end(st)) {
609 /* Include even malformed urls */
610 if (*alnum_pos == '/' || *alnum_pos == '\\') {
611 return true;
612 }
613 }
614 }
615 }
616
617 return false;
618 }
619
620 static auto
html_process_url_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc)621 html_process_url_tag(rspamd_mempool_t *pool,
622 struct html_tag *tag,
623 struct html_content *hc) -> std::optional<struct rspamd_url *>
624 {
625 auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
626
627 if (found_href_maybe) {
628 /* Check base url */
629 auto &href_value = found_href_maybe.value();
630
631 if (hc && hc->base_url) {
632 /*
633 * Relative url cannot start from the following:
634 * schema://
635 * data:
636 * slash
637 */
638
639 if (!html_is_absolute_url(href_value)) {
640
641 if (href_value.size() >= sizeof("data:") &&
642 g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) {
643 /* Image data url, never insert as url */
644 return std::nullopt;
645 }
646
647 /* Assume relative url */
648 auto need_slash = false;
649
650 auto orig_len = href_value.size();
651 auto len = orig_len + hc->base_url->urllen;
652
653 if (hc->base_url->datalen == 0) {
654 need_slash = true;
655 len++;
656 }
657
658 auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
659 auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1,
660 "%*s%s%*s",
661 (int) hc->base_url->urllen, hc->base_url->string,
662 need_slash ? "/" : "",
663 (gint) orig_len, href_value.data());
664 href_value = {buf, nlen};
665 }
666 else if (href_value.size() > 2 && href_value[0] == '/' && href_value[1] != '/') {
667 /* Relative to the hostname */
668 auto orig_len = href_value.size();
669 auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen +
670 3 /* for :// */;
671 auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
672 auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
673 (int) hc->base_url->protocollen, hc->base_url->string,
674 (int) hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
675 (gint) orig_len, href_value.data());
676 href_value = {buf, nlen};
677 }
678 }
679
680 auto url = html_process_url(pool, href_value);
681
682 if (url && std::holds_alternative<std::monostate>(tag->extra)) {
683 tag->extra = url.value();
684 }
685
686 return url;
687 }
688
689 return std::nullopt;
690 }
691
692 struct rspamd_html_url_query_cbd {
693 rspamd_mempool_t *pool;
694 khash_t (rspamd_url_hash) *url_set;
695 struct rspamd_url *url;
696 GPtrArray *part_urls;
697 };
698
699 static gboolean
html_url_query_callback(struct rspamd_url * url,gsize start_offset,gsize end_offset,gpointer ud)700 html_url_query_callback(struct rspamd_url *url, gsize start_offset,
701 gsize end_offset, gpointer ud)
702 {
703 struct rspamd_html_url_query_cbd *cbd =
704 (struct rspamd_html_url_query_cbd *) ud;
705 rspamd_mempool_t *pool;
706
707 pool = cbd->pool;
708
709 if (url->protocol == PROTOCOL_MAILTO) {
710 if (url->userlen == 0) {
711 return FALSE;
712 }
713 }
714
715 msg_debug_html ("found url %s in query of url"
716 " %*s", url->string,
717 cbd->url->querylen, rspamd_url_query_unsafe(cbd->url));
718
719 url->flags |= RSPAMD_URL_FLAG_QUERY;
720
721 if (rspamd_url_set_add_or_increase(cbd->url_set, url, false)
722 && cbd->part_urls) {
723 g_ptr_array_add(cbd->part_urls, url);
724 }
725
726 return TRUE;
727 }
728
729 static void
html_process_query_url(rspamd_mempool_t * pool,struct rspamd_url * url,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)730 html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
731 khash_t (rspamd_url_hash) *url_set,
732 GPtrArray *part_urls)
733 {
734 if (url->querylen > 0) {
735 struct rspamd_html_url_query_cbd qcbd;
736
737 qcbd.pool = pool;
738 qcbd.url_set = url_set;
739 qcbd.url = url;
740 qcbd.part_urls = part_urls;
741
742 rspamd_url_find_multiple(pool,
743 rspamd_url_query_unsafe (url), url->querylen,
744 RSPAMD_URL_FIND_ALL, NULL,
745 html_url_query_callback, &qcbd);
746 }
747
748 if (part_urls) {
749 g_ptr_array_add(part_urls, url);
750 }
751 }
752
753 static auto
html_process_data_image(rspamd_mempool_t * pool,struct html_image * img,std::string_view input)754 html_process_data_image(rspamd_mempool_t *pool,
755 struct html_image *img,
756 std::string_view input) -> void
757 {
758 /*
759 * Here, we do very basic processing of the data:
760 * detect if we have something like: `data:image/xxx;base64,yyyzzz==`
761 * We only parse base64 encoded data.
762 * We ignore content type so far
763 */
764 struct rspamd_image *parsed_image;
765 const gchar *semicolon_pos = input.data(),
766 *end = input.data() + input.size();
767
768 if ((semicolon_pos = (const gchar *) memchr(semicolon_pos, ';', end - semicolon_pos)) != NULL) {
769 if (end - semicolon_pos > sizeof("base64,")) {
770 if (memcmp(semicolon_pos + 1, "base64,", sizeof("base64,") - 1) == 0) {
771 const gchar *data_pos = semicolon_pos + sizeof("base64,");
772 gchar *decoded;
773 gsize encoded_len = end - data_pos, decoded_len;
774 rspamd_ftok_t inp;
775
776 decoded_len = (encoded_len / 4 * 3) + 12;
777 decoded = rspamd_mempool_alloc_buffer(pool, decoded_len);
778 rspamd_cryptobox_base64_decode(data_pos, encoded_len,
779 reinterpret_cast<guchar *>(decoded), &decoded_len);
780 inp.begin = decoded;
781 inp.len = decoded_len;
782
783 parsed_image = rspamd_maybe_process_image(pool, &inp);
784
785 if (parsed_image) {
786 msg_debug_html ("detected %s image of size %ud x %ud in data url",
787 rspamd_image_type_str(parsed_image->type),
788 parsed_image->width, parsed_image->height);
789 img->embedded_image = parsed_image;
790 }
791 }
792 }
793 else {
794 /* Nothing useful */
795 return;
796 }
797 }
798 }
799
800 static void
html_process_img_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)801 html_process_img_tag(rspamd_mempool_t *pool,
802 struct html_tag *tag,
803 struct html_content *hc,
804 khash_t (rspamd_url_hash) *url_set,
805 GPtrArray *part_urls)
806 {
807 struct html_image *img;
808
809 img = rspamd_mempool_alloc0_type (pool, struct html_image);
810 img->tag = tag;
811
812 for (const auto ¶m : tag->components) {
813
814 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
815 /* Check base url */
816 const auto &href_value = param.value;
817
818 if (href_value.size() > 0) {
819 rspamd_ftok_t fstr;
820 fstr.begin = href_value.data();
821 fstr.len = href_value.size();
822 img->src = rspamd_mempool_ftokdup (pool, &fstr);
823
824 if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
825 "cid:", sizeof("cid:") - 1) == 0) {
826 /* We have an embedded image */
827 img->src += sizeof("cid:") - 1;
828 img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
829 }
830 else {
831 if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
832 "data:", sizeof("data:") - 1) == 0) {
833 /* We have an embedded image in HTML tag */
834 img->flags |=
835 (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
836 html_process_data_image(pool, img, href_value);
837 hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
838 }
839 else {
840 img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
841 if (img->src) {
842
843 std::string_view cpy{href_value};
844 auto maybe_url = html_process_url(pool, cpy);
845
846 if (maybe_url) {
847 img->url = maybe_url.value();
848 struct rspamd_url *existing;
849
850 img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
851 existing = rspamd_url_set_add_or_return(url_set,
852 img->url);
853
854 if (existing && existing != img->url) {
855 /*
856 * We have some other URL that could be
857 * found, e.g. from another part. However,
858 * we still want to set an image flag on it
859 */
860 existing->flags |= img->url->flags;
861 existing->count++;
862 }
863 else if (part_urls) {
864 /* New url */
865 g_ptr_array_add(part_urls, img->url);
866 }
867 }
868 }
869 }
870 }
871 }
872 }
873
874
875 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) {
876 unsigned long val;
877
878 rspamd_strtoul(param.value.data(), param.value.size(), &val);
879 img->height = val;
880 }
881
882 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) {
883 unsigned long val;
884
885 rspamd_strtoul(param.value.data(), param.value.size(), &val);
886 img->width = val;
887 }
888
889 /* TODO: rework to css at some time */
890 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
891 if (img->height == 0) {
892 auto style_st = param.value;
893 auto pos = rspamd_substring_search_caseless(style_st.data(),
894 style_st.size(),
895 "height", sizeof("height") - 1);
896 if (pos != -1) {
897 auto substr = style_st.substr(pos + sizeof("height") - 1);
898
899 for (auto i = 0; i < substr.size(); i++) {
900 auto t = substr[i];
901 if (g_ascii_isdigit (t)) {
902 unsigned long val;
903 rspamd_strtoul(substr.data(),
904 substr.size(), &val);
905 img->height = val;
906 break;
907 }
908 else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
909 /* Fallback */
910 break;
911 }
912 }
913 }
914 }
915 if (img->width == 0) {
916 auto style_st = param.value;
917 auto pos = rspamd_substring_search_caseless(style_st.data(),
918 style_st.size(),
919 "width", sizeof("width") - 1);
920 if (pos != -1) {
921 auto substr = style_st.substr(pos + sizeof("width") - 1);
922
923 for (auto i = 0; i < substr.size(); i++) {
924 auto t = substr[i];
925 if (g_ascii_isdigit (t)) {
926 unsigned long val;
927 rspamd_strtoul(substr.data(),
928 substr.size(), &val);
929 img->width = val;
930 break;
931 }
932 else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
933 /* Fallback */
934 break;
935 }
936 }
937 }
938 }
939 }
940 }
941
942 if (img->embedded_image) {
943 if (img->height == 0) {
944 img->height = img->embedded_image->height;
945 }
946 if (img->width == 0) {
947 img->width = img->embedded_image->width;
948 }
949 }
950
951 hc->images.push_back(img);
952 tag->extra = img;
953 }
954
955 static auto
html_process_link_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)956 html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
957 struct html_content *hc,
958 khash_t (rspamd_url_hash) *url_set,
959 GPtrArray *part_urls) -> void
960 {
961 auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL);
962
963 if (found_rel_maybe) {
964 if (found_rel_maybe.value() == "icon") {
965 html_process_img_tag(pool, tag, hc, url_set, part_urls);
966 }
967 }
968 }
969
970 static auto
html_process_block_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc)971 html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
972 struct html_content *hc) -> void
973 {
974 std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
975 bool hidden = false;
976
977 for (const auto ¶m : tag->components) {
978 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
979 maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
980 }
981
982 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
983 maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
984 }
985
986 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
987 tag->block = rspamd::css::parse_css_declaration(pool, param.value);
988 }
989
990 if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) {
991 hidden = true;
992 }
993 }
994
995 if (!tag->block) {
996 tag->block = html_block::undefined_html_block_pool(pool);
997 }
998
999 if (hidden) {
1000 tag->block->set_display(false);
1001 }
1002
1003 if (maybe_fgcolor) {
1004 tag->block->set_fgcolor(maybe_fgcolor->to_color().value());
1005 }
1006
1007 if (maybe_bgcolor) {
1008 tag->block->set_bgcolor(maybe_bgcolor->to_color().value());
1009 }
1010 }
1011
1012 static inline auto
html_append_parsed(struct html_content * hc,std::string_view data,bool transparent,std::size_t input_len,std::string & dest)1013 html_append_parsed(struct html_content *hc,
1014 std::string_view data,
1015 bool transparent,
1016 std::size_t input_len,
1017 std::string &dest) -> std::size_t
1018 {
1019 auto cur_offset = dest.size();
1020
1021 if (dest.size() > input_len) {
1022 /* Impossible case, refuse to append */
1023 return 0;
1024 }
1025
1026 if (data.size() > 0) {
1027 /* Handle multiple spaces at the begin */
1028
1029 if (cur_offset > 0) {
1030 auto last = dest.back();
1031 if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) {
1032 dest.append(" ");
1033 data = {data.data() + 1, data.size() - 1};
1034 cur_offset++;
1035 }
1036 }
1037
1038 if (data.find('\0') != std::string_view::npos) {
1039 auto replace_zero_func = [](const auto &input, auto &output) {
1040 const auto last = input.cend();
1041 for (auto it = input.cbegin(); it != last; ++it) {
1042 if (*it == '\0') {
1043 output.append(u8"\uFFFD");
1044 }
1045 else {
1046 output.push_back(*it);
1047 }
1048 }
1049 };
1050
1051 dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD"));
1052 replace_zero_func(data, dest);
1053 hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
1054 }
1055 else {
1056 dest.append(data);
1057 }
1058 }
1059
1060 auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset,
1061 dest.size() - cur_offset, true);
1062
1063 dest.resize(nlen + cur_offset);
1064
1065 if (transparent) {
1066 /* Replace all visible characters with spaces */
1067 auto start = std::next(dest.begin(), cur_offset);
1068 std::replace_if(start, std::end(dest), [](const auto c) {
1069 return !g_ascii_isspace(c);
1070 }, ' ');
1071 }
1072
1073 return nlen;
1074 }
1075
1076 static auto
html_process_displayed_href_tag(rspamd_mempool_t * pool,struct html_content * hc,std::string_view data,const struct html_tag * cur_tag,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,goffset dest_offset)1077 html_process_displayed_href_tag(rspamd_mempool_t *pool,
1078 struct html_content *hc,
1079 std::string_view data,
1080 const struct html_tag *cur_tag,
1081 GList **exceptions,
1082 khash_t (rspamd_url_hash) *url_set,
1083 goffset dest_offset) -> void
1084 {
1085
1086 if (std::holds_alternative<rspamd_url *>(cur_tag->extra)) {
1087 auto *url = std::get<rspamd_url *>(cur_tag->extra);
1088
1089 html_check_displayed_url(pool,
1090 exceptions, url_set,
1091 data,
1092 dest_offset,
1093 url);
1094 }
1095 }
1096
1097 static auto
html_append_tag_content(rspamd_mempool_t * pool,const gchar * start,gsize len,struct html_content * hc,html_tag * tag,GList ** exceptions,khash_t (rspamd_url_hash)* url_set)1098 html_append_tag_content(rspamd_mempool_t *pool,
1099 const gchar *start, gsize len,
1100 struct html_content *hc,
1101 html_tag *tag,
1102 GList **exceptions,
1103 khash_t (rspamd_url_hash) *url_set) -> goffset
1104 {
1105 auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false;
1106 goffset next_tag_offset = tag->closing.end,
1107 initial_parsed_offset = hc->parsed.size(),
1108 initial_invisible_offset = hc->invisible.size();
1109
1110 auto calculate_final_tag_offsets = [&]() -> void {
1111 if (is_visible) {
1112 tag->content_offset = initial_parsed_offset;
1113 tag->closing.start = hc->parsed.size();
1114 }
1115 else {
1116 tag->content_offset = initial_invisible_offset;
1117 tag->closing.start = hc->invisible.size();
1118 }
1119 };
1120
1121 if (tag->closing.end == -1) {
1122 if (tag->closing.start != -1) {
1123 next_tag_offset = tag->closing.start;
1124 tag->closing.end = tag->closing.start;
1125 }
1126 else {
1127 next_tag_offset = tag->content_offset;
1128 tag->closing.end = tag->content_offset;
1129 }
1130 }
1131 if (tag->closing.start == -1) {
1132 tag->closing.start = tag->closing.end;
1133 }
1134
1135 auto append_margin = [&](char c) -> void {
1136 /* We do care about visible margins only */
1137 if (is_visible) {
1138 if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
1139 if (hc->parsed.back() == ' ') {
1140 /* We also strip extra spaces at the end, but limiting the start */
1141 auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset);
1142 auto first = std::find_if(hc->parsed.rbegin(), last,
1143 [](auto ch) -> auto {
1144 return ch != ' ';
1145 });
1146 hc->parsed.erase(first.base(), hc->parsed.end());
1147 g_assert(hc->parsed.size() >= initial_parsed_offset);
1148 }
1149 hc->parsed.push_back(c);
1150 }
1151 }
1152 };
1153
1154 if (tag->id == Tag_BR || tag->id == Tag_HR) {
1155
1156 if (!(tag->flags & FL_IGNORE)) {
1157 hc->parsed.append("\n");
1158 }
1159
1160 auto ret = tag->content_offset;
1161 calculate_final_tag_offsets();
1162
1163 return ret;
1164 }
1165 else if ((tag->id == Tag_HEAD && (tag->flags & FL_IGNORE)) || (tag->flags & CM_HEAD)) {
1166 auto ret = tag->closing.end;
1167 calculate_final_tag_offsets();
1168
1169 return ret;
1170 }
1171
1172 if ((tag->flags & (FL_COMMENT | FL_XML | FL_IGNORE | CM_HEAD))) {
1173 is_visible = false;
1174 }
1175 else {
1176 if (!tag->block) {
1177 is_visible = true;
1178 }
1179 else if (!tag->block->is_visible()) {
1180 if (!tag->block->is_transparent()) {
1181 is_visible = false;
1182 }
1183 else {
1184 if (tag->block->has_display() &&
1185 tag->block->display == css::css_display_value::DISPLAY_HIDDEN) {
1186 is_visible = false;
1187 }
1188 else {
1189 is_transparent = true;
1190 }
1191 }
1192 }
1193 else {
1194 if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
1195 is_block = true;
1196 }
1197 else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) {
1198 is_spaces = true;
1199 }
1200 }
1201 }
1202
1203 if (is_block) {
1204 append_margin('\n');
1205 }
1206 else if (is_spaces) {
1207 append_margin(' ');
1208 }
1209
1210 goffset cur_offset = tag->content_offset;
1211
1212 for (auto *cld : tag->children) {
1213 auto enclosed_start = cld->tag_start;
1214 goffset initial_part_len = enclosed_start - cur_offset;
1215
1216 if (initial_part_len > 0) {
1217 if (is_visible) {
1218 html_append_parsed(hc,
1219 {start + cur_offset, std::size_t(initial_part_len)},
1220 is_transparent, len, hc->parsed);
1221 }
1222 else {
1223 html_append_parsed(hc,
1224 {start + cur_offset, std::size_t(initial_part_len)},
1225 is_transparent, len, hc->invisible);
1226 }
1227 }
1228
1229 auto next_offset = html_append_tag_content(pool, start, len,
1230 hc, cld, exceptions, url_set);
1231
1232 /* Do not allow shifting back */
1233 if (next_offset > cur_offset) {
1234 cur_offset = next_offset;
1235 }
1236 }
1237
1238 if (cur_offset < tag->closing.start) {
1239 goffset final_part_len = tag->closing.start - cur_offset;
1240
1241 if (final_part_len > 0) {
1242 if (is_visible) {
1243 html_append_parsed(hc,
1244 {start + cur_offset, std::size_t(final_part_len)},
1245 is_transparent,
1246 len,
1247 hc->parsed);
1248 }
1249 else {
1250 html_append_parsed(hc,
1251 {start + cur_offset, std::size_t(final_part_len)},
1252 is_transparent,
1253 len,
1254 hc->invisible);
1255 }
1256 }
1257 }
1258 if (is_block) {
1259 append_margin('\n');
1260 }
1261 else if (is_spaces) {
1262 append_margin(' ');
1263 }
1264
1265 if (is_visible) {
1266 if (tag->id == Tag_A) {
1267 auto written_len = hc->parsed.size() - initial_parsed_offset;
1268 html_process_displayed_href_tag(pool, hc,
1269 {hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)},
1270 tag, exceptions,
1271 url_set, initial_parsed_offset);
1272 }
1273 else if (tag->id == Tag_IMG) {
1274 /* Process ALT if presented */
1275 auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
1276
1277 if (maybe_alt) {
1278 if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) {
1279 /* Add a space */
1280 hc->parsed += ' ';
1281 }
1282
1283 hc->parsed.append(maybe_alt.value());
1284
1285 if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) {
1286 /* Add a space */
1287 hc->parsed += ' ';
1288 }
1289 }
1290 }
1291 }
1292 else {
1293 /* Invisible stuff */
1294 if (std::holds_alternative<rspamd_url *>(tag->extra)) {
1295 auto *url_enclosed = std::get<rspamd_url *>(tag->extra);
1296
1297 /*
1298 * TODO: when hash is fixed to include flags we need to remove and add
1299 * url to the hash set
1300 */
1301 if (url_enclosed) {
1302 url_enclosed->flags |= RSPAMD_URL_FLAG_INVISIBLE;
1303 }
1304 }
1305 }
1306
1307 calculate_final_tag_offsets();
1308
1309 return next_tag_offset;
1310 }
1311
1312 auto
html_process_input(rspamd_mempool_t * pool,GByteArray * in,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls,bool allow_css)1313 html_process_input(rspamd_mempool_t *pool,
1314 GByteArray *in,
1315 GList **exceptions,
1316 khash_t (rspamd_url_hash) *url_set,
1317 GPtrArray *part_urls,
1318 bool allow_css) -> html_content *
1319 {
1320 const gchar *p, *c, *end, *start;
1321 guchar t;
1322 auto closing = false;
1323 guint obrace = 0, ebrace = 0;
1324 struct rspamd_url *url = nullptr;
1325 gint href_offset = -1;
1326 struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
1327 struct tag_content_parser_state content_parser_env;
1328
1329 enum {
1330 parse_start = 0,
1331 content_before_start,
1332 tag_begin,
1333 sgml_tag,
1334 xml_tag,
1335 compound_tag,
1336 comment_tag,
1337 comment_content,
1338 sgml_content,
1339 tag_content,
1340 tag_end_opening,
1341 tag_end_closing,
1342 html_text_content,
1343 xml_tag_end,
1344 tag_raw_text,
1345 tag_raw_text_less_than,
1346 tags_limit_overflow,
1347 } state = parse_start;
1348
1349 enum class html_document_state {
1350 doctype,
1351 head,
1352 body
1353 } html_document_state = html_document_state::doctype;
1354
1355 g_assert (in != NULL);
1356 g_assert (pool != NULL);
1357
1358 struct html_content *hc = new html_content;
1359 rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
1360
1361 auto new_tag = [&](int flags = 0) -> struct html_tag * {
1362
1363 if (hc->all_tags.size() > rspamd::html::max_tags) {
1364 hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
1365
1366 return nullptr;
1367 }
1368
1369 hc->all_tags.emplace_back(std::make_unique<html_tag>());
1370 auto *ntag = hc->all_tags.back().get();
1371 ntag->tag_start = c - start;
1372 ntag->flags = flags;
1373
1374 if (cur_tag && !(cur_tag->flags & (CM_EMPTY | FL_CLOSED)) && cur_tag != &cur_closing_tag) {
1375 parent_tag = cur_tag;
1376 }
1377
1378 if (flags & FL_XML) {
1379 return ntag;
1380 }
1381
1382 return ntag;
1383 };
1384
1385 auto process_opening_tag = [&]() {
1386 if (cur_tag->id > Tag_UNKNOWN) {
1387 if (cur_tag->flags & CM_UNIQUE) {
1388 if (!hc->tags_seen[cur_tag->id]) {
1389 /* Duplicate tag has been found */
1390 hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
1391 }
1392 }
1393 hc->tags_seen[cur_tag->id] = true;
1394 }
1395
1396 /* Shift to the first unclosed tag */
1397 auto *pt = parent_tag;
1398 while (pt && (pt->flags & FL_CLOSED)) {
1399 pt = pt->parent;
1400 }
1401
1402 if (pt) {
1403 g_assert(cur_tag != pt);
1404 cur_tag->parent = pt;
1405 g_assert(cur_tag->parent != &cur_closing_tag);
1406 parent_tag = pt;
1407 parent_tag->children.push_back(cur_tag);
1408 }
1409 else {
1410 if (hc->root_tag) {
1411 if (cur_tag != hc->root_tag) {
1412 cur_tag->parent = hc->root_tag;
1413 g_assert(cur_tag->parent != cur_tag);
1414 hc->root_tag->children.push_back(cur_tag);
1415 parent_tag = hc->root_tag;
1416 }
1417 }
1418 else {
1419 if (cur_tag->id == Tag_HTML) {
1420 hc->root_tag = cur_tag;
1421 }
1422 else {
1423 /* Insert a fake html tag */
1424 hc->all_tags.emplace_back(std::make_unique<html_tag>());
1425 auto *top_tag = hc->all_tags.back().get();
1426 top_tag->tag_start = 0;
1427 top_tag->flags = FL_VIRTUAL;
1428 top_tag->id = Tag_HTML;
1429 top_tag->content_offset = 0;
1430 top_tag->children.push_back(cur_tag);
1431 cur_tag->parent = top_tag;
1432 g_assert(cur_tag->parent != cur_tag);
1433 hc->root_tag = top_tag;
1434 parent_tag = top_tag;
1435 }
1436 }
1437 }
1438
1439 if (cur_tag->flags & FL_HREF && html_document_state == html_document_state::body) {
1440 auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
1441
1442 if (maybe_url) {
1443 url = maybe_url.value();
1444
1445 if (url_set != NULL) {
1446 struct rspamd_url *maybe_existing =
1447 rspamd_url_set_add_or_return(url_set, maybe_url.value());
1448 if (maybe_existing == maybe_url.value()) {
1449 html_process_query_url(pool, url, url_set,
1450 part_urls);
1451 }
1452 else {
1453 url = maybe_existing;
1454 /* Replace extra as well */
1455 cur_tag->extra = maybe_existing;
1456 /* Increase count to avoid odd checks failure */
1457 url->count++;
1458 }
1459 }
1460 if (part_urls) {
1461 g_ptr_array_add(part_urls, url);
1462 }
1463
1464 href_offset = hc->parsed.size();
1465 }
1466 }
1467 else if (cur_tag->id == Tag_BASE) {
1468 /*
1469 * Base is allowed only within head tag but HTML is retarded
1470 */
1471 auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
1472
1473 if (maybe_url) {
1474 msg_debug_html ("got valid base tag");
1475 cur_tag->extra = maybe_url.value();
1476 cur_tag->flags |= FL_HREF;
1477
1478 if (hc->base_url == nullptr) {
1479 hc->base_url = maybe_url.value();
1480 }
1481 else {
1482 msg_debug_html ("ignore redundant base tag");
1483 }
1484 }
1485 else {
1486 msg_debug_html ("got invalid base tag!");
1487 }
1488 }
1489
1490 if (cur_tag->id == Tag_IMG) {
1491 html_process_img_tag(pool, cur_tag, hc, url_set,
1492 part_urls);
1493 }
1494 else if (cur_tag->id == Tag_LINK) {
1495 html_process_link_tag(pool, cur_tag, hc, url_set,
1496 part_urls);
1497 }
1498
1499 if (!(cur_tag->flags & CM_EMPTY)) {
1500 html_process_block_tag(pool, cur_tag, hc);
1501 }
1502 else {
1503 /* Implicitly close */
1504 cur_tag->flags |= FL_CLOSED;
1505 }
1506
1507 if (cur_tag->flags & FL_CLOSED) {
1508 cur_tag->closing.end = cur_tag->content_offset;
1509 cur_tag->closing.start = cur_tag->tag_start;
1510
1511 cur_tag = parent_tag;
1512 }
1513 };
1514
1515 p = (const char *) in->data;
1516 c = p;
1517 end = p + in->len;
1518 start = c;
1519
1520 while (p < end) {
1521 t = *p;
1522
1523 switch (state) {
1524 case parse_start:
1525 if (t == '<') {
1526 state = tag_begin;
1527 }
1528 else {
1529 /* We have no starting tag, so assume that it's content */
1530 hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
1531 cur_tag = new_tag();
1532 html_document_state = html_document_state::body;
1533
1534 if (cur_tag) {
1535 cur_tag->id = Tag_HTML;
1536 hc->root_tag = cur_tag;
1537 state = content_before_start;
1538 }
1539 else {
1540 state = tags_limit_overflow;
1541 }
1542 }
1543 break;
1544 case content_before_start:
1545 if (t == '<') {
1546 state = tag_begin;
1547 }
1548 else {
1549 p++;
1550 }
1551 break;
1552 case tag_begin:
1553 switch (t) {
1554 case '<':
1555 c = p;
1556 p++;
1557 closing = FALSE;
1558 break;
1559 case '!':
1560 cur_tag = new_tag(FL_XML | FL_CLOSED);
1561 if (cur_tag) {
1562 state = sgml_tag;
1563 }
1564 else {
1565 state = tags_limit_overflow;
1566 }
1567 p++;
1568 break;
1569 case '?':
1570 cur_tag = new_tag(FL_XML | FL_CLOSED);
1571 if (cur_tag) {
1572 state = xml_tag;
1573 }
1574 else {
1575 state = tags_limit_overflow;
1576 }
1577 hc->flags |= RSPAMD_HTML_FLAG_XML;
1578 p++;
1579 break;
1580 case '/':
1581 closing = TRUE;
1582 /* We fill fake closing tag to fill it with the content parser */
1583 cur_closing_tag.clear();
1584 /*
1585 * For closing tags, we need to find some corresponding opening tag.
1586 * However, at this point we have not even parsed a name, so we
1587 * can not assume anything about balancing, etc.
1588 *
1589 * So we need to ensure that:
1590 * 1) We have some opening tag in the chain cur_tag->parent...
1591 * 2) cur_tag is nullptr - okay, html is just brain damaged
1592 * 3) cur_tag must NOT be equal to cur_closing tag. It means that
1593 * we had some poor closing tag but we still need to find an opening
1594 * tag... Somewhere...
1595 */
1596
1597 if (cur_tag == &cur_closing_tag) {
1598 if (parent_tag != &cur_closing_tag) {
1599 cur_closing_tag.parent = parent_tag;
1600 }
1601 else {
1602 cur_closing_tag.parent = nullptr;
1603 }
1604 }
1605 else if (cur_tag && cur_tag->flags & FL_CLOSED) {
1606 /* Cur tag is already closed, we should find something else */
1607 auto *tmp = cur_tag;
1608 while (tmp) {
1609 tmp = tmp->parent;
1610
1611 if (tmp == nullptr || !(tmp->flags & FL_CLOSED)) {
1612 break;
1613 }
1614 }
1615
1616 cur_closing_tag.parent = tmp;
1617 }
1618 else {
1619 cur_closing_tag.parent = cur_tag;
1620 }
1621
1622 cur_tag = &cur_closing_tag;
1623 p++;
1624 break;
1625 case '>':
1626 /* Empty tag */
1627 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1628 state = html_text_content;
1629 continue;
1630 default:
1631 if (g_ascii_isalpha(t)) {
1632 state = tag_content;
1633 content_parser_env.reset();
1634
1635 if (!closing) {
1636 cur_tag = new_tag();
1637 }
1638
1639 if (cur_tag) {
1640 state = tag_content;
1641 }
1642 else {
1643 state = tags_limit_overflow;
1644 }
1645 }
1646 else {
1647 /* Wrong bad tag */
1648 state = html_text_content;
1649 }
1650 break;
1651 }
1652
1653 break;
1654
1655 case sgml_tag:
1656 switch (t) {
1657 case '[':
1658 state = compound_tag;
1659 obrace = 1;
1660 ebrace = 0;
1661 p++;
1662 break;
1663 case '-':
1664 cur_tag->flags |= FL_COMMENT;
1665 state = comment_tag;
1666 p++;
1667 break;
1668 default:
1669 state = sgml_content;
1670 break;
1671 }
1672
1673 break;
1674
1675 case xml_tag:
1676 if (t == '?') {
1677 state = xml_tag_end;
1678 }
1679 else if (t == '>') {
1680 /* Misformed xml tag */
1681 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1682 state = tag_end_opening;
1683 continue;
1684 }
1685 /* We efficiently ignore xml tags */
1686 p++;
1687 break;
1688
1689 case xml_tag_end:
1690 if (t == '>') {
1691 state = tag_end_opening;
1692 cur_tag->content_offset = p - start + 1;
1693 continue;
1694 }
1695 else {
1696 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1697 }
1698 p++;
1699 break;
1700
1701 case compound_tag:
1702 if (t == '[') {
1703 obrace++;
1704 }
1705 else if (t == ']') {
1706 ebrace++;
1707 }
1708 else if (t == '>' && obrace == ebrace) {
1709 state = tag_end_opening;
1710 cur_tag->content_offset = p - start + 1;
1711 continue;
1712 }
1713 p++;
1714 break;
1715
1716 case comment_tag:
1717 if (t != '-') {
1718 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1719 state = tag_end_opening;
1720 }
1721 else {
1722 p++;
1723 ebrace = 0;
1724 /*
1725 * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments
1726 * ... the text must not start with a single
1727 * U+003E GREATER-THAN SIGN character (>),
1728 * nor start with a "-" (U+002D) character followed by
1729 * a U+003E GREATER-THAN SIGN (>) character,
1730 * nor contain two consecutive U+002D HYPHEN-MINUS
1731 * characters (--), nor end with a "-" (U+002D) character.
1732 */
1733 if (p[0] == '-' && p + 1 < end && p[1] == '>') {
1734 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1735 p++;
1736 state = tag_end_opening;
1737 }
1738 else if (*p == '>') {
1739 hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1740 state = tag_end_opening;
1741 }
1742 else {
1743 state = comment_content;
1744 }
1745 }
1746 break;
1747
1748 case comment_content:
1749 if (t == '-') {
1750 ebrace++;
1751 }
1752 else if (t == '>' && ebrace >= 2) {
1753 cur_tag->content_offset = p - start + 1;
1754 state = tag_end_opening;
1755 continue;
1756 }
1757 else {
1758 ebrace = 0;
1759 }
1760
1761 p++;
1762 break;
1763
1764 case html_text_content:
1765 if (t != '<') {
1766 p++;
1767 }
1768 else {
1769 state = tag_begin;
1770 }
1771 break;
1772
1773 case tag_raw_text:
1774 if (t == '<') {
1775 c = p;
1776 state = tag_raw_text_less_than;
1777 }
1778 p ++;
1779 break;
1780 case tag_raw_text_less_than:
1781 if (t == '/') {
1782 /* Here are special things: we look for obrace and then ensure
1783 * that if there is any closing brace nearby
1784 * (we look maximum at 30 characters). We also need to ensure
1785 * that we have no special characters, such as punctuation marks and
1786 * so on.
1787 * Basically, we validate the input to be sane.
1788 * Since closing tags must not have attributes, these assumptions
1789 * seems to be reasonable enough for our toy parser.
1790 */
1791 gint cur_lookahead = 1;
1792 gint max_lookahead = MIN (end - p, 30);
1793 bool valid_closing_tag = true;
1794
1795 if (p + 1 < end && !g_ascii_isalpha (p[1])) {
1796 valid_closing_tag = false;
1797 }
1798 else {
1799 while (cur_lookahead < max_lookahead) {
1800 gchar tt = p[cur_lookahead];
1801 if (tt == '>') {
1802 break;
1803 }
1804 else if (tt < '\n' || tt == ',') {
1805 valid_closing_tag = false;
1806 break;
1807 }
1808 cur_lookahead ++;
1809 }
1810
1811 if (cur_lookahead == max_lookahead) {
1812 valid_closing_tag = false;
1813 }
1814 }
1815
1816 if (valid_closing_tag) {
1817 /* Shift back */
1818 p = c;
1819 state = tag_begin;
1820 }
1821 else {
1822 p ++;
1823 state = tag_raw_text;
1824 }
1825 }
1826 else {
1827 p ++;
1828 state = tag_raw_text;
1829 }
1830 break;
1831 case sgml_content:
1832 /* TODO: parse DOCTYPE here */
1833 if (t == '>') {
1834 cur_tag->content_offset = p - start + 1;
1835 state = tag_end_opening;
1836 }
1837 else {
1838 p++;
1839 }
1840 break;
1841
1842 case tag_content:
1843 html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env);
1844
1845 if (t == '>') {
1846 if (closing) {
1847 cur_tag->closing.start = c - start;
1848 cur_tag->closing.end = p - start + 1;
1849
1850 closing = FALSE;
1851 state = tag_end_closing;
1852 }
1853 else {
1854 cur_tag->content_offset = p - start + 1;
1855 state = tag_end_opening;
1856 }
1857
1858
1859 continue;
1860 }
1861 p++;
1862 break;
1863
1864 case tag_end_opening:
1865 content_parser_env.reset();
1866 state = html_text_content;
1867
1868 if (cur_tag) {
1869 if (cur_tag->id == Tag_STYLE || cur_tag->id == Tag_NOSCRIPT || cur_tag->id == Tag_SCRIPT) {
1870 state = tag_raw_text;
1871 }
1872 if (html_document_state == html_document_state::doctype) {
1873 if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) {
1874 html_document_state = html_document_state::head;
1875 cur_tag->flags |= FL_IGNORE;
1876 }
1877 else if (cur_tag->id != Tag_HTML) {
1878 html_document_state = html_document_state::body;
1879 }
1880 }
1881 else if (html_document_state == html_document_state::head) {
1882 if (!(cur_tag->flags & (CM_EMPTY | CM_HEAD))) {
1883 if (parent_tag && (parent_tag->id == Tag_HEAD || !(parent_tag->flags & CM_HEAD))) {
1884 /*
1885 * As by standard, we have to close the HEAD tag
1886 * and switch to the body state
1887 */
1888 parent_tag->flags |= FL_CLOSED;
1889 parent_tag->closing.start = cur_tag->tag_start;
1890 parent_tag->closing.end = cur_tag->content_offset;
1891
1892 html_document_state = html_document_state::body;
1893 }
1894 else if (cur_tag->id == Tag_BODY) {
1895 html_document_state = html_document_state::body;
1896 }
1897 else {
1898 /*
1899 * For propagation in something like
1900 * <title><p><a>ololo</a></p></title> - should be unprocessed
1901 */
1902 cur_tag->flags |= CM_HEAD;
1903 }
1904 }
1905 }
1906
1907 process_opening_tag();
1908 }
1909
1910 p++;
1911 c = p;
1912 break;
1913 case tag_end_closing: {
1914 if (cur_tag) {
1915
1916 if (cur_tag->flags & CM_EMPTY) {
1917 /* Ignore closing empty tags */
1918 cur_tag->flags |= FL_IGNORE;
1919 }
1920 if (html_document_state == html_document_state::doctype) {
1921
1922 }
1923 else if (html_document_state == html_document_state::head) {
1924 if (cur_tag->id == Tag_HEAD) {
1925 html_document_state = html_document_state::body;
1926 }
1927 }
1928
1929 /* cur_tag here is a closing tag */
1930 auto *next_cur_tag = html_check_balance(hc, cur_tag,
1931 c - start, p - start + 1);
1932
1933 if (cur_tag->id == Tag_STYLE && allow_css) {
1934 auto *opening_tag = cur_tag->parent;
1935
1936 if (opening_tag && opening_tag->id == Tag_STYLE &&
1937 (int)opening_tag->content_offset < opening_tag->closing.start) {
1938 auto ret_maybe = rspamd::css::parse_css(pool,
1939 {start + opening_tag->content_offset,
1940 opening_tag->closing.start - opening_tag->content_offset},
1941 std::move(hc->css_style));
1942
1943 if (!ret_maybe.has_value()) {
1944 if (ret_maybe.error().is_fatal()) {
1945 auto err_str = fmt::format(
1946 "cannot parse css (error code: {}): {}",
1947 static_cast<int>(ret_maybe.error().type),
1948 ret_maybe.error().description.value_or("unknown error"));
1949 msg_info_pool ("%*s", (int) err_str.size(), err_str.data());
1950 }
1951 }
1952 else {
1953 hc->css_style = ret_maybe.value();
1954 }
1955 }
1956 }
1957
1958 if (next_cur_tag != nullptr) {
1959 cur_tag = next_cur_tag;
1960 }
1961 else {
1962 /*
1963 * Here, we handle cases like <p>lala</b>...
1964 * So the tag </b> is bogus and unpaired
1965 * However, we need to exclude it from the output of <p> tag
1966 * To do that, we create a fake opening tag and insert that to
1967 * the current opening tag
1968 */
1969 auto *cur_opening_tag = cur_tag->parent;
1970
1971 while (cur_opening_tag && (cur_opening_tag->flags & FL_CLOSED)) {
1972 cur_opening_tag = cur_opening_tag->parent;
1973 }
1974
1975 if (!cur_opening_tag) {
1976 cur_opening_tag = hc->root_tag;
1977 }
1978
1979 auto &&vtag = std::make_unique<html_tag>();
1980 vtag->id = cur_tag->id;
1981 vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags;
1982 vtag->tag_start = cur_tag->closing.start;
1983 vtag->content_offset = p - start + 1;
1984 vtag->closing = cur_tag->closing;
1985 vtag->parent = cur_opening_tag;
1986 g_assert(vtag->parent != &cur_closing_tag);
1987 cur_opening_tag->children.push_back(vtag.get());
1988 hc->all_tags.emplace_back(std::move(vtag));
1989 cur_tag = cur_opening_tag;
1990 parent_tag = cur_tag->parent;
1991 g_assert(cur_tag->parent != &cur_closing_tag);
1992 }
1993 } /* if cur_tag != nullptr */
1994 state = html_text_content;
1995 p++;
1996 c = p;
1997 break;
1998 }
1999 case tags_limit_overflow:
2000 msg_warn_pool("tags limit of %d tags is reached at the position %d;"
2001 " ignoring the rest of the HTML content",
2002 (int) hc->all_tags.size(), (int) (p - start));
2003 c = p;
2004 p = end;
2005 break;
2006 }
2007 }
2008
2009 if (cur_tag && !(cur_tag->flags & FL_CLOSED) && cur_tag != &cur_closing_tag) {
2010 cur_closing_tag.parent = cur_tag;
2011 cur_closing_tag.id = cur_tag->id;
2012 cur_tag = &cur_closing_tag;
2013 html_check_balance(hc, cur_tag,
2014 end - start, end - start);
2015 }
2016
2017 /* Propagate styles */
2018 hc->traverse_block_tags([&hc, &pool](const html_tag *tag) -> bool {
2019
2020 if (hc->css_style && tag->id > Tag_UNKNOWN && tag->id < Tag_MAX) {
2021 auto *css_block = hc->css_style->check_tag_block(tag);
2022
2023 if (css_block) {
2024 if (tag->block) {
2025 tag->block->set_block(*css_block);
2026 }
2027 else {
2028 tag->block = css_block;
2029 }
2030 }
2031 }
2032 if (tag->block) {
2033 if (!tag->block->has_display()) {
2034 /* If we have no display field, we can check it by tag */
2035 if (tag->flags & CM_HEAD) {
2036 tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN,
2037 html_block::set);
2038 }
2039 else if (tag->flags & (CM_BLOCK | CM_TABLE)) {
2040 tag->block->set_display(css::css_display_value::DISPLAY_BLOCK,
2041 html_block::implicit);
2042 }
2043 else if (tag->flags & CM_ROW) {
2044 tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW,
2045 html_block::implicit);
2046 }
2047 else {
2048 tag->block->set_display(css::css_display_value::DISPLAY_INLINE,
2049 html_block::implicit);
2050 }
2051 }
2052
2053 tag->block->compute_visibility();
2054
2055 for (const auto *cld_tag : tag->children) {
2056
2057 if (cld_tag->block) {
2058 cld_tag->block->propagate_block(*tag->block);
2059 }
2060 else {
2061 cld_tag->block = rspamd_mempool_alloc0_type(pool, html_block);
2062 *cld_tag->block = *tag->block;
2063 }
2064 }
2065 }
2066 return true;
2067 }, html_content::traverse_type::PRE_ORDER);
2068
2069 /* Leftover before content */
2070 switch (state) {
2071 case tag_end_opening:
2072 if (cur_tag != nullptr) {
2073 process_opening_tag();
2074 }
2075 break;
2076 default:
2077 /* Do nothing */
2078 break;
2079 }
2080
2081 if (!hc->all_tags.empty() && hc->root_tag) {
2082 html_append_tag_content(pool, start, end - start, hc, hc->root_tag,
2083 exceptions, url_set);
2084 }
2085
2086 /* Leftover after content */
2087 switch (state) {
2088 case tags_limit_overflow:
2089 html_append_parsed(hc, {c, (std::size_t) (end - c)},
2090 false, end - start, hc->parsed);
2091 break;
2092 default:
2093 /* Do nothing */
2094 break;
2095 }
2096
2097 if (!hc->parsed.empty()) {
2098 /* Trim extra spaces at the at the end if needed */
2099 if (g_ascii_isspace(hc->parsed.back())) {
2100 auto last_it = std::end(hc->parsed);
2101
2102 /* Allow last newline */
2103 if (hc->parsed.back() == '\n') {
2104 --last_it;
2105 }
2106
2107 hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(),
2108 [](auto ch) -> auto {
2109 return !g_ascii_isspace(ch);
2110 }).base(),
2111 last_it);
2112 }
2113 }
2114
2115 return hc;
2116 }
2117
2118 static auto
html_find_image_by_cid(const html_content & hc,std::string_view cid)2119 html_find_image_by_cid(const html_content &hc, std::string_view cid)
2120 -> std::optional<const html_image *>
2121 {
2122 for (const auto *html_image : hc.images) {
2123 /* Filter embedded images */
2124 if (html_image->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED &&
2125 html_image->src != nullptr) {
2126 if (cid == html_image->src) {
2127 return html_image;
2128 }
2129 }
2130 }
2131
2132 return std::nullopt;
2133 }
2134
2135 auto
html_debug_structure(const html_content & hc)2136 html_debug_structure(const html_content &hc) -> std::string
2137 {
2138 std::string output;
2139
2140 if (hc.root_tag) {
2141 auto rec_functor = [&](const html_tag *t, int level, auto rec_functor) -> void {
2142 std::string pluses(level, '+');
2143
2144 if (!(t->flags & (FL_VIRTUAL | FL_IGNORE))) {
2145 if (t->flags & FL_XML) {
2146 output += fmt::format("{}xml;", pluses);
2147 }
2148 else {
2149 output += fmt::format("{}{};", pluses,
2150 html_tags_defs.name_by_id_safe(t->id));
2151 }
2152 level++;
2153 }
2154 for (const auto *cld : t->children) {
2155 rec_functor(cld, level, rec_functor);
2156 }
2157 };
2158
2159 rec_functor(hc.root_tag, 1, rec_functor);
2160 }
2161
2162 return output;
2163 }
2164
html_tag_by_name(const std::string_view & name)2165 auto html_tag_by_name(const std::string_view &name)
2166 -> std::optional<tag_id_t>
2167 {
2168 const auto *td = rspamd::html::html_tags_defs.by_name(name);
2169
2170 if (td != nullptr) {
2171 return td->id;
2172 }
2173
2174 return std::nullopt;
2175 }
2176
2177 auto
get_content(const struct html_content * hc) const2178 html_tag::get_content(const struct html_content *hc) const -> std::string_view
2179 {
2180 const std::string *dest = &hc->parsed;
2181
2182 if (block && !block->is_visible()) {
2183 dest = &hc->invisible;
2184 }
2185 const auto clen = get_content_length();
2186 if (content_offset < dest->size()) {
2187 if (dest->size() - content_offset >= clen) {
2188 return std::string_view{*dest}.substr(content_offset, clen);
2189 }
2190 else {
2191 return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset);
2192 }
2193 }
2194
2195 return std::string_view{};
2196 }
2197
2198 }
2199
2200 void *
rspamd_html_process_part_full(rspamd_mempool_t * pool,GByteArray * in,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls,bool allow_css)2201 rspamd_html_process_part_full(rspamd_mempool_t *pool,
2202 GByteArray *in, GList **exceptions,
2203 khash_t (rspamd_url_hash) *url_set,
2204 GPtrArray *part_urls,
2205 bool allow_css)
2206 {
2207 return rspamd::html::html_process_input(pool, in, exceptions, url_set,
2208 part_urls, allow_css);
2209 }
2210
2211 void *
rspamd_html_process_part(rspamd_mempool_t * pool,GByteArray * in)2212 rspamd_html_process_part(rspamd_mempool_t *pool,
2213 GByteArray *in)
2214 {
2215 return rspamd_html_process_part_full (pool, in, NULL,
2216 NULL, NULL, FALSE);
2217 }
2218
2219 guint
rspamd_html_decode_entitles_inplace(gchar * s,gsize len)2220 rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
2221 {
2222 return rspamd::html::decode_html_entitles_inplace(s, len);
2223 }
2224
2225 gint
rspamd_html_tag_by_name(const gchar * name)2226 rspamd_html_tag_by_name(const gchar *name)
2227 {
2228 const auto *td = rspamd::html::html_tags_defs.by_name(name);
2229
2230 if (td != nullptr) {
2231 return td->id;
2232 }
2233
2234 return -1;
2235 }
2236
2237 gboolean
rspamd_html_tag_seen(void * ptr,const gchar * tagname)2238 rspamd_html_tag_seen(void *ptr, const gchar *tagname)
2239 {
2240 gint id;
2241 auto *hc = rspamd::html::html_content::from_ptr(ptr);
2242
2243 g_assert (hc != NULL);
2244
2245 id = rspamd_html_tag_by_name(tagname);
2246
2247 if (id != -1) {
2248 return hc->tags_seen[id];
2249 }
2250
2251 return FALSE;
2252 }
2253
2254 const gchar *
rspamd_html_tag_by_id(gint id)2255 rspamd_html_tag_by_id(gint id)
2256 {
2257 if (id > Tag_UNKNOWN && id < Tag_MAX) {
2258 const auto *td = rspamd::html::html_tags_defs.by_id(id);
2259
2260 if (td != nullptr) {
2261 return td->name.c_str();
2262 }
2263 }
2264
2265 return nullptr;
2266 }
2267
2268 const gchar *
rspamd_html_tag_name(void * p,gsize * len)2269 rspamd_html_tag_name(void *p, gsize *len)
2270 {
2271 auto *tag = reinterpret_cast<rspamd::html::html_tag *>(p);
2272 auto tname = rspamd::html::html_tags_defs.name_by_id_safe(tag->id);
2273
2274 if (len) {
2275 *len = tname.size();
2276 }
2277
2278 return tname.data();
2279 }
2280
2281 struct html_image*
rspamd_html_find_embedded_image(void * html_content,const char * cid,gsize cid_len)2282 rspamd_html_find_embedded_image(void *html_content,
2283 const char *cid, gsize cid_len)
2284 {
2285 auto *hc = rspamd::html::html_content::from_ptr(html_content);
2286
2287 auto maybe_img = rspamd::html::html_find_image_by_cid(*hc, {cid, cid_len});
2288
2289 if (maybe_img) {
2290 return (html_image *)maybe_img.value();
2291 }
2292
2293 return nullptr;
2294 }
2295
2296 bool
rspamd_html_get_parsed_content(void * html_content,rspamd_ftok_t * dest)2297 rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest)
2298 {
2299 auto *hc = rspamd::html::html_content::from_ptr(html_content);
2300
2301 dest->begin = hc->parsed.data();
2302 dest->len = hc->parsed.size();
2303
2304 return true;
2305 }
2306
2307 gsize
rspamd_html_get_tags_count(void * html_content)2308 rspamd_html_get_tags_count(void *html_content)
2309 {
2310 auto *hc = rspamd::html::html_content::from_ptr(html_content);
2311
2312 if (!hc) {
2313 return 0;
2314 }
2315
2316 return hc->all_tags.size();
2317 }