1 /*-
2  * Copyright 2021 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "config.h"
17 #include "util.h"
18 #include "message.h"
19 #include "html.h"
20 #include "html_tags.h"
21 #include "html_block.hxx"
22 #include "html.hxx"
23 #include "libserver/css/css_value.hxx"
24 #include "libserver/css/css.hxx"
25 
26 #include "url.h"
27 #include "contrib/libucl/khash.h"
28 #include "libmime/images.h"
29 #include "libutil/cxx/utf8_util.h"
30 
31 #include "html_tag_defs.hxx"
32 #include "html_entities.hxx"
33 #include "html_tag.hxx"
34 #include "html_url.hxx"
35 
36 #include <frozen/unordered_map.h>
37 #include <frozen/string.h>
38 #include <fmt/core.h>
39 
40 #include <unicode/uversion.h>
41 
42 namespace rspamd::html {
43 
44 static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
45 
46 static const html_tags_storage html_tags_defs;
47 
48 auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>(
49 		{
50 				{"name",    html_component_type::RSPAMD_HTML_COMPONENT_NAME},
51 				{"href",    html_component_type::RSPAMD_HTML_COMPONENT_HREF},
52 				{"src",     html_component_type::RSPAMD_HTML_COMPONENT_HREF},
53 				{"action",  html_component_type::RSPAMD_HTML_COMPONENT_HREF},
54 				{"color",   html_component_type::RSPAMD_HTML_COMPONENT_COLOR},
55 				{"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
56 				{"style",   html_component_type::RSPAMD_HTML_COMPONENT_STYLE},
57 				{"class",   html_component_type::RSPAMD_HTML_COMPONENT_CLASS},
58 				{"width",   html_component_type::RSPAMD_HTML_COMPONENT_WIDTH},
59 				{"height",  html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT},
60 				{"size",    html_component_type::RSPAMD_HTML_COMPONENT_SIZE},
61 				{"rel",     html_component_type::RSPAMD_HTML_COMPONENT_REL},
62 				{"alt",     html_component_type::RSPAMD_HTML_COMPONENT_ALT},
63 				{"id",      html_component_type::RSPAMD_HTML_COMPONENT_ID},
64 				{"hidden",  html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN},
65 		});
66 
67 #define msg_debug_html(...)  rspamd_conditional_debug_fast (NULL, NULL, \
68         rspamd_html_log_id, "html", pool->tag.uid, \
69         __FUNCTION__, \
70         __VA_ARGS__)
71 
INIT_LOG_MODULE(html)72 INIT_LOG_MODULE(html)
73 
74 /*
75  * This function is expected to be called on a closing tag to fill up all tags
76  * and return the current parent (meaning unclosed) tag
77  */
78 static auto
79 html_check_balance(struct html_content *hc,
80 				   struct html_tag *tag,
81 				   goffset tag_start_offset,
82 				   goffset tag_end_offset) -> html_tag *
83 {
84 	/* As agreed, the closing tag has the last opening at the parent ptr */
85 	auto *opening_tag = tag->parent;
86 
87 	auto calculate_content_length = [tag_start_offset, tag_end_offset](html_tag *t) {
88 		auto opening_content_offset = t->content_offset;
89 
90 		if (t->flags & (CM_EMPTY)) {
91 			/* Attach closing tag just at the opening tag */
92 			t->closing.start = t->tag_start;
93 			t->closing.end = t->content_offset;
94 		}
95 		else {
96 
97 			if (opening_content_offset <= tag_start_offset) {
98 				t->closing.start = tag_start_offset;
99 				t->closing.end = tag_end_offset;
100 			}
101 			else {
102 
103 				t->closing.start = t->content_offset;
104 				t->closing.end = tag_end_offset;
105 			}
106 		}
107 	};
108 
109 	auto balance_tag = [&]() -> html_tag * {
110 		auto it = tag->parent;
111 		auto found_pair = false;
112 
113 		for (; it != nullptr; it = it->parent) {
114 			if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
115 				found_pair = true;
116 				break;
117 			}
118 
119 		}
120 
121 		/*
122 		 * If we have found a closing pair, then we need to close all tags and
123 		 * return the top-most tag
124 		 */
125 		if (found_pair) {
126 			for (it = tag->parent; it != nullptr; it = it->parent) {
127 				it->flags |= FL_CLOSED;
128 				/* Insert a virtual closing tag for all tags that are not closed */
129 				calculate_content_length(it);
130 				if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
131 					break;
132 				}
133 			}
134 
135 			return it;
136 		}
137 		else {
138 			/*
139 			 * We have not found a pair, so this closing tag is bogus and should
140 			 * be ignored completely.
141 			 * Unfortunately, it also means that we need to insert another tag,
142 			 * as the current closing tag is unusable for that purposes.
143 			 *
144 			 * We assume that callee will recognise that and reconstruct the
145 			 * tag at the tag_end_closing state, so we return nullptr...
146 			 */
147 
148 		}
149 
150 		/* Tag must be ignored and reconstructed */
151 		return nullptr;
152 	};
153 
154 	if (opening_tag) {
155 
156 		if (opening_tag->id == tag->id) {
157 			opening_tag->flags |= FL_CLOSED;
158 
159 			calculate_content_length(opening_tag);
160 			/* All good */
161 			return opening_tag->parent;
162 		}
163 		else {
164 			return balance_tag();
165 		}
166 	}
167 	else {
168 		/*
169 		 * We have no opening tag
170 		 * There are two possibilities:
171 		 *
172 		 * 1) We have some block tag in hc->all_tags;
173 		 * 2) We have no tags
174 		 */
175 
176 		if (hc->all_tags.empty()) {
177 			hc->all_tags.push_back(std::make_unique<html_tag>());
178 			auto *vtag = hc->all_tags.back().get();
179 			vtag->id = Tag_HTML;
180 			vtag->flags = FL_VIRTUAL;
181 			vtag->tag_start = 0;
182 			vtag->content_offset = 0;
183 			calculate_content_length(vtag);
184 
185 			if (!hc->root_tag) {
186 				hc->root_tag = vtag;
187 			}
188 			else {
189 				vtag->parent = hc->root_tag;
190 			}
191 
192 			tag->parent = vtag;
193 
194 			/* Recursively call with a virtual <html> tag inserted */
195 			return html_check_balance(hc, tag, tag_start_offset, tag_end_offset);
196 		}
197 	}
198 
199 	return nullptr;
200 }
201 
202 auto
html_component_from_string(const std::string_view & st)203 html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>
204 {
205 	auto known_component_it = html_components_map.find(st);
206 
207 	if (known_component_it != html_components_map.end()) {
208 		return known_component_it->second;
209 	}
210 	else {
211 		return std::nullopt;
212 	}
213 }
214 
215 struct tag_content_parser_state {
216 	int cur_state = 0;
217 	std::string buf;
218 	std::optional<html_component_type> cur_component;
219 
resetrspamd::html::tag_content_parser_state220 	void reset()
221 	{
222 		cur_state = 0;
223 		buf.clear();
224 		cur_component = std::nullopt;
225 	}
226 };
227 
228 static inline void
html_parse_tag_content(rspamd_mempool_t * pool,struct html_content * hc,struct html_tag * tag,const char * in,struct tag_content_parser_state & parser_env)229 html_parse_tag_content(rspamd_mempool_t *pool,
230 					   struct html_content *hc,
231 					   struct html_tag *tag,
232 					   const char *in,
233 					   struct tag_content_parser_state &parser_env)
234 {
235 	enum tag_parser_state {
236 		parse_start = 0,
237 		parse_name,
238 		parse_attr_name,
239 		parse_equal,
240 		parse_start_dquote,
241 		parse_dqvalue,
242 		parse_end_dquote,
243 		parse_start_squote,
244 		parse_sqvalue,
245 		parse_end_squote,
246 		parse_value,
247 		spaces_before_eq,
248 		spaces_after_eq,
249 		spaces_after_param,
250 		ignore_bad_tag,
251 		tag_end,
252 		slash_after_value,
253 		slash_in_unqouted_value,
254 	} state;
255 
256 	state = static_cast<enum tag_parser_state>(parser_env.cur_state);
257 
258 	/*
259 	 * Stores tag component if it doesn't exist, performing copy of the
260 	 * value + decoding of the entities
261 	 * Parser env is set to clear the current html attribute fields (saved_p and
262 	 * cur_component)
263 	 */
264 	auto store_component_value = [&]() -> void {
265 		if (parser_env.cur_component) {
266 
267 			if (parser_env.buf.empty()) {
268 				tag->components.emplace_back(parser_env.cur_component.value(),
269 						std::string_view{});
270 			}
271 			else {
272 				/* We need to copy buf to a persistent storage */
273 				auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
274 
275 				if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID ||
276 						parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
277 					/* Lowercase */
278 					rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size());
279 				}
280 				else {
281 					memcpy(s, parser_env.buf.data(), parser_env.buf.size());
282 				}
283 
284 				auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size());
285 				tag->components.emplace_back(parser_env.cur_component.value(),
286 						std::string_view{s, sz});
287 			}
288 		}
289 
290 		parser_env.buf.clear();
291 		parser_env.cur_component = std::nullopt;
292 	};
293 
294 	auto store_component_name = [&]() -> bool {
295 		decode_html_entitles_inplace(parser_env.buf);
296 		auto known_component_it = html_components_map.find(std::string_view{parser_env.buf});
297 		parser_env.buf.clear();
298 
299 		if (known_component_it != html_components_map.end()) {
300 			parser_env.cur_component = known_component_it->second;
301 
302 			return true;
303 		}
304 		else {
305 			parser_env.cur_component = std::nullopt;
306 		}
307 
308 		return false;
309 	};
310 
311 	auto store_value_character = [&](bool lc) -> void {
312 		auto c = lc ? g_ascii_tolower(*in) : *in;
313 
314 		if (c == '\0') {
315 			/* Replace with u0FFD */
316 			parser_env.buf.append(u8"\uFFFD");
317 		}
318 		else {
319 			parser_env.buf.push_back(c);
320 		}
321 	};
322 
323 	switch (state) {
324 	case parse_start:
325 		if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
326 			hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
327 			state = ignore_bad_tag;
328 			tag->id = N_TAGS;
329 			tag->flags |= FL_BROKEN;
330 		}
331 		else if (g_ascii_isalpha (*in)) {
332 			state = parse_name;
333 			store_value_character(true);
334 		}
335 		break;
336 
337 	case parse_name:
338 		if ((g_ascii_isspace (*in) || *in == '>' || *in == '/')) {
339 			if (*in == '/') {
340 				tag->flags |= FL_CLOSED;
341 			}
342 
343 			if (parser_env.buf.empty()) {
344 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
345 				tag->id = N_TAGS;
346 				tag->flags |= FL_BROKEN;
347 				state = ignore_bad_tag;
348 			}
349 			else {
350 				decode_html_entitles_inplace(parser_env.buf);
351 				const auto *tag_def = rspamd::html::html_tags_defs.by_name(parser_env.buf);
352 
353 				if (tag_def == nullptr) {
354 					hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
355 					/* Assign -hash to match closing tag if needed */
356 					auto nhash = static_cast<std::int32_t>(std::hash<std::string>{}(parser_env.buf));
357 					/* Always negative */
358 					tag->id = static_cast<tag_id_t>(nhash | G_MININT32);
359 				}
360 				else {
361 					tag->id = tag_def->id;
362 					tag->flags = tag_def->flags;
363 				}
364 
365 				parser_env.buf.clear();
366 
367 				state = spaces_after_param;
368 			}
369 		}
370 		else {
371 			store_value_character(true);
372 		}
373 		break;
374 
375 	case parse_attr_name:
376 		if (*in == '=') {
377 			if (!parser_env.buf.empty()) {
378 				store_component_name();
379 			}
380 			state = parse_equal;
381 		}
382 		else if (g_ascii_isspace(*in)) {
383 			store_component_name();
384 			state = spaces_before_eq;
385 		}
386 		else if (*in == '/') {
387 			store_component_name();
388 			store_component_value();
389 			state = slash_after_value;
390 		}
391 		else if (*in == '>') {
392 			store_component_name();
393 			store_component_value();
394 			state = tag_end;
395 		}
396 		else {
397 			if (*in == '"' || *in == '\'' || *in == '<') {
398 				/* Should never be in attribute names but ignored */
399 				tag->flags |= FL_BROKEN;
400 			}
401 
402 			store_value_character(true);
403 		}
404 
405 		break;
406 
407 	case spaces_before_eq:
408 		if (*in == '=') {
409 			state = parse_equal;
410 		}
411 		else if (!g_ascii_isspace (*in)) {
412 			/*
413 			 * HTML defines that crap could still be restored and
414 			 * calculated somehow... So we have to follow this stupid behaviour
415 			 */
416 			/*
417 			 * TODO: estimate what insane things do email clients in each case
418 			 */
419 			if (*in == '>') {
420 				/*
421 				 * Attribtute name followed by end of tag
422 				 * Should be okay (empty attribute). The rest is handled outside
423 				 * this automata.
424 				 */
425 				store_component_value();
426 				state = tag_end;
427 			}
428 			else if (*in == '"' || *in == '\'' || *in == '<') {
429 				/* Attribute followed by quote... Missing '=' ? Dunno, need to test */
430 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
431 				tag->flags |= FL_BROKEN;
432 				store_component_value();
433 				store_value_character(true);
434 				state = spaces_after_param;
435 			}
436 			else {
437 				/* Empty attribute */
438 				store_component_value();
439 				store_value_character(true);
440 				state = spaces_after_param;
441 			}
442 		}
443 		break;
444 
445 	case spaces_after_eq:
446 		if (*in == '"') {
447 			state = parse_start_dquote;
448 		}
449 		else if (*in == '\'') {
450 			state = parse_start_squote;
451 		}
452 		else if (!g_ascii_isspace (*in)) {
453 			store_value_character(true);
454 			state = parse_value;
455 		}
456 		break;
457 
458 	case parse_equal:
459 		if (g_ascii_isspace (*in)) {
460 			state = spaces_after_eq;
461 		}
462 		else if (*in == '"') {
463 			state = parse_start_dquote;
464 		}
465 		else if (*in == '\'') {
466 			state = parse_start_squote;
467 		}
468 		else {
469 			store_value_character(true);
470 			state = parse_value;
471 		}
472 		break;
473 
474 	case parse_start_dquote:
475 		if (*in == '"') {
476 			state = spaces_after_param;
477 		}
478 		else {
479 			store_value_character(false);
480 			state = parse_dqvalue;
481 		}
482 		break;
483 
484 	case parse_start_squote:
485 		if (*in == '\'') {
486 			state = spaces_after_param;
487 		}
488 		else {
489 			store_value_character(false);
490 			state = parse_sqvalue;
491 		}
492 		break;
493 
494 	case parse_dqvalue:
495 		if (*in == '"') {
496 			store_component_value();
497 			state = parse_end_dquote;
498 		}
499 		else {
500 			store_value_character(false);
501 		}
502 		break;
503 
504 	case parse_sqvalue:
505 		if (*in == '\'') {
506 			store_component_value();
507 			state = parse_end_squote;
508 		}
509 		else {
510 			store_value_character(false);
511 		}
512 
513 		break;
514 
515 	case parse_value:
516 		if (*in == '/') {
517 			state = slash_in_unqouted_value;
518 		}
519 		else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
520 			store_component_value();
521 			state = spaces_after_param;
522 		}
523 		else {
524 			store_value_character(false);
525 		}
526 		break;
527 
528 	case parse_end_dquote:
529 	case parse_end_squote:
530 		if (g_ascii_isspace (*in)) {
531 			state = spaces_after_param;
532 		}
533 		else if (*in == '/') {
534 			store_component_value();
535 			store_value_character(true);
536 			state = slash_after_value;
537 		}
538 		else {
539 			/* No space, proceed immediately to the attribute name */
540 			state = parse_attr_name;
541 			store_component_value();
542 			store_value_character(true);
543 		}
544 		break;
545 
546 	case spaces_after_param:
547 		if (!g_ascii_isspace (*in)) {
548 			if (*in == '/') {
549 				state = slash_after_value;
550 			}
551 			else if (*in == '=') {
552 				/* Attributes cannot start with '=' */
553 				tag->flags |= FL_BROKEN;
554 				store_value_character(true);
555 				state = parse_attr_name;
556 			}
557 			else {
558 				store_value_character(true);
559 				state = parse_attr_name;
560 			}
561 		}
562 		break;
563 	case slash_after_value:
564 		if (*in == '>') {
565 			tag->flags |= FL_CLOSED;
566 			state = tag_end;
567 		}
568 		else if (!g_ascii_isspace(*in)) {
569 			tag->flags |= FL_BROKEN;
570 			state = parse_attr_name;
571 		}
572 		break;
573 	case slash_in_unqouted_value:
574 		if (*in == '>') {
575 			/* That slash was in fact closing tag slash, wohoo */
576 			tag->flags |= FL_CLOSED;
577 			state = tag_end;
578 			store_component_value();
579 		}
580 		else {
581 			/* Welcome to the world of html, revert state and save missing / */
582 			parser_env.buf.push_back('/');
583 			store_value_character(false);
584 			state = parse_value;
585 		}
586 		break;
587 	case ignore_bad_tag:
588 	case tag_end:
589 		break;
590 	}
591 
592 	parser_env.cur_state = state;
593 }
594 
595 static inline auto
html_is_absolute_url(std::string_view st)596 html_is_absolute_url(std::string_view st) -> bool
597 {
598 	auto alnum_pos = std::find_if(std::begin(st), std::end(st),
599 			[](auto c) {return !g_ascii_isalnum(c);});
600 
601 	if (alnum_pos != std::end(st) && alnum_pos != std::begin(st)) {
602 		if (*alnum_pos == ':') {
603 			if (st.substr(0, std::distance(std::begin(st), alnum_pos)) == "mailto") {
604 				return true;
605 			}
606 
607 			std::advance(alnum_pos, 1);
608 			if (alnum_pos != std::end(st)) {
609 				/* Include even malformed urls */
610 				if (*alnum_pos == '/' || *alnum_pos == '\\') {
611 					return true;
612 				}
613 			}
614 		}
615 	}
616 
617 	return false;
618 }
619 
620 static auto
html_process_url_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc)621 html_process_url_tag(rspamd_mempool_t *pool,
622 					 struct html_tag *tag,
623 					 struct html_content *hc) -> std::optional<struct rspamd_url *>
624 {
625 	auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
626 
627 	if (found_href_maybe) {
628 		/* Check base url */
629 		auto &href_value = found_href_maybe.value();
630 
631 		if (hc && hc->base_url) {
632 			/*
633 			 * Relative url cannot start from the following:
634 			 * schema://
635 			 * data:
636 			 * slash
637 			 */
638 
639 			if (!html_is_absolute_url(href_value)) {
640 
641 				if (href_value.size() >= sizeof("data:") &&
642 					g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) {
643 					/* Image data url, never insert as url */
644 					return std::nullopt;
645 				}
646 
647 				/* Assume relative url */
648 				auto need_slash = false;
649 
650 				auto orig_len = href_value.size();
651 				auto len = orig_len + hc->base_url->urllen;
652 
653 				if (hc->base_url->datalen == 0) {
654 					need_slash = true;
655 					len++;
656 				}
657 
658 				auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
659 				auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1,
660 						"%*s%s%*s",
661 						(int) hc->base_url->urllen, hc->base_url->string,
662 						need_slash ? "/" : "",
663 						(gint) orig_len, href_value.data());
664 				href_value = {buf, nlen};
665 			}
666 			else if (href_value.size() > 2 && href_value[0] == '/' && href_value[1] != '/') {
667 				/* Relative to the hostname */
668 				auto orig_len = href_value.size();
669 				auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen +
670 						   3 /* for :// */;
671 				auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
672 				auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
673 						(int) hc->base_url->protocollen, hc->base_url->string,
674 						(int) hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
675 						(gint) orig_len, href_value.data());
676 				href_value = {buf, nlen};
677 			}
678 		}
679 
680 		auto url = html_process_url(pool, href_value);
681 
682 		if (url && std::holds_alternative<std::monostate>(tag->extra)) {
683 			tag->extra = url.value();
684 		}
685 
686 		return url;
687 	}
688 
689 	return std::nullopt;
690 }
691 
692 struct rspamd_html_url_query_cbd {
693 	rspamd_mempool_t *pool;
694 	khash_t (rspamd_url_hash) *url_set;
695 	struct rspamd_url *url;
696 	GPtrArray *part_urls;
697 };
698 
699 static gboolean
html_url_query_callback(struct rspamd_url * url,gsize start_offset,gsize end_offset,gpointer ud)700 html_url_query_callback(struct rspamd_url *url, gsize start_offset,
701 						gsize end_offset, gpointer ud)
702 {
703 	struct rspamd_html_url_query_cbd *cbd =
704 			(struct rspamd_html_url_query_cbd *) ud;
705 	rspamd_mempool_t *pool;
706 
707 	pool = cbd->pool;
708 
709 	if (url->protocol == PROTOCOL_MAILTO) {
710 		if (url->userlen == 0) {
711 			return FALSE;
712 		}
713 	}
714 
715 	msg_debug_html ("found url %s in query of url"
716 					" %*s", url->string,
717 			cbd->url->querylen, rspamd_url_query_unsafe(cbd->url));
718 
719 	url->flags |= RSPAMD_URL_FLAG_QUERY;
720 
721 	if (rspamd_url_set_add_or_increase(cbd->url_set, url, false)
722 		&& cbd->part_urls) {
723 		g_ptr_array_add(cbd->part_urls, url);
724 	}
725 
726 	return TRUE;
727 }
728 
729 static void
html_process_query_url(rspamd_mempool_t * pool,struct rspamd_url * url,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)730 html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
731 					   khash_t (rspamd_url_hash) *url_set,
732 					   GPtrArray *part_urls)
733 {
734 	if (url->querylen > 0) {
735 		struct rspamd_html_url_query_cbd qcbd;
736 
737 		qcbd.pool = pool;
738 		qcbd.url_set = url_set;
739 		qcbd.url = url;
740 		qcbd.part_urls = part_urls;
741 
742 		rspamd_url_find_multiple(pool,
743 				rspamd_url_query_unsafe (url), url->querylen,
744 				RSPAMD_URL_FIND_ALL, NULL,
745 				html_url_query_callback, &qcbd);
746 	}
747 
748 	if (part_urls) {
749 		g_ptr_array_add(part_urls, url);
750 	}
751 }
752 
753 static auto
html_process_data_image(rspamd_mempool_t * pool,struct html_image * img,std::string_view input)754 html_process_data_image(rspamd_mempool_t *pool,
755 						struct html_image *img,
756 						std::string_view input) -> void
757 {
758 	/*
759 	 * Here, we do very basic processing of the data:
760 	 * detect if we have something like: `data:image/xxx;base64,yyyzzz==`
761 	 * We only parse base64 encoded data.
762 	 * We ignore content type so far
763 	 */
764 	struct rspamd_image *parsed_image;
765 	const gchar *semicolon_pos = input.data(),
766 			*end = input.data() + input.size();
767 
768 	if ((semicolon_pos = (const gchar *) memchr(semicolon_pos, ';', end - semicolon_pos)) != NULL) {
769 		if (end - semicolon_pos > sizeof("base64,")) {
770 			if (memcmp(semicolon_pos + 1, "base64,", sizeof("base64,") - 1) == 0) {
771 				const gchar *data_pos = semicolon_pos + sizeof("base64,");
772 				gchar *decoded;
773 				gsize encoded_len = end - data_pos, decoded_len;
774 				rspamd_ftok_t inp;
775 
776 				decoded_len = (encoded_len / 4 * 3) + 12;
777 				decoded = rspamd_mempool_alloc_buffer(pool, decoded_len);
778 				rspamd_cryptobox_base64_decode(data_pos, encoded_len,
779 						reinterpret_cast<guchar *>(decoded), &decoded_len);
780 				inp.begin = decoded;
781 				inp.len = decoded_len;
782 
783 				parsed_image = rspamd_maybe_process_image(pool, &inp);
784 
785 				if (parsed_image) {
786 					msg_debug_html ("detected %s image of size %ud x %ud in data url",
787 							rspamd_image_type_str(parsed_image->type),
788 							parsed_image->width, parsed_image->height);
789 					img->embedded_image = parsed_image;
790 				}
791 			}
792 		}
793 		else {
794 			/* Nothing useful */
795 			return;
796 		}
797 	}
798 }
799 
800 static void
html_process_img_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)801 html_process_img_tag(rspamd_mempool_t *pool,
802 					 struct html_tag *tag,
803 					 struct html_content *hc,
804 					 khash_t (rspamd_url_hash) *url_set,
805 					 GPtrArray *part_urls)
806 {
807 	struct html_image *img;
808 
809 	img = rspamd_mempool_alloc0_type (pool, struct html_image);
810 	img->tag = tag;
811 
812 	for (const auto &param : tag->components) {
813 
814 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
815 			/* Check base url */
816 			const auto &href_value = param.value;
817 
818 			if (href_value.size() > 0) {
819 				rspamd_ftok_t fstr;
820 				fstr.begin = href_value.data();
821 				fstr.len = href_value.size();
822 				img->src = rspamd_mempool_ftokdup (pool, &fstr);
823 
824 				if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
825 						"cid:", sizeof("cid:") - 1) == 0) {
826 					/* We have an embedded image */
827 					img->src += sizeof("cid:") - 1;
828 					img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
829 				}
830 				else {
831 					if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
832 							"data:", sizeof("data:") - 1) == 0) {
833 						/* We have an embedded image in HTML tag */
834 						img->flags |=
835 								(RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
836 						html_process_data_image(pool, img, href_value);
837 						hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
838 					}
839 					else {
840 						img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
841 						if (img->src) {
842 
843 							std::string_view cpy{href_value};
844 							auto maybe_url = html_process_url(pool, cpy);
845 
846 							if (maybe_url) {
847 								img->url = maybe_url.value();
848 								struct rspamd_url *existing;
849 
850 								img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
851 								existing = rspamd_url_set_add_or_return(url_set,
852 										img->url);
853 
854 								if (existing && existing != img->url) {
855 									/*
856 									 * We have some other URL that could be
857 									 * found, e.g. from another part. However,
858 									 * we still want to set an image flag on it
859 									 */
860 									existing->flags |= img->url->flags;
861 									existing->count++;
862 								}
863 								else if (part_urls) {
864 									/* New url */
865 									g_ptr_array_add(part_urls, img->url);
866 								}
867 							}
868 						}
869 					}
870 				}
871 			}
872 		}
873 
874 
875 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) {
876 			unsigned long val;
877 
878 			rspamd_strtoul(param.value.data(), param.value.size(), &val);
879 			img->height = val;
880 		}
881 
882 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) {
883 			unsigned long val;
884 
885 			rspamd_strtoul(param.value.data(), param.value.size(), &val);
886 			img->width = val;
887 		}
888 
889 		/* TODO: rework to css at some time */
890 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
891 			if (img->height == 0) {
892 				auto style_st = param.value;
893 				auto pos = rspamd_substring_search_caseless(style_st.data(),
894 						style_st.size(),
895 						"height", sizeof("height") - 1);
896 				if (pos != -1) {
897 					auto substr = style_st.substr(pos + sizeof("height") - 1);
898 
899 					for (auto i = 0; i < substr.size(); i++) {
900 						auto t = substr[i];
901 						if (g_ascii_isdigit (t)) {
902 							unsigned long val;
903 							rspamd_strtoul(substr.data(),
904 									substr.size(), &val);
905 							img->height = val;
906 							break;
907 						}
908 						else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
909 							/* Fallback */
910 							break;
911 						}
912 					}
913 				}
914 			}
915 			if (img->width == 0) {
916 				auto style_st = param.value;
917 				auto pos = rspamd_substring_search_caseless(style_st.data(),
918 						style_st.size(),
919 						"width", sizeof("width") - 1);
920 				if (pos != -1) {
921 					auto substr = style_st.substr(pos + sizeof("width") - 1);
922 
923 					for (auto i = 0; i < substr.size(); i++) {
924 						auto t = substr[i];
925 						if (g_ascii_isdigit (t)) {
926 							unsigned long val;
927 							rspamd_strtoul(substr.data(),
928 									substr.size(), &val);
929 							img->width = val;
930 							break;
931 						}
932 						else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
933 							/* Fallback */
934 							break;
935 						}
936 					}
937 				}
938 			}
939 		}
940 	}
941 
942 	if (img->embedded_image) {
943 		if (img->height == 0) {
944 			img->height = img->embedded_image->height;
945 		}
946 		if (img->width == 0) {
947 			img->width = img->embedded_image->width;
948 		}
949 	}
950 
951 	hc->images.push_back(img);
952 	tag->extra = img;
953 }
954 
955 static auto
html_process_link_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)956 html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
957 					  struct html_content *hc,
958 					  khash_t (rspamd_url_hash) *url_set,
959 					  GPtrArray *part_urls) -> void
960 {
961 	auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL);
962 
963 	if (found_rel_maybe) {
964 		if (found_rel_maybe.value() == "icon") {
965 			html_process_img_tag(pool, tag, hc, url_set, part_urls);
966 		}
967 	}
968 }
969 
970 static auto
html_process_block_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc)971 html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
972 					   struct html_content *hc) -> void
973 {
974 	std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
975 	bool hidden = false;
976 
977 	for (const auto &param : tag->components) {
978 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
979 			maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
980 		}
981 
982 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
983 			maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
984 		}
985 
986 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
987 			tag->block = rspamd::css::parse_css_declaration(pool, param.value);
988 		}
989 
990 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) {
991 			hidden = true;
992 		}
993 	}
994 
995 	if (!tag->block) {
996 		tag->block = html_block::undefined_html_block_pool(pool);
997 	}
998 
999 	if (hidden) {
1000 		tag->block->set_display(false);
1001 	}
1002 
1003 	if (maybe_fgcolor) {
1004 		tag->block->set_fgcolor(maybe_fgcolor->to_color().value());
1005 	}
1006 
1007 	if (maybe_bgcolor) {
1008 		tag->block->set_bgcolor(maybe_bgcolor->to_color().value());
1009 	}
1010 }
1011 
1012 static inline auto
html_append_parsed(struct html_content * hc,std::string_view data,bool transparent,std::size_t input_len,std::string & dest)1013 html_append_parsed(struct html_content *hc,
1014 				   std::string_view data,
1015 				   bool transparent,
1016 				   std::size_t input_len,
1017 				   std::string &dest) -> std::size_t
1018 {
1019 	auto cur_offset = dest.size();
1020 
1021 	if (dest.size() > input_len) {
1022 		/* Impossible case, refuse to append */
1023 		return 0;
1024 	}
1025 
1026 	if (data.size() > 0) {
1027 		/* Handle multiple spaces at the begin */
1028 
1029 		if (cur_offset > 0) {
1030 			auto last = dest.back();
1031 			if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) {
1032 				dest.append(" ");
1033 				data = {data.data() + 1, data.size() - 1};
1034 				cur_offset++;
1035 			}
1036 		}
1037 
1038 		if (data.find('\0') != std::string_view::npos) {
1039 			auto replace_zero_func = [](const auto &input, auto &output) {
1040 				const auto last = input.cend();
1041 				for (auto it = input.cbegin(); it != last; ++it) {
1042 					if (*it == '\0') {
1043 						output.append(u8"\uFFFD");
1044 					}
1045 					else {
1046 						output.push_back(*it);
1047 					}
1048 				}
1049 			};
1050 
1051 			dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD"));
1052 			replace_zero_func(data, dest);
1053 			hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
1054 		}
1055 		else {
1056 			dest.append(data);
1057 		}
1058 	}
1059 
1060 	auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset,
1061 			dest.size() - cur_offset, true);
1062 
1063 	dest.resize(nlen + cur_offset);
1064 
1065 	if (transparent) {
1066 		/* Replace all visible characters with spaces */
1067 		auto start = std::next(dest.begin(), cur_offset);
1068 		std::replace_if(start, std::end(dest), [](const auto c) {
1069 			return !g_ascii_isspace(c);
1070 		}, ' ');
1071 	}
1072 
1073 	return nlen;
1074 }
1075 
1076 static auto
html_process_displayed_href_tag(rspamd_mempool_t * pool,struct html_content * hc,std::string_view data,const struct html_tag * cur_tag,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,goffset dest_offset)1077 html_process_displayed_href_tag(rspamd_mempool_t *pool,
1078 								struct html_content *hc,
1079 								std::string_view data,
1080 								const struct html_tag *cur_tag,
1081 								GList **exceptions,
1082 								khash_t (rspamd_url_hash) *url_set,
1083 								goffset dest_offset) -> void
1084 {
1085 
1086 	if (std::holds_alternative<rspamd_url *>(cur_tag->extra)) {
1087 		auto *url = std::get<rspamd_url *>(cur_tag->extra);
1088 
1089 		html_check_displayed_url(pool,
1090 				exceptions, url_set,
1091 				data,
1092 				dest_offset,
1093 				url);
1094 	}
1095 }
1096 
1097 static auto
html_append_tag_content(rspamd_mempool_t * pool,const gchar * start,gsize len,struct html_content * hc,html_tag * tag,GList ** exceptions,khash_t (rspamd_url_hash)* url_set)1098 html_append_tag_content(rspamd_mempool_t *pool,
1099 						const gchar *start, gsize len,
1100 						struct html_content *hc,
1101 						html_tag *tag,
1102 						GList **exceptions,
1103 						khash_t (rspamd_url_hash) *url_set) -> goffset
1104 {
1105 	auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false;
1106 	goffset next_tag_offset = tag->closing.end,
1107 			initial_parsed_offset = hc->parsed.size(),
1108 			initial_invisible_offset = hc->invisible.size();
1109 
1110 	auto calculate_final_tag_offsets = [&]() -> void {
1111 		if (is_visible) {
1112 			tag->content_offset = initial_parsed_offset;
1113 			tag->closing.start = hc->parsed.size();
1114 		}
1115 		else {
1116 			tag->content_offset = initial_invisible_offset;
1117 			tag->closing.start = hc->invisible.size();
1118 		}
1119 	};
1120 
1121 	if (tag->closing.end == -1) {
1122 		if (tag->closing.start != -1) {
1123 			next_tag_offset = tag->closing.start;
1124 			tag->closing.end = tag->closing.start;
1125 		}
1126 		else {
1127 			next_tag_offset = tag->content_offset;
1128 			tag->closing.end = tag->content_offset;
1129 		}
1130 	}
1131 	if (tag->closing.start == -1) {
1132 		tag->closing.start = tag->closing.end;
1133 	}
1134 
1135 	auto append_margin = [&](char c) -> void {
1136 		/* We do care about visible margins only */
1137 		if (is_visible) {
1138 			if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
1139 				if (hc->parsed.back() == ' ') {
1140 					/* We also strip extra spaces at the end, but limiting the start */
1141 					auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset);
1142 					auto first = std::find_if(hc->parsed.rbegin(), last,
1143 							[](auto ch) -> auto {
1144 								return ch != ' ';
1145 							});
1146 					hc->parsed.erase(first.base(), hc->parsed.end());
1147 					g_assert(hc->parsed.size() >= initial_parsed_offset);
1148 				}
1149 				hc->parsed.push_back(c);
1150 			}
1151 		}
1152 	};
1153 
1154 	if (tag->id == Tag_BR || tag->id == Tag_HR) {
1155 
1156 		if (!(tag->flags & FL_IGNORE)) {
1157 			hc->parsed.append("\n");
1158 		}
1159 
1160 		auto ret = tag->content_offset;
1161 		calculate_final_tag_offsets();
1162 
1163 		return ret;
1164 	}
1165 	else if ((tag->id == Tag_HEAD && (tag->flags & FL_IGNORE)) || (tag->flags & CM_HEAD)) {
1166 		auto ret = tag->closing.end;
1167 		calculate_final_tag_offsets();
1168 
1169 		return ret;
1170 	}
1171 
1172 	if ((tag->flags & (FL_COMMENT | FL_XML | FL_IGNORE | CM_HEAD))) {
1173 		is_visible = false;
1174 	}
1175 	else {
1176 		if (!tag->block) {
1177 			is_visible = true;
1178 		}
1179 		else if (!tag->block->is_visible()) {
1180 			if (!tag->block->is_transparent()) {
1181 				is_visible = false;
1182 			}
1183 			else {
1184 				if (tag->block->has_display() &&
1185 					tag->block->display == css::css_display_value::DISPLAY_HIDDEN) {
1186 					is_visible = false;
1187 				}
1188 				else {
1189 					is_transparent = true;
1190 				}
1191 			}
1192 		}
1193 		else {
1194 			if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
1195 				is_block = true;
1196 			}
1197 			else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) {
1198 				is_spaces = true;
1199 			}
1200 		}
1201 	}
1202 
1203 	if (is_block) {
1204 		append_margin('\n');
1205 	}
1206 	else if (is_spaces) {
1207 		append_margin(' ');
1208 	}
1209 
1210 	goffset cur_offset = tag->content_offset;
1211 
1212 	for (auto *cld : tag->children) {
1213 		auto enclosed_start = cld->tag_start;
1214 		goffset initial_part_len = enclosed_start - cur_offset;
1215 
1216 		if (initial_part_len > 0) {
1217 			if (is_visible) {
1218 				html_append_parsed(hc,
1219 						{start + cur_offset, std::size_t(initial_part_len)},
1220 						is_transparent, len, hc->parsed);
1221 			}
1222 			else {
1223 				html_append_parsed(hc,
1224 						{start + cur_offset, std::size_t(initial_part_len)},
1225 						is_transparent, len, hc->invisible);
1226 			}
1227 		}
1228 
1229 		auto next_offset = html_append_tag_content(pool, start, len,
1230 				hc, cld, exceptions, url_set);
1231 
1232 		/* Do not allow shifting back */
1233 		if (next_offset > cur_offset) {
1234 			cur_offset = next_offset;
1235 		}
1236 	}
1237 
1238 	if (cur_offset < tag->closing.start) {
1239 		goffset final_part_len = tag->closing.start - cur_offset;
1240 
1241 		if (final_part_len > 0) {
1242 			if (is_visible) {
1243 				html_append_parsed(hc,
1244 						{start + cur_offset, std::size_t(final_part_len)},
1245 						is_transparent,
1246 						len,
1247 						hc->parsed);
1248 			}
1249 			else {
1250 				html_append_parsed(hc,
1251 						{start + cur_offset, std::size_t(final_part_len)},
1252 						is_transparent,
1253 						len,
1254 						hc->invisible);
1255 			}
1256 		}
1257 	}
1258 	if (is_block) {
1259 		append_margin('\n');
1260 	}
1261 	else if (is_spaces) {
1262 		append_margin(' ');
1263 	}
1264 
1265 	if (is_visible) {
1266 		if (tag->id == Tag_A) {
1267 			auto written_len = hc->parsed.size() - initial_parsed_offset;
1268 			html_process_displayed_href_tag(pool, hc,
1269 					{hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)},
1270 					tag, exceptions,
1271 					url_set, initial_parsed_offset);
1272 		}
1273 		else if (tag->id == Tag_IMG) {
1274 			/* Process ALT if presented */
1275 			auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
1276 
1277 			if (maybe_alt) {
1278 				if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) {
1279 					/* Add a space */
1280 					hc->parsed += ' ';
1281 				}
1282 
1283 				hc->parsed.append(maybe_alt.value());
1284 
1285 				if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) {
1286 					/* Add a space */
1287 					hc->parsed += ' ';
1288 				}
1289 			}
1290 		}
1291 	}
1292 	else {
1293 		/* Invisible stuff */
1294 		if (std::holds_alternative<rspamd_url *>(tag->extra)) {
1295 			auto *url_enclosed = std::get<rspamd_url *>(tag->extra);
1296 
1297 			/*
1298 			 * TODO: when hash is fixed to include flags we need to remove and add
1299 			 * url to the hash set
1300 			 */
1301 			if (url_enclosed) {
1302 				url_enclosed->flags |= RSPAMD_URL_FLAG_INVISIBLE;
1303 			}
1304 		}
1305 	}
1306 
1307 	calculate_final_tag_offsets();
1308 
1309 	return next_tag_offset;
1310 }
1311 
1312 auto
html_process_input(rspamd_mempool_t * pool,GByteArray * in,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls,bool allow_css)1313 html_process_input(rspamd_mempool_t *pool,
1314 				   GByteArray *in,
1315 				   GList **exceptions,
1316 				   khash_t (rspamd_url_hash) *url_set,
1317 				   GPtrArray *part_urls,
1318 				   bool allow_css) -> html_content *
1319 {
1320 	const gchar *p, *c, *end, *start;
1321 	guchar t;
1322 	auto closing = false;
1323 	guint obrace = 0, ebrace = 0;
1324 	struct rspamd_url *url = nullptr;
1325 	gint href_offset = -1;
1326 	struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
1327 	struct tag_content_parser_state content_parser_env;
1328 
1329 	enum {
1330 		parse_start = 0,
1331 		content_before_start,
1332 		tag_begin,
1333 		sgml_tag,
1334 		xml_tag,
1335 		compound_tag,
1336 		comment_tag,
1337 		comment_content,
1338 		sgml_content,
1339 		tag_content,
1340 		tag_end_opening,
1341 		tag_end_closing,
1342 		html_text_content,
1343 		xml_tag_end,
1344 		tag_raw_text,
1345 		tag_raw_text_less_than,
1346 		tags_limit_overflow,
1347 	} state = parse_start;
1348 
1349 	enum class html_document_state {
1350 		doctype,
1351 		head,
1352 		body
1353 	} html_document_state = html_document_state::doctype;
1354 
1355 	g_assert (in != NULL);
1356 	g_assert (pool != NULL);
1357 
1358 	struct html_content *hc = new html_content;
1359 	rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
1360 
1361 	auto new_tag = [&](int flags = 0) -> struct html_tag * {
1362 
1363 		if (hc->all_tags.size() > rspamd::html::max_tags) {
1364 			hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
1365 
1366 			return nullptr;
1367 		}
1368 
1369 		hc->all_tags.emplace_back(std::make_unique<html_tag>());
1370 		auto *ntag = hc->all_tags.back().get();
1371 		ntag->tag_start = c - start;
1372 		ntag->flags = flags;
1373 
1374 		if (cur_tag && !(cur_tag->flags & (CM_EMPTY | FL_CLOSED)) && cur_tag != &cur_closing_tag) {
1375 			parent_tag = cur_tag;
1376 		}
1377 
1378 		if (flags & FL_XML) {
1379 			return ntag;
1380 		}
1381 
1382 		return ntag;
1383 	};
1384 
1385 	auto process_opening_tag = [&]() {
1386 		if (cur_tag->id > Tag_UNKNOWN) {
1387 			if (cur_tag->flags & CM_UNIQUE) {
1388 				if (!hc->tags_seen[cur_tag->id]) {
1389 					/* Duplicate tag has been found */
1390 					hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
1391 				}
1392 			}
1393 			hc->tags_seen[cur_tag->id] = true;
1394 		}
1395 
1396 		/* Shift to the first unclosed tag */
1397 		auto *pt = parent_tag;
1398 		while (pt && (pt->flags & FL_CLOSED)) {
1399 			pt = pt->parent;
1400 		}
1401 
1402 		if (pt) {
1403 			g_assert(cur_tag != pt);
1404 			cur_tag->parent = pt;
1405 			g_assert(cur_tag->parent != &cur_closing_tag);
1406 			parent_tag = pt;
1407 			parent_tag->children.push_back(cur_tag);
1408 		}
1409 		else {
1410 			if (hc->root_tag) {
1411 				if (cur_tag != hc->root_tag) {
1412 					cur_tag->parent = hc->root_tag;
1413 					g_assert(cur_tag->parent != cur_tag);
1414 					hc->root_tag->children.push_back(cur_tag);
1415 					parent_tag = hc->root_tag;
1416 				}
1417 			}
1418 			else {
1419 				if (cur_tag->id == Tag_HTML) {
1420 					hc->root_tag = cur_tag;
1421 				}
1422 				else {
1423 					/* Insert a fake html tag */
1424 					hc->all_tags.emplace_back(std::make_unique<html_tag>());
1425 					auto *top_tag = hc->all_tags.back().get();
1426 					top_tag->tag_start = 0;
1427 					top_tag->flags = FL_VIRTUAL;
1428 					top_tag->id = Tag_HTML;
1429 					top_tag->content_offset = 0;
1430 					top_tag->children.push_back(cur_tag);
1431 					cur_tag->parent = top_tag;
1432 					g_assert(cur_tag->parent != cur_tag);
1433 					hc->root_tag = top_tag;
1434 					parent_tag = top_tag;
1435 				}
1436 			}
1437 		}
1438 
1439 		if (cur_tag->flags & FL_HREF && html_document_state == html_document_state::body) {
1440 			auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
1441 
1442 			if (maybe_url) {
1443 				url = maybe_url.value();
1444 
1445 				if (url_set != NULL) {
1446 					struct rspamd_url *maybe_existing =
1447 							rspamd_url_set_add_or_return(url_set, maybe_url.value());
1448 					if (maybe_existing == maybe_url.value()) {
1449 						html_process_query_url(pool, url, url_set,
1450 								part_urls);
1451 					}
1452 					else {
1453 						url = maybe_existing;
1454 						/* Replace extra as well */
1455 						cur_tag->extra = maybe_existing;
1456 						/* Increase count to avoid odd checks failure */
1457 						url->count++;
1458 					}
1459 				}
1460 				if (part_urls) {
1461 					g_ptr_array_add(part_urls, url);
1462 				}
1463 
1464 				href_offset = hc->parsed.size();
1465 			}
1466 		}
1467 		else if (cur_tag->id == Tag_BASE) {
1468 			/*
1469 			 * Base is allowed only within head tag but HTML is retarded
1470 			 */
1471 			auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
1472 
1473 			if (maybe_url) {
1474 				msg_debug_html ("got valid base tag");
1475 				cur_tag->extra = maybe_url.value();
1476 				cur_tag->flags |= FL_HREF;
1477 
1478 				if (hc->base_url == nullptr) {
1479 					hc->base_url = maybe_url.value();
1480 				}
1481 				else {
1482 					msg_debug_html ("ignore redundant base tag");
1483 				}
1484 			}
1485 			else {
1486 				msg_debug_html ("got invalid base tag!");
1487 			}
1488 		}
1489 
1490 		if (cur_tag->id == Tag_IMG) {
1491 			html_process_img_tag(pool, cur_tag, hc, url_set,
1492 					part_urls);
1493 		}
1494 		else if (cur_tag->id == Tag_LINK) {
1495 			html_process_link_tag(pool, cur_tag, hc, url_set,
1496 					part_urls);
1497 		}
1498 
1499 		if (!(cur_tag->flags & CM_EMPTY)) {
1500 			html_process_block_tag(pool, cur_tag, hc);
1501 		}
1502 		else {
1503 			/* Implicitly close */
1504 			cur_tag->flags |= FL_CLOSED;
1505 		}
1506 
1507 		if (cur_tag->flags & FL_CLOSED) {
1508 			cur_tag->closing.end = cur_tag->content_offset;
1509 			cur_tag->closing.start = cur_tag->tag_start;
1510 
1511 			cur_tag = parent_tag;
1512 		}
1513 	};
1514 
1515 	p = (const char *) in->data;
1516 	c = p;
1517 	end = p + in->len;
1518 	start = c;
1519 
1520 	while (p < end) {
1521 		t = *p;
1522 
1523 		switch (state) {
1524 		case parse_start:
1525 			if (t == '<') {
1526 				state = tag_begin;
1527 			}
1528 			else {
1529 				/* We have no starting tag, so assume that it's content */
1530 				hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
1531 				cur_tag = new_tag();
1532 				html_document_state = html_document_state::body;
1533 
1534 				if (cur_tag) {
1535 					cur_tag->id = Tag_HTML;
1536 					hc->root_tag = cur_tag;
1537 					state = content_before_start;
1538 				}
1539 				else {
1540 					state = tags_limit_overflow;
1541 				}
1542 			}
1543 			break;
1544 		case content_before_start:
1545 			if (t == '<') {
1546 				state = tag_begin;
1547 			}
1548 			else {
1549 				p++;
1550 			}
1551 			break;
1552 		case tag_begin:
1553 			switch (t) {
1554 			case '<':
1555 				c = p;
1556 				p++;
1557 				closing = FALSE;
1558 				break;
1559 			case '!':
1560 				cur_tag = new_tag(FL_XML | FL_CLOSED);
1561 				if (cur_tag) {
1562 					state = sgml_tag;
1563 				}
1564 				else {
1565 					state = tags_limit_overflow;
1566 				}
1567 				p++;
1568 				break;
1569 			case '?':
1570 				cur_tag = new_tag(FL_XML | FL_CLOSED);
1571 				if (cur_tag) {
1572 					state = xml_tag;
1573 				}
1574 				else {
1575 					state = tags_limit_overflow;
1576 				}
1577 				hc->flags |= RSPAMD_HTML_FLAG_XML;
1578 				p++;
1579 				break;
1580 			case '/':
1581 				closing = TRUE;
1582 				/* We fill fake closing tag to fill it with the content parser */
1583 				cur_closing_tag.clear();
1584 				/*
1585 				 * For closing tags, we need to find some corresponding opening tag.
1586 				 * However, at this point we have not even parsed a name, so we
1587 				 * can not assume anything about balancing, etc.
1588 				 *
1589 				 * So we need to ensure that:
1590 				 * 1) We have some opening tag in the chain cur_tag->parent...
1591 				 * 2) cur_tag is nullptr - okay, html is just brain damaged
1592 				 * 3) cur_tag must NOT be equal to cur_closing tag. It means that
1593 				 * we had some poor closing tag but we still need to find an opening
1594 				 * tag... Somewhere...
1595 				 */
1596 
1597 				if (cur_tag == &cur_closing_tag) {
1598 					if (parent_tag != &cur_closing_tag) {
1599 						cur_closing_tag.parent = parent_tag;
1600 					}
1601 					else {
1602 						cur_closing_tag.parent = nullptr;
1603 					}
1604 				}
1605 				else if (cur_tag && cur_tag->flags & FL_CLOSED) {
1606 					/* Cur tag is already closed, we should find something else */
1607 					auto *tmp = cur_tag;
1608 					while (tmp) {
1609 						tmp = tmp->parent;
1610 
1611 						if (tmp == nullptr || !(tmp->flags & FL_CLOSED)) {
1612 							break;
1613 						}
1614 					}
1615 
1616 					cur_closing_tag.parent = tmp;
1617 				}
1618 				else {
1619 					cur_closing_tag.parent = cur_tag;
1620 				}
1621 
1622 				cur_tag = &cur_closing_tag;
1623 				p++;
1624 				break;
1625 			case '>':
1626 				/* Empty tag */
1627 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1628 				state = html_text_content;
1629 				continue;
1630 			default:
1631 				if (g_ascii_isalpha(t)) {
1632 					state = tag_content;
1633 					content_parser_env.reset();
1634 
1635 					if (!closing) {
1636 						cur_tag = new_tag();
1637 					}
1638 
1639 					if (cur_tag) {
1640 						state = tag_content;
1641 					}
1642 					else {
1643 						state = tags_limit_overflow;
1644 					}
1645 				}
1646 				else {
1647 					/* Wrong bad tag */
1648 					state = html_text_content;
1649 				}
1650 				break;
1651 			}
1652 
1653 			break;
1654 
1655 		case sgml_tag:
1656 			switch (t) {
1657 			case '[':
1658 				state = compound_tag;
1659 				obrace = 1;
1660 				ebrace = 0;
1661 				p++;
1662 				break;
1663 			case '-':
1664 				cur_tag->flags |= FL_COMMENT;
1665 				state = comment_tag;
1666 				p++;
1667 				break;
1668 			default:
1669 				state = sgml_content;
1670 				break;
1671 			}
1672 
1673 			break;
1674 
1675 		case xml_tag:
1676 			if (t == '?') {
1677 				state = xml_tag_end;
1678 			}
1679 			else if (t == '>') {
1680 				/* Misformed xml tag */
1681 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1682 				state = tag_end_opening;
1683 				continue;
1684 			}
1685 			/* We efficiently ignore xml tags */
1686 			p++;
1687 			break;
1688 
1689 		case xml_tag_end:
1690 			if (t == '>') {
1691 				state = tag_end_opening;
1692 				cur_tag->content_offset = p - start + 1;
1693 				continue;
1694 			}
1695 			else {
1696 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1697 			}
1698 			p++;
1699 			break;
1700 
1701 		case compound_tag:
1702 			if (t == '[') {
1703 				obrace++;
1704 			}
1705 			else if (t == ']') {
1706 				ebrace++;
1707 			}
1708 			else if (t == '>' && obrace == ebrace) {
1709 				state = tag_end_opening;
1710 				cur_tag->content_offset = p - start + 1;
1711 				continue;
1712 			}
1713 			p++;
1714 			break;
1715 
1716 		case comment_tag:
1717 			if (t != '-') {
1718 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1719 				state = tag_end_opening;
1720 			}
1721 			else {
1722 				p++;
1723 				ebrace = 0;
1724 				/*
1725 				 * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments
1726 				 *  ... the text must not start with a single
1727 				 *  U+003E GREATER-THAN SIGN character (>),
1728 				 *  nor start with a "-" (U+002D) character followed by
1729 				 *  a U+003E GREATER-THAN SIGN (>) character,
1730 				 *  nor contain two consecutive U+002D HYPHEN-MINUS
1731 				 *  characters (--), nor end with a "-" (U+002D) character.
1732 				 */
1733 				if (p[0] == '-' && p + 1 < end && p[1] == '>') {
1734 					hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1735 					p++;
1736 					state = tag_end_opening;
1737 				}
1738 				else if (*p == '>') {
1739 					hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1740 					state = tag_end_opening;
1741 				}
1742 				else {
1743 					state = comment_content;
1744 				}
1745 			}
1746 			break;
1747 
1748 		case comment_content:
1749 			if (t == '-') {
1750 				ebrace++;
1751 			}
1752 			else if (t == '>' && ebrace >= 2) {
1753 				cur_tag->content_offset = p - start + 1;
1754 				state = tag_end_opening;
1755 				continue;
1756 			}
1757 			else {
1758 				ebrace = 0;
1759 			}
1760 
1761 			p++;
1762 			break;
1763 
1764 		case html_text_content:
1765 			if (t != '<') {
1766 				p++;
1767 			}
1768 			else {
1769 				state = tag_begin;
1770 			}
1771 			break;
1772 
1773 		case tag_raw_text:
1774 			if (t == '<') {
1775 				c = p;
1776 				state = tag_raw_text_less_than;
1777 			}
1778 			p ++;
1779 			break;
1780 		case tag_raw_text_less_than:
1781 			if (t == '/') {
1782 				/* Here are special things: we look for obrace and then ensure
1783 				 * that if there is any closing brace nearby
1784 				 * (we look maximum at 30 characters). We also need to ensure
1785 				 * that we have no special characters, such as punctuation marks and
1786 				 * so on.
1787 				 * Basically, we validate the input to be sane.
1788 				 * Since closing tags must not have attributes, these assumptions
1789 				 * seems to be reasonable enough for our toy parser.
1790 				 */
1791 				gint cur_lookahead = 1;
1792 				gint max_lookahead = MIN (end - p, 30);
1793 				bool valid_closing_tag = true;
1794 
1795 				if (p + 1 < end && !g_ascii_isalpha (p[1])) {
1796 					valid_closing_tag = false;
1797 				}
1798 				else {
1799 					while (cur_lookahead < max_lookahead) {
1800 						gchar tt = p[cur_lookahead];
1801 						if (tt == '>') {
1802 							break;
1803 						}
1804 						else if (tt < '\n' || tt == ',') {
1805 							valid_closing_tag = false;
1806 							break;
1807 						}
1808 						cur_lookahead ++;
1809 					}
1810 
1811 					if (cur_lookahead == max_lookahead) {
1812 						valid_closing_tag = false;
1813 					}
1814 				}
1815 
1816 				if (valid_closing_tag) {
1817 					/* Shift back */
1818 					p = c;
1819 					state = tag_begin;
1820 				}
1821 				else {
1822 					p ++;
1823 					state = tag_raw_text;
1824 				}
1825 			}
1826 			else {
1827 				p ++;
1828 				state = tag_raw_text;
1829 			}
1830 			break;
1831 		case sgml_content:
1832 			/* TODO: parse DOCTYPE here */
1833 			if (t == '>') {
1834 				cur_tag->content_offset = p - start + 1;
1835 				state = tag_end_opening;
1836 			}
1837 			else {
1838 				p++;
1839 			}
1840 			break;
1841 
1842 		case tag_content:
1843 			html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env);
1844 
1845 			if (t == '>') {
1846 				if (closing) {
1847 					cur_tag->closing.start = c - start;
1848 					cur_tag->closing.end = p - start + 1;
1849 
1850 					closing = FALSE;
1851 					state = tag_end_closing;
1852 				}
1853 				else {
1854 					cur_tag->content_offset = p - start + 1;
1855 					state = tag_end_opening;
1856 				}
1857 
1858 
1859 				continue;
1860 			}
1861 			p++;
1862 			break;
1863 
1864 		case tag_end_opening:
1865 			content_parser_env.reset();
1866 			state = html_text_content;
1867 
1868 			if (cur_tag) {
1869 				if (cur_tag->id == Tag_STYLE || cur_tag->id == Tag_NOSCRIPT || cur_tag->id == Tag_SCRIPT) {
1870 					state = tag_raw_text;
1871 				}
1872 				if (html_document_state == html_document_state::doctype) {
1873 					if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) {
1874 						html_document_state = html_document_state::head;
1875 						cur_tag->flags |= FL_IGNORE;
1876 					}
1877 					else if (cur_tag->id != Tag_HTML) {
1878 						html_document_state = html_document_state::body;
1879 					}
1880 				}
1881 				else if (html_document_state == html_document_state::head) {
1882 					if (!(cur_tag->flags & (CM_EMPTY | CM_HEAD))) {
1883 						if (parent_tag && (parent_tag->id == Tag_HEAD || !(parent_tag->flags & CM_HEAD))) {
1884 							/*
1885 							 * As by standard, we have to close the HEAD tag
1886 							 * and switch to the body state
1887 							 */
1888 							parent_tag->flags |= FL_CLOSED;
1889 							parent_tag->closing.start = cur_tag->tag_start;
1890 							parent_tag->closing.end = cur_tag->content_offset;
1891 
1892 							html_document_state = html_document_state::body;
1893 						}
1894 						else if (cur_tag->id == Tag_BODY) {
1895 							html_document_state = html_document_state::body;
1896 						}
1897 						else {
1898 							/*
1899 							 * For propagation in something like
1900 							 * <title><p><a>ololo</a></p></title> - should be unprocessed
1901 							 */
1902 							cur_tag->flags |= CM_HEAD;
1903 						}
1904 					}
1905 				}
1906 
1907 				process_opening_tag();
1908 			}
1909 
1910 			p++;
1911 			c = p;
1912 			break;
1913 		case tag_end_closing: {
1914 			if (cur_tag) {
1915 
1916 				if (cur_tag->flags & CM_EMPTY) {
1917 					/* Ignore closing empty tags */
1918 					cur_tag->flags |= FL_IGNORE;
1919 				}
1920 				if (html_document_state == html_document_state::doctype) {
1921 
1922 				}
1923 				else if (html_document_state == html_document_state::head) {
1924 					if (cur_tag->id == Tag_HEAD) {
1925 						html_document_state = html_document_state::body;
1926 					}
1927 				}
1928 
1929 				/* cur_tag here is a closing tag */
1930 				auto *next_cur_tag = html_check_balance(hc, cur_tag,
1931 						c - start, p - start + 1);
1932 
1933 				if (cur_tag->id == Tag_STYLE && allow_css) {
1934 					auto *opening_tag = cur_tag->parent;
1935 
1936 					if (opening_tag && opening_tag->id == Tag_STYLE &&
1937 						(int)opening_tag->content_offset < opening_tag->closing.start) {
1938 						auto ret_maybe = rspamd::css::parse_css(pool,
1939 								{start + opening_tag->content_offset,
1940 								 opening_tag->closing.start - opening_tag->content_offset},
1941 								std::move(hc->css_style));
1942 
1943 						if (!ret_maybe.has_value()) {
1944 							if (ret_maybe.error().is_fatal()) {
1945 								auto err_str = fmt::format(
1946 										"cannot parse css (error code: {}): {}",
1947 										static_cast<int>(ret_maybe.error().type),
1948 										ret_maybe.error().description.value_or("unknown error"));
1949 								msg_info_pool ("%*s", (int) err_str.size(), err_str.data());
1950 							}
1951 						}
1952 						else {
1953 							hc->css_style = ret_maybe.value();
1954 						}
1955 					}
1956 				}
1957 
1958 				if (next_cur_tag != nullptr) {
1959 					cur_tag = next_cur_tag;
1960 				}
1961 				else {
1962 					/*
1963 					 * Here, we handle cases like <p>lala</b>...
1964 					 * So the tag </b> is bogus and unpaired
1965 					 * However, we need to exclude it from the output of <p> tag
1966 					 * To do that, we create a fake opening tag and insert that to
1967 					 * the current opening tag
1968 					 */
1969 					auto *cur_opening_tag = cur_tag->parent;
1970 
1971 					while (cur_opening_tag && (cur_opening_tag->flags & FL_CLOSED)) {
1972 						cur_opening_tag = cur_opening_tag->parent;
1973 					}
1974 
1975 					if (!cur_opening_tag) {
1976 						cur_opening_tag = hc->root_tag;
1977 					}
1978 
1979 					auto &&vtag = std::make_unique<html_tag>();
1980 					vtag->id = cur_tag->id;
1981 					vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags;
1982 					vtag->tag_start = cur_tag->closing.start;
1983 					vtag->content_offset = p - start + 1;
1984 					vtag->closing = cur_tag->closing;
1985 					vtag->parent = cur_opening_tag;
1986 					g_assert(vtag->parent != &cur_closing_tag);
1987 					cur_opening_tag->children.push_back(vtag.get());
1988 					hc->all_tags.emplace_back(std::move(vtag));
1989 					cur_tag = cur_opening_tag;
1990 					parent_tag = cur_tag->parent;
1991 					g_assert(cur_tag->parent != &cur_closing_tag);
1992 				}
1993 			} /* if cur_tag != nullptr */
1994 			state = html_text_content;
1995 			p++;
1996 			c = p;
1997 			break;
1998 		}
1999 		case tags_limit_overflow:
2000 			msg_warn_pool("tags limit of %d tags is reached at the position %d;"
2001 						  " ignoring the rest of the HTML content",
2002 					(int) hc->all_tags.size(), (int) (p - start));
2003 			c = p;
2004 			p = end;
2005 			break;
2006 		}
2007 	}
2008 
2009 	if (cur_tag && !(cur_tag->flags & FL_CLOSED) && cur_tag != &cur_closing_tag) {
2010 		cur_closing_tag.parent = cur_tag;
2011 		cur_closing_tag.id = cur_tag->id;
2012 		cur_tag = &cur_closing_tag;
2013 		html_check_balance(hc, cur_tag,
2014 				end - start, end - start);
2015 	}
2016 
2017 	/* Propagate styles */
2018 	hc->traverse_block_tags([&hc, &pool](const html_tag *tag) -> bool {
2019 
2020 		if (hc->css_style && tag->id > Tag_UNKNOWN && tag->id < Tag_MAX) {
2021 			auto *css_block = hc->css_style->check_tag_block(tag);
2022 
2023 			if (css_block) {
2024 				if (tag->block) {
2025 					tag->block->set_block(*css_block);
2026 				}
2027 				else {
2028 					tag->block = css_block;
2029 				}
2030 			}
2031 		}
2032 		if (tag->block) {
2033 			if (!tag->block->has_display()) {
2034 				/* If we have no display field, we can check it by tag */
2035 				if (tag->flags & CM_HEAD) {
2036 					tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN,
2037 							html_block::set);
2038 				}
2039 				else if (tag->flags & (CM_BLOCK | CM_TABLE)) {
2040 					tag->block->set_display(css::css_display_value::DISPLAY_BLOCK,
2041 							html_block::implicit);
2042 				}
2043 				else if (tag->flags & CM_ROW) {
2044 					tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW,
2045 							html_block::implicit);
2046 				}
2047 				else {
2048 					tag->block->set_display(css::css_display_value::DISPLAY_INLINE,
2049 							html_block::implicit);
2050 				}
2051 			}
2052 
2053 			tag->block->compute_visibility();
2054 
2055 			for (const auto *cld_tag : tag->children) {
2056 
2057 				if (cld_tag->block) {
2058 					cld_tag->block->propagate_block(*tag->block);
2059 				}
2060 				else {
2061 					cld_tag->block = rspamd_mempool_alloc0_type(pool, html_block);
2062 					*cld_tag->block = *tag->block;
2063 				}
2064 			}
2065 		}
2066 		return true;
2067 	}, html_content::traverse_type::PRE_ORDER);
2068 
2069 	/* Leftover before content */
2070 	switch (state) {
2071 	case tag_end_opening:
2072 		if (cur_tag != nullptr) {
2073 			process_opening_tag();
2074 		}
2075 		break;
2076 	default:
2077 		/* Do nothing */
2078 		break;
2079 	}
2080 
2081 	if (!hc->all_tags.empty() && hc->root_tag) {
2082 		html_append_tag_content(pool, start, end - start, hc, hc->root_tag,
2083 				exceptions, url_set);
2084 	}
2085 
2086 	/* Leftover after content */
2087 	switch (state) {
2088 	case tags_limit_overflow:
2089 		html_append_parsed(hc, {c, (std::size_t) (end - c)},
2090 				false, end - start, hc->parsed);
2091 		break;
2092 	default:
2093 		/* Do nothing */
2094 		break;
2095 	}
2096 
2097 	if (!hc->parsed.empty()) {
2098 		/* Trim extra spaces at the at the end if needed */
2099 		if (g_ascii_isspace(hc->parsed.back())) {
2100 			auto last_it = std::end(hc->parsed);
2101 
2102 			/* Allow last newline */
2103 			if (hc->parsed.back() == '\n') {
2104 				--last_it;
2105 			}
2106 
2107 			hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(),
2108 					[](auto ch) -> auto {
2109 						return !g_ascii_isspace(ch);
2110 					}).base(),
2111 					last_it);
2112 		}
2113 	}
2114 
2115 	return hc;
2116 }
2117 
2118 static auto
html_find_image_by_cid(const html_content & hc,std::string_view cid)2119 html_find_image_by_cid(const html_content &hc, std::string_view cid)
2120 -> std::optional<const html_image *>
2121 {
2122 	for (const auto *html_image : hc.images) {
2123 		/* Filter embedded images */
2124 		if (html_image->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED &&
2125 			html_image->src != nullptr) {
2126 			if (cid == html_image->src) {
2127 				return html_image;
2128 			}
2129 		}
2130 	}
2131 
2132 	return std::nullopt;
2133 }
2134 
2135 auto
html_debug_structure(const html_content & hc)2136 html_debug_structure(const html_content &hc) -> std::string
2137 {
2138 	std::string output;
2139 
2140 	if (hc.root_tag) {
2141 		auto rec_functor = [&](const html_tag *t, int level, auto rec_functor) -> void {
2142 			std::string pluses(level, '+');
2143 
2144 			if (!(t->flags & (FL_VIRTUAL | FL_IGNORE))) {
2145 				if (t->flags & FL_XML) {
2146 					output += fmt::format("{}xml;", pluses);
2147 				}
2148 				else {
2149 					output += fmt::format("{}{};", pluses,
2150 							html_tags_defs.name_by_id_safe(t->id));
2151 				}
2152 				level++;
2153 			}
2154 			for (const auto *cld : t->children) {
2155 				rec_functor(cld, level, rec_functor);
2156 			}
2157 		};
2158 
2159 		rec_functor(hc.root_tag, 1, rec_functor);
2160 	}
2161 
2162 	return output;
2163 }
2164 
html_tag_by_name(const std::string_view & name)2165 auto html_tag_by_name(const std::string_view &name)
2166 -> std::optional<tag_id_t>
2167 {
2168 	const auto *td = rspamd::html::html_tags_defs.by_name(name);
2169 
2170 	if (td != nullptr) {
2171 		return td->id;
2172 	}
2173 
2174 	return std::nullopt;
2175 }
2176 
2177 auto
get_content(const struct html_content * hc) const2178 html_tag::get_content(const struct html_content *hc) const -> std::string_view
2179 {
2180 	const std::string *dest = &hc->parsed;
2181 
2182 	if (block && !block->is_visible()) {
2183 		dest = &hc->invisible;
2184 	}
2185 	const auto clen = get_content_length();
2186 	if (content_offset < dest->size()) {
2187 		if (dest->size() - content_offset >= clen) {
2188 			return std::string_view{*dest}.substr(content_offset, clen);
2189 		}
2190 		else {
2191 			return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset);
2192 		}
2193 	}
2194 
2195 	return std::string_view{};
2196 }
2197 
2198 }
2199 
2200 void *
rspamd_html_process_part_full(rspamd_mempool_t * pool,GByteArray * in,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls,bool allow_css)2201 rspamd_html_process_part_full(rspamd_mempool_t *pool,
2202 							  GByteArray *in, GList **exceptions,
2203 							  khash_t (rspamd_url_hash) *url_set,
2204 							  GPtrArray *part_urls,
2205 							  bool allow_css)
2206 {
2207 	return rspamd::html::html_process_input(pool, in, exceptions, url_set,
2208 			part_urls, allow_css);
2209 }
2210 
2211 void *
rspamd_html_process_part(rspamd_mempool_t * pool,GByteArray * in)2212 rspamd_html_process_part(rspamd_mempool_t *pool,
2213 						 GByteArray *in)
2214 {
2215 	return rspamd_html_process_part_full (pool, in, NULL,
2216 			NULL, NULL, FALSE);
2217 }
2218 
2219 guint
rspamd_html_decode_entitles_inplace(gchar * s,gsize len)2220 rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
2221 {
2222 	return rspamd::html::decode_html_entitles_inplace(s, len);
2223 }
2224 
2225 gint
rspamd_html_tag_by_name(const gchar * name)2226 rspamd_html_tag_by_name(const gchar *name)
2227 {
2228 	const auto *td = rspamd::html::html_tags_defs.by_name(name);
2229 
2230 	if (td != nullptr) {
2231 		return td->id;
2232 	}
2233 
2234 	return -1;
2235 }
2236 
2237 gboolean
rspamd_html_tag_seen(void * ptr,const gchar * tagname)2238 rspamd_html_tag_seen(void *ptr, const gchar *tagname)
2239 {
2240 	gint id;
2241 	auto *hc = rspamd::html::html_content::from_ptr(ptr);
2242 
2243 	g_assert (hc != NULL);
2244 
2245 	id = rspamd_html_tag_by_name(tagname);
2246 
2247 	if (id != -1) {
2248 		return hc->tags_seen[id];
2249 	}
2250 
2251 	return FALSE;
2252 }
2253 
2254 const gchar *
rspamd_html_tag_by_id(gint id)2255 rspamd_html_tag_by_id(gint id)
2256 {
2257 	if (id > Tag_UNKNOWN && id < Tag_MAX) {
2258 		const auto *td = rspamd::html::html_tags_defs.by_id(id);
2259 
2260 		if (td != nullptr) {
2261 			return td->name.c_str();
2262 		}
2263 	}
2264 
2265 	return nullptr;
2266 }
2267 
2268 const gchar *
rspamd_html_tag_name(void * p,gsize * len)2269 rspamd_html_tag_name(void *p, gsize *len)
2270 {
2271 	auto *tag = reinterpret_cast<rspamd::html::html_tag *>(p);
2272 	auto tname = rspamd::html::html_tags_defs.name_by_id_safe(tag->id);
2273 
2274 	if (len) {
2275 		*len = tname.size();
2276 	}
2277 
2278 	return tname.data();
2279 }
2280 
2281 struct html_image*
rspamd_html_find_embedded_image(void * html_content,const char * cid,gsize cid_len)2282 rspamd_html_find_embedded_image(void *html_content,
2283 								const char *cid, gsize cid_len)
2284 {
2285 	auto *hc = rspamd::html::html_content::from_ptr(html_content);
2286 
2287 	auto maybe_img = rspamd::html::html_find_image_by_cid(*hc, {cid, cid_len});
2288 
2289 	if (maybe_img) {
2290 		return (html_image *)maybe_img.value();
2291 	}
2292 
2293 	return nullptr;
2294 }
2295 
2296 bool
rspamd_html_get_parsed_content(void * html_content,rspamd_ftok_t * dest)2297 rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest)
2298 {
2299 	auto *hc = rspamd::html::html_content::from_ptr(html_content);
2300 
2301 	dest->begin = hc->parsed.data();
2302 	dest->len = hc->parsed.size();
2303 
2304 	return true;
2305 }
2306 
2307 gsize
rspamd_html_get_tags_count(void * html_content)2308 rspamd_html_get_tags_count(void *html_content)
2309 {
2310 	auto *hc = rspamd::html::html_content::from_ptr(html_content);
2311 
2312 	if (!hc) {
2313 		return 0;
2314 	}
2315 
2316 	return hc->all_tags.size();
2317 }