1 /*-
2  * Copyright 2021 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "config.h"
17 #include "util.h"
18 #include "message.h"
19 #include "html.h"
20 #include "html_tags.h"
21 #include "html_block.hxx"
22 #include "html.hxx"
23 #include "libserver/css/css_value.hxx"
24 #include "libserver/css/css.hxx"
25 
26 #include "url.h"
27 #include "contrib/libucl/khash.h"
28 #include "libmime/images.h"
29 #include "libutil/cxx/utf8_util.h"
30 
31 #include "html_tag_defs.hxx"
32 #include "html_entities.hxx"
33 #include "html_tag.hxx"
34 #include "html_url.hxx"
35 
36 #include <frozen/unordered_map.h>
37 #include <frozen/string.h>
38 #include <fmt/core.h>
39 
40 #include <unicode/uversion.h>
41 
42 namespace rspamd::html {
43 
44 static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
45 
46 static const html_tags_storage html_tags_defs;
47 
48 auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>(
49 		{
50 				{"name",    html_component_type::RSPAMD_HTML_COMPONENT_NAME},
51 				{"href",    html_component_type::RSPAMD_HTML_COMPONENT_HREF},
52 				{"src",     html_component_type::RSPAMD_HTML_COMPONENT_HREF},
53 				{"action",  html_component_type::RSPAMD_HTML_COMPONENT_HREF},
54 				{"color",   html_component_type::RSPAMD_HTML_COMPONENT_COLOR},
55 				{"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
56 				{"style",   html_component_type::RSPAMD_HTML_COMPONENT_STYLE},
57 				{"class",   html_component_type::RSPAMD_HTML_COMPONENT_CLASS},
58 				{"width",   html_component_type::RSPAMD_HTML_COMPONENT_WIDTH},
59 				{"height",  html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT},
60 				{"size",    html_component_type::RSPAMD_HTML_COMPONENT_SIZE},
61 				{"rel",     html_component_type::RSPAMD_HTML_COMPONENT_REL},
62 				{"alt",     html_component_type::RSPAMD_HTML_COMPONENT_ALT},
63 				{"id",      html_component_type::RSPAMD_HTML_COMPONENT_ID},
64 				{"hidden",  html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN},
65 		});
66 
67 #define msg_debug_html(...)  rspamd_conditional_debug_fast (NULL, NULL, \
68         rspamd_html_log_id, "html", pool->tag.uid, \
69         __FUNCTION__, \
70         __VA_ARGS__)
71 
INIT_LOG_MODULE(html)72 INIT_LOG_MODULE(html)
73 
74 /*
75  * This function is expected to be called on a closing tag to fill up all tags
76  * and return the current parent (meaning unclosed) tag
77  */
78 static auto
79 html_check_balance(struct html_content *hc,
80 				   struct html_tag *tag,
81 				   goffset tag_start_offset,
82 				   goffset tag_end_offset) -> html_tag *
83 {
84 	/* As agreed, the closing tag has the last opening at the parent ptr */
85 	auto *opening_tag = tag->parent;
86 
87 	auto calculate_content_length = [tag_start_offset, tag_end_offset](html_tag *t) {
88 		auto opening_content_offset = t->content_offset;
89 
90 		if (t->flags & (CM_EMPTY)) {
91 			/* Attach closing tag just at the opening tag */
92 			t->closing.start = t->tag_start;
93 			t->closing.end = t->content_offset;
94 		}
95 		else {
96 
97 			if (opening_content_offset <= tag_start_offset) {
98 				t->closing.start = tag_start_offset;
99 				t->closing.end = tag_end_offset;
100 			}
101 			else {
102 
103 				t->closing.start = t->content_offset;
104 				t->closing.end = tag_end_offset;
105 			}
106 		}
107 	};
108 
109 	auto balance_tag = [&]() -> html_tag * {
110 		auto it = tag->parent;
111 		auto found_pair = false;
112 
113 		for (; it != nullptr; it = it->parent) {
114 			if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
115 				found_pair = true;
116 				break;
117 			}
118 
119 		}
120 
121 		/*
122 		 * If we have found a closing pair, then we need to close all tags and
123 		 * return the top-most tag
124 		 */
125 		if (found_pair) {
126 			for (it = tag->parent; it != nullptr; it = it->parent) {
127 				it->flags |= FL_CLOSED;
128 				/* Insert a virtual closing tag for all tags that are not closed */
129 				calculate_content_length(it);
130 				if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
131 					break;
132 				}
133 			}
134 
135 			return it;
136 		}
137 		else {
138 			/*
139 			 * We have not found a pair, so this closing tag is bogus and should
140 			 * be ignored completely.
141 			 * Unfortunately, it also means that we need to insert another tag,
142 			 * as the current closing tag is unusable for that purposes.
143 			 *
144 			 * We assume that callee will recognise that and reconstruct the
145 			 * tag at the tag_end_closing state, so we return nullptr...
146 			 */
147 
148 		}
149 
150 		/* Tag must be ignored and reconstructed */
151 		return nullptr;
152 	};
153 
154 	if (opening_tag) {
155 
156 		if (opening_tag->id == tag->id) {
157 			opening_tag->flags |= FL_CLOSED;
158 
159 			calculate_content_length(opening_tag);
160 			/* All good */
161 			return opening_tag->parent;
162 		}
163 		else {
164 			return balance_tag();
165 		}
166 	}
167 	else {
168 		/*
169 		 * We have no opening tag
170 		 * There are two possibilities:
171 		 *
172 		 * 1) We have some block tag in hc->all_tags;
173 		 * 2) We have no tags
174 		 */
175 
176 		if (hc->all_tags.empty()) {
177 			hc->all_tags.push_back(std::make_unique<html_tag>());
178 			auto *vtag = hc->all_tags.back().get();
179 			vtag->id = Tag_HTML;
180 			vtag->flags = FL_VIRTUAL;
181 			vtag->tag_start = 0;
182 			vtag->content_offset = 0;
183 			calculate_content_length(vtag);
184 
185 			if (!hc->root_tag) {
186 				hc->root_tag = vtag;
187 			}
188 			else {
189 				vtag->parent = hc->root_tag;
190 			}
191 
192 			tag->parent = vtag;
193 
194 			/* Recursively call with a virtual <html> tag inserted */
195 			return html_check_balance(hc, tag, tag_start_offset, tag_end_offset);
196 		}
197 	}
198 
199 	return nullptr;
200 }
201 
202 auto
html_component_from_string(const std::string_view & st)203 html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>
204 {
205 	auto known_component_it = html_components_map.find(st);
206 
207 	if (known_component_it != html_components_map.end()) {
208 		return known_component_it->second;
209 	}
210 	else {
211 		return std::nullopt;
212 	}
213 }
214 
215 struct tag_content_parser_state {
216 	int cur_state = 0;
217 	std::string buf;
218 	std::optional<html_component_type> cur_component;
219 
resetrspamd::html::tag_content_parser_state220 	void reset()
221 	{
222 		cur_state = 0;
223 		buf.clear();
224 		cur_component = std::nullopt;
225 	}
226 };
227 
228 static inline void
html_parse_tag_content(rspamd_mempool_t * pool,struct html_content * hc,struct html_tag * tag,const char * in,struct tag_content_parser_state & parser_env)229 html_parse_tag_content(rspamd_mempool_t *pool,
230 					   struct html_content *hc,
231 					   struct html_tag *tag,
232 					   const char *in,
233 					   struct tag_content_parser_state &parser_env)
234 {
235 	enum tag_parser_state {
236 		parse_start = 0,
237 		parse_name,
238 		parse_attr_name,
239 		parse_equal,
240 		parse_start_dquote,
241 		parse_dqvalue,
242 		parse_end_dquote,
243 		parse_start_squote,
244 		parse_sqvalue,
245 		parse_end_squote,
246 		parse_value,
247 		spaces_before_eq,
248 		spaces_after_eq,
249 		spaces_after_param,
250 		ignore_bad_tag,
251 		tag_end,
252 		slash_after_value,
253 		slash_in_unqouted_value,
254 	} state;
255 
256 	state = static_cast<enum tag_parser_state>(parser_env.cur_state);
257 
258 	/*
259 	 * Stores tag component if it doesn't exist, performing copy of the
260 	 * value + decoding of the entities
261 	 * Parser env is set to clear the current html attribute fields (saved_p and
262 	 * cur_component)
263 	 */
264 	auto store_component_value = [&]() -> void {
265 		if (parser_env.cur_component) {
266 
267 			if (parser_env.buf.empty()) {
268 				tag->components.emplace_back(parser_env.cur_component.value(),
269 						std::string_view{});
270 			}
271 			else {
272 				/* We need to copy buf to a persistent storage */
273 				auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
274 
275 				if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID ||
276 						parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
277 					/* Lowercase */
278 					rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size());
279 				}
280 				else {
281 					memcpy(s, parser_env.buf.data(), parser_env.buf.size());
282 				}
283 
284 				auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size());
285 				tag->components.emplace_back(parser_env.cur_component.value(),
286 						std::string_view{s, sz});
287 			}
288 		}
289 
290 		parser_env.buf.clear();
291 		parser_env.cur_component = std::nullopt;
292 	};
293 
294 	auto store_component_name = [&]() -> bool {
295 		decode_html_entitles_inplace(parser_env.buf);
296 		auto known_component_it = html_components_map.find(std::string_view{parser_env.buf});
297 		parser_env.buf.clear();
298 
299 		if (known_component_it != html_components_map.end()) {
300 			parser_env.cur_component = known_component_it->second;
301 
302 			return true;
303 		}
304 		else {
305 			parser_env.cur_component = std::nullopt;
306 		}
307 
308 		return false;
309 	};
310 
311 	auto store_value_character = [&](bool lc) -> void {
312 		auto c = lc ? g_ascii_tolower(*in) : *in;
313 
314 		if (c == '\0') {
315 			/* Replace with u0FFD */
316 			parser_env.buf.append(u8"\uFFFD");
317 		}
318 		else {
319 			parser_env.buf.push_back(c);
320 		}
321 	};
322 
323 	switch (state) {
324 	case parse_start:
325 		if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
326 			hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
327 			state = ignore_bad_tag;
328 			tag->id = N_TAGS;
329 			tag->flags |= FL_BROKEN;
330 		}
331 		else if (g_ascii_isalpha (*in)) {
332 			state = parse_name;
333 			store_value_character(true);
334 		}
335 		break;
336 
337 	case parse_name:
338 		if ((g_ascii_isspace (*in) || *in == '>' || *in == '/')) {
339 			if (*in == '/') {
340 				tag->flags |= FL_CLOSED;
341 			}
342 
343 			if (parser_env.buf.empty()) {
344 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
345 				tag->id = N_TAGS;
346 				tag->flags |= FL_BROKEN;
347 				state = ignore_bad_tag;
348 			}
349 			else {
350 				decode_html_entitles_inplace(parser_env.buf);
351 				const auto *tag_def = rspamd::html::html_tags_defs.by_name(parser_env.buf);
352 
353 				if (tag_def == nullptr) {
354 					hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
355 					/* Assign -hash to match closing tag if needed */
356 					auto nhash = static_cast<std::int32_t>(std::hash<std::string>{}(parser_env.buf));
357 					/* Always negative */
358 					tag->id = static_cast<tag_id_t>(nhash | G_MININT32);
359 				}
360 				else {
361 					tag->id = tag_def->id;
362 					tag->flags = tag_def->flags;
363 				}
364 
365 				parser_env.buf.clear();
366 
367 				state = spaces_after_param;
368 			}
369 		}
370 		else {
371 			store_value_character(true);
372 		}
373 		break;
374 
375 	case parse_attr_name:
376 		if (*in == '=') {
377 			if (!parser_env.buf.empty()) {
378 				store_component_name();
379 			}
380 			state = parse_equal;
381 		}
382 		else if (g_ascii_isspace(*in)) {
383 			store_component_name();
384 			state = spaces_before_eq;
385 		}
386 		else if (*in == '/') {
387 			store_component_name();
388 			store_component_value();
389 			state = slash_after_value;
390 		}
391 		else if (*in == '>') {
392 			store_component_name();
393 			store_component_value();
394 			state = tag_end;
395 		}
396 		else {
397 			if (*in == '"' || *in == '\'' || *in == '<') {
398 				/* Should never be in attribute names but ignored */
399 				tag->flags |= FL_BROKEN;
400 			}
401 
402 			store_value_character(true);
403 		}
404 
405 		break;
406 
407 	case spaces_before_eq:
408 		if (*in == '=') {
409 			state = parse_equal;
410 		}
411 		else if (!g_ascii_isspace (*in)) {
412 			/*
413 			 * HTML defines that crap could still be restored and
414 			 * calculated somehow... So we have to follow this stupid behaviour
415 			 */
416 			/*
417 			 * TODO: estimate what insane things do email clients in each case
418 			 */
419 			if (*in == '>') {
420 				/*
421 				 * Attribtute name followed by end of tag
422 				 * Should be okay (empty attribute). The rest is handled outside
423 				 * this automata.
424 				 */
425 				store_component_value();
426 				state = tag_end;
427 			}
428 			else if (*in == '"' || *in == '\'' || *in == '<') {
429 				/* Attribute followed by quote... Missing '=' ? Dunno, need to test */
430 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
431 				tag->flags |= FL_BROKEN;
432 				store_component_value();
433 				store_value_character(true);
434 				state = spaces_after_param;
435 			}
436 			else {
437 				/* Empty attribute */
438 				store_component_value();
439 				store_value_character(true);
440 				state = spaces_after_param;
441 			}
442 		}
443 		break;
444 
445 	case spaces_after_eq:
446 		if (*in == '"') {
447 			state = parse_start_dquote;
448 		}
449 		else if (*in == '\'') {
450 			state = parse_start_squote;
451 		}
452 		else if (!g_ascii_isspace (*in)) {
453 			store_value_character(true);
454 			state = parse_value;
455 		}
456 		break;
457 
458 	case parse_equal:
459 		if (g_ascii_isspace (*in)) {
460 			state = spaces_after_eq;
461 		}
462 		else if (*in == '"') {
463 			state = parse_start_dquote;
464 		}
465 		else if (*in == '\'') {
466 			state = parse_start_squote;
467 		}
468 		else {
469 			store_value_character(true);
470 			state = parse_value;
471 		}
472 		break;
473 
474 	case parse_start_dquote:
475 		if (*in == '"') {
476 			state = spaces_after_param;
477 		}
478 		else {
479 			store_value_character(false);
480 			state = parse_dqvalue;
481 		}
482 		break;
483 
484 	case parse_start_squote:
485 		if (*in == '\'') {
486 			state = spaces_after_param;
487 		}
488 		else {
489 			store_value_character(false);
490 			state = parse_sqvalue;
491 		}
492 		break;
493 
494 	case parse_dqvalue:
495 		if (*in == '"') {
496 			store_component_value();
497 			state = parse_end_dquote;
498 		}
499 		else {
500 			store_value_character(false);
501 		}
502 		break;
503 
504 	case parse_sqvalue:
505 		if (*in == '\'') {
506 			store_component_value();
507 			state = parse_end_squote;
508 		}
509 		else {
510 			store_value_character(false);
511 		}
512 
513 		break;
514 
515 	case parse_value:
516 		if (*in == '/') {
517 			state = slash_in_unqouted_value;
518 		}
519 		else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
520 			store_component_value();
521 			state = spaces_after_param;
522 		}
523 		else {
524 			store_value_character(false);
525 		}
526 		break;
527 
528 	case parse_end_dquote:
529 	case parse_end_squote:
530 		if (g_ascii_isspace (*in)) {
531 			state = spaces_after_param;
532 		}
533 		else if (*in == '/') {
534 			store_component_value();
535 			store_value_character(true);
536 			state = slash_after_value;
537 		}
538 		else {
539 			/* No space, proceed immediately to the attribute name */
540 			state = parse_attr_name;
541 			store_component_value();
542 			store_value_character(true);
543 		}
544 		break;
545 
546 	case spaces_after_param:
547 		if (!g_ascii_isspace (*in)) {
548 			if (*in == '/') {
549 				state = slash_after_value;
550 			}
551 			else if (*in == '=') {
552 				/* Attributes cannot start with '=' */
553 				tag->flags |= FL_BROKEN;
554 				store_value_character(true);
555 				state = parse_attr_name;
556 			}
557 			else {
558 				store_value_character(true);
559 				state = parse_attr_name;
560 			}
561 		}
562 		break;
563 	case slash_after_value:
564 		if (*in == '>') {
565 			tag->flags |= FL_CLOSED;
566 			state = tag_end;
567 		}
568 		else if (!g_ascii_isspace(*in)) {
569 			tag->flags |= FL_BROKEN;
570 			state = parse_attr_name;
571 		}
572 		break;
573 	case slash_in_unqouted_value:
574 		if (*in == '>') {
575 			/* That slash was in fact closing tag slash, wohoo */
576 			tag->flags |= FL_CLOSED;
577 			state = tag_end;
578 			store_component_value();
579 		}
580 		else {
581 			/* Welcome to the world of html, revert state and save missing / */
582 			parser_env.buf.push_back('/');
583 			store_value_character(false);
584 			state = parse_value;
585 		}
586 		break;
587 	case ignore_bad_tag:
588 	case tag_end:
589 		break;
590 	}
591 
592 	parser_env.cur_state = state;
593 }
594 
595 static inline auto
html_is_absolute_url(std::string_view st)596 html_is_absolute_url(std::string_view st) -> bool
597 {
598 	auto alnum_pos = std::find_if(std::begin(st), std::end(st),
599 			[](auto c) {return !g_ascii_isalnum(c);});
600 
601 	if (alnum_pos != std::end(st) && alnum_pos != std::begin(st)) {
602 		if (*alnum_pos == ':') {
603 			if (st.substr(0, std::distance(std::begin(st), alnum_pos)) == "mailto") {
604 				return true;
605 			}
606 
607 			std::advance(alnum_pos, 1);
608 			if (alnum_pos != std::end(st)) {
609 				/* Include even malformed urls */
610 				if (*alnum_pos == '/' || *alnum_pos == '\\') {
611 					return true;
612 				}
613 			}
614 		}
615 	}
616 
617 	return false;
618 }
619 
620 static auto
html_process_url_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc)621 html_process_url_tag(rspamd_mempool_t *pool,
622 					 struct html_tag *tag,
623 					 struct html_content *hc) -> std::optional<struct rspamd_url *>
624 {
625 	auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
626 
627 	if (found_href_maybe) {
628 		/* Check base url */
629 		auto &href_value = found_href_maybe.value();
630 
631 		if (hc && hc->base_url) {
632 			/*
633 			 * Relative url cannot start from the following:
634 			 * schema://
635 			 * data:
636 			 * slash
637 			 */
638 
639 			if (!html_is_absolute_url(href_value)) {
640 
641 				if (href_value.size() >= sizeof("data:") &&
642 					g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) {
643 					/* Image data url, never insert as url */
644 					return std::nullopt;
645 				}
646 
647 				/* Assume relative url */
648 				auto need_slash = false;
649 
650 				auto orig_len = href_value.size();
651 				auto len = orig_len + hc->base_url->urllen;
652 
653 				if (hc->base_url->datalen == 0) {
654 					need_slash = true;
655 					len++;
656 				}
657 
658 				auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
659 				auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1,
660 						"%*s%s%*s",
661 						(int) hc->base_url->urllen, hc->base_url->string,
662 						need_slash ? "/" : "",
663 						(gint) orig_len, href_value.data());
664 				href_value = {buf, nlen};
665 			}
666 			else if (href_value.size() > 2 && href_value[0] == '/' && href_value[1] != '/') {
667 				/* Relative to the hostname */
668 				auto orig_len = href_value.size();
669 				auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen +
670 						   3 /* for :// */;
671 				auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
672 				auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
673 						(int) hc->base_url->protocollen, hc->base_url->string,
674 						(int) hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
675 						(gint) orig_len, href_value.data());
676 				href_value = {buf, nlen};
677 			}
678 		}
679 
680 		auto url = html_process_url(pool, href_value);
681 
682 		if (url && std::holds_alternative<std::monostate>(tag->extra)) {
683 			tag->extra = url.value();
684 		}
685 
686 		return url;
687 	}
688 
689 	return std::nullopt;
690 }
691 
692 struct rspamd_html_url_query_cbd {
693 	rspamd_mempool_t *pool;
694 	khash_t (rspamd_url_hash) *url_set;
695 	struct rspamd_url *url;
696 	GPtrArray *part_urls;
697 };
698 
699 static gboolean
html_url_query_callback(struct rspamd_url * url,gsize start_offset,gsize end_offset,gpointer ud)700 html_url_query_callback(struct rspamd_url *url, gsize start_offset,
701 						gsize end_offset, gpointer ud)
702 {
703 	struct rspamd_html_url_query_cbd *cbd =
704 			(struct rspamd_html_url_query_cbd *) ud;
705 	rspamd_mempool_t *pool;
706 
707 	pool = cbd->pool;
708 
709 	if (url->protocol == PROTOCOL_MAILTO) {
710 		if (url->userlen == 0) {
711 			return FALSE;
712 		}
713 	}
714 
715 	msg_debug_html ("found url %s in query of url"
716 					" %*s", url->string,
717 			cbd->url->querylen, rspamd_url_query_unsafe(cbd->url));
718 
719 	url->flags |= RSPAMD_URL_FLAG_QUERY;
720 
721 	if (rspamd_url_set_add_or_increase(cbd->url_set, url, false)
722 		&& cbd->part_urls) {
723 		g_ptr_array_add(cbd->part_urls, url);
724 	}
725 
726 	return TRUE;
727 }
728 
729 static void
html_process_query_url(rspamd_mempool_t * pool,struct rspamd_url * url,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)730 html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
731 					   khash_t (rspamd_url_hash) *url_set,
732 					   GPtrArray *part_urls)
733 {
734 	if (url->querylen > 0) {
735 		struct rspamd_html_url_query_cbd qcbd;
736 
737 		qcbd.pool = pool;
738 		qcbd.url_set = url_set;
739 		qcbd.url = url;
740 		qcbd.part_urls = part_urls;
741 
742 		rspamd_url_find_multiple(pool,
743 				rspamd_url_query_unsafe (url), url->querylen,
744 				RSPAMD_URL_FIND_ALL, NULL,
745 				html_url_query_callback, &qcbd);
746 	}
747 
748 	if (part_urls) {
749 		g_ptr_array_add(part_urls, url);
750 	}
751 }
752 
753 static auto
html_process_data_image(rspamd_mempool_t * pool,struct html_image * img,std::string_view input)754 html_process_data_image(rspamd_mempool_t *pool,
755 						struct html_image *img,
756 						std::string_view input) -> void
757 {
758 	/*
759 	 * Here, we do very basic processing of the data:
760 	 * detect if we have something like: `data:image/xxx;base64,yyyzzz==`
761 	 * We only parse base64 encoded data.
762 	 * We ignore content type so far
763 	 */
764 	struct rspamd_image *parsed_image;
765 	const gchar *semicolon_pos = input.data(),
766 			*end = input.data() + input.size();
767 
768 	if ((semicolon_pos = (const gchar *) memchr(semicolon_pos, ';', end - semicolon_pos)) != NULL) {
769 		if (end - semicolon_pos > sizeof("base64,")) {
770 			if (memcmp(semicolon_pos + 1, "base64,", sizeof("base64,") - 1) == 0) {
771 				const gchar *data_pos = semicolon_pos + sizeof("base64,");
772 				gchar *decoded;
773 				gsize encoded_len = end - data_pos, decoded_len;
774 				rspamd_ftok_t inp;
775 
776 				decoded_len = (encoded_len / 4 * 3) + 12;
777 				decoded = rspamd_mempool_alloc_buffer(pool, decoded_len);
778 				rspamd_cryptobox_base64_decode(data_pos, encoded_len,
779 						reinterpret_cast<guchar *>(decoded), &decoded_len);
780 				inp.begin = decoded;
781 				inp.len = decoded_len;
782 
783 				parsed_image = rspamd_maybe_process_image(pool, &inp);
784 
785 				if (parsed_image) {
786 					msg_debug_html ("detected %s image of size %ud x %ud in data url",
787 							rspamd_image_type_str(parsed_image->type),
788 							parsed_image->width, parsed_image->height);
789 					img->embedded_image = parsed_image;
790 				}
791 			}
792 		}
793 		else {
794 			/* Nothing useful */
795 			return;
796 		}
797 	}
798 }
799 
800 static void
html_process_img_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)801 html_process_img_tag(rspamd_mempool_t *pool,
802 					 struct html_tag *tag,
803 					 struct html_content *hc,
804 					 khash_t (rspamd_url_hash) *url_set,
805 					 GPtrArray *part_urls)
806 {
807 	struct html_image *img;
808 
809 	img = rspamd_mempool_alloc0_type (pool, struct html_image);
810 	img->tag = tag;
811 
812 	for (const auto &param : tag->components) {
813 
814 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
815 			/* Check base url */
816 			const auto &href_value = param.value;
817 
818 			if (href_value.size() > 0) {
819 				rspamd_ftok_t fstr;
820 				fstr.begin = href_value.data();
821 				fstr.len = href_value.size();
822 				img->src = rspamd_mempool_ftokdup (pool, &fstr);
823 
824 				if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
825 						"cid:", sizeof("cid:") - 1) == 0) {
826 					/* We have an embedded image */
827 					img->src += sizeof("cid:") - 1;
828 					img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
829 				}
830 				else {
831 					if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
832 							"data:", sizeof("data:") - 1) == 0) {
833 						/* We have an embedded image in HTML tag */
834 						img->flags |=
835 								(RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
836 						html_process_data_image(pool, img, href_value);
837 						hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
838 					}
839 					else {
840 						img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
841 						if (img->src) {
842 
843 							std::string_view cpy{href_value};
844 							auto maybe_url = html_process_url(pool, cpy);
845 
846 							if (maybe_url) {
847 								img->url = maybe_url.value();
848 								struct rspamd_url *existing;
849 
850 								img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
851 								existing = rspamd_url_set_add_or_return(url_set,
852 										img->url);
853 
854 								if (existing && existing != img->url) {
855 									/*
856 									 * We have some other URL that could be
857 									 * found, e.g. from another part. However,
858 									 * we still want to set an image flag on it
859 									 */
860 									existing->flags |= img->url->flags;
861 									existing->count++;
862 								}
863 								else if (part_urls) {
864 									/* New url */
865 									g_ptr_array_add(part_urls, img->url);
866 								}
867 							}
868 						}
869 					}
870 				}
871 			}
872 		}
873 
874 
875 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) {
876 			unsigned long val;
877 
878 			rspamd_strtoul(param.value.data(), param.value.size(), &val);
879 			img->height = val;
880 		}
881 
882 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) {
883 			unsigned long val;
884 
885 			rspamd_strtoul(param.value.data(), param.value.size(), &val);
886 			img->width = val;
887 		}
888 
889 		/* TODO: rework to css at some time */
890 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
891 			if (img->height == 0) {
892 				auto style_st = param.value;
893 				auto pos = rspamd_substring_search_caseless(style_st.data(),
894 						style_st.size(),
895 						"height", sizeof("height") - 1);
896 				if (pos != -1) {
897 					auto substr = style_st.substr(pos + sizeof("height") - 1);
898 
899 					for (auto i = 0; i < substr.size(); i++) {
900 						auto t = substr[i];
901 						if (g_ascii_isdigit (t)) {
902 							unsigned long val;
903 							rspamd_strtoul(substr.data(),
904 									substr.size(), &val);
905 							img->height = val;
906 							break;
907 						}
908 						else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
909 							/* Fallback */
910 							break;
911 						}
912 					}
913 				}
914 			}
915 			if (img->width == 0) {
916 				auto style_st = param.value;
917 				auto pos = rspamd_substring_search_caseless(style_st.data(),
918 						style_st.size(),
919 						"width", sizeof("width") - 1);
920 				if (pos != -1) {
921 					auto substr = style_st.substr(pos + sizeof("width") - 1);
922 
923 					for (auto i = 0; i < substr.size(); i++) {
924 						auto t = substr[i];
925 						if (g_ascii_isdigit (t)) {
926 							unsigned long val;
927 							rspamd_strtoul(substr.data(),
928 									substr.size(), &val);
929 							img->width = val;
930 							break;
931 						}
932 						else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
933 							/* Fallback */
934 							break;
935 						}
936 					}
937 				}
938 			}
939 		}
940 	}
941 
942 	if (img->embedded_image) {
943 		if (img->height == 0) {
944 			img->height = img->embedded_image->height;
945 		}
946 		if (img->width == 0) {
947 			img->width = img->embedded_image->width;
948 		}
949 	}
950 
951 	hc->images.push_back(img);
952 	tag->extra = img;
953 }
954 
955 static auto
html_process_link_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls)956 html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
957 					  struct html_content *hc,
958 					  khash_t (rspamd_url_hash) *url_set,
959 					  GPtrArray *part_urls) -> void
960 {
961 	auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL);
962 
963 	if (found_rel_maybe) {
964 		if (found_rel_maybe.value() == "icon") {
965 			html_process_img_tag(pool, tag, hc, url_set, part_urls);
966 		}
967 	}
968 }
969 
970 static auto
html_process_block_tag(rspamd_mempool_t * pool,struct html_tag * tag,struct html_content * hc)971 html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
972 					   struct html_content *hc) -> void
973 {
974 	std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
975 	bool hidden = false;
976 
977 	for (const auto &param : tag->components) {
978 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
979 			maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
980 		}
981 
982 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
983 			maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
984 		}
985 
986 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
987 			tag->block = rspamd::css::parse_css_declaration(pool, param.value);
988 		}
989 
990 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) {
991 			hidden = true;
992 		}
993 	}
994 
995 	if (!tag->block) {
996 		tag->block = html_block::undefined_html_block_pool(pool);
997 	}
998 
999 	if (hidden) {
1000 		tag->block->set_display(false);
1001 	}
1002 
1003 	if (maybe_fgcolor) {
1004 		tag->block->set_fgcolor(maybe_fgcolor->to_color().value());
1005 	}
1006 
1007 	if (maybe_bgcolor) {
1008 		tag->block->set_bgcolor(maybe_bgcolor->to_color().value());
1009 	}
1010 }
1011 
1012 static inline auto
html_append_parsed(struct html_content * hc,std::string_view data,bool transparent,std::size_t input_len,std::string & dest)1013 html_append_parsed(struct html_content *hc,
1014 				   std::string_view data,
1015 				   bool transparent,
1016 				   std::size_t input_len,
1017 				   std::string &dest) -> std::size_t
1018 {
1019 	auto cur_offset = dest.size();
1020 
1021 	if (dest.size() > input_len) {
1022 		/* Impossible case, refuse to append */
1023 		return 0;
1024 	}
1025 
1026 	if (data.size() > 0) {
1027 		/* Handle multiple spaces at the begin */
1028 
1029 		if (cur_offset > 0) {
1030 			auto last = dest.back();
1031 			if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) {
1032 				dest.append(" ");
1033 				data = {data.data() + 1, data.size() - 1};
1034 				cur_offset++;
1035 			}
1036 		}
1037 
1038 		if (data.find('\0') != std::string_view::npos) {
1039 			auto replace_zero_func = [](const auto &input, auto &output) {
1040 				const auto last = input.cend();
1041 				for (auto it = input.cbegin(); it != last; ++it) {
1042 					if (*it == '\0') {
1043 						output.append(u8"\uFFFD");
1044 					}
1045 					else {
1046 						output.push_back(*it);
1047 					}
1048 				}
1049 			};
1050 
1051 			dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD"));
1052 			replace_zero_func(data, dest);
1053 			hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
1054 		}
1055 		else {
1056 			dest.append(data);
1057 		}
1058 	}
1059 
1060 	auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset,
1061 			dest.size() - cur_offset, true);
1062 
1063 	dest.resize(nlen + cur_offset);
1064 
1065 	if (transparent) {
1066 		/* Replace all visible characters with spaces */
1067 		auto start = std::next(dest.begin(), cur_offset);
1068 		std::replace_if(start, std::end(dest), [](const auto c) {
1069 			return !g_ascii_isspace(c);
1070 		}, ' ');
1071 	}
1072 
1073 	return nlen;
1074 }
1075 
1076 static auto
html_process_displayed_href_tag(rspamd_mempool_t * pool,struct html_content * hc,std::string_view data,const struct html_tag * cur_tag,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,goffset dest_offset)1077 html_process_displayed_href_tag(rspamd_mempool_t *pool,
1078 								struct html_content *hc,
1079 								std::string_view data,
1080 								const struct html_tag *cur_tag,
1081 								GList **exceptions,
1082 								khash_t (rspamd_url_hash) *url_set,
1083 								goffset dest_offset) -> void
1084 {
1085 
1086 	if (std::holds_alternative<rspamd_url *>(cur_tag->extra)) {
1087 		auto *url = std::get<rspamd_url *>(cur_tag->extra);
1088 
1089 		html_check_displayed_url(pool,
1090 				exceptions, url_set,
1091 				data,
1092 				dest_offset,
1093 				url);
1094 	}
1095 }
1096 
1097 static auto
html_append_tag_content(rspamd_mempool_t * pool,const gchar * start,gsize len,struct html_content * hc,html_tag * tag,GList ** exceptions,khash_t (rspamd_url_hash)* url_set)1098 html_append_tag_content(rspamd_mempool_t *pool,
1099 						const gchar *start, gsize len,
1100 						struct html_content *hc,
1101 						html_tag *tag,
1102 						GList **exceptions,
1103 						khash_t (rspamd_url_hash) *url_set) -> goffset
1104 {
1105 	auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false;
1106 	goffset next_tag_offset = tag->closing.end,
1107 			initial_parsed_offset = hc->parsed.size(),
1108 			initial_invisible_offset = hc->invisible.size();
1109 
1110 	auto calculate_final_tag_offsets = [&]() -> void {
1111 		if (is_visible) {
1112 			tag->content_offset = initial_parsed_offset;
1113 			tag->closing.start = hc->parsed.size();
1114 		}
1115 		else {
1116 			tag->content_offset = initial_invisible_offset;
1117 			tag->closing.start = hc->invisible.size();
1118 		}
1119 	};
1120 
1121 	if (tag->closing.end == -1) {
1122 		if (tag->closing.start != -1) {
1123 			next_tag_offset = tag->closing.start;
1124 			tag->closing.end = tag->closing.start;
1125 		}
1126 		else {
1127 			next_tag_offset = tag->content_offset;
1128 			tag->closing.end = tag->content_offset;
1129 		}
1130 	}
1131 	if (tag->closing.start == -1) {
1132 		tag->closing.start = tag->closing.end;
1133 	}
1134 
1135 	auto append_margin = [&](char c) -> void {
1136 		/* We do care about visible margins only */
1137 		if (is_visible) {
1138 			if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
1139 				if (hc->parsed.back() == ' ') {
1140 					/* We also strip extra spaces at the end, but limiting the start */
1141 					auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset);
1142 					auto first = std::find_if(hc->parsed.rbegin(), last,
1143 							[](auto ch) -> auto {
1144 								return ch != ' ';
1145 							});
1146 					hc->parsed.erase(first.base(), hc->parsed.end());
1147 					g_assert(hc->parsed.size() >= initial_parsed_offset);
1148 				}
1149 				hc->parsed.push_back(c);
1150 			}
1151 		}
1152 	};
1153 
1154 	if (tag->id == Tag_BR || tag->id == Tag_HR) {
1155 
1156 		if (!(tag->flags & FL_IGNORE)) {
1157 			hc->parsed.append("\n");
1158 		}
1159 
1160 		auto ret = tag->content_offset;
1161 		calculate_final_tag_offsets();
1162 
1163 		return ret;
1164 	}
1165 	else if ((tag->id == Tag_HEAD && (tag->flags & FL_IGNORE)) || (tag->flags & CM_HEAD)) {
1166 		auto ret = tag->closing.end;
1167 		calculate_final_tag_offsets();
1168 
1169 		return ret;
1170 	}
1171 
1172 	if ((tag->flags & (FL_COMMENT | FL_XML | FL_IGNORE | CM_HEAD))) {
1173 		is_visible = false;
1174 	}
1175 	else {
1176 		if (!tag->block) {
1177 			is_visible = true;
1178 		}
1179 		else if (!tag->block->is_visible()) {
1180 			if (!tag->block->is_transparent()) {
1181 				is_visible = false;
1182 			}
1183 			else {
1184 				if (tag->block->has_display() &&
1185 					tag->block->display == css::css_display_value::DISPLAY_HIDDEN) {
1186 					is_visible = false;
1187 				}
1188 				else {
1189 					is_transparent = true;
1190 				}
1191 			}
1192 		}
1193 		else {
1194 			if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
1195 				is_block = true;
1196 			}
1197 			else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) {
1198 				is_spaces = true;
1199 			}
1200 		}
1201 	}
1202 
1203 	if (is_block) {
1204 		append_margin('\n');
1205 	}
1206 	else if (is_spaces) {
1207 		append_margin(' ');
1208 	}
1209 
1210 	goffset cur_offset = tag->content_offset;
1211 
1212 	for (auto *cld : tag->children) {
1213 		auto enclosed_start = cld->tag_start;
1214 		goffset initial_part_len = enclosed_start - cur_offset;
1215 
1216 		if (initial_part_len > 0) {
1217 			if (is_visible) {
1218 				html_append_parsed(hc,
1219 						{start + cur_offset, std::size_t(initial_part_len)},
1220 						is_transparent, len, hc->parsed);
1221 			}
1222 			else {
1223 				html_append_parsed(hc,
1224 						{start + cur_offset, std::size_t(initial_part_len)},
1225 						is_transparent, len, hc->invisible);
1226 			}
1227 		}
1228 
1229 		auto next_offset = html_append_tag_content(pool, start, len,
1230 				hc, cld, exceptions, url_set);
1231 
1232 		/* Do not allow shifting back */
1233 		if (next_offset > cur_offset) {
1234 			cur_offset = next_offset;
1235 		}
1236 	}
1237 
1238 	if (cur_offset < tag->closing.start) {
1239 		goffset final_part_len = tag->closing.start - cur_offset;
1240 
1241 		if (final_part_len > 0) {
1242 			if (is_visible) {
1243 				html_append_parsed(hc,
1244 						{start + cur_offset, std::size_t(final_part_len)},
1245 						is_transparent,
1246 						len,
1247 						hc->parsed);
1248 			}
1249 			else {
1250 				html_append_parsed(hc,
1251 						{start + cur_offset, std::size_t(final_part_len)},
1252 						is_transparent,
1253 						len,
1254 						hc->invisible);
1255 			}
1256 		}
1257 	}
1258 	if (is_block) {
1259 		append_margin('\n');
1260 	}
1261 	else if (is_spaces) {
1262 		append_margin(' ');
1263 	}
1264 
1265 	if (is_visible) {
1266 		if (tag->id == Tag_A) {
1267 			auto written_len = hc->parsed.size() - initial_parsed_offset;
1268 			html_process_displayed_href_tag(pool, hc,
1269 					{hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)},
1270 					tag, exceptions,
1271 					url_set, initial_parsed_offset);
1272 		}
1273 		else if (tag->id == Tag_IMG) {
1274 			/* Process ALT if presented */
1275 			auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
1276 
1277 			if (maybe_alt) {
1278 				if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) {
1279 					/* Add a space */
1280 					hc->parsed += ' ';
1281 				}
1282 
1283 				hc->parsed.append(maybe_alt.value());
1284 
1285 				if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) {
1286 					/* Add a space */
1287 					hc->parsed += ' ';
1288 				}
1289 			}
1290 		}
1291 	}
1292 	else {
1293 		/* Invisible stuff */
1294 		if (std::holds_alternative<rspamd_url *>(tag->extra)) {
1295 			auto *url_enclosed = std::get<rspamd_url *>(tag->extra);
1296 
1297 			/*
1298 			 * TODO: when hash is fixed to include flags we need to remove and add
1299 			 * url to the hash set
1300 			 */
1301 			if (url_enclosed) {
1302 				url_enclosed->flags |= RSPAMD_URL_FLAG_INVISIBLE;
1303 			}
1304 		}
1305 	}
1306 
1307 	calculate_final_tag_offsets();
1308 
1309 	return next_tag_offset;
1310 }
1311 
1312 auto
html_process_input(rspamd_mempool_t * pool,GByteArray * in,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls,bool allow_css)1313 html_process_input(rspamd_mempool_t *pool,
1314 				   GByteArray *in,
1315 				   GList **exceptions,
1316 				   khash_t (rspamd_url_hash) *url_set,
1317 				   GPtrArray *part_urls,
1318 				   bool allow_css) -> html_content *
1319 {
1320 	const gchar *p, *c, *end, *start;
1321 	guchar t;
1322 	auto closing = false;
1323 	guint obrace = 0, ebrace = 0;
1324 	struct rspamd_url *url = nullptr;
1325 	gint href_offset = -1;
1326 	struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
1327 	struct tag_content_parser_state content_parser_env;
1328 
1329 	enum {
1330 		parse_start = 0,
1331 		content_before_start,
1332 		tag_begin,
1333 		sgml_tag,
1334 		xml_tag,
1335 		compound_tag,
1336 		comment_tag,
1337 		comment_content,
1338 		sgml_content,
1339 		tag_content,
1340 		tag_end_opening,
1341 		tag_end_closing,
1342 		html_text_content,
1343 		xml_tag_end,
1344 		tag_raw_text,
1345 		tag_raw_text_less_than,
1346 		tags_limit_overflow,
1347 	} state = parse_start;
1348 
1349 	enum class html_document_state {
1350 		doctype,
1351 		head,
1352 		body
1353 	} html_document_state = html_document_state::doctype;
1354 
1355 	g_assert (in != NULL);
1356 	g_assert (pool != NULL);
1357 
1358 	struct html_content *hc = new html_content;
1359 	rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
1360 
1361 	auto new_tag = [&](int flags = 0) -> struct html_tag * {
1362 
1363 		if (hc->all_tags.size() > rspamd::html::max_tags) {
1364 			hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
1365 
1366 			return nullptr;
1367 		}
1368 
1369 		hc->all_tags.emplace_back(std::make_unique<html_tag>());
1370 		auto *ntag = hc->all_tags.back().get();
1371 		ntag->tag_start = c - start;
1372 		ntag->flags = flags;
1373 
1374 		if (cur_tag && !(cur_tag->flags & (CM_EMPTY | FL_CLOSED)) && cur_tag != &cur_closing_tag) {
1375 			parent_tag = cur_tag;
1376 		}
1377 
1378 		if (flags & FL_XML) {
1379 			return ntag;
1380 		}
1381 
1382 		return ntag;
1383 	};
1384 
1385 	auto process_opening_tag = [&]() {
1386 		if (cur_tag->id > Tag_UNKNOWN) {
1387 			if (cur_tag->flags & CM_UNIQUE) {
1388 				if (!hc->tags_seen[cur_tag->id]) {
1389 					/* Duplicate tag has been found */
1390 					hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
1391 				}
1392 			}
1393 			hc->tags_seen[cur_tag->id] = true;
1394 		}
1395 
1396 		/* Shift to the first unclosed tag */
1397 		auto *pt = parent_tag;
1398 		while (pt && (pt->flags & FL_CLOSED)) {
1399 			pt = pt->parent;
1400 		}
1401 
1402 		if (pt) {
1403 			cur_tag->parent = pt;
1404 			g_assert(cur_tag->parent != cur_tag);
1405 			g_assert(cur_tag->parent != &cur_closing_tag);
1406 			parent_tag = pt;
1407 			parent_tag->children.push_back(cur_tag);
1408 		}
1409 		else {
1410 			if (hc->root_tag) {
1411 				cur_tag->parent = hc->root_tag;
1412 				g_assert(cur_tag->parent != cur_tag);
1413 				hc->root_tag->children.push_back(cur_tag);
1414 				parent_tag = hc->root_tag;
1415 			}
1416 			else {
1417 				if (cur_tag->id == Tag_HTML) {
1418 					hc->root_tag = cur_tag;
1419 				}
1420 				else {
1421 					/* Insert a fake html tag */
1422 					hc->all_tags.emplace_back(std::make_unique<html_tag>());
1423 					auto *top_tag = hc->all_tags.back().get();
1424 					top_tag->tag_start = 0;
1425 					top_tag->flags = FL_VIRTUAL;
1426 					top_tag->id = Tag_HTML;
1427 					top_tag->content_offset = 0;
1428 					top_tag->children.push_back(cur_tag);
1429 					cur_tag->parent = top_tag;
1430 					g_assert(cur_tag->parent != cur_tag);
1431 					hc->root_tag = top_tag;
1432 					parent_tag = top_tag;
1433 				}
1434 			}
1435 		}
1436 
1437 		if (cur_tag->flags & FL_HREF && html_document_state == html_document_state::body) {
1438 			auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
1439 
1440 			if (maybe_url) {
1441 				url = maybe_url.value();
1442 
1443 				if (url_set != NULL) {
1444 					struct rspamd_url *maybe_existing =
1445 							rspamd_url_set_add_or_return(url_set, maybe_url.value());
1446 					if (maybe_existing == maybe_url.value()) {
1447 						html_process_query_url(pool, url, url_set,
1448 								part_urls);
1449 					}
1450 					else {
1451 						url = maybe_existing;
1452 						/* Replace extra as well */
1453 						cur_tag->extra = maybe_existing;
1454 						/* Increase count to avoid odd checks failure */
1455 						url->count++;
1456 					}
1457 				}
1458 				if (part_urls) {
1459 					g_ptr_array_add(part_urls, url);
1460 				}
1461 
1462 				href_offset = hc->parsed.size();
1463 			}
1464 		}
1465 		else if (cur_tag->id == Tag_BASE) {
1466 			/*
1467 			 * Base is allowed only within head tag but HTML is retarded
1468 			 */
1469 			auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
1470 
1471 			if (maybe_url) {
1472 				msg_debug_html ("got valid base tag");
1473 				cur_tag->extra = maybe_url.value();
1474 				cur_tag->flags |= FL_HREF;
1475 
1476 				if (hc->base_url == nullptr) {
1477 					hc->base_url = maybe_url.value();
1478 				}
1479 				else {
1480 					msg_debug_html ("ignore redundant base tag");
1481 				}
1482 			}
1483 			else {
1484 				msg_debug_html ("got invalid base tag!");
1485 			}
1486 		}
1487 
1488 		if (cur_tag->id == Tag_IMG) {
1489 			html_process_img_tag(pool, cur_tag, hc, url_set,
1490 					part_urls);
1491 		}
1492 		else if (cur_tag->id == Tag_LINK) {
1493 			html_process_link_tag(pool, cur_tag, hc, url_set,
1494 					part_urls);
1495 		}
1496 
1497 		if (!(cur_tag->flags & CM_EMPTY)) {
1498 			html_process_block_tag(pool, cur_tag, hc);
1499 		}
1500 		else {
1501 			/* Implicitly close */
1502 			cur_tag->flags |= FL_CLOSED;
1503 		}
1504 
1505 		if (cur_tag->flags & FL_CLOSED) {
1506 			cur_tag->closing.end = cur_tag->content_offset;
1507 			cur_tag->closing.start = cur_tag->tag_start;
1508 
1509 			cur_tag = parent_tag;
1510 		}
1511 	};
1512 
1513 	p = (const char *) in->data;
1514 	c = p;
1515 	end = p + in->len;
1516 	start = c;
1517 
1518 	while (p < end) {
1519 		t = *p;
1520 
1521 		switch (state) {
1522 		case parse_start:
1523 			if (t == '<') {
1524 				state = tag_begin;
1525 			}
1526 			else {
1527 				/* We have no starting tag, so assume that it's content */
1528 				hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
1529 				cur_tag = new_tag();
1530 				html_document_state = html_document_state::body;
1531 
1532 				if (cur_tag) {
1533 					cur_tag->id = Tag_HTML;
1534 					hc->root_tag = cur_tag;
1535 					state = content_before_start;
1536 				}
1537 				else {
1538 					state = tags_limit_overflow;
1539 				}
1540 			}
1541 			break;
1542 		case content_before_start:
1543 			if (t == '<') {
1544 				state = tag_begin;
1545 			}
1546 			else {
1547 				p++;
1548 			}
1549 			break;
1550 		case tag_begin:
1551 			switch (t) {
1552 			case '<':
1553 				c = p;
1554 				p++;
1555 				closing = FALSE;
1556 				break;
1557 			case '!':
1558 				cur_tag = new_tag(FL_XML | FL_CLOSED);
1559 				if (cur_tag) {
1560 					state = sgml_tag;
1561 				}
1562 				else {
1563 					state = tags_limit_overflow;
1564 				}
1565 				p++;
1566 				break;
1567 			case '?':
1568 				cur_tag = new_tag(FL_XML | FL_CLOSED);
1569 				if (cur_tag) {
1570 					state = xml_tag;
1571 				}
1572 				else {
1573 					state = tags_limit_overflow;
1574 				}
1575 				hc->flags |= RSPAMD_HTML_FLAG_XML;
1576 				p++;
1577 				break;
1578 			case '/':
1579 				closing = TRUE;
1580 				/* We fill fake closing tag to fill it with the content parser */
1581 				cur_closing_tag.clear();
1582 				/*
1583 				 * For closing tags, we need to find some corresponding opening tag.
1584 				 * However, at this point we have not even parsed a name, so we
1585 				 * can not assume anything about balancing, etc.
1586 				 *
1587 				 * So we need to ensure that:
1588 				 * 1) We have some opening tag in the chain cur_tag->parent...
1589 				 * 2) cur_tag is nullptr - okay, html is just brain damaged
1590 				 * 3) cur_tag must NOT be equal to cur_closing tag. It means that
1591 				 * we had some poor closing tag but we still need to find an opening
1592 				 * tag... Somewhere...
1593 				 */
1594 
1595 				if (cur_tag == &cur_closing_tag) {
1596 					if (parent_tag != &cur_closing_tag) {
1597 						cur_closing_tag.parent = parent_tag;
1598 					}
1599 					else {
1600 						cur_closing_tag.parent = nullptr;
1601 					}
1602 				}
1603 				else if (cur_tag && cur_tag->flags & FL_CLOSED) {
1604 					/* Cur tag is already closed, we should find something else */
1605 					auto *tmp = cur_tag;
1606 					while (tmp) {
1607 						tmp = tmp->parent;
1608 
1609 						if (tmp == nullptr || !(tmp->flags & FL_CLOSED)) {
1610 							break;
1611 						}
1612 					}
1613 
1614 					cur_closing_tag.parent = tmp;
1615 				}
1616 				else {
1617 					cur_closing_tag.parent = cur_tag;
1618 				}
1619 
1620 				cur_tag = &cur_closing_tag;
1621 				p++;
1622 				break;
1623 			case '>':
1624 				/* Empty tag */
1625 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1626 				state = html_text_content;
1627 				continue;
1628 			default:
1629 				if (g_ascii_isalpha(t)) {
1630 					state = tag_content;
1631 					content_parser_env.reset();
1632 
1633 					if (!closing) {
1634 						cur_tag = new_tag();
1635 					}
1636 
1637 					if (cur_tag) {
1638 						state = tag_content;
1639 					}
1640 					else {
1641 						state = tags_limit_overflow;
1642 					}
1643 				}
1644 				else {
1645 					/* Wrong bad tag */
1646 					state = html_text_content;
1647 				}
1648 				break;
1649 			}
1650 
1651 			break;
1652 
1653 		case sgml_tag:
1654 			switch (t) {
1655 			case '[':
1656 				state = compound_tag;
1657 				obrace = 1;
1658 				ebrace = 0;
1659 				p++;
1660 				break;
1661 			case '-':
1662 				cur_tag->flags |= FL_COMMENT;
1663 				state = comment_tag;
1664 				p++;
1665 				break;
1666 			default:
1667 				state = sgml_content;
1668 				break;
1669 			}
1670 
1671 			break;
1672 
1673 		case xml_tag:
1674 			if (t == '?') {
1675 				state = xml_tag_end;
1676 			}
1677 			else if (t == '>') {
1678 				/* Misformed xml tag */
1679 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1680 				state = tag_end_opening;
1681 				continue;
1682 			}
1683 			/* We efficiently ignore xml tags */
1684 			p++;
1685 			break;
1686 
1687 		case xml_tag_end:
1688 			if (t == '>') {
1689 				state = tag_end_opening;
1690 				cur_tag->content_offset = p - start + 1;
1691 				continue;
1692 			}
1693 			else {
1694 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1695 			}
1696 			p++;
1697 			break;
1698 
1699 		case compound_tag:
1700 			if (t == '[') {
1701 				obrace++;
1702 			}
1703 			else if (t == ']') {
1704 				ebrace++;
1705 			}
1706 			else if (t == '>' && obrace == ebrace) {
1707 				state = tag_end_opening;
1708 				cur_tag->content_offset = p - start + 1;
1709 				continue;
1710 			}
1711 			p++;
1712 			break;
1713 
1714 		case comment_tag:
1715 			if (t != '-') {
1716 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1717 				state = tag_end_opening;
1718 			}
1719 			else {
1720 				p++;
1721 				ebrace = 0;
1722 				/*
1723 				 * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments
1724 				 *  ... the text must not start with a single
1725 				 *  U+003E GREATER-THAN SIGN character (>),
1726 				 *  nor start with a "-" (U+002D) character followed by
1727 				 *  a U+003E GREATER-THAN SIGN (>) character,
1728 				 *  nor contain two consecutive U+002D HYPHEN-MINUS
1729 				 *  characters (--), nor end with a "-" (U+002D) character.
1730 				 */
1731 				if (p[0] == '-' && p + 1 < end && p[1] == '>') {
1732 					hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1733 					p++;
1734 					state = tag_end_opening;
1735 				}
1736 				else if (*p == '>') {
1737 					hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
1738 					state = tag_end_opening;
1739 				}
1740 				else {
1741 					state = comment_content;
1742 				}
1743 			}
1744 			break;
1745 
1746 		case comment_content:
1747 			if (t == '-') {
1748 				ebrace++;
1749 			}
1750 			else if (t == '>' && ebrace >= 2) {
1751 				cur_tag->content_offset = p - start + 1;
1752 				state = tag_end_opening;
1753 				continue;
1754 			}
1755 			else {
1756 				ebrace = 0;
1757 			}
1758 
1759 			p++;
1760 			break;
1761 
1762 		case html_text_content:
1763 			if (t != '<') {
1764 				p++;
1765 			}
1766 			else {
1767 				state = tag_begin;
1768 			}
1769 			break;
1770 
1771 		case tag_raw_text:
1772 			if (t == '<') {
1773 				c = p;
1774 				state = tag_raw_text_less_than;
1775 			}
1776 			p ++;
1777 			break;
1778 		case tag_raw_text_less_than:
1779 			if (t == '/') {
1780 				/* Here are special things: we look for obrace and then ensure
1781 				 * that if there is any closing brace nearby
1782 				 * (we look maximum at 30 characters). We also need to ensure
1783 				 * that we have no special characters, such as punctuation marks and
1784 				 * so on.
1785 				 * Basically, we validate the input to be sane.
1786 				 * Since closing tags must not have attributes, these assumptions
1787 				 * seems to be reasonable enough for our toy parser.
1788 				 */
1789 				gint cur_lookahead = 1;
1790 				gint max_lookahead = MIN (end - p, 30);
1791 				bool valid_closing_tag = true;
1792 
1793 				if (p + 1 < end && !g_ascii_isalpha (p[1])) {
1794 					valid_closing_tag = false;
1795 				}
1796 				else {
1797 					while (cur_lookahead < max_lookahead) {
1798 						gchar tt = p[cur_lookahead];
1799 						if (tt == '>') {
1800 							break;
1801 						}
1802 						else if (tt < '\n' || tt == ',') {
1803 							valid_closing_tag = false;
1804 							break;
1805 						}
1806 						cur_lookahead ++;
1807 					}
1808 
1809 					if (cur_lookahead == max_lookahead) {
1810 						valid_closing_tag = false;
1811 					}
1812 				}
1813 
1814 				if (valid_closing_tag) {
1815 					/* Shift back */
1816 					p = c;
1817 					state = tag_begin;
1818 				}
1819 				else {
1820 					p ++;
1821 					state = tag_raw_text;
1822 				}
1823 			}
1824 			else {
1825 				p ++;
1826 				state = tag_raw_text;
1827 			}
1828 			break;
1829 		case sgml_content:
1830 			/* TODO: parse DOCTYPE here */
1831 			if (t == '>') {
1832 				cur_tag->content_offset = p - start + 1;
1833 				state = tag_end_opening;
1834 			}
1835 			else {
1836 				p++;
1837 			}
1838 			break;
1839 
1840 		case tag_content:
1841 			html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env);
1842 
1843 			if (t == '>') {
1844 				if (closing) {
1845 					cur_tag->closing.start = c - start;
1846 					cur_tag->closing.end = p - start + 1;
1847 
1848 					closing = FALSE;
1849 					state = tag_end_closing;
1850 				}
1851 				else {
1852 					cur_tag->content_offset = p - start + 1;
1853 					state = tag_end_opening;
1854 				}
1855 
1856 
1857 				continue;
1858 			}
1859 			p++;
1860 			break;
1861 
1862 		case tag_end_opening:
1863 			content_parser_env.reset();
1864 			state = html_text_content;
1865 
1866 			if (cur_tag) {
1867 				if (cur_tag->id == Tag_STYLE || cur_tag->id == Tag_NOSCRIPT || cur_tag->id == Tag_SCRIPT) {
1868 					state = tag_raw_text;
1869 				}
1870 				if (html_document_state == html_document_state::doctype) {
1871 					if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) {
1872 						html_document_state = html_document_state::head;
1873 						cur_tag->flags |= FL_IGNORE;
1874 					}
1875 					else if (cur_tag->id != Tag_HTML) {
1876 						html_document_state = html_document_state::body;
1877 					}
1878 				}
1879 				else if (html_document_state == html_document_state::head) {
1880 					if (!(cur_tag->flags & (CM_EMPTY | CM_HEAD))) {
1881 						if (parent_tag && (parent_tag->id == Tag_HEAD || !(parent_tag->flags & CM_HEAD))) {
1882 							/*
1883 							 * As by standard, we have to close the HEAD tag
1884 							 * and switch to the body state
1885 							 */
1886 							parent_tag->flags |= FL_CLOSED;
1887 							parent_tag->closing.start = cur_tag->tag_start;
1888 							parent_tag->closing.end = cur_tag->content_offset;
1889 
1890 							html_document_state = html_document_state::body;
1891 						}
1892 						else if (cur_tag->id == Tag_BODY) {
1893 							html_document_state = html_document_state::body;
1894 						}
1895 						else {
1896 							/*
1897 							 * For propagation in something like
1898 							 * <title><p><a>ololo</a></p></title> - should be unprocessed
1899 							 */
1900 							cur_tag->flags |= CM_HEAD;
1901 						}
1902 					}
1903 				}
1904 
1905 				process_opening_tag();
1906 			}
1907 
1908 			p++;
1909 			c = p;
1910 			break;
1911 		case tag_end_closing: {
1912 			if (cur_tag) {
1913 
1914 				if (cur_tag->flags & CM_EMPTY) {
1915 					/* Ignore closing empty tags */
1916 					cur_tag->flags |= FL_IGNORE;
1917 				}
1918 				if (html_document_state == html_document_state::doctype) {
1919 
1920 				}
1921 				else if (html_document_state == html_document_state::head) {
1922 					if (cur_tag->id == Tag_HEAD) {
1923 						html_document_state = html_document_state::body;
1924 					}
1925 				}
1926 
1927 				/* cur_tag here is a closing tag */
1928 				auto *next_cur_tag = html_check_balance(hc, cur_tag,
1929 						c - start, p - start + 1);
1930 
1931 				if (cur_tag->id == Tag_STYLE && allow_css) {
1932 					auto *opening_tag = cur_tag->parent;
1933 
1934 					if (opening_tag && opening_tag->id == Tag_STYLE &&
1935 						(int)opening_tag->content_offset < opening_tag->closing.start) {
1936 						auto ret_maybe = rspamd::css::parse_css(pool,
1937 								{start + opening_tag->content_offset,
1938 								 opening_tag->closing.start - opening_tag->content_offset},
1939 								std::move(hc->css_style));
1940 
1941 						if (!ret_maybe.has_value()) {
1942 							if (ret_maybe.error().is_fatal()) {
1943 								auto err_str = fmt::format(
1944 										"cannot parse css (error code: {}): {}",
1945 										static_cast<int>(ret_maybe.error().type),
1946 										ret_maybe.error().description.value_or("unknown error"));
1947 								msg_info_pool ("%*s", (int) err_str.size(), err_str.data());
1948 							}
1949 						}
1950 						else {
1951 							hc->css_style = ret_maybe.value();
1952 						}
1953 					}
1954 				}
1955 
1956 				if (next_cur_tag != nullptr) {
1957 					cur_tag = next_cur_tag;
1958 				}
1959 				else {
1960 					/*
1961 					 * Here, we handle cases like <p>lala</b>...
1962 					 * So the tag </b> is bogus and unpaired
1963 					 * However, we need to exclude it from the output of <p> tag
1964 					 * To do that, we create a fake opening tag and insert that to
1965 					 * the current opening tag
1966 					 */
1967 					auto *cur_opening_tag = cur_tag->parent;
1968 
1969 					while (cur_opening_tag && (cur_opening_tag->flags & FL_CLOSED)) {
1970 						cur_opening_tag = cur_opening_tag->parent;
1971 					}
1972 
1973 					if (!cur_opening_tag) {
1974 						cur_opening_tag = hc->root_tag;
1975 					}
1976 
1977 					auto &&vtag = std::make_unique<html_tag>();
1978 					vtag->id = cur_tag->id;
1979 					vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags;
1980 					vtag->tag_start = cur_tag->closing.start;
1981 					vtag->content_offset = p - start + 1;
1982 					vtag->closing = cur_tag->closing;
1983 					vtag->parent = cur_opening_tag;
1984 					g_assert(vtag->parent != &cur_closing_tag);
1985 					cur_opening_tag->children.push_back(vtag.get());
1986 					hc->all_tags.emplace_back(std::move(vtag));
1987 					cur_tag = cur_opening_tag;
1988 					parent_tag = cur_tag->parent;
1989 					g_assert(cur_tag->parent != &cur_closing_tag);
1990 				}
1991 			} /* if cur_tag != nullptr */
1992 			state = html_text_content;
1993 			p++;
1994 			c = p;
1995 			break;
1996 		}
1997 		case tags_limit_overflow:
1998 			msg_warn_pool("tags limit of %d tags is reached at the position %d;"
1999 						  " ignoring the rest of the HTML content",
2000 					(int) hc->all_tags.size(), (int) (p - start));
2001 			c = p;
2002 			p = end;
2003 			break;
2004 		}
2005 	}
2006 
2007 	if (cur_tag && !(cur_tag->flags & FL_CLOSED) && cur_tag != &cur_closing_tag) {
2008 		cur_closing_tag.parent = cur_tag;
2009 		cur_closing_tag.id = cur_tag->id;
2010 		cur_tag = &cur_closing_tag;
2011 		html_check_balance(hc, cur_tag,
2012 				end - start, end - start);
2013 	}
2014 
2015 	/* Propagate styles */
2016 	hc->traverse_block_tags([&hc, &pool](const html_tag *tag) -> bool {
2017 
2018 		if (hc->css_style) {
2019 			auto *css_block = hc->css_style->check_tag_block(tag);
2020 
2021 			if (css_block) {
2022 				if (tag->block) {
2023 					tag->block->set_block(*css_block);
2024 				}
2025 				else {
2026 					tag->block = css_block;
2027 				}
2028 			}
2029 		}
2030 		if (tag->block) {
2031 			if (!tag->block->has_display()) {
2032 				/* If we have no display field, we can check it by tag */
2033 				if (tag->flags & CM_HEAD) {
2034 					tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN,
2035 							html_block::set);
2036 				}
2037 				else if (tag->flags & (CM_BLOCK | CM_TABLE)) {
2038 					tag->block->set_display(css::css_display_value::DISPLAY_BLOCK,
2039 							html_block::implicit);
2040 				}
2041 				else if (tag->flags & CM_ROW) {
2042 					tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW,
2043 							html_block::implicit);
2044 				}
2045 				else {
2046 					tag->block->set_display(css::css_display_value::DISPLAY_INLINE,
2047 							html_block::implicit);
2048 				}
2049 			}
2050 
2051 			tag->block->compute_visibility();
2052 
2053 			for (const auto *cld_tag : tag->children) {
2054 
2055 				if (cld_tag->block) {
2056 					cld_tag->block->propagate_block(*tag->block);
2057 				}
2058 				else {
2059 					cld_tag->block = rspamd_mempool_alloc0_type(pool, html_block);
2060 					*cld_tag->block = *tag->block;
2061 				}
2062 			}
2063 		}
2064 		return true;
2065 	}, html_content::traverse_type::PRE_ORDER);
2066 
2067 	/* Leftover before content */
2068 	switch (state) {
2069 	case tag_end_opening:
2070 		if (cur_tag != nullptr) {
2071 			process_opening_tag();
2072 		}
2073 		break;
2074 	default:
2075 		/* Do nothing */
2076 		break;
2077 	}
2078 
2079 	if (!hc->all_tags.empty() && hc->root_tag) {
2080 		html_append_tag_content(pool, start, end - start, hc, hc->root_tag,
2081 				exceptions, url_set);
2082 	}
2083 
2084 	/* Leftover after content */
2085 	switch (state) {
2086 	case tag_end_opening:
2087 		if (cur_tag != nullptr) {
2088 			process_opening_tag();
2089 		}
2090 		break;
2091 	case tags_limit_overflow:
2092 		html_append_parsed(hc, {c, (std::size_t) (end - c)},
2093 				false, end - start, hc->parsed);
2094 		break;
2095 	default:
2096 		/* Do nothing */
2097 		break;
2098 	}
2099 
2100 	if (!hc->parsed.empty()) {
2101 		/* Trim extra spaces at the at the end if needed */
2102 		if (g_ascii_isspace(hc->parsed.back())) {
2103 			auto last_it = std::end(hc->parsed);
2104 
2105 			/* Allow last newline */
2106 			if (hc->parsed.back() == '\n') {
2107 				--last_it;
2108 			}
2109 
2110 			hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(),
2111 					[](auto ch) -> auto {
2112 						return !g_ascii_isspace(ch);
2113 					}).base(),
2114 					last_it);
2115 		}
2116 	}
2117 
2118 	return hc;
2119 }
2120 
2121 static auto
html_find_image_by_cid(const html_content & hc,std::string_view cid)2122 html_find_image_by_cid(const html_content &hc, std::string_view cid)
2123 -> std::optional<const html_image *>
2124 {
2125 	for (const auto *html_image : hc.images) {
2126 		/* Filter embedded images */
2127 		if (html_image->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED &&
2128 			html_image->src != nullptr) {
2129 			if (cid == html_image->src) {
2130 				return html_image;
2131 			}
2132 		}
2133 	}
2134 
2135 	return std::nullopt;
2136 }
2137 
2138 auto
html_debug_structure(const html_content & hc)2139 html_debug_structure(const html_content &hc) -> std::string
2140 {
2141 	std::string output;
2142 
2143 	if (hc.root_tag) {
2144 		auto rec_functor = [&](const html_tag *t, int level, auto rec_functor) -> void {
2145 			std::string pluses(level, '+');
2146 
2147 			if (!(t->flags & (FL_VIRTUAL | FL_IGNORE))) {
2148 				if (t->flags & FL_XML) {
2149 					output += fmt::format("{}xml;", pluses);
2150 				}
2151 				else {
2152 					output += fmt::format("{}{};", pluses,
2153 							html_tags_defs.name_by_id_safe(t->id));
2154 				}
2155 				level++;
2156 			}
2157 			for (const auto *cld : t->children) {
2158 				rec_functor(cld, level, rec_functor);
2159 			}
2160 		};
2161 
2162 		rec_functor(hc.root_tag, 1, rec_functor);
2163 	}
2164 
2165 	return output;
2166 }
2167 
html_tag_by_name(const std::string_view & name)2168 auto html_tag_by_name(const std::string_view &name)
2169 -> std::optional<tag_id_t>
2170 {
2171 	const auto *td = rspamd::html::html_tags_defs.by_name(name);
2172 
2173 	if (td != nullptr) {
2174 		return td->id;
2175 	}
2176 
2177 	return std::nullopt;
2178 }
2179 
2180 auto
get_content(const struct html_content * hc) const2181 html_tag::get_content(const struct html_content *hc) const -> std::string_view
2182 {
2183 	const std::string *dest = &hc->parsed;
2184 
2185 	if (block && !block->is_visible()) {
2186 		dest = &hc->invisible;
2187 	}
2188 	const auto clen = get_content_length();
2189 	if (content_offset < dest->size()) {
2190 		if (dest->size() - content_offset >= clen) {
2191 			return std::string_view{*dest}.substr(content_offset, clen);
2192 		}
2193 		else {
2194 			return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset);
2195 		}
2196 	}
2197 
2198 	return std::string_view{};
2199 }
2200 
2201 }
2202 
2203 void *
rspamd_html_process_part_full(rspamd_mempool_t * pool,GByteArray * in,GList ** exceptions,khash_t (rspamd_url_hash)* url_set,GPtrArray * part_urls,bool allow_css)2204 rspamd_html_process_part_full(rspamd_mempool_t *pool,
2205 							  GByteArray *in, GList **exceptions,
2206 							  khash_t (rspamd_url_hash) *url_set,
2207 							  GPtrArray *part_urls,
2208 							  bool allow_css)
2209 {
2210 	return rspamd::html::html_process_input(pool, in, exceptions, url_set,
2211 			part_urls, allow_css);
2212 }
2213 
2214 void *
rspamd_html_process_part(rspamd_mempool_t * pool,GByteArray * in)2215 rspamd_html_process_part(rspamd_mempool_t *pool,
2216 						 GByteArray *in)
2217 {
2218 	return rspamd_html_process_part_full (pool, in, NULL,
2219 			NULL, NULL, FALSE);
2220 }
2221 
2222 guint
rspamd_html_decode_entitles_inplace(gchar * s,gsize len)2223 rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
2224 {
2225 	return rspamd::html::decode_html_entitles_inplace(s, len);
2226 }
2227 
2228 gint
rspamd_html_tag_by_name(const gchar * name)2229 rspamd_html_tag_by_name(const gchar *name)
2230 {
2231 	const auto *td = rspamd::html::html_tags_defs.by_name(name);
2232 
2233 	if (td != nullptr) {
2234 		return td->id;
2235 	}
2236 
2237 	return -1;
2238 }
2239 
2240 gboolean
rspamd_html_tag_seen(void * ptr,const gchar * tagname)2241 rspamd_html_tag_seen(void *ptr, const gchar *tagname)
2242 {
2243 	gint id;
2244 	auto *hc = rspamd::html::html_content::from_ptr(ptr);
2245 
2246 	g_assert (hc != NULL);
2247 
2248 	id = rspamd_html_tag_by_name(tagname);
2249 
2250 	if (id != -1) {
2251 		return hc->tags_seen[id];
2252 	}
2253 
2254 	return FALSE;
2255 }
2256 
2257 const gchar *
rspamd_html_tag_by_id(gint id)2258 rspamd_html_tag_by_id(gint id)
2259 {
2260 	const auto *td = rspamd::html::html_tags_defs.by_id(id);
2261 
2262 	if (td != nullptr) {
2263 		return td->name.c_str();
2264 	}
2265 
2266 	return nullptr;
2267 }
2268 
2269 const gchar *
rspamd_html_tag_name(void * p,gsize * len)2270 rspamd_html_tag_name(void *p, gsize *len)
2271 {
2272 	auto *tag = reinterpret_cast<rspamd::html::html_tag *>(p);
2273 	auto tname = rspamd::html::html_tags_defs.name_by_id_safe(tag->id);
2274 
2275 	if (len) {
2276 		*len = tname.size();
2277 	}
2278 
2279 	return tname.data();
2280 }
2281 
2282 struct html_image*
rspamd_html_find_embedded_image(void * html_content,const char * cid,gsize cid_len)2283 rspamd_html_find_embedded_image(void *html_content,
2284 								const char *cid, gsize cid_len)
2285 {
2286 	auto *hc = rspamd::html::html_content::from_ptr(html_content);
2287 
2288 	auto maybe_img = rspamd::html::html_find_image_by_cid(*hc, {cid, cid_len});
2289 
2290 	if (maybe_img) {
2291 		return (html_image *)maybe_img.value();
2292 	}
2293 
2294 	return nullptr;
2295 }
2296 
2297 bool
rspamd_html_get_parsed_content(void * html_content,rspamd_ftok_t * dest)2298 rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest)
2299 {
2300 	auto *hc = rspamd::html::html_content::from_ptr(html_content);
2301 
2302 	dest->begin = hc->parsed.data();
2303 	dest->len = hc->parsed.size();
2304 
2305 	return true;
2306 }
2307 
2308 gsize
rspamd_html_get_tags_count(void * html_content)2309 rspamd_html_get_tags_count(void *html_content)
2310 {
2311 	auto *hc = rspamd::html::html_content::from_ptr(html_content);
2312 
2313 	if (!hc) {
2314 		return 0;
2315 	}
2316 
2317 	return hc->all_tags.size();
2318 }