1 /*-
2  * Copyright 2021 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef RSPAMD_HTML_HXX
18 #define RSPAMD_HTML_HXX
19 #pragma once
20 
21 #include "config.h"
22 #include "libserver/url.h"
23 #include "libserver/html/html_tag.hxx"
24 #include "libserver/html/html.h"
25 #include "libserver/html/html_tags.h"
26 
27 
28 #include <vector>
29 #include <memory>
30 #include <string>
31 #include "function2/function2.hpp"
32 
33 namespace rspamd::css {
34 /* Forward declaration */
35 class css_style_sheet;
36 }
37 
38 namespace rspamd::html {
39 
40 struct html_block;
41 
42 struct html_content {
43 	struct rspamd_url *base_url = nullptr;
44 	struct html_tag *root_tag = nullptr;
45 	gint flags = 0;
46 	std::vector<bool> tags_seen;
47 	std::vector<html_image *> images;
48 	std::vector<std::unique_ptr<struct html_tag>> all_tags;
49 	std::string parsed;
50 	std::string invisible;
51 	std::shared_ptr<css::css_style_sheet> css_style;
52 
53 	/* Preallocate and reserve all internal structures */
html_contentrspamd::html::html_content54 	html_content() {
55 		tags_seen.resize(Tag_MAX, false);
56 		all_tags.reserve(128);
57 		parsed.reserve(256);
58 	}
59 
html_content_dtorrspamd::html::html_content60 	static void html_content_dtor(void *ptr) {
61 		delete html_content::from_ptr(ptr);
62 	}
63 
from_ptrrspamd::html::html_content64 	static auto from_ptr(void *ptr) -> html_content * {
65 		return static_cast<html_content* >(ptr);
66 	}
67 
68 	enum class traverse_type {
69 		PRE_ORDER,
70 		POST_ORDER
71 	};
traverse_block_tagsrspamd::html::html_content72 	auto traverse_block_tags(fu2::function<bool(const html_tag *)> &&func,
73 					traverse_type how = traverse_type::PRE_ORDER) const -> bool {
74 
75 		if (root_tag == nullptr) {
76 			return false;
77 		}
78 
79 		auto rec_functor_pre_order = [&](const html_tag *root, auto &&rec) -> bool {
80 			if (func(root)) {
81 
82 				for (const auto *c : root->children) {
83 					if (!rec(c, rec)) {
84 						return false;
85 					}
86 				}
87 
88 				return true;
89 			}
90 			return false;
91 		};
92 		auto rec_functor_post_order = [&](const html_tag *root, auto &&rec) -> bool {
93 			for (const auto *c : root->children) {
94 				if (!rec(c, rec)) {
95 					return false;
96 				}
97 			}
98 
99 			return func(root);
100 		};
101 
102 		switch(how) {
103 		case traverse_type::PRE_ORDER:
104 			return rec_functor_pre_order(root_tag, rec_functor_pre_order);
105 		case traverse_type::POST_ORDER:
106 			return rec_functor_post_order(root_tag, rec_functor_post_order);
107 		default:
108 			RSPAMD_UNREACHABLE;
109 		}
110 	}
111 
traverse_all_tagsrspamd::html::html_content112 	auto traverse_all_tags(fu2::function<bool(const html_tag *)> &&func) const -> bool {
113 		for (const auto &tag : all_tags) {
114 			if (!(tag->flags & (FL_XML|FL_VIRTUAL))) {
115 				if (!func(tag.get())) {
116 					return false;
117 				}
118 			}
119 		}
120 
121 		return true;
122 	}
123 
124 private:
125 	~html_content() = default;
126 };
127 
128 
129 auto html_tag_by_name(const std::string_view &name) -> std::optional<tag_id_t>;
130 auto html_process_input(rspamd_mempool_t *pool,
131 				   GByteArray *in,
132 				   GList **exceptions,
133 				   khash_t (rspamd_url_hash) *url_set,
134 				   GPtrArray *part_urls,
135 				   bool allow_css) -> html_content *;
136 auto html_debug_structure(const html_content &hc) -> std::string;
137 
138 }
139 
140 #endif //RSPAMD_HTML_HXX
141