1 /*-
2  * Copyright 2021 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef RSPAMD_HTML_TAG_HXX
18 #define RSPAMD_HTML_TAG_HXX
19 #pragma once
20 
21 #include <utility>
22 #include <string_view>
23 #include <variant>
24 #include <vector>
25 #include <optional>
26 
27 #include "html_tags.h"
28 
29 struct rspamd_url;
30 struct html_image;
31 
32 namespace rspamd::html {
33 
34 struct html_content; /* Forward declaration */
35 
36 enum class html_component_type : std::uint8_t {
37 	RSPAMD_HTML_COMPONENT_NAME = 0,
38 	RSPAMD_HTML_COMPONENT_HREF,
39 	RSPAMD_HTML_COMPONENT_COLOR,
40 	RSPAMD_HTML_COMPONENT_BGCOLOR,
41 	RSPAMD_HTML_COMPONENT_STYLE,
42 	RSPAMD_HTML_COMPONENT_CLASS,
43 	RSPAMD_HTML_COMPONENT_WIDTH,
44 	RSPAMD_HTML_COMPONENT_HEIGHT,
45 	RSPAMD_HTML_COMPONENT_SIZE,
46 	RSPAMD_HTML_COMPONENT_REL,
47 	RSPAMD_HTML_COMPONENT_ALT,
48 	RSPAMD_HTML_COMPONENT_ID,
49 	RSPAMD_HTML_COMPONENT_HIDDEN,
50 };
51 
52 /* Public tags flags */
53 /* XML tag */
54 #define FL_XML          (1u << CM_USER_SHIFT)
55 /* Fully closed tag (e.g. <a attrs />) */
56 #define FL_CLOSED       (1 << (CM_USER_SHIFT + 1))
57 #define FL_BROKEN       (1 << (CM_USER_SHIFT + 2))
58 #define FL_IGNORE       (1 << (CM_USER_SHIFT + 3))
59 #define FL_BLOCK        (1 << (CM_USER_SHIFT + 4))
60 #define FL_HREF         (1 << (CM_USER_SHIFT + 5))
61 #define FL_COMMENT      (1 << (CM_USER_SHIFT + 6))
62 #define FL_VIRTUAL      (1 << (CM_USER_SHIFT + 7))
63 
64 /**
65  * Returns component type from a string
66  * @param st
67  * @return
68  */
69 auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>;
70 
71 using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>;
72 struct html_tag_component {
73 	html_component_type type;
74 	std::string_view value;
75 
html_tag_componentrspamd::html::html_tag_component76 	html_tag_component(html_component_type type, std::string_view value)
77 		: type(type), value(value) {}
78 };
79 
80 /* Pairing closing tag representation */
81 struct html_closing_tag {
82 	int start = -1;
83 	int end = -1;
84 
clearrspamd::html::html_closing_tag85 	auto clear() -> void {
86 		start = end = -1;
87 	}
88 };
89 
90 struct html_tag {
91 	unsigned int tag_start = 0;
92 	unsigned int content_offset = 0;
93 	std::uint32_t flags = 0;
94 	tag_id_t id = Tag_UNKNOWN;
95 	html_closing_tag closing;
96 
97 	std::vector<html_tag_component> components;
98 
99 	html_tag_extra_t extra;
100 	mutable struct html_block *block = nullptr;
101 	std::vector<struct html_tag *> children;
102 	struct html_tag *parent;
103 
find_componentrspamd::html::html_tag104 	auto find_component(html_component_type what) const -> std::optional<std::string_view>
105 	{
106 		for (const auto &comp : components) {
107 			if (comp.type == what) {
108 				return comp.value;
109 			}
110 		}
111 
112 		return std::nullopt;
113 	}
114 
find_componentrspamd::html::html_tag115 	auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view>
116 	{
117 		if (what) {
118 			return find_component(what.value());
119 		}
120 
121 		return std::nullopt;
122 	}
123 
clearrspamd::html::html_tag124 	auto clear(void) -> void {
125 		id = Tag_UNKNOWN;
126 		tag_start = content_offset = 0;
127 		extra = std::monostate{};
128 		components.clear();
129 		flags = 0;
130 		block = nullptr;
131 		children.clear();
132 		closing.clear();
133 	}
134 
get_content_lengthrspamd::html::html_tag135 	constexpr auto get_content_length() const -> std::size_t {
136 		if (flags & (FL_IGNORE|CM_HEAD)) {
137 			return 0;
138 		}
139 		if (closing.start > content_offset) {
140 			return closing.start - content_offset;
141 		}
142 
143 		return 0;
144 	}
145 
146 	auto get_content(const struct html_content *hc) const -> std::string_view;
147 };
148 
149 static_assert(CM_USER_SHIFT + 7 < sizeof(html_tag::flags) * NBBY);
150 
151 }
152 
153 #endif //RSPAMD_HTML_TAG_HXX
154