1 /*- 2 * Copyright 2021 Vsevolod Stakhov 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef RSPAMD_HTML_TAG_HXX 18 #define RSPAMD_HTML_TAG_HXX 19 #pragma once 20 21 #include <utility> 22 #include <string_view> 23 #include <variant> 24 #include <vector> 25 #include <optional> 26 27 #include "html_tags.h" 28 29 struct rspamd_url; 30 struct html_image; 31 32 namespace rspamd::html { 33 34 struct html_content; /* Forward declaration */ 35 36 enum class html_component_type : std::uint8_t { 37 RSPAMD_HTML_COMPONENT_NAME = 0, 38 RSPAMD_HTML_COMPONENT_HREF, 39 RSPAMD_HTML_COMPONENT_COLOR, 40 RSPAMD_HTML_COMPONENT_BGCOLOR, 41 RSPAMD_HTML_COMPONENT_STYLE, 42 RSPAMD_HTML_COMPONENT_CLASS, 43 RSPAMD_HTML_COMPONENT_WIDTH, 44 RSPAMD_HTML_COMPONENT_HEIGHT, 45 RSPAMD_HTML_COMPONENT_SIZE, 46 RSPAMD_HTML_COMPONENT_REL, 47 RSPAMD_HTML_COMPONENT_ALT, 48 RSPAMD_HTML_COMPONENT_ID, 49 RSPAMD_HTML_COMPONENT_HIDDEN, 50 }; 51 52 /* Public tags flags */ 53 /* XML tag */ 54 #define FL_XML (1u << CM_USER_SHIFT) 55 /* Fully closed tag (e.g. <a attrs />) */ 56 #define FL_CLOSED (1 << (CM_USER_SHIFT + 1)) 57 #define FL_BROKEN (1 << (CM_USER_SHIFT + 2)) 58 #define FL_IGNORE (1 << (CM_USER_SHIFT + 3)) 59 #define FL_BLOCK (1 << (CM_USER_SHIFT + 4)) 60 #define FL_HREF (1 << (CM_USER_SHIFT + 5)) 61 #define FL_COMMENT (1 << (CM_USER_SHIFT + 6)) 62 #define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7)) 63 64 /** 65 * Returns component type from a string 66 * @param st 67 * @return 68 */ 69 auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>; 70 71 using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>; 72 struct html_tag_component { 73 html_component_type type; 74 std::string_view value; 75 html_tag_componentrspamd::html::html_tag_component76 html_tag_component(html_component_type type, std::string_view value) 77 : type(type), value(value) {} 78 }; 79 80 /* Pairing closing tag representation */ 81 struct html_closing_tag { 82 int start = -1; 83 int end = -1; 84 clearrspamd::html::html_closing_tag85 auto clear() -> void { 86 start = end = -1; 87 } 88 }; 89 90 struct html_tag { 91 unsigned int tag_start = 0; 92 unsigned int content_offset = 0; 93 std::uint32_t flags = 0; 94 tag_id_t id = Tag_UNKNOWN; 95 html_closing_tag closing; 96 97 std::vector<html_tag_component> components; 98 99 html_tag_extra_t extra; 100 mutable struct html_block *block = nullptr; 101 std::vector<struct html_tag *> children; 102 struct html_tag *parent; 103 find_componentrspamd::html::html_tag104 auto find_component(html_component_type what) const -> std::optional<std::string_view> 105 { 106 for (const auto &comp : components) { 107 if (comp.type == what) { 108 return comp.value; 109 } 110 } 111 112 return std::nullopt; 113 } 114 find_componentrspamd::html::html_tag115 auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view> 116 { 117 if (what) { 118 return find_component(what.value()); 119 } 120 121 return std::nullopt; 122 } 123 clearrspamd::html::html_tag124 auto clear(void) -> void { 125 id = Tag_UNKNOWN; 126 tag_start = content_offset = 0; 127 extra = std::monostate{}; 128 components.clear(); 129 flags = 0; 130 block = nullptr; 131 children.clear(); 132 closing.clear(); 133 } 134 get_content_lengthrspamd::html::html_tag135 constexpr auto get_content_length() const -> std::size_t { 136 if (flags & (FL_IGNORE|CM_HEAD)) { 137 return 0; 138 } 139 if (closing.start > content_offset) { 140 return closing.start - content_offset; 141 } 142 143 return 0; 144 } 145 146 auto get_content(const struct html_content *hc) const -> std::string_view; 147 }; 148 149 static_assert(CM_USER_SHIFT + 7 < sizeof(html_tag::flags) * NBBY); 150 151 } 152 153 #endif //RSPAMD_HTML_TAG_HXX 154