1 /*- 2 * Copyright 2021 Vsevolod Stakhov 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef RSPAMD_HTML_HXX 18 #define RSPAMD_HTML_HXX 19 #pragma once 20 21 #include "config.h" 22 #include "libserver/url.h" 23 #include "libserver/html/html_tag.hxx" 24 #include "libserver/html/html.h" 25 #include "libserver/html/html_tags.h" 26 27 28 #include <vector> 29 #include <memory> 30 #include <string> 31 #include "function2/function2.hpp" 32 33 namespace rspamd::css { 34 /* Forward declaration */ 35 class css_style_sheet; 36 } 37 38 namespace rspamd::html { 39 40 struct html_block; 41 42 struct html_content { 43 struct rspamd_url *base_url = nullptr; 44 struct html_tag *root_tag = nullptr; 45 gint flags = 0; 46 std::vector<bool> tags_seen; 47 std::vector<html_image *> images; 48 std::vector<std::unique_ptr<struct html_tag>> all_tags; 49 std::string parsed; 50 std::string invisible; 51 std::shared_ptr<css::css_style_sheet> css_style; 52 53 /* Preallocate and reserve all internal structures */ html_contentrspamd::html::html_content54 html_content() { 55 tags_seen.resize(Tag_MAX, false); 56 all_tags.reserve(128); 57 parsed.reserve(256); 58 } 59 html_content_dtorrspamd::html::html_content60 static void html_content_dtor(void *ptr) { 61 delete html_content::from_ptr(ptr); 62 } 63 from_ptrrspamd::html::html_content64 static auto from_ptr(void *ptr) -> html_content * { 65 return static_cast<html_content* >(ptr); 66 } 67 68 enum class traverse_type { 69 PRE_ORDER, 70 POST_ORDER 71 }; traverse_block_tagsrspamd::html::html_content72 auto traverse_block_tags(fu2::function<bool(const html_tag *)> &&func, 73 traverse_type how = traverse_type::PRE_ORDER) const -> bool { 74 75 if (root_tag == nullptr) { 76 return false; 77 } 78 79 auto rec_functor_pre_order = [&](const html_tag *root, auto &&rec) -> bool { 80 if (func(root)) { 81 82 for (const auto *c : root->children) { 83 if (!rec(c, rec)) { 84 return false; 85 } 86 } 87 88 return true; 89 } 90 return false; 91 }; 92 auto rec_functor_post_order = [&](const html_tag *root, auto &&rec) -> bool { 93 for (const auto *c : root->children) { 94 if (!rec(c, rec)) { 95 return false; 96 } 97 } 98 99 return func(root); 100 }; 101 102 switch(how) { 103 case traverse_type::PRE_ORDER: 104 return rec_functor_pre_order(root_tag, rec_functor_pre_order); 105 case traverse_type::POST_ORDER: 106 return rec_functor_post_order(root_tag, rec_functor_post_order); 107 default: 108 RSPAMD_UNREACHABLE; 109 } 110 } 111 traverse_all_tagsrspamd::html::html_content112 auto traverse_all_tags(fu2::function<bool(const html_tag *)> &&func) const -> bool { 113 for (const auto &tag : all_tags) { 114 if (!(tag->flags & (FL_XML|FL_VIRTUAL))) { 115 if (!func(tag.get())) { 116 return false; 117 } 118 } 119 } 120 121 return true; 122 } 123 124 private: 125 ~html_content() = default; 126 }; 127 128 129 auto html_tag_by_name(const std::string_view &name) -> std::optional<tag_id_t>; 130 auto html_process_input(rspamd_mempool_t *pool, 131 GByteArray *in, 132 GList **exceptions, 133 khash_t (rspamd_url_hash) *url_set, 134 GPtrArray *part_urls, 135 bool allow_css) -> html_content *; 136 auto html_debug_structure(const html_content &hc) -> std::string; 137 138 } 139 140 #endif //RSPAMD_HTML_HXX 141