1 /* 2 Copyright (C) 2015-2017 Alexander Borisov 3 4 This library is free software; you can redistribute it and/or 5 modify it under the terms of the GNU Lesser General Public 6 License as published by the Free Software Foundation; either 7 version 2.1 of the License, or (at your option) any later version. 8 9 This library is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 Lesser General Public License for more details. 13 14 You should have received a copy of the GNU Lesser General Public 15 License along with this library; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 18 Author: lex.borisov@gmail.com (Alexander Borisov) 19 */ 20 21 #ifndef MyHTML_TREE_H 22 #define MyHTML_TREE_H 23 #pragma once 24 25 #ifdef __cplusplus 26 extern "C" { 27 #endif 28 29 #include <myhtml/myosi.h> 30 #include <myhtml/myhtml.h> 31 #include <myhtml/mystring.h> 32 #include <myhtml/token.h> 33 #include <myhtml/stream.h> 34 #include <mycore/thread_queue.h> 35 #include <mycore/utils/mcsync.h> 36 #include <mycore/utils/mchar_async.h> 37 #include <mycore/utils/mcobject.h> 38 #include <mycore/utils/mcobject_async.h> 39 40 #define myhtml_tree_get(tree, attr) tree->attr 41 #define myhtml_tree_set(tree, attr) tree->attr 42 43 #define myhtml_tree_token_current(tree) myhtml_tree_get(tree, token_current) 44 #define myhtml_tree_token_attr_current(tree) myhtml_tree_get(tree, attr_current) 45 46 #define myhtml_tree_node_get(tree, node_id, attr) tree->nodes[node_id].attr 47 48 #define myhtml_tree_node_callback_insert(tree, node) \ 49 if(tree->callback_tree_node_insert) \ 50 tree->callback_tree_node_insert(tree, node, tree->callback_tree_node_insert_ctx) 51 52 #define myhtml_tree_node_callback_remove(tree, node) \ 53 if(tree->callback_tree_node_remove) \ 54 tree->callback_tree_node_remove(tree, node, tree->callback_tree_node_remove_ctx) 55 56 enum myhtml_tree_node_type { 57 MyHTML_TYPE_NONE = 0, 58 MyHTML_TYPE_BLOCK = 1, 59 MyHTML_TYPE_INLINE = 2, 60 MyHTML_TYPE_TABLE = 3, 61 MyHTML_TYPE_META = 4, 62 MyHTML_TYPE_COMMENT = 5 63 }; 64 65 enum myhtml_close_type { 66 MyHTML_CLOSE_TYPE_NONE = 0, 67 MyHTML_CLOSE_TYPE_NOW = 1, 68 MyHTML_CLOSE_TYPE_SELF = 2, 69 MyHTML_CLOSE_TYPE_BLOCK = 3 70 }; 71 72 enum myhtml_tree_node_flags { 73 MyHTML_TREE_NODE_UNDEF = 0, 74 MyHTML_TREE_NODE_PARSER_INSERTED = 1, 75 MyHTML_TREE_NODE_BLOCKING = 2 76 }; 77 78 struct myhtml_tree_node { 79 enum myhtml_tree_node_flags flags; 80 81 myhtml_tag_id_t tag_id; 82 enum myhtml_namespace ns; 83 84 myhtml_tree_node_t* prev; 85 myhtml_tree_node_t* next; 86 myhtml_tree_node_t* child; 87 myhtml_tree_node_t* parent; 88 89 myhtml_tree_node_t* last_child; 90 91 myhtml_token_node_t* token; 92 void* data; 93 94 myhtml_tree_t* tree; 95 }; 96 97 enum myhtml_tree_compat_mode { 98 MyHTML_TREE_COMPAT_MODE_NO_QUIRKS = 0x00, 99 MyHTML_TREE_COMPAT_MODE_QUIRKS = 0x01, 100 MyHTML_TREE_COMPAT_MODE_LIMITED_QUIRKS = 0x02 101 }; 102 103 enum myhtml_tree_doctype_id { 104 MyHTML_TREE_DOCTYPE_ID_NAME = 0x00, 105 MyHTML_TREE_DOCTYPE_ID_SYSTEM = 0x01, 106 MyHTML_TREE_DOCTYPE_ID_PUBLIC = 0x02 107 }; 108 109 enum myhtml_tree_insertion_mode { 110 MyHTML_TREE_INSERTION_MODE_DEFAULT = 0x00, 111 MyHTML_TREE_INSERTION_MODE_BEFORE = 0x01, 112 MyHTML_TREE_INSERTION_MODE_AFTER = 0x02 113 }; 114 115 struct myhtml_async_args { 116 size_t mchar_node_id; 117 }; 118 119 struct myhtml_tree_doctype { 120 bool is_html; 121 char* attr_name; 122 char* attr_public; 123 char* attr_system; 124 }; 125 126 struct myhtml_tree_list { 127 myhtml_tree_node_t** list; 128 volatile size_t length; 129 size_t size; 130 }; 131 132 struct myhtml_tree_token_list { 133 myhtml_token_node_t** list; 134 size_t length; 135 size_t size; 136 }; 137 138 struct myhtml_tree_insertion_list { 139 enum myhtml_insertion_mode* list; 140 size_t length; 141 size_t size; 142 }; 143 144 struct myhtml_tree_temp_tag_name { 145 char *data; 146 size_t length; 147 size_t size; 148 }; 149 150 struct myhtml_tree_special_token { 151 myhtml_token_node_t *token; 152 myhtml_namespace_t ns; 153 } 154 typedef myhtml_tree_special_token_t; 155 156 struct myhtml_tree_special_token_list { 157 myhtml_tree_special_token_t *list; 158 size_t length; 159 size_t size; 160 } 161 typedef myhtml_tree_special_token_list_t; 162 163 struct myhtml_tree_temp_stream { 164 struct myhtml_tree_temp_tag_name** data; 165 size_t length; 166 size_t size; 167 168 myencoding_result_t res; 169 struct myhtml_tree_temp_tag_name* current; 170 }; 171 172 struct myhtml_tree { 173 // ref 174 myhtml_t* myhtml; 175 mchar_async_t* mchar; 176 myhtml_token_t* token; 177 mcobject_async_t* tree_obj; 178 mcsync_t* sync; 179 mythread_queue_list_entry_t* queue_entry; 180 mythread_queue_t* queue; 181 myhtml_tag_t* tags; 182 void* modest; 183 void* context; 184 185 // init id's 186 size_t mcasync_rules_token_id; 187 size_t mcasync_rules_attr_id; 188 size_t mcasync_tree_id; 189 /* 190 * mchar_node_id 191 * for rules, or if single mode, 192 * or for main thread only after parsing 193 */ 194 size_t mchar_node_id; 195 myhtml_token_attr_t* attr_current; 196 myhtml_tag_id_t tmp_tag_id; 197 myhtml_token_node_t* current_token_node; 198 mythread_queue_node_t* current_qnode; 199 200 mcobject_t* mcobject_incoming_buf; 201 mycore_incoming_buffer_t* incoming_buf; 202 mycore_incoming_buffer_t* incoming_buf_first; 203 204 // ref for nodes 205 myhtml_tree_node_t* document; 206 myhtml_tree_node_t* fragment; 207 myhtml_tree_node_t* node_head; 208 myhtml_tree_node_t* node_html; 209 myhtml_tree_node_t* node_body; 210 myhtml_tree_node_t* node_form; 211 myhtml_tree_doctype_t doctype; 212 213 // for build tree 214 myhtml_tree_list_t* active_formatting; 215 myhtml_tree_list_t* open_elements; 216 myhtml_tree_list_t* other_elements; 217 myhtml_tree_token_list_t* token_list; 218 myhtml_tree_insertion_list_t* template_insertion; 219 myhtml_async_args_t* async_args; 220 myhtml_stream_buffer_t* stream_buffer; 221 myhtml_token_node_t* volatile token_last_done; 222 223 // for detect namespace out of tree builder 224 myhtml_token_node_t* token_namespace; 225 226 // tree params 227 enum myhtml_tokenizer_state state; 228 enum myhtml_tokenizer_state state_of_builder; 229 enum myhtml_insertion_mode insert_mode; 230 enum myhtml_insertion_mode orig_insert_mode; 231 enum myhtml_tree_compat_mode compat_mode; 232 volatile enum myhtml_tree_flags flags; 233 volatile myhtml_tree_parse_flags_t parse_flags; 234 bool foster_parenting; 235 size_t global_offset; 236 mystatus_t tokenizer_status; 237 238 myencoding_t encoding; 239 myencoding_t encoding_usereq; 240 myhtml_tree_temp_tag_name_t temp_tag_name; 241 242 /* callback */ 243 myhtml_callback_token_f callback_before_token; 244 myhtml_callback_token_f callback_after_token; 245 246 void* callback_before_token_ctx; 247 void* callback_after_token_ctx; 248 249 myhtml_callback_tree_node_f callback_tree_node_insert; 250 myhtml_callback_tree_node_f callback_tree_node_remove; 251 252 void* callback_tree_node_insert_ctx; 253 void* callback_tree_node_remove_ctx; 254 }; 255 256 // base 257 myhtml_tree_t * myhtml_tree_create(void); 258 mystatus_t myhtml_tree_init(myhtml_tree_t* tree, myhtml_t* myhtml); 259 void myhtml_tree_clean(myhtml_tree_t* tree); 260 void myhtml_tree_clean_all(myhtml_tree_t* tree); 261 myhtml_tree_t * myhtml_tree_destroy(myhtml_tree_t* tree); 262 263 /* parse flags */ 264 myhtml_tree_parse_flags_t myhtml_tree_parse_flags(myhtml_tree_t* tree); 265 void myhtml_tree_parse_flags_set(myhtml_tree_t* tree, myhtml_tree_parse_flags_t flags); 266 267 myhtml_t * myhtml_tree_get_myhtml(myhtml_tree_t* tree); 268 myhtml_tag_t * myhtml_tree_get_tag(myhtml_tree_t* tree); 269 myhtml_tree_node_t * myhtml_tree_get_document(myhtml_tree_t* tree); 270 myhtml_tree_node_t * myhtml_tree_get_node_html(myhtml_tree_t* tree); 271 myhtml_tree_node_t * myhtml_tree_get_node_head(myhtml_tree_t* tree); 272 myhtml_tree_node_t * myhtml_tree_get_node_body(myhtml_tree_t* tree); 273 274 mchar_async_t * myhtml_tree_get_mchar(myhtml_tree_t* tree); 275 size_t myhtml_tree_get_mchar_node_id(myhtml_tree_t* tree); 276 277 // list 278 myhtml_tree_list_t * myhtml_tree_list_init(void); 279 void myhtml_tree_list_clean(myhtml_tree_list_t* list); 280 myhtml_tree_list_t * myhtml_tree_list_destroy(myhtml_tree_list_t* list, bool destroy_self); 281 282 void myhtml_tree_list_append(myhtml_tree_list_t* list, myhtml_tree_node_t* node); 283 void myhtml_tree_list_append_after_index(myhtml_tree_list_t* list, myhtml_tree_node_t* node, size_t index); 284 void myhtml_tree_list_insert_by_index(myhtml_tree_list_t* list, myhtml_tree_node_t* node, size_t index); 285 myhtml_tree_node_t * myhtml_tree_list_current_node(myhtml_tree_list_t* list); 286 287 // token list 288 myhtml_tree_token_list_t * myhtml_tree_token_list_init(void); 289 void myhtml_tree_token_list_clean(myhtml_tree_token_list_t* list); 290 myhtml_tree_token_list_t * myhtml_tree_token_list_destroy(myhtml_tree_token_list_t* list, bool destroy_self); 291 292 void myhtml_tree_token_list_append(myhtml_tree_token_list_t* list, myhtml_token_node_t* token); 293 void myhtml_tree_token_list_append_after_index(myhtml_tree_token_list_t* list, myhtml_token_node_t* token, size_t index); 294 myhtml_token_node_t * myhtml_tree_token_list_current_node(myhtml_tree_token_list_t* list); 295 296 // active formatting 297 myhtml_tree_list_t * myhtml_tree_active_formatting_init(myhtml_tree_t* tree); 298 void myhtml_tree_active_formatting_clean(myhtml_tree_t* tree); 299 myhtml_tree_list_t * myhtml_tree_active_formatting_destroy(myhtml_tree_t* tree); 300 301 bool myhtml_tree_active_formatting_is_marker(myhtml_tree_t* tree, myhtml_tree_node_t* idx); 302 myhtml_tree_node_t* myhtml_tree_active_formatting_between_last_marker(myhtml_tree_t* tree, myhtml_tag_id_t tag_idx, size_t* return_idx); 303 304 void myhtml_tree_active_formatting_append(myhtml_tree_t* tree, myhtml_tree_node_t* node); 305 void myhtml_tree_active_formatting_append_with_check(myhtml_tree_t* tree, myhtml_tree_node_t* node); 306 void myhtml_tree_active_formatting_pop(myhtml_tree_t* tree); 307 void myhtml_tree_active_formatting_remove(myhtml_tree_t* tree, myhtml_tree_node_t* node); 308 void myhtml_tree_active_formatting_remove_by_index(myhtml_tree_t* tree, size_t idx); 309 310 void myhtml_tree_active_formatting_reconstruction(myhtml_tree_t* tree); 311 void myhtml_tree_active_formatting_up_to_last_marker(myhtml_tree_t* tree); 312 313 bool myhtml_tree_active_formatting_find(myhtml_tree_t* tree, myhtml_tree_node_t* idx, size_t* return_idx); 314 myhtml_tree_node_t* myhtml_tree_active_formatting_current_node(myhtml_tree_t* tree); 315 316 // open elements 317 myhtml_tree_list_t * myhtml_tree_open_elements_init(myhtml_tree_t* tree); 318 void myhtml_tree_open_elements_clean(myhtml_tree_t* tree); 319 myhtml_tree_list_t * myhtml_tree_open_elements_destroy(myhtml_tree_t* tree); 320 321 myhtml_tree_node_t* myhtml_tree_current_node(myhtml_tree_t* tree); 322 myhtml_tree_node_t * myhtml_tree_adjusted_current_node(myhtml_tree_t* tree); 323 324 void myhtml_tree_open_elements_append(myhtml_tree_t* tree, myhtml_tree_node_t* node); 325 void myhtml_tree_open_elements_append_after_index(myhtml_tree_t* tree, myhtml_tree_node_t* node, size_t index); 326 void myhtml_tree_open_elements_pop(myhtml_tree_t* tree); 327 void myhtml_tree_open_elements_pop_until(myhtml_tree_t* tree, myhtml_tag_id_t tag_idx, myhtml_namespace_t mynamespace, bool is_exclude); 328 void myhtml_tree_open_elements_pop_until_by_node(myhtml_tree_t* tree, myhtml_tree_node_t* node_idx, bool is_exclude); 329 void myhtml_tree_open_elements_pop_until_by_index(myhtml_tree_t* tree, size_t idx, bool is_exclude); 330 void myhtml_tree_open_elements_remove(myhtml_tree_t* tree, myhtml_tree_node_t* node); 331 332 bool myhtml_tree_open_elements_find(myhtml_tree_t* tree, myhtml_tree_node_t* idx, size_t* pos); 333 bool myhtml_tree_open_elements_find_reverse(myhtml_tree_t* tree, myhtml_tree_node_t* idx, size_t* pos); 334 myhtml_tree_node_t * myhtml_tree_open_elements_find_by_tag_idx(myhtml_tree_t* tree, myhtml_tag_id_t tag_idx, myhtml_namespace_t mynamespace, size_t* return_index); 335 myhtml_tree_node_t * myhtml_tree_open_elements_find_by_tag_idx_reverse(myhtml_tree_t* tree, myhtml_tag_id_t tag_idx, myhtml_namespace_t mynamespace, size_t* return_index); 336 myhtml_tree_node_t * myhtml_tree_element_in_scope(myhtml_tree_t* tree, myhtml_tag_id_t tag_idx, myhtml_namespace_t mynamespace, enum myhtml_tag_categories category); 337 bool myhtml_tree_element_in_scope_by_node(myhtml_tree_node_t* node, enum myhtml_tag_categories category); 338 void myhtml_tree_generate_implied_end_tags(myhtml_tree_t* tree, myhtml_tag_id_t exclude_tag_idx, myhtml_namespace_t mynamespace); 339 void myhtml_tree_generate_all_implied_end_tags(myhtml_tree_t* tree, myhtml_tag_id_t exclude_tag_idx, myhtml_namespace_t mynamespace); 340 myhtml_tree_node_t * myhtml_tree_appropriate_place_inserting(myhtml_tree_t* tree, myhtml_tree_node_t* override_target, enum myhtml_tree_insertion_mode* mode); 341 myhtml_tree_node_t * myhtml_tree_appropriate_place_inserting_in_tree(myhtml_tree_node_t* target, enum myhtml_tree_insertion_mode* mode); 342 343 // template insertion 344 myhtml_tree_insertion_list_t * myhtml_tree_template_insertion_init(myhtml_tree_t* tree); 345 void myhtml_tree_template_insertion_clean(myhtml_tree_t* tree); 346 myhtml_tree_insertion_list_t * myhtml_tree_template_insertion_destroy(myhtml_tree_t* tree); 347 348 void myhtml_tree_template_insertion_append(myhtml_tree_t* tree, enum myhtml_insertion_mode insert_mode); 349 void myhtml_tree_template_insertion_pop(myhtml_tree_t* tree); 350 351 void myhtml_tree_reset_insertion_mode_appropriately(myhtml_tree_t* tree); 352 353 bool myhtml_tree_adoption_agency_algorithm(myhtml_tree_t* tree, myhtml_token_node_t* token, myhtml_tag_id_t subject_tag_idx); 354 size_t myhtml_tree_template_insertion_length(myhtml_tree_t* tree); 355 356 // other for a tree 357 myhtml_tree_node_t * myhtml_tree_node_create(myhtml_tree_t* tree); 358 void myhtml_tree_node_delete(myhtml_tree_node_t* node); 359 void myhtml_tree_node_delete_recursive(myhtml_tree_node_t* node); 360 void myhtml_tree_node_clean(myhtml_tree_node_t* tree_node); 361 void myhtml_tree_node_free(myhtml_tree_node_t* node); 362 myhtml_tree_node_t * myhtml_tree_node_clone(myhtml_tree_node_t* node); 363 364 void myhtml_tree_node_add_child(myhtml_tree_node_t* root, myhtml_tree_node_t* node); 365 void myhtml_tree_node_insert_before(myhtml_tree_node_t* root, myhtml_tree_node_t* node); 366 void myhtml_tree_node_insert_after(myhtml_tree_node_t* root, myhtml_tree_node_t* node); 367 void myhtml_tree_node_insert_by_mode(myhtml_tree_node_t* adjusted_location, myhtml_tree_node_t* node, enum myhtml_tree_insertion_mode mode); 368 myhtml_tree_node_t * myhtml_tree_node_remove(myhtml_tree_node_t* node); 369 370 myhtml_tree_node_t * myhtml_tree_node_insert_html_element(myhtml_tree_t* tree, myhtml_token_node_t* token); 371 myhtml_tree_node_t * myhtml_tree_node_insert_foreign_element(myhtml_tree_t* tree, myhtml_token_node_t* token); 372 myhtml_tree_node_t * myhtml_tree_node_insert_by_token(myhtml_tree_t* tree, myhtml_token_node_t* token, myhtml_namespace_t ns); 373 myhtml_tree_node_t * myhtml_tree_node_insert(myhtml_tree_t* tree, myhtml_tag_id_t tag_idx, myhtml_namespace_t ns); 374 myhtml_tree_node_t * myhtml_tree_node_insert_by_node(myhtml_tree_t* tree, myhtml_tree_node_t* idx); 375 myhtml_tree_node_t * myhtml_tree_node_insert_comment(myhtml_tree_t* tree, myhtml_token_node_t* token, myhtml_tree_node_t* parent); 376 myhtml_tree_node_t * myhtml_tree_node_insert_doctype(myhtml_tree_t* tree, myhtml_token_node_t* token); 377 myhtml_tree_node_t * myhtml_tree_node_insert_root(myhtml_tree_t* tree, myhtml_token_node_t* token, myhtml_namespace_t ns); 378 myhtml_tree_node_t * myhtml_tree_node_insert_text(myhtml_tree_t* tree, myhtml_token_node_t* token); 379 myhtml_tree_node_t * myhtml_tree_node_find_parent_by_tag_id(myhtml_tree_node_t* node, myhtml_tag_id_t tag_id); 380 381 // other 382 void myhtml_tree_wait_for_last_done_token(myhtml_tree_t* tree, myhtml_token_node_t* token_for_wait); 383 384 void myhtml_tree_tags_close_p(myhtml_tree_t* tree, myhtml_token_node_t* token); 385 myhtml_tree_node_t * myhtml_tree_generic_raw_text_element_parsing_algorithm(myhtml_tree_t* tree, myhtml_token_node_t* token_node); 386 void myhtml_tree_clear_stack_back_table_context(myhtml_tree_t* tree); 387 void myhtml_tree_clear_stack_back_table_body_context(myhtml_tree_t* tree); 388 void myhtml_tree_clear_stack_back_table_row_context(myhtml_tree_t* tree); 389 void myhtml_tree_close_cell(myhtml_tree_t* tree, myhtml_tree_node_t* tr_or_th_node, myhtml_token_node_t* token); 390 391 bool myhtml_tree_is_mathml_integration_point(myhtml_tree_t* tree, myhtml_tree_node_t* node); 392 bool myhtml_tree_is_html_integration_point(myhtml_tree_t* tree, myhtml_tree_node_t* node); 393 394 // temp tag name 395 mystatus_t myhtml_tree_temp_tag_name_init(myhtml_tree_temp_tag_name_t* temp_tag_name); 396 void myhtml_tree_temp_tag_name_clean(myhtml_tree_temp_tag_name_t* temp_tag_name); 397 myhtml_tree_temp_tag_name_t * myhtml_tree_temp_tag_name_destroy(myhtml_tree_temp_tag_name_t* temp_tag_name, bool self_destroy); 398 mystatus_t myhtml_tree_temp_tag_name_append(myhtml_tree_temp_tag_name_t* temp_tag_name, const char* name, size_t name_len); 399 mystatus_t myhtml_tree_temp_tag_name_append_one(myhtml_tree_temp_tag_name_t* temp_tag_name, const char name); 400 401 /* special tonek list */ 402 mystatus_t myhtml_tree_special_list_init(myhtml_tree_special_token_list_t* special); 403 mystatus_t myhtml_tree_special_list_append(myhtml_tree_special_token_list_t* special, myhtml_token_node_t *token, myhtml_namespace_t ns); 404 size_t myhtml_tree_special_list_length(myhtml_tree_special_token_list_t* special); 405 myhtml_tree_special_token_t * myhtml_tree_special_list_get_last(myhtml_tree_special_token_list_t* special); 406 size_t myhtml_tree_special_list_pop(myhtml_tree_special_token_list_t* special); 407 408 /* incoming buffer */ 409 mycore_incoming_buffer_t * myhtml_tree_incoming_buffer_first(myhtml_tree_t *tree); 410 const char * myhtml_tree_incomming_buffer_make_data(myhtml_tree_t *tree, size_t begin, size_t length); 411 412 #ifdef __cplusplus 413 } /* extern "C" */ 414 #endif 415 416 #endif /* myhtml_tree_h */ 417 418 419