1 // Copyright 2010 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Author: jdtang@google.com (Jonathan Tang) 16 // 17 // Error types, enums, and handling functions. 18 19 #ifndef GUMBO_ERROR_H_ 20 #define GUMBO_ERROR_H_ 21 #ifdef _MSC_VER 22 #define _CRT_SECURE_NO_WARNINGS 23 #endif 24 #include <stdint.h> 25 26 #include "gumbo.h" 27 #include "insertion_mode.h" 28 #include "string_buffer.h" 29 #include "token_type.h" 30 31 #ifdef __cplusplus 32 extern "C" { 33 #endif 34 35 struct GumboInternalParser; 36 37 typedef enum { 38 GUMBO_ERR_UTF8_INVALID, 39 GUMBO_ERR_UTF8_TRUNCATED, 40 GUMBO_ERR_UTF8_NULL, 41 GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS, 42 GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON, 43 GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, 44 GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON, 45 GUMBO_ERR_NAMED_CHAR_REF_INVALID, 46 GUMBO_ERR_TAG_STARTS_WITH_QUESTION, 47 GUMBO_ERR_TAG_EOF, 48 GUMBO_ERR_TAG_INVALID, 49 GUMBO_ERR_CLOSE_TAG_EMPTY, 50 GUMBO_ERR_CLOSE_TAG_EOF, 51 GUMBO_ERR_CLOSE_TAG_INVALID, 52 GUMBO_ERR_SCRIPT_EOF, 53 GUMBO_ERR_ATTR_NAME_EOF, 54 GUMBO_ERR_ATTR_NAME_INVALID, 55 GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF, 56 GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF, 57 GUMBO_ERR_ATTR_UNQUOTED_EOF, 58 GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET, 59 GUMBO_ERR_ATTR_UNQUOTED_EQUALS, 60 GUMBO_ERR_ATTR_AFTER_EOF, 61 GUMBO_ERR_ATTR_AFTER_INVALID, 62 GUMBO_ERR_DUPLICATE_ATTR, 63 GUMBO_ERR_SOLIDUS_EOF, 64 GUMBO_ERR_SOLIDUS_INVALID, 65 GUMBO_ERR_DASHES_OR_DOCTYPE, 66 GUMBO_ERR_COMMENT_EOF, 67 GUMBO_ERR_COMMENT_INVALID, 68 GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH, 69 GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH, 70 GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH, 71 GUMBO_ERR_COMMENT_END_BANG_EOF, 72 GUMBO_ERR_DOCTYPE_EOF, 73 GUMBO_ERR_DOCTYPE_INVALID, 74 GUMBO_ERR_DOCTYPE_SPACE, 75 GUMBO_ERR_DOCTYPE_RIGHT_BRACKET, 76 GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET, 77 GUMBO_ERR_DOCTYPE_END, 78 GUMBO_ERR_PARSER, 79 GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG, 80 } GumboErrorType; 81 82 // Additional data for duplicated attributes. 83 typedef struct GumboInternalDuplicateAttrError { 84 // The name of the attribute. Owned by this struct. 85 const char* name; 86 87 // The (0-based) index within the attributes vector of the original 88 // occurrence. 89 unsigned int original_index; 90 91 // The (0-based) index where the new occurrence would be. 92 unsigned int new_index; 93 } GumboDuplicateAttrError; 94 95 // A simplified representation of the tokenizer state, designed to be more 96 // useful to clients of this library than the internal representation. This 97 // condenses the actual states used in the tokenizer state machine into a few 98 // values that will be familiar to users of HTML. 99 typedef enum { 100 GUMBO_ERR_TOKENIZER_DATA, 101 GUMBO_ERR_TOKENIZER_CHAR_REF, 102 GUMBO_ERR_TOKENIZER_RCDATA, 103 GUMBO_ERR_TOKENIZER_RAWTEXT, 104 GUMBO_ERR_TOKENIZER_PLAINTEXT, 105 GUMBO_ERR_TOKENIZER_SCRIPT, 106 GUMBO_ERR_TOKENIZER_TAG, 107 GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG, 108 GUMBO_ERR_TOKENIZER_ATTR_NAME, 109 GUMBO_ERR_TOKENIZER_ATTR_VALUE, 110 GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION, 111 GUMBO_ERR_TOKENIZER_COMMENT, 112 GUMBO_ERR_TOKENIZER_DOCTYPE, 113 GUMBO_ERR_TOKENIZER_CDATA, 114 } GumboTokenizerErrorState; 115 116 // Additional data for tokenizer errors. 117 // This records the current state and codepoint encountered - this is usually 118 // enough to reconstruct what went wrong and provide a friendly error message. 119 typedef struct GumboInternalTokenizerError { 120 // The bad codepoint encountered. 121 int codepoint; 122 123 // The state that the tokenizer was in at the time. 124 GumboTokenizerErrorState state; 125 } GumboTokenizerError; 126 127 // Additional data for parse errors. 128 typedef struct GumboInternalParserError { 129 // The type of input token that resulted in this error. 130 GumboTokenType input_type; 131 132 // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token. 133 GumboTag input_tag; 134 135 // The insertion mode that the parser was in at the time. 136 GumboInsertionMode parser_state; 137 138 // The tag stack at the point of the error. Note that this is an GumboVector 139 // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to 140 // get at the tag. 141 GumboVector /* GumboTag */ tag_stack; 142 } GumboParserError; 143 144 // The overall error struct representing an error in decoding/tokenizing/parsing 145 // the HTML. This contains an enumerated type flag, a source position, and then 146 // a union of fields containing data specific to the error. 147 typedef struct GumboInternalError { 148 // The type of error. 149 GumboErrorType type; 150 151 // The position within the source file where the error occurred. 152 GumboSourcePosition position; 153 154 // A pointer to the byte within the original source file text where the error 155 // occurred (note that this is not the same as position.offset, as that gives 156 // character-based instead of byte-based offsets). 157 const char* original_text; 158 159 // Type-specific error information. 160 union { 161 // The code point we encountered, for: 162 // * GUMBO_ERR_UTF8_INVALID 163 // * GUMBO_ERR_UTF8_TRUNCATED 164 // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON 165 // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID 166 uint64_t codepoint; 167 168 // Tokenizer errors. 169 GumboTokenizerError tokenizer; 170 171 // Short textual data, for: 172 // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON 173 // * GUMBO_ERR_NAMED_CHAR_REF_INVALID 174 GumboStringPiece text; 175 176 // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR. 177 GumboDuplicateAttrError duplicate_attr; 178 179 // Parser state, for GUMBO_ERR_PARSER and 180 // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG. 181 struct GumboInternalParserError parser; 182 } v; 183 } GumboError; 184 185 // Adds a new error to the parser's error list, and returns a pointer to it so 186 // that clients can fill out the rest of its fields. May return NULL if we're 187 // already over the max_errors field specified in GumboOptions. 188 GumboError* gumbo_add_error(struct GumboInternalParser* parser); 189 190 // Initializes the errors vector in the parser. 191 void gumbo_init_errors(struct GumboInternalParser* errors); 192 193 // Frees all the errors in the 'errors_' field of the parser. 194 void gumbo_destroy_errors(struct GumboInternalParser* errors); 195 196 // Frees the memory used for a single GumboError. 197 void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error); 198 199 // Prints an error to a string. This fills an empty GumboStringBuffer with a 200 // freshly-allocated buffer containing the error message text. The caller is 201 // responsible for deleting the buffer. (Note that the buffer is allocated with 202 // the allocator specified in the GumboParser config and hence should be freed 203 // by gumbo_parser_deallocate().) 204 void gumbo_error_to_string(struct GumboInternalParser* parser, 205 const GumboError* error, GumboStringBuffer* output); 206 207 // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer 208 // with a freshly-allocated buffer containing the error message text. The 209 // caller is responsible for deleting the buffer. (Note that the buffer is 210 // allocated with the allocator specified in the GumboParser config and hence 211 // should be freed by gumbo_parser_deallocate().) 212 void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser, 213 const GumboError* error, const char* source_text, 214 GumboStringBuffer* output); 215 216 // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead 217 // of writing to a string. 218 void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser, 219 const GumboError* error, const char* source_text); 220 221 #ifdef __cplusplus 222 } 223 #endif 224 225 #endif // GUMBO_ERROR_H_ 226