1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 //
17 // Error types, enums, and handling functions.
18 
19 #ifndef GUMBO_ERROR_H_
20 #define GUMBO_ERROR_H_
21 #ifdef _MSC_VER
22 #define _CRT_SECURE_NO_WARNINGS
23 #endif
24 #include <stdint.h>
25 
26 #include "gumbo.h"
27 #include "insertion_mode.h"
28 #include "string_buffer.h"
29 #include "token_type.h"
30 
31 #ifdef __cplusplus
32 extern "C" {
33 #endif
34 
35 struct GumboInternalParser;
36 
37 typedef enum {
38   GUMBO_ERR_UTF8_INVALID,
39   GUMBO_ERR_UTF8_TRUNCATED,
40   GUMBO_ERR_UTF8_NULL,
41   GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
42   GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
43   GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
44   GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
45   GUMBO_ERR_NAMED_CHAR_REF_INVALID,
46   GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
47   GUMBO_ERR_TAG_EOF,
48   GUMBO_ERR_TAG_INVALID,
49   GUMBO_ERR_CLOSE_TAG_EMPTY,
50   GUMBO_ERR_CLOSE_TAG_EOF,
51   GUMBO_ERR_CLOSE_TAG_INVALID,
52   GUMBO_ERR_SCRIPT_EOF,
53   GUMBO_ERR_ATTR_NAME_EOF,
54   GUMBO_ERR_ATTR_NAME_INVALID,
55   GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
56   GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
57   GUMBO_ERR_ATTR_UNQUOTED_EOF,
58   GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
59   GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
60   GUMBO_ERR_ATTR_AFTER_EOF,
61   GUMBO_ERR_ATTR_AFTER_INVALID,
62   GUMBO_ERR_DUPLICATE_ATTR,
63   GUMBO_ERR_SOLIDUS_EOF,
64   GUMBO_ERR_SOLIDUS_INVALID,
65   GUMBO_ERR_DASHES_OR_DOCTYPE,
66   GUMBO_ERR_COMMENT_EOF,
67   GUMBO_ERR_COMMENT_INVALID,
68   GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
69   GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
70   GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
71   GUMBO_ERR_COMMENT_END_BANG_EOF,
72   GUMBO_ERR_DOCTYPE_EOF,
73   GUMBO_ERR_DOCTYPE_INVALID,
74   GUMBO_ERR_DOCTYPE_SPACE,
75   GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
76   GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
77   GUMBO_ERR_DOCTYPE_END,
78   GUMBO_ERR_PARSER,
79   GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
80 } GumboErrorType;
81 
82 // Additional data for duplicated attributes.
83 typedef struct GumboInternalDuplicateAttrError {
84   // The name of the attribute.  Owned by this struct.
85   const char* name;
86 
87   // The (0-based) index within the attributes vector of the original
88   // occurrence.
89   unsigned int original_index;
90 
91   // The (0-based) index where the new occurrence would be.
92   unsigned int new_index;
93 } GumboDuplicateAttrError;
94 
95 // A simplified representation of the tokenizer state, designed to be more
96 // useful to clients of this library than the internal representation.  This
97 // condenses the actual states used in the tokenizer state machine into a few
98 // values that will be familiar to users of HTML.
99 typedef enum {
100   GUMBO_ERR_TOKENIZER_DATA,
101   GUMBO_ERR_TOKENIZER_CHAR_REF,
102   GUMBO_ERR_TOKENIZER_RCDATA,
103   GUMBO_ERR_TOKENIZER_RAWTEXT,
104   GUMBO_ERR_TOKENIZER_PLAINTEXT,
105   GUMBO_ERR_TOKENIZER_SCRIPT,
106   GUMBO_ERR_TOKENIZER_TAG,
107   GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
108   GUMBO_ERR_TOKENIZER_ATTR_NAME,
109   GUMBO_ERR_TOKENIZER_ATTR_VALUE,
110   GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
111   GUMBO_ERR_TOKENIZER_COMMENT,
112   GUMBO_ERR_TOKENIZER_DOCTYPE,
113   GUMBO_ERR_TOKENIZER_CDATA,
114 } GumboTokenizerErrorState;
115 
116 // Additional data for tokenizer errors.
117 // This records the current state and codepoint encountered - this is usually
118 // enough to reconstruct what went wrong and provide a friendly error message.
119 typedef struct GumboInternalTokenizerError {
120   // The bad codepoint encountered.
121   int codepoint;
122 
123   // The state that the tokenizer was in at the time.
124   GumboTokenizerErrorState state;
125 } GumboTokenizerError;
126 
127 // Additional data for parse errors.
128 typedef struct GumboInternalParserError {
129   // The type of input token that resulted in this error.
130   GumboTokenType input_type;
131 
132   // The HTML tag of the input token.  TAG_UNKNOWN if this was not a tag token.
133   GumboTag input_tag;
134 
135   // The insertion mode that the parser was in at the time.
136   GumboInsertionMode parser_state;
137 
138   // The tag stack at the point of the error.  Note that this is an GumboVector
139   // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
140   // get at the tag.
141   GumboVector /* GumboTag */ tag_stack;
142 } GumboParserError;
143 
144 // The overall error struct representing an error in decoding/tokenizing/parsing
145 // the HTML.  This contains an enumerated type flag, a source position, and then
146 // a union of fields containing data specific to the error.
147 typedef struct GumboInternalError {
148   // The type of error.
149   GumboErrorType type;
150 
151   // The position within the source file where the error occurred.
152   GumboSourcePosition position;
153 
154   // A pointer to the byte within the original source file text where the error
155   // occurred (note that this is not the same as position.offset, as that gives
156   // character-based instead of byte-based offsets).
157   const char* original_text;
158 
159   // Type-specific error information.
160   union {
161     // The code point we encountered, for:
162     // * GUMBO_ERR_UTF8_INVALID
163     // * GUMBO_ERR_UTF8_TRUNCATED
164     // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
165     // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
166     uint64_t codepoint;
167 
168     // Tokenizer errors.
169     GumboTokenizerError tokenizer;
170 
171     // Short textual data, for:
172     // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
173     // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
174     GumboStringPiece text;
175 
176     // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
177     GumboDuplicateAttrError duplicate_attr;
178 
179     // Parser state, for GUMBO_ERR_PARSER and
180     // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
181     struct GumboInternalParserError parser;
182   } v;
183 } GumboError;
184 
185 // Adds a new error to the parser's error list, and returns a pointer to it so
186 // that clients can fill out the rest of its fields.  May return NULL if we're
187 // already over the max_errors field specified in GumboOptions.
188 GumboError* gumbo_add_error(struct GumboInternalParser* parser);
189 
190 // Initializes the errors vector in the parser.
191 void gumbo_init_errors(struct GumboInternalParser* errors);
192 
193 // Frees all the errors in the 'errors_' field of the parser.
194 void gumbo_destroy_errors(struct GumboInternalParser* errors);
195 
196 // Frees the memory used for a single GumboError.
197 void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
198 
199 // Prints an error to a string.  This fills an empty GumboStringBuffer with a
200 // freshly-allocated buffer containing the error message text.  The caller is
201 // responsible for deleting the buffer.  (Note that the buffer is allocated with
202 // the allocator specified in the GumboParser config and hence should be freed
203 // by gumbo_parser_deallocate().)
204 void gumbo_error_to_string(struct GumboInternalParser* parser,
205     const GumboError* error, GumboStringBuffer* output);
206 
207 // Prints a caret diagnostic to a string.  This fills an empty GumboStringBuffer
208 // with a freshly-allocated buffer containing the error message text.  The
209 // caller is responsible for deleting the buffer.  (Note that the buffer is
210 // allocated with the allocator specified in the GumboParser config and hence
211 // should be freed by gumbo_parser_deallocate().)
212 void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
213     const GumboError* error, const char* source_text,
214     GumboStringBuffer* output);
215 
216 // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
217 // of writing to a string.
218 void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
219     const GumboError* error, const char* source_text);
220 
221 #ifdef __cplusplus
222 }
223 #endif
224 
225 #endif  // GUMBO_ERROR_H_
226