1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 //
17 // Coding conventions specific to this file:
18 //
19 // 1. Functions that fill in a token should be named emit_*, and should be
20 // followed immediately by a return from the tokenizer (true if no error
21 // occurred, false if an error occurred). Sometimes the emit functions
22 // themselves return a boolean so that they can be combined with the return
23 // statement; in this case, they should match this convention.
24 // 2. Functions that shuffle data from temporaries to final API structures
25 // should be named finish_*, and be called just before the tokenizer exits the
26 // state that accumulates the temporary.
27 // 3. All internal data structures should be kept in an initialized state from
28 // tokenizer creation onwards, ready to accept input. When a buffer's flushed
29 // and reset, it should be deallocated and immediately reinitialized.
30 // 4. Make sure there are appropriate break statements following each state.
31 // 5. Assertions on the state of the temporary and tag buffers are usually a
32 // good idea, and should go at the entry point of each state when added.
33 // 6. Statement order within states goes:
34 // 1. Add parse errors, if appropriate.
35 // 2. Call finish_* functions to build up tag state.
36 // 2. Switch to new state. Set _reconsume flag if appropriate.
37 // 3. Perform any other temporary buffer manipulation.
38 // 4. Emit tokens
39 // 5. Return/break.
40 // This order ensures that we can verify that every emit is followed by a
41 // return, ensures that the correct state is recorded with any parse errors, and
42 // prevents parse error position from being messed up by possible mark/resets in
43 // temporary buffer manipulation.
44
45 #include "tokenizer.h"
46
47 #include <assert.h>
48 #include <stdbool.h>
49 #include <string.h>
50
51 #include "attribute.h"
52 #include "char_ref.h"
53 #include "error.h"
54 #include "gumbo.h"
55 #include "parser.h"
56 #include "string_buffer.h"
57 #include "string_piece.h"
58 #include "token_type.h"
59 #include "tokenizer_states.h"
60 #include "utf8.h"
61 #include "util.h"
62 #include "vector.h"
63
64 // Compared against _script_data_buffer to determine if we're in double-escaped
65 // script mode.
66 const GumboStringPiece kScriptTag = {"script", 6};
67
68 // An enum for the return value of each individual state.
69 typedef enum {
70 RETURN_ERROR, // Return false (error) from the tokenizer.
71 RETURN_SUCCESS, // Return true (success) from the tokenizer.
72 NEXT_CHAR // Proceed to the next character and continue lexing.
73 } StateResult;
74
75 // This is a struct containing state necessary to build up a tag token,
76 // character by character.
77 typedef struct GumboInternalTagState {
78 // A buffer to accumulate characters for various GumboStringPiece fields.
79 GumboStringBuffer _buffer;
80
81 // A pointer to the start of the original text corresponding to the contents
82 // of the buffer.
83 const char* _original_text;
84
85 // The current tag enum, computed once the tag name state has finished so that
86 // the buffer can be re-used for building up attributes.
87 GumboTag _tag;
88
89 // The starting location of the text in the buffer.
90 GumboSourcePosition _start_pos;
91
92 // The current list of attributes. This is copied (and ownership of its data
93 // transferred) to the GumboStartTag token upon completion of the tag. New
94 // attributes are added as soon as their attribute name state is complete, and
95 // values are filled in by operating on _attributes.data[attributes.length-1].
96 GumboVector /* GumboAttribute */ _attributes;
97
98 // If true, the next attribute value to be finished should be dropped. This
99 // happens if a duplicate attribute name is encountered - we want to consume
100 // the attribute value, but shouldn't overwrite the existing value.
101 bool _drop_next_attr_value;
102
103 // The state that caused the tokenizer to switch into a character reference in
104 // attribute value state. This is used to set the additional allowed
105 // character, and is switched back to on completion. Initialized as the
106 // tokenizer enters the character reference state.
107 GumboTokenizerEnum _attr_value_state;
108
109 // The last start tag to have been emitted by the tokenizer. This is
110 // necessary to check for appropriate end tags.
111 GumboTag _last_start_tag;
112
113 // If true, then this is a start tag. If false, it's an end tag. This is
114 // necessary to generate the appropriate token type at tag-closing time.
115 bool _is_start_tag;
116
117 // If true, then this tag is "self-closing" and doesn't have an end tag.
118 bool _is_self_closing;
119 } GumboTagState;
120
121 // This is the main tokenizer state struct, containing all state used by in
122 // tokenizing the input stream.
123 typedef struct GumboInternalTokenizerState {
124 // The current lexer state. Starts in GUMBO_LEX_DATA.
125 GumboTokenizerEnum _state;
126
127 // A flag indicating whether the current input character needs to reconsumed
128 // in another state, or whether the next input character should be read for
129 // the next iteration of the state loop. This is set when the spec reads
130 // "Reconsume the current input character in..."
131 bool _reconsume_current_input;
132
133 // A flag indicating whether the current node is a foreign element. This is
134 // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
135 // markup declaration state.
136 bool _is_current_node_foreign;
137
138 // A flag indicating whether the tokenizer is in a CDATA section. If so, then
139 // text tokens emitted will be GUMBO_TOKEN_CDATA.
140 bool _is_in_cdata;
141
142 // Certain states (notably character references) may emit two character tokens
143 // at once, but the contract for lex() fills in only one token at a time. The
144 // extra character is buffered here, and then this is checked on entry to
145 // lex(). If a character is stored here, it's immediately emitted and control
146 // returns from the lexer. kGumboNoChar is used to represent 'no character
147 // stored.'
148 //
149 // Note that characters emitted through this mechanism will have their source
150 // position marked as the character under the mark, i.e. multiple characters
151 // may be emitted with the same position. This is desirable for character
152 // references, but unsuitable for many other cases. Use the _temporary_buffer
153 // mechanism if the buffered characters must have their original positions in
154 // the document.
155 int _buffered_emit_char;
156
157 // A temporary buffer to accumulate characters, as described by the "temporary
158 // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
159 // way: we record the specific character to go into the buffer, which may
160 // sometimes be a lowercased version of the actual input character. However,
161 // we *also* use utf8iterator_mark() to record the position at tag start.
162 // When we start flushing the temporary buffer, we set _temporary_buffer_emit
163 // to the start of it, and then increment it for each call to the tokenizer.
164 // We also call utf8iterator_reset(), and utf8iterator_next() through the
165 // input stream, so that tokens emitted by emit_char have the correct position
166 // and original text.
167 GumboStringBuffer _temporary_buffer;
168
169 // The current cursor position we're emitting from within
170 // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
171 const char* _temporary_buffer_emit;
172
173 // The temporary buffer is also used by the spec to check whether we should
174 // enter the script data double escaped state, but we can't use the same
175 // buffer for both because we have to flush out "<s" as emits while still
176 // maintaining the context that will eventually become "script". This is a
177 // separate buffer that's used in place of the temporary buffer for states
178 // that may enter the script data double escape start state.
179 GumboStringBuffer _script_data_buffer;
180
181 // Pointer to the beginning of the current token in the original buffer; used
182 // to record the original text.
183 const char* _token_start;
184
185 // GumboSourcePosition recording the source location of the start of the
186 // current token.
187 GumboSourcePosition _token_start_pos;
188
189 // Current tag state.
190 GumboTagState _tag_state;
191
192 // Doctype state. We use the temporary buffer to accumulate characters (it's
193 // not used for anything else in the doctype states), and then freshly
194 // allocate the strings in the doctype token, then copy it over on emit.
195 GumboTokenDocType _doc_type_state;
196
197 // The UTF8Iterator over the tokenizer input.
198 Utf8Iterator _input;
199 } GumboTokenizerState;
200
201 // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
tokenizer_add_parse_error(GumboParser * parser,GumboErrorType type)202 static void tokenizer_add_parse_error(
203 GumboParser* parser, GumboErrorType type) {
204 GumboError* error = gumbo_add_error(parser);
205 if (!error) {
206 return;
207 }
208 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
209 utf8iterator_get_position(&tokenizer->_input, &error->position);
210 error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
211 error->type = type;
212 error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
213 switch (tokenizer->_state) {
214 case GUMBO_LEX_DATA:
215 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
216 break;
217 case GUMBO_LEX_CHAR_REF_IN_DATA:
218 case GUMBO_LEX_CHAR_REF_IN_RCDATA:
219 case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
220 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
221 break;
222 case GUMBO_LEX_RCDATA:
223 case GUMBO_LEX_RCDATA_LT:
224 case GUMBO_LEX_RCDATA_END_TAG_OPEN:
225 case GUMBO_LEX_RCDATA_END_TAG_NAME:
226 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
227 break;
228 case GUMBO_LEX_RAWTEXT:
229 case GUMBO_LEX_RAWTEXT_LT:
230 case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
231 case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
232 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
233 break;
234 case GUMBO_LEX_PLAINTEXT:
235 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
236 break;
237 case GUMBO_LEX_SCRIPT:
238 case GUMBO_LEX_SCRIPT_LT:
239 case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
240 case GUMBO_LEX_SCRIPT_END_TAG_NAME:
241 case GUMBO_LEX_SCRIPT_ESCAPED_START:
242 case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
243 case GUMBO_LEX_SCRIPT_ESCAPED:
244 case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
245 case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
246 case GUMBO_LEX_SCRIPT_ESCAPED_LT:
247 case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
248 case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
249 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
250 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
251 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
252 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
253 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
254 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
255 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
256 break;
257 case GUMBO_LEX_TAG_OPEN:
258 case GUMBO_LEX_END_TAG_OPEN:
259 case GUMBO_LEX_TAG_NAME:
260 case GUMBO_LEX_BEFORE_ATTR_NAME:
261 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
262 break;
263 case GUMBO_LEX_SELF_CLOSING_START_TAG:
264 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
265 break;
266 case GUMBO_LEX_ATTR_NAME:
267 case GUMBO_LEX_AFTER_ATTR_NAME:
268 case GUMBO_LEX_BEFORE_ATTR_VALUE:
269 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
270 break;
271 case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
272 case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
273 case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
274 case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
275 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
276 break;
277 case GUMBO_LEX_BOGUS_COMMENT:
278 case GUMBO_LEX_COMMENT_START:
279 case GUMBO_LEX_COMMENT_START_DASH:
280 case GUMBO_LEX_COMMENT:
281 case GUMBO_LEX_COMMENT_END_DASH:
282 case GUMBO_LEX_COMMENT_END:
283 case GUMBO_LEX_COMMENT_END_BANG:
284 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
285 break;
286 case GUMBO_LEX_MARKUP_DECLARATION:
287 case GUMBO_LEX_DOCTYPE:
288 case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
289 case GUMBO_LEX_DOCTYPE_NAME:
290 case GUMBO_LEX_AFTER_DOCTYPE_NAME:
291 case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
292 case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
293 case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
294 case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
295 case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
296 case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
297 case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
298 case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
299 case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
300 case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
301 case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
302 case GUMBO_LEX_BOGUS_DOCTYPE:
303 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
304 break;
305 case GUMBO_LEX_CDATA:
306 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
307 break;
308 }
309 }
310
is_alpha(int c)311 static bool is_alpha(int c) {
312 // We don't use ISO C isupper/islower functions here because they
313 // depend upon the program's locale, while the behavior of the HTML5 spec is
314 // independent of which locale the program is run in.
315 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
316 }
317
ensure_lowercase(int c)318 static int ensure_lowercase(int c) {
319 return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
320 }
321
get_char_token_type(bool is_in_cdata,int c)322 static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
323 if (is_in_cdata && c > 0) {
324 return GUMBO_TOKEN_CDATA;
325 }
326
327 switch (c) {
328 case '\t':
329 case '\n':
330 case '\r':
331 case '\f':
332 case ' ':
333 return GUMBO_TOKEN_WHITESPACE;
334 case 0:
335 gumbo_debug("Emitted null byte.\n");
336 return GUMBO_TOKEN_NULL;
337 case -1:
338 return GUMBO_TOKEN_EOF;
339 default:
340 return GUMBO_TOKEN_CHARACTER;
341 }
342 }
343
344 // Starts recording characters in the temporary buffer.
345 // Because this needs to reset the utf8iterator_mark to the beginning of the
346 // text that will eventually be emitted, it needs to be called a couple of
347 // states before the spec says "Set the temporary buffer to the empty string".
348 // In general, this should be called whenever there's a transition to a
349 // "less-than sign state". The initial < and possibly / then need to be
350 // appended to the temporary buffer, their presence needs to be accounted for in
351 // states that compare the temporary buffer against a literal value, and
352 // spec stanzas that say "emit a < and / character token along with a character
353 // token for each character in the temporary buffer" need to be adjusted to
354 // account for the presence of the < and / inside the temporary buffer.
clear_temporary_buffer(GumboParser * parser)355 static void clear_temporary_buffer(GumboParser* parser) {
356 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
357 assert(!tokenizer->_temporary_buffer_emit);
358 utf8iterator_mark(&tokenizer->_input);
359 gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
360 // The temporary buffer and script data buffer are the same object in the
361 // spec, so the script data buffer should be cleared as well.
362 gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
363 }
364
365 // Appends a codepoint to the temporary buffer.
append_char_to_temporary_buffer(GumboParser * parser,int codepoint)366 static void append_char_to_temporary_buffer(
367 GumboParser* parser, int codepoint) {
368 gumbo_string_buffer_append_codepoint(
369 parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
370 }
371
372 // Checks to see if the temporary buffer equals a certain string.
373 // Make sure this remains side-effect free; it's used in assertions.
374 #ifndef NDEBUG
temporary_buffer_equals(GumboParser * parser,const char * text)375 static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
376 GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
377 // TODO(jdtang): See if the extra strlen is a performance problem, and replace
378 // it with an explicit sizeof(literal) if necessary. I don't think it will
379 // be, as this is only used in a couple of rare states.
380 int text_len = strlen(text);
381 return text_len == buffer->length &&
382 memcmp(buffer->data, text, text_len) == 0;
383 }
384 #endif
385
doc_type_state_init(GumboParser * parser)386 static void doc_type_state_init(GumboParser* parser) {
387 GumboTokenDocType* doc_type_state =
388 &parser->_tokenizer_state->_doc_type_state;
389 // We initialize these to NULL here so that we don't end up leaking memory if
390 // we never see a doctype token. When we do see a doctype token, we reset
391 // them to a freshly-allocated empty string so that we can present a uniform
392 // interface to client code and not make them check for null. Ownership is
393 // transferred to the doctype token when it's emitted.
394 doc_type_state->name = NULL;
395 doc_type_state->public_identifier = NULL;
396 doc_type_state->system_identifier = NULL;
397 doc_type_state->force_quirks = false;
398 doc_type_state->has_public_identifier = false;
399 doc_type_state->has_system_identifier = false;
400 }
401
402 // Sets the token original_text and position to the current iterator position.
403 // This is necessary because [CDATA[ sections may include text that is ignored
404 // by the tokenizer.
reset_token_start_point(GumboTokenizerState * tokenizer)405 static void reset_token_start_point(GumboTokenizerState* tokenizer) {
406 tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
407 utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
408 }
409
410 // Sets the tag buffer original text and start point to the current iterator
411 // position. This is necessary because attribute names & values may have
412 // whitespace preceeding them, and so we can't assume that the actual token
413 // starting point was the end of the last tag buffer usage.
reset_tag_buffer_start_point(GumboParser * parser)414 static void reset_tag_buffer_start_point(GumboParser* parser) {
415 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
416 GumboTagState* tag_state = &tokenizer->_tag_state;
417
418 utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
419 tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
420 }
421
422 // Moves the temporary buffer contents over to the specified output string,
423 // and clears the temporary buffer.
finish_temporary_buffer(GumboParser * parser,const char ** output)424 static void finish_temporary_buffer(GumboParser* parser, const char** output) {
425 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
426 *output =
427 gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
428 clear_temporary_buffer(parser);
429 }
430
431 // Advances the iterator past the end of the token, and then fills in the
432 // relevant position fields. It's assumed that after every emit, the tokenizer
433 // will immediately return (letting the tree-construction stage read the filled
434 // in Token). Thus, it's safe to advance the input stream here, since it will
435 // bypass the advance at the bottom of the state machine loop.
436 //
437 // Since this advances the iterator and resets the current input, make sure to
438 // call it after you've recorded any other data you need for the token.
finish_token(GumboParser * parser,GumboToken * token)439 static void finish_token(GumboParser* parser, GumboToken* token) {
440 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
441 if (!tokenizer->_reconsume_current_input) {
442 utf8iterator_next(&tokenizer->_input);
443 }
444
445 token->position = tokenizer->_token_start_pos;
446 token->original_text.data = tokenizer->_token_start;
447 reset_token_start_point(tokenizer);
448 token->original_text.length =
449 tokenizer->_token_start - token->original_text.data;
450 if (token->original_text.length > 0 &&
451 token->original_text.data[token->original_text.length - 1] == '\r') {
452 // The UTF8 iterator will ignore carriage returns in the input stream, which
453 // means that the next token may start one past a \r character. The pointer
454 // arithmetic above results in that \r being appended to the original text
455 // of the preceding token, so we have to adjust its length here to chop the
456 // \r off.
457 --token->original_text.length;
458 }
459 }
460
461 // Records the doctype public ID, assumed to be in the temporary buffer.
462 // Convenience method that also sets has_public_identifier to true.
finish_doctype_public_id(GumboParser * parser)463 static void finish_doctype_public_id(GumboParser* parser) {
464 GumboTokenDocType* doc_type_state =
465 &parser->_tokenizer_state->_doc_type_state;
466 gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
467 finish_temporary_buffer(parser, &doc_type_state->public_identifier);
468 doc_type_state->has_public_identifier = true;
469 }
470
471 // Records the doctype system ID, assumed to be in the temporary buffer.
472 // Convenience method that also sets has_system_identifier to true.
finish_doctype_system_id(GumboParser * parser)473 static void finish_doctype_system_id(GumboParser* parser) {
474 GumboTokenDocType* doc_type_state =
475 &parser->_tokenizer_state->_doc_type_state;
476 gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
477 finish_temporary_buffer(parser, &doc_type_state->system_identifier);
478 doc_type_state->has_system_identifier = true;
479 }
480
481 // Writes a single specified character to the output token.
emit_char(GumboParser * parser,int c,GumboToken * output)482 static void emit_char(GumboParser* parser, int c, GumboToken* output) {
483 output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
484 output->v.character = c;
485 finish_token(parser, output);
486 }
487
488 // Writes a replacement character token and records a parse error.
489 // Always returns RETURN_ERROR, per gumbo_lex return value.
emit_replacement_char(GumboParser * parser,GumboToken * output)490 static StateResult emit_replacement_char(
491 GumboParser* parser, GumboToken* output) {
492 // In all cases, this is because of a null byte in the input stream.
493 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
494 emit_char(parser, kUtf8ReplacementChar, output);
495 return RETURN_ERROR;
496 }
497
498 // Writes an EOF character token. Always returns RETURN_SUCCESS.
emit_eof(GumboParser * parser,GumboToken * output)499 static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
500 emit_char(parser, -1, output);
501 return RETURN_SUCCESS;
502 }
503
504 // Writes the current input character out as a character token.
505 // Always returns RETURN_SUCCESS.
emit_current_char(GumboParser * parser,GumboToken * output)506 static bool emit_current_char(GumboParser* parser, GumboToken* output) {
507 emit_char(
508 parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
509 return RETURN_SUCCESS;
510 }
511
512 // Writes out a doctype token, copying it from the tokenizer state.
emit_doctype(GumboParser * parser,GumboToken * output)513 static void emit_doctype(GumboParser* parser, GumboToken* output) {
514 output->type = GUMBO_TOKEN_DOCTYPE;
515 output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
516 finish_token(parser, output);
517 doc_type_state_init(parser);
518 }
519
520 // Debug-only function that explicitly sets the attribute vector data to NULL so
521 // it can be asserted on tag creation, verifying that there are no memory leaks.
mark_tag_state_as_empty(GumboTagState * tag_state)522 static void mark_tag_state_as_empty(GumboTagState* tag_state) {
523 #ifndef NDEBUG
524 tag_state->_attributes = kGumboEmptyVector;
525 #endif
526 }
527
528 // Writes out the current tag as a start or end tag token.
529 // Always returns RETURN_SUCCESS.
emit_current_tag(GumboParser * parser,GumboToken * output)530 static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
531 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
532 if (tag_state->_is_start_tag) {
533 output->type = GUMBO_TOKEN_START_TAG;
534 output->v.start_tag.tag = tag_state->_tag;
535 output->v.start_tag.attributes = tag_state->_attributes;
536 output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
537 tag_state->_last_start_tag = tag_state->_tag;
538 mark_tag_state_as_empty(tag_state);
539 gumbo_debug(
540 "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
541 } else {
542 output->type = GUMBO_TOKEN_END_TAG;
543 output->v.end_tag = tag_state->_tag;
544 // In end tags, ownership of the attributes vector is not transferred to the
545 // token, but it's still initialized as normal, so it must be manually
546 // deallocated. There may also be attributes to destroy, in certain broken
547 // cases like </div</th> (the "th" is an attribute there).
548 for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
549 gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
550 }
551 gumbo_parser_deallocate(parser, tag_state->_attributes.data);
552 mark_tag_state_as_empty(tag_state);
553 gumbo_debug(
554 "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
555 }
556 gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
557 finish_token(parser, output);
558 gumbo_debug("Original text = %.*s.\n", output->original_text.length,
559 output->original_text.data);
560 assert(output->original_text.length >= 2);
561 assert(output->original_text.data[0] == '<');
562 assert(output->original_text.data[output->original_text.length - 1] == '>');
563 return RETURN_SUCCESS;
564 }
565
566 // In some states, we speculatively start a tag, but don't know whether it'll be
567 // emitted as tag token or as a series of character tokens until we finish it.
568 // We need to abandon the tag we'd started & free its memory in that case to
569 // avoid a memory leak.
abandon_current_tag(GumboParser * parser)570 static void abandon_current_tag(GumboParser* parser) {
571 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
572 for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
573 gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
574 }
575 gumbo_parser_deallocate(parser, tag_state->_attributes.data);
576 mark_tag_state_as_empty(tag_state);
577 gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
578 gumbo_debug("Abandoning current tag.\n");
579 }
580
581 // Wraps the consume_char_ref function to handle its output and make the
582 // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
583 // error occurred, RETURN_SUCCESS otherwise.
emit_char_ref(GumboParser * parser,int additional_allowed_char,bool is_in_attribute,GumboToken * output)584 static StateResult emit_char_ref(GumboParser* parser,
585 int additional_allowed_char, bool is_in_attribute, GumboToken* output) {
586 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
587 OneOrTwoCodepoints char_ref;
588 bool status = consume_char_ref(
589 parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
590 if (char_ref.first != kGumboNoChar) {
591 // consume_char_ref ends with the iterator pointing at the next character,
592 // so we need to be sure not advance it again before reading the next token.
593 tokenizer->_reconsume_current_input = true;
594 emit_char(parser, char_ref.first, output);
595 tokenizer->_buffered_emit_char = char_ref.second;
596 } else {
597 emit_char(parser, '&', output);
598 }
599 return status ? RETURN_SUCCESS : RETURN_ERROR;
600 }
601
602 // Emits a comment token. Comments use the temporary buffer to accumulate their
603 // data, and then it's copied over and released to the 'text' field of the
604 // GumboToken union. Always returns RETURN_SUCCESS.
emit_comment(GumboParser * parser,GumboToken * output)605 static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
606 output->type = GUMBO_TOKEN_COMMENT;
607 finish_temporary_buffer(parser, &output->v.text);
608 finish_token(parser, output);
609 return RETURN_SUCCESS;
610 }
611
612 // Checks to see we should be flushing accumulated characters in the temporary
613 // buffer, and fills the output token with the next output character if so.
614 // Returns true if a character has been emitted and the tokenizer should
615 // immediately return, false if we're at the end of the temporary buffer and
616 // should resume normal operation.
maybe_emit_from_temporary_buffer(GumboParser * parser,GumboToken * output)617 static bool maybe_emit_from_temporary_buffer(
618 GumboParser* parser, GumboToken* output) {
619 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
620 const char* c = tokenizer->_temporary_buffer_emit;
621 GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
622
623 if (!c || c >= buffer->data + buffer->length) {
624 tokenizer->_temporary_buffer_emit = NULL;
625 return false;
626 }
627
628 assert(*c == utf8iterator_current(&tokenizer->_input));
629 // emit_char also advances the input stream. We need to do some juggling of
630 // the _reconsume_current_input flag to get the proper behavior when emitting
631 // previous tokens. Basically, _reconsume_current_input should *never* be set
632 // when emitting anything from the temporary buffer, since those characters
633 // have already been advanced past. However, it should be preserved so that
634 // when the *next* character is encountered again, the tokenizer knows not to
635 // advance past it.
636 bool saved_reconsume_state = tokenizer->_reconsume_current_input;
637 tokenizer->_reconsume_current_input = false;
638 emit_char(parser, *c, output);
639 ++tokenizer->_temporary_buffer_emit;
640 tokenizer->_reconsume_current_input = saved_reconsume_state;
641 return true;
642 }
643
644 // Sets up the tokenizer to begin flushing the temporary buffer.
645 // This resets the input iterator stream to the start of the last tag, sets up
646 // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
647 // the first character in it. It returns true if a character was emitted, false
648 // otherwise.
emit_temporary_buffer(GumboParser * parser,GumboToken * output)649 static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
650 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
651 assert(tokenizer->_temporary_buffer.data);
652 utf8iterator_reset(&tokenizer->_input);
653 tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
654 return maybe_emit_from_temporary_buffer(parser, output);
655 }
656
657 // Appends a codepoint to the current tag buffer. If
658 // reinitilize_position_on_first is set, this also initializes the tag buffer
659 // start point; the only time you would *not* want to pass true for this
660 // parameter is if you want the original_text to include character (like an
661 // opening quote) that doesn't appear in the value.
append_char_to_tag_buffer(GumboParser * parser,int codepoint,bool reinitilize_position_on_first)662 static void append_char_to_tag_buffer(
663 GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
664 GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
665 if (buffer->length == 0 && reinitilize_position_on_first) {
666 reset_tag_buffer_start_point(parser);
667 }
668 gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
669 }
670
671 // (Re-)initialize the tag buffer. This also resets the original_text pointer
672 // and _start_pos field to point to the current position.
initialize_tag_buffer(GumboParser * parser)673 static void initialize_tag_buffer(GumboParser* parser) {
674 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
675 GumboTagState* tag_state = &tokenizer->_tag_state;
676
677 gumbo_string_buffer_init(parser, &tag_state->_buffer);
678 reset_tag_buffer_start_point(parser);
679 }
680
681 // Initializes the tag_state to start a new tag, keeping track of the opening
682 // positions and original text. Takes a boolean indicating whether this is a
683 // start or end tag.
start_new_tag(GumboParser * parser,bool is_start_tag)684 static void start_new_tag(GumboParser* parser, bool is_start_tag) {
685 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
686 GumboTagState* tag_state = &tokenizer->_tag_state;
687 int c = utf8iterator_current(&tokenizer->_input);
688 assert(is_alpha(c));
689 c = ensure_lowercase(c);
690 assert(is_alpha(c));
691
692 initialize_tag_buffer(parser);
693 gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
694
695 assert(tag_state->_attributes.data == NULL);
696 // Initial size chosen by statistical analysis of a corpus of 60k webpages.
697 // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
698 // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
699 // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
700 gumbo_vector_init(parser, 1, &tag_state->_attributes);
701 tag_state->_drop_next_attr_value = false;
702 tag_state->_is_start_tag = is_start_tag;
703 tag_state->_is_self_closing = false;
704 gumbo_debug("Starting new tag.\n");
705 }
706
707 // Fills in the specified char* with the contents of the tag buffer.
copy_over_tag_buffer(GumboParser * parser,const char ** output)708 static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
709 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
710 GumboTagState* tag_state = &tokenizer->_tag_state;
711 *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
712 }
713
714 // Fills in:
715 // * The original_text GumboStringPiece with the portion of the original
716 // buffer that corresponds to the tag buffer.
717 // * The start_pos GumboSourcePosition with the start position of the tag
718 // buffer.
719 // * The end_pos GumboSourcePosition with the current source position.
copy_over_original_tag_text(GumboParser * parser,GumboStringPiece * original_text,GumboSourcePosition * start_pos,GumboSourcePosition * end_pos)720 static void copy_over_original_tag_text(GumboParser* parser,
721 GumboStringPiece* original_text, GumboSourcePosition* start_pos,
722 GumboSourcePosition* end_pos) {
723 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
724 GumboTagState* tag_state = &tokenizer->_tag_state;
725
726 original_text->data = tag_state->_original_text;
727 original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
728 tag_state->_original_text;
729 if (original_text->data[original_text->length - 1] == '\r') {
730 // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
731 // appended to the end of original text even when it's really the first part
732 // of the next character. If we detect this situation, shrink the length of
733 // the original text by 1 to remove the carriage return.
734 --original_text->length;
735 }
736 *start_pos = tag_state->_start_pos;
737 utf8iterator_get_position(&tokenizer->_input, end_pos);
738 }
739
740 // Releases and then re-initializes the tag buffer.
reinitialize_tag_buffer(GumboParser * parser)741 static void reinitialize_tag_buffer(GumboParser* parser) {
742 gumbo_parser_deallocate(
743 parser, parser->_tokenizer_state->_tag_state._buffer.data);
744 initialize_tag_buffer(parser);
745 }
746
747 // Moves some data from the temporary buffer over the the tag-based fields in
748 // TagState.
finish_tag_name(GumboParser * parser)749 static void finish_tag_name(GumboParser* parser) {
750 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
751 GumboTagState* tag_state = &tokenizer->_tag_state;
752
753 tag_state->_tag =
754 gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
755 reinitialize_tag_buffer(parser);
756 }
757
758 // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
add_duplicate_attr_error(GumboParser * parser,const char * attr_name,int original_index,int new_index)759 static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
760 int original_index, int new_index) {
761 GumboError* error = gumbo_add_error(parser);
762 if (!error) {
763 return;
764 }
765 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
766 error->type = GUMBO_ERR_DUPLICATE_ATTR;
767 error->position = tag_state->_start_pos;
768 error->original_text = tag_state->_original_text;
769 error->v.duplicate_attr.original_index = original_index;
770 error->v.duplicate_attr.new_index = new_index;
771 copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
772 reinitialize_tag_buffer(parser);
773 }
774
775 // Creates a new attribute in the current tag, copying the current tag buffer to
776 // the attribute's name. The attribute's value starts out as the empty string
777 // (following the "Boolean attributes" section of the spec) and is only
778 // overwritten on finish_attribute_value(). If the attribute has already been
779 // specified, the new attribute is dropped, a parse error is added, and the
780 // function returns false. Otherwise, this returns true.
finish_attribute_name(GumboParser * parser)781 static bool finish_attribute_name(GumboParser* parser) {
782 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
783 GumboTagState* tag_state = &tokenizer->_tag_state;
784 // May've been set by a previous attribute without a value; reset it here.
785 tag_state->_drop_next_attr_value = false;
786 assert(tag_state->_attributes.data);
787 assert(tag_state->_attributes.capacity);
788
789 GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
790 for (unsigned int i = 0; i < attributes->length; ++i) {
791 GumboAttribute* attr = attributes->data[i];
792 if (strlen(attr->name) == tag_state->_buffer.length &&
793 memcmp(attr->name, tag_state->_buffer.data,
794 tag_state->_buffer.length) == 0) {
795 // Identical attribute; bail.
796 add_duplicate_attr_error(parser, attr->name, i, attributes->length);
797 tag_state->_drop_next_attr_value = true;
798 return false;
799 }
800 }
801
802 GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
803 attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
804 copy_over_tag_buffer(parser, &attr->name);
805 copy_over_original_tag_text(
806 parser, &attr->original_name, &attr->name_start, &attr->name_end);
807 attr->value = gumbo_copy_stringz(parser, "");
808 copy_over_original_tag_text(
809 parser, &attr->original_value, &attr->name_start, &attr->name_end);
810 gumbo_vector_add(parser, attr, attributes);
811 reinitialize_tag_buffer(parser);
812 return true;
813 }
814
815 // Finishes an attribute value. This sets the value of the most recently added
816 // attribute to the current contents of the tag buffer.
finish_attribute_value(GumboParser * parser)817 static void finish_attribute_value(GumboParser* parser) {
818 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
819 if (tag_state->_drop_next_attr_value) {
820 // Duplicate attribute name detected in an earlier state, so we have to
821 // ignore the value.
822 tag_state->_drop_next_attr_value = false;
823 reinitialize_tag_buffer(parser);
824 return;
825 }
826
827 GumboAttribute* attr =
828 tag_state->_attributes.data[tag_state->_attributes.length - 1];
829 gumbo_parser_deallocate(parser, (void*) attr->value);
830 copy_over_tag_buffer(parser, &attr->value);
831 copy_over_original_tag_text(
832 parser, &attr->original_value, &attr->value_start, &attr->value_end);
833 reinitialize_tag_buffer(parser);
834 }
835
836 // Returns true if the current end tag matches the last start tag emitted.
is_appropriate_end_tag(GumboParser * parser)837 static bool is_appropriate_end_tag(GumboParser* parser) {
838 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
839 assert(!tag_state->_is_start_tag);
840 return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
841 tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
842 tag_state->_buffer.length);
843 }
844
gumbo_tokenizer_state_init(GumboParser * parser,const char * text,size_t text_length)845 void gumbo_tokenizer_state_init(
846 GumboParser* parser, const char* text, size_t text_length) {
847 GumboTokenizerState* tokenizer =
848 gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
849 parser->_tokenizer_state = tokenizer;
850 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851 tokenizer->_reconsume_current_input = false;
852 tokenizer->_is_current_node_foreign = false;
853 tokenizer->_is_in_cdata = false;
854 tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
855
856 tokenizer->_buffered_emit_char = kGumboNoChar;
857 gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
858 tokenizer->_temporary_buffer_emit = NULL;
859
860 mark_tag_state_as_empty(&tokenizer->_tag_state);
861
862 gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
863 tokenizer->_token_start = text;
864 utf8iterator_init(parser, text, text_length, &tokenizer->_input);
865 utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
866 doc_type_state_init(parser);
867 }
868
gumbo_tokenizer_state_destroy(GumboParser * parser)869 void gumbo_tokenizer_state_destroy(GumboParser* parser) {
870 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
871 assert(tokenizer->_doc_type_state.name == NULL);
872 assert(tokenizer->_doc_type_state.public_identifier == NULL);
873 assert(tokenizer->_doc_type_state.system_identifier == NULL);
874 gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
875 gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
876 gumbo_parser_deallocate(parser, tokenizer);
877 }
878
gumbo_tokenizer_set_state(GumboParser * parser,GumboTokenizerEnum state)879 void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
880 parser->_tokenizer_state->_state = state;
881 }
882
gumbo_tokenizer_set_is_current_node_foreign(GumboParser * parser,bool is_foreign)883 void gumbo_tokenizer_set_is_current_node_foreign(
884 GumboParser* parser, bool is_foreign) {
885 if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
886 gumbo_debug("Toggling is_current_node_foreign to %s.\n",
887 is_foreign ? "true" : "false");
888 }
889 parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
890 }
891
892 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
handle_data_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)893 static StateResult handle_data_state(GumboParser* parser,
894 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
895 switch (c) {
896 case '&':
897 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
898 // The char_ref machinery expects to be on the & so it can mark that
899 // and return to it if the text isn't a char ref, so we need to
900 // reconsume it.
901 tokenizer->_reconsume_current_input = true;
902 return NEXT_CHAR;
903 case '<':
904 gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
905 clear_temporary_buffer(parser);
906 append_char_to_temporary_buffer(parser, '<');
907 return NEXT_CHAR;
908 case '\0':
909 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
910 emit_char(parser, c, output);
911 return RETURN_ERROR;
912 default:
913 return emit_current_char(parser, output);
914 }
915 }
916
917 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
handle_char_ref_in_data_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)918 static StateResult handle_char_ref_in_data_state(GumboParser* parser,
919 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
920 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
921 return emit_char_ref(parser, ' ', false, output);
922 }
923
924 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
handle_rcdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)925 static StateResult handle_rcdata_state(GumboParser* parser,
926 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
927 switch (c) {
928 case '&':
929 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
930 tokenizer->_reconsume_current_input = true;
931 return NEXT_CHAR;
932 case '<':
933 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
934 clear_temporary_buffer(parser);
935 append_char_to_temporary_buffer(parser, '<');
936 return NEXT_CHAR;
937 case '\0':
938 return emit_replacement_char(parser, output);
939 case -1:
940 return emit_eof(parser, output);
941 default:
942 return emit_current_char(parser, output);
943 }
944 }
945
946 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
handle_char_ref_in_rcdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)947 static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
948 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
949 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
950 return emit_char_ref(parser, ' ', false, output);
951 }
952
953 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
handle_rawtext_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)954 static StateResult handle_rawtext_state(GumboParser* parser,
955 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
956 switch (c) {
957 case '<':
958 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
959 clear_temporary_buffer(parser);
960 append_char_to_temporary_buffer(parser, '<');
961 return NEXT_CHAR;
962 case '\0':
963 return emit_replacement_char(parser, output);
964 case -1:
965 return emit_eof(parser, output);
966 default:
967 return emit_current_char(parser, output);
968 }
969 }
970
971 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
handle_script_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)972 static StateResult handle_script_state(GumboParser* parser,
973 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
974 switch (c) {
975 case '<':
976 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
977 clear_temporary_buffer(parser);
978 append_char_to_temporary_buffer(parser, '<');
979 return NEXT_CHAR;
980 case '\0':
981 return emit_replacement_char(parser, output);
982 case -1:
983 return emit_eof(parser, output);
984 default:
985 return emit_current_char(parser, output);
986 }
987 }
988
989 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
handle_plaintext_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)990 static StateResult handle_plaintext_state(GumboParser* parser,
991 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
992 switch (c) {
993 case '\0':
994 return emit_replacement_char(parser, output);
995 case -1:
996 return emit_eof(parser, output);
997 default:
998 return emit_current_char(parser, output);
999 }
1000 }
1001
1002 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
handle_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1003 static StateResult handle_tag_open_state(GumboParser* parser,
1004 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1005 assert(temporary_buffer_equals(parser, "<"));
1006 switch (c) {
1007 case '!':
1008 gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1009 clear_temporary_buffer(parser);
1010 return NEXT_CHAR;
1011 case '/':
1012 gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1013 append_char_to_temporary_buffer(parser, '/');
1014 return NEXT_CHAR;
1015 case '?':
1016 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1017 clear_temporary_buffer(parser);
1018 append_char_to_temporary_buffer(parser, '?');
1019 tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1020 return NEXT_CHAR;
1021 default:
1022 if (is_alpha(c)) {
1023 gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1024 start_new_tag(parser, true);
1025 return NEXT_CHAR;
1026 } else {
1027 tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1028 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1029 emit_temporary_buffer(parser, output);
1030 return RETURN_ERROR;
1031 }
1032 }
1033 }
1034
1035 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
handle_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1036 static StateResult handle_end_tag_open_state(GumboParser* parser,
1037 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1038 assert(temporary_buffer_equals(parser, "</"));
1039 switch (c) {
1040 case '>':
1041 tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1042 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1043 return NEXT_CHAR;
1044 case -1:
1045 tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1046 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1047 return emit_temporary_buffer(parser, output);
1048 default:
1049 if (is_alpha(c)) {
1050 gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1051 start_new_tag(parser, false);
1052 } else {
1053 tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1054 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1055 clear_temporary_buffer(parser);
1056 append_char_to_temporary_buffer(parser, c);
1057 }
1058 return NEXT_CHAR;
1059 }
1060 }
1061
1062 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
handle_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1063 static StateResult handle_tag_name_state(GumboParser* parser,
1064 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1065 switch (c) {
1066 case '\t':
1067 case '\n':
1068 case '\f':
1069 case ' ':
1070 finish_tag_name(parser);
1071 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1072 return NEXT_CHAR;
1073 case '/':
1074 finish_tag_name(parser);
1075 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1076 return NEXT_CHAR;
1077 case '>':
1078 finish_tag_name(parser);
1079 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1080 return emit_current_tag(parser, output);
1081 case '\0':
1082 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1083 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1084 return NEXT_CHAR;
1085 case -1:
1086 tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1087 abandon_current_tag(parser);
1088 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1089 return NEXT_CHAR;
1090 default:
1091 append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1092 return NEXT_CHAR;
1093 }
1094 }
1095
1096 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
handle_rcdata_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1097 static StateResult handle_rcdata_lt_state(GumboParser* parser,
1098 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1099 assert(temporary_buffer_equals(parser, "<"));
1100 if (c == '/') {
1101 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1102 append_char_to_temporary_buffer(parser, '/');
1103 return NEXT_CHAR;
1104 } else {
1105 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1106 tokenizer->_reconsume_current_input = true;
1107 return emit_temporary_buffer(parser, output);
1108 }
1109 }
1110
1111 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
handle_rcdata_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1112 static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1113 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1114 assert(temporary_buffer_equals(parser, "</"));
1115 if (is_alpha(c)) {
1116 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1117 start_new_tag(parser, false);
1118 append_char_to_temporary_buffer(parser, c);
1119 return NEXT_CHAR;
1120 } else {
1121 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1122 return emit_temporary_buffer(parser, output);
1123 }
1124 return true;
1125 }
1126
1127 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
handle_rcdata_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1128 static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1129 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1130 assert(tokenizer->_temporary_buffer.length >= 2);
1131 if (is_alpha(c)) {
1132 append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1133 append_char_to_temporary_buffer(parser, c);
1134 return NEXT_CHAR;
1135 } else if (is_appropriate_end_tag(parser)) {
1136 switch (c) {
1137 case '\t':
1138 case '\n':
1139 case '\f':
1140 case ' ':
1141 finish_tag_name(parser);
1142 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1143 return NEXT_CHAR;
1144 case '/':
1145 finish_tag_name(parser);
1146 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1147 return NEXT_CHAR;
1148 case '>':
1149 finish_tag_name(parser);
1150 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1151 return emit_current_tag(parser, output);
1152 }
1153 }
1154 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1155 abandon_current_tag(parser);
1156 return emit_temporary_buffer(parser, output);
1157 }
1158
1159 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
handle_rawtext_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1160 static StateResult handle_rawtext_lt_state(GumboParser* parser,
1161 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1162 assert(temporary_buffer_equals(parser, "<"));
1163 if (c == '/') {
1164 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1165 append_char_to_temporary_buffer(parser, '/');
1166 return NEXT_CHAR;
1167 } else {
1168 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1169 tokenizer->_reconsume_current_input = true;
1170 return emit_temporary_buffer(parser, output);
1171 }
1172 }
1173
1174 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
handle_rawtext_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1175 static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1176 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1177 assert(temporary_buffer_equals(parser, "</"));
1178 if (is_alpha(c)) {
1179 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1180 start_new_tag(parser, false);
1181 append_char_to_temporary_buffer(parser, c);
1182 return NEXT_CHAR;
1183 } else {
1184 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1185 return emit_temporary_buffer(parser, output);
1186 }
1187 }
1188
1189 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
handle_rawtext_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1190 static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1191 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1192 assert(tokenizer->_temporary_buffer.length >= 2);
1193 gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1194 tokenizer->_tag_state._buffer.data);
1195 if (is_alpha(c)) {
1196 append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1197 append_char_to_temporary_buffer(parser, c);
1198 return NEXT_CHAR;
1199 } else if (is_appropriate_end_tag(parser)) {
1200 gumbo_debug("Is an appropriate end tag.\n");
1201 switch (c) {
1202 case '\t':
1203 case '\n':
1204 case '\f':
1205 case ' ':
1206 finish_tag_name(parser);
1207 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1208 return NEXT_CHAR;
1209 case '/':
1210 finish_tag_name(parser);
1211 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1212 return NEXT_CHAR;
1213 case '>':
1214 finish_tag_name(parser);
1215 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1216 return emit_current_tag(parser, output);
1217 }
1218 }
1219 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1220 abandon_current_tag(parser);
1221 return emit_temporary_buffer(parser, output);
1222 }
1223
1224 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
handle_script_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1225 static StateResult handle_script_lt_state(GumboParser* parser,
1226 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1227 assert(temporary_buffer_equals(parser, "<"));
1228 if (c == '/') {
1229 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1230 append_char_to_temporary_buffer(parser, '/');
1231 return NEXT_CHAR;
1232 } else if (c == '!') {
1233 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1234 append_char_to_temporary_buffer(parser, '!');
1235 return emit_temporary_buffer(parser, output);
1236 } else {
1237 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1238 tokenizer->_reconsume_current_input = true;
1239 return emit_temporary_buffer(parser, output);
1240 }
1241 }
1242
1243 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
handle_script_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1244 static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1245 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1246 assert(temporary_buffer_equals(parser, "</"));
1247 if (is_alpha(c)) {
1248 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1249 start_new_tag(parser, false);
1250 append_char_to_temporary_buffer(parser, c);
1251 return NEXT_CHAR;
1252 } else {
1253 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1254 return emit_temporary_buffer(parser, output);
1255 }
1256 }
1257
1258 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
handle_script_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1259 static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1260 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1261 assert(tokenizer->_temporary_buffer.length >= 2);
1262 if (is_alpha(c)) {
1263 append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1264 append_char_to_temporary_buffer(parser, c);
1265 return NEXT_CHAR;
1266 } else if (is_appropriate_end_tag(parser)) {
1267 switch (c) {
1268 case '\t':
1269 case '\n':
1270 case '\f':
1271 case ' ':
1272 finish_tag_name(parser);
1273 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1274 return NEXT_CHAR;
1275 case '/':
1276 finish_tag_name(parser);
1277 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1278 return NEXT_CHAR;
1279 case '>':
1280 finish_tag_name(parser);
1281 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1282 return emit_current_tag(parser, output);
1283 }
1284 }
1285 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1286 abandon_current_tag(parser);
1287 return emit_temporary_buffer(parser, output);
1288 }
1289
1290 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
handle_script_escaped_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1291 static StateResult handle_script_escaped_start_state(GumboParser* parser,
1292 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1293 if (c == '-') {
1294 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1295 return emit_current_char(parser, output);
1296 } else {
1297 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1298 tokenizer->_reconsume_current_input = true;
1299 return NEXT_CHAR;
1300 }
1301 }
1302
1303 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
handle_script_escaped_start_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1304 static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1305 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1306 if (c == '-') {
1307 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1308 return emit_current_char(parser, output);
1309 } else {
1310 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1311 tokenizer->_reconsume_current_input = true;
1312 return NEXT_CHAR;
1313 }
1314 }
1315
1316 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
handle_script_escaped_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1317 static StateResult handle_script_escaped_state(GumboParser* parser,
1318 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1319 switch (c) {
1320 case '-':
1321 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1322 return emit_current_char(parser, output);
1323 case '<':
1324 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1325 clear_temporary_buffer(parser);
1326 append_char_to_temporary_buffer(parser, c);
1327 return NEXT_CHAR;
1328 case '\0':
1329 return emit_replacement_char(parser, output);
1330 case -1:
1331 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1332 return emit_eof(parser, output);
1333 default:
1334 return emit_current_char(parser, output);
1335 }
1336 }
1337
1338 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
handle_script_escaped_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1339 static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1340 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1341 switch (c) {
1342 case '-':
1343 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1344 return emit_current_char(parser, output);
1345 case '<':
1346 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1347 clear_temporary_buffer(parser);
1348 append_char_to_temporary_buffer(parser, c);
1349 return NEXT_CHAR;
1350 case '\0':
1351 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1352 return emit_replacement_char(parser, output);
1353 case -1:
1354 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1355 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1356 return NEXT_CHAR;
1357 default:
1358 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1359 return emit_current_char(parser, output);
1360 }
1361 }
1362
1363 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
handle_script_escaped_dash_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1364 static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1365 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1366 switch (c) {
1367 case '-':
1368 return emit_current_char(parser, output);
1369 case '<':
1370 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1371 clear_temporary_buffer(parser);
1372 append_char_to_temporary_buffer(parser, c);
1373 return NEXT_CHAR;
1374 case '>':
1375 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1376 return emit_current_char(parser, output);
1377 case '\0':
1378 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1379 return emit_replacement_char(parser, output);
1380 case -1:
1381 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1382 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1383 return NEXT_CHAR;
1384 default:
1385 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1386 return emit_current_char(parser, output);
1387 }
1388 }
1389
1390 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
handle_script_escaped_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1391 static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1392 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1393 assert(temporary_buffer_equals(parser, "<"));
1394 assert(!tokenizer->_script_data_buffer.length);
1395 if (c == '/') {
1396 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1397 append_char_to_temporary_buffer(parser, c);
1398 return NEXT_CHAR;
1399 } else if (is_alpha(c)) {
1400 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1401 append_char_to_temporary_buffer(parser, c);
1402 gumbo_string_buffer_append_codepoint(
1403 parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1404 return emit_temporary_buffer(parser, output);
1405 } else {
1406 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1407 return emit_temporary_buffer(parser, output);
1408 }
1409 }
1410
1411 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
handle_script_escaped_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1412 static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1413 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1414 assert(temporary_buffer_equals(parser, "</"));
1415 if (is_alpha(c)) {
1416 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1417 start_new_tag(parser, false);
1418 append_char_to_temporary_buffer(parser, c);
1419 return NEXT_CHAR;
1420 } else {
1421 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1422 return emit_temporary_buffer(parser, output);
1423 }
1424 }
1425
1426 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
handle_script_escaped_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1427 static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1428 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1429 assert(tokenizer->_temporary_buffer.length >= 2);
1430 if (is_alpha(c)) {
1431 append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1432 append_char_to_temporary_buffer(parser, c);
1433 return NEXT_CHAR;
1434 } else if (is_appropriate_end_tag(parser)) {
1435 switch (c) {
1436 case '\t':
1437 case '\n':
1438 case '\f':
1439 case ' ':
1440 finish_tag_name(parser);
1441 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1442 return NEXT_CHAR;
1443 case '/':
1444 finish_tag_name(parser);
1445 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1446 return NEXT_CHAR;
1447 case '>':
1448 finish_tag_name(parser);
1449 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1450 return emit_current_tag(parser, output);
1451 }
1452 }
1453 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1454 abandon_current_tag(parser);
1455 return emit_temporary_buffer(parser, output);
1456 }
1457
1458 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
handle_script_double_escaped_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1459 static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1460 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1461 switch (c) {
1462 case '\t':
1463 case '\n':
1464 case '\f':
1465 case ' ':
1466 case '/':
1467 case '>':
1468 gumbo_tokenizer_set_state(
1469 parser, gumbo_string_equals(&kScriptTag,
1470 (GumboStringPiece*) &tokenizer->_script_data_buffer)
1471 ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1472 : GUMBO_LEX_SCRIPT_ESCAPED);
1473 return emit_current_char(parser, output);
1474 default:
1475 if (is_alpha(c)) {
1476 gumbo_string_buffer_append_codepoint(
1477 parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1478 return emit_current_char(parser, output);
1479 } else {
1480 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1481 tokenizer->_reconsume_current_input = true;
1482 return NEXT_CHAR;
1483 }
1484 }
1485 }
1486
1487 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
handle_script_double_escaped_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1488 static StateResult handle_script_double_escaped_state(GumboParser* parser,
1489 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1490 switch (c) {
1491 case '-':
1492 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1493 return emit_current_char(parser, output);
1494 case '<':
1495 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1496 return emit_current_char(parser, output);
1497 case '\0':
1498 return emit_replacement_char(parser, output);
1499 case -1:
1500 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1501 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1502 return NEXT_CHAR;
1503 default:
1504 return emit_current_char(parser, output);
1505 }
1506 }
1507
1508 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
handle_script_double_escaped_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1509 static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1510 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1511 switch (c) {
1512 case '-':
1513 gumbo_tokenizer_set_state(
1514 parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1515 return emit_current_char(parser, output);
1516 case '<':
1517 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1518 return emit_current_char(parser, output);
1519 case '\0':
1520 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1521 return emit_replacement_char(parser, output);
1522 case -1:
1523 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1524 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1525 return NEXT_CHAR;
1526 default:
1527 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1528 return emit_current_char(parser, output);
1529 }
1530 }
1531
1532 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
handle_script_double_escaped_dash_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1533 static StateResult handle_script_double_escaped_dash_dash_state(
1534 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
1535 GumboToken* output) {
1536 switch (c) {
1537 case '-':
1538 return emit_current_char(parser, output);
1539 case '<':
1540 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1541 return emit_current_char(parser, output);
1542 case '>':
1543 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1544 return emit_current_char(parser, output);
1545 case '\0':
1546 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1547 return emit_replacement_char(parser, output);
1548 case -1:
1549 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1550 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1551 return NEXT_CHAR;
1552 default:
1553 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1554 return emit_current_char(parser, output);
1555 }
1556 }
1557
1558 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
handle_script_double_escaped_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1559 static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1560 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1561 if (c == '/') {
1562 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1563 gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1564 return emit_current_char(parser, output);
1565 } else {
1566 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1567 tokenizer->_reconsume_current_input = true;
1568 return NEXT_CHAR;
1569 }
1570 }
1571
1572 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
handle_script_double_escaped_end_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1573 static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1574 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1575 switch (c) {
1576 case '\t':
1577 case '\n':
1578 case '\f':
1579 case ' ':
1580 case '/':
1581 case '>':
1582 gumbo_tokenizer_set_state(
1583 parser, gumbo_string_equals(&kScriptTag,
1584 (GumboStringPiece*) &tokenizer->_script_data_buffer)
1585 ? GUMBO_LEX_SCRIPT_ESCAPED
1586 : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1587 return emit_current_char(parser, output);
1588 default:
1589 if (is_alpha(c)) {
1590 gumbo_string_buffer_append_codepoint(
1591 parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1592 return emit_current_char(parser, output);
1593 } else {
1594 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1595 tokenizer->_reconsume_current_input = true;
1596 return NEXT_CHAR;
1597 }
1598 }
1599 }
1600
1601 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
handle_before_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1602 static StateResult handle_before_attr_name_state(GumboParser* parser,
1603 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1604 switch (c) {
1605 case '\t':
1606 case '\n':
1607 case '\f':
1608 case ' ':
1609 return NEXT_CHAR;
1610 case '/':
1611 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1612 return NEXT_CHAR;
1613 case '>':
1614 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1615 return emit_current_tag(parser, output);
1616 case '\0':
1617 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1618 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1619 append_char_to_temporary_buffer(parser, 0xfffd);
1620 return NEXT_CHAR;
1621 case -1:
1622 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1623 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1624 abandon_current_tag(parser);
1625 return NEXT_CHAR;
1626 case '"':
1627 case '\'':
1628 case '<':
1629 case '=':
1630 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1631 // Fall through.
1632 default:
1633 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1634 append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1635 return NEXT_CHAR;
1636 }
1637 }
1638
1639 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
handle_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1640 static StateResult handle_attr_name_state(GumboParser* parser,
1641 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1642 switch (c) {
1643 case '\t':
1644 case '\n':
1645 case '\f':
1646 case ' ':
1647 finish_attribute_name(parser);
1648 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1649 return NEXT_CHAR;
1650 case '/':
1651 finish_attribute_name(parser);
1652 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1653 return NEXT_CHAR;
1654 case '=':
1655 finish_attribute_name(parser);
1656 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1657 return NEXT_CHAR;
1658 case '>':
1659 finish_attribute_name(parser);
1660 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1661 return emit_current_tag(parser, output);
1662 case '\0':
1663 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1664 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1665 return NEXT_CHAR;
1666 case -1:
1667 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1668 abandon_current_tag(parser);
1669 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1670 return NEXT_CHAR;
1671 case '"':
1672 case '\'':
1673 case '<':
1674 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1675 // Fall through.
1676 default:
1677 append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1678 return NEXT_CHAR;
1679 }
1680 }
1681
1682 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
handle_after_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1683 static StateResult handle_after_attr_name_state(GumboParser* parser,
1684 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1685 switch (c) {
1686 case '\t':
1687 case '\n':
1688 case '\f':
1689 case ' ':
1690 return NEXT_CHAR;
1691 case '/':
1692 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1693 return NEXT_CHAR;
1694 case '=':
1695 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1696 return NEXT_CHAR;
1697 case '>':
1698 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1699 return emit_current_tag(parser, output);
1700 case '\0':
1701 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1702 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1703 append_char_to_temporary_buffer(parser, 0xfffd);
1704 return NEXT_CHAR;
1705 case -1:
1706 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1707 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1708 abandon_current_tag(parser);
1709 return NEXT_CHAR;
1710 case '"':
1711 case '\'':
1712 case '<':
1713 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1714 // Fall through.
1715 default:
1716 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1717 append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1718 return NEXT_CHAR;
1719 }
1720 }
1721
1722 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
handle_before_attr_value_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1723 static StateResult handle_before_attr_value_state(GumboParser* parser,
1724 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1725 switch (c) {
1726 case '\t':
1727 case '\n':
1728 case '\f':
1729 case ' ':
1730 return NEXT_CHAR;
1731 case '"':
1732 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1733 reset_tag_buffer_start_point(parser);
1734 return NEXT_CHAR;
1735 case '&':
1736 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1737 tokenizer->_reconsume_current_input = true;
1738 return NEXT_CHAR;
1739 case '\'':
1740 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1741 reset_tag_buffer_start_point(parser);
1742 return NEXT_CHAR;
1743 case '\0':
1744 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1745 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1746 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1747 return NEXT_CHAR;
1748 case -1:
1749 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1750 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1751 abandon_current_tag(parser);
1752 tokenizer->_reconsume_current_input = true;
1753 return NEXT_CHAR;
1754 case '>':
1755 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1756 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1757 emit_current_tag(parser, output);
1758 return RETURN_ERROR;
1759 case '<':
1760 case '=':
1761 case '`':
1762 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1763 // Fall through.
1764 default:
1765 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1766 append_char_to_tag_buffer(parser, c, true);
1767 return NEXT_CHAR;
1768 }
1769 }
1770
1771 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
handle_attr_value_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1772 static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1773 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1774 switch (c) {
1775 case '"':
1776 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1777 return NEXT_CHAR;
1778 case '&':
1779 tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1780 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1781 tokenizer->_reconsume_current_input = true;
1782 return NEXT_CHAR;
1783 case '\0':
1784 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1785 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1786 return NEXT_CHAR;
1787 case -1:
1788 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
1789 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1790 abandon_current_tag(parser);
1791 tokenizer->_reconsume_current_input = true;
1792 return NEXT_CHAR;
1793 default:
1794 append_char_to_tag_buffer(parser, c, false);
1795 return NEXT_CHAR;
1796 }
1797 }
1798
1799 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
handle_attr_value_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1800 static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1801 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1802 switch (c) {
1803 case '\'':
1804 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1805 return NEXT_CHAR;
1806 case '&':
1807 tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1808 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1809 tokenizer->_reconsume_current_input = true;
1810 return NEXT_CHAR;
1811 case '\0':
1812 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1813 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1814 return NEXT_CHAR;
1815 case -1:
1816 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
1817 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1818 abandon_current_tag(parser);
1819 tokenizer->_reconsume_current_input = true;
1820 return NEXT_CHAR;
1821 default:
1822 append_char_to_tag_buffer(parser, c, false);
1823 return NEXT_CHAR;
1824 }
1825 }
1826
1827 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
handle_attr_value_unquoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1828 static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1829 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1830 switch (c) {
1831 case '\t':
1832 case '\n':
1833 case '\f':
1834 case ' ':
1835 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1836 finish_attribute_value(parser);
1837 return NEXT_CHAR;
1838 case '&':
1839 tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1840 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1841 tokenizer->_reconsume_current_input = true;
1842 return NEXT_CHAR;
1843 case '>':
1844 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1845 finish_attribute_value(parser);
1846 return emit_current_tag(parser, output);
1847 case '\0':
1848 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1849 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1850 return NEXT_CHAR;
1851 case -1:
1852 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1853 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1854 tokenizer->_reconsume_current_input = true;
1855 abandon_current_tag(parser);
1856 return NEXT_CHAR;
1857 case '<':
1858 case '=':
1859 case '"':
1860 case '\'':
1861 case '`':
1862 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1863 // Fall through.
1864 default:
1865 append_char_to_tag_buffer(parser, c, true);
1866 return NEXT_CHAR;
1867 }
1868 }
1869
1870 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
handle_char_ref_in_attr_value_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1871 static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1872 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1873 OneOrTwoCodepoints char_ref;
1874 int allowed_char;
1875 bool is_unquoted = false;
1876 switch (tokenizer->_tag_state._attr_value_state) {
1877 case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
1878 allowed_char = '"';
1879 break;
1880 case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
1881 allowed_char = '\'';
1882 break;
1883 case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
1884 allowed_char = '>';
1885 is_unquoted = true;
1886 break;
1887 default:
1888 // -Wmaybe-uninitialized is a little overzealous here, and doesn't
1889 // get that the assert(0) means this codepath will never happen.
1890 allowed_char = ' ';
1891 assert(0);
1892 }
1893
1894 // Ignore the status, since we don't have a convenient way of signalling that
1895 // a parser error has occurred when the error occurs in the middle of a
1896 // multi-state token. We'd need a flag inside the TokenizerState to do this,
1897 // but that's a low priority fix.
1898 consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
1899 if (char_ref.first != kGumboNoChar) {
1900 tokenizer->_reconsume_current_input = true;
1901 append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
1902 if (char_ref.second != kGumboNoChar) {
1903 append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
1904 }
1905 } else {
1906 append_char_to_tag_buffer(parser, '&', is_unquoted);
1907 }
1908 gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
1909 return NEXT_CHAR;
1910 }
1911
1912 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
handle_after_attr_value_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1913 static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1914 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1915 finish_attribute_value(parser);
1916 switch (c) {
1917 case '\t':
1918 case '\n':
1919 case '\f':
1920 case ' ':
1921 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1922 return NEXT_CHAR;
1923 case '/':
1924 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1925 return NEXT_CHAR;
1926 case '>':
1927 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1928 return emit_current_tag(parser, output);
1929 case -1:
1930 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
1931 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1932 abandon_current_tag(parser);
1933 tokenizer->_reconsume_current_input = true;
1934 return NEXT_CHAR;
1935 default:
1936 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
1937 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1938 tokenizer->_reconsume_current_input = true;
1939 return NEXT_CHAR;
1940 }
1941 }
1942
1943 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
handle_self_closing_start_tag_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1944 static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1945 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1946 switch (c) {
1947 case '>':
1948 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1949 tokenizer->_tag_state._is_self_closing = true;
1950 return emit_current_tag(parser, output);
1951 case -1:
1952 tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
1953 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1954 abandon_current_tag(parser);
1955 return NEXT_CHAR;
1956 default:
1957 tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
1958 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1959 tokenizer->_reconsume_current_input = true;
1960 return NEXT_CHAR;
1961 }
1962 }
1963
1964 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
handle_bogus_comment_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1965 static StateResult handle_bogus_comment_state(GumboParser* parser,
1966 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1967 while (c != '>' && c != -1) {
1968 if (c == '\0') {
1969 c = 0xFFFD;
1970 }
1971 append_char_to_temporary_buffer(parser, c);
1972 utf8iterator_next(&tokenizer->_input);
1973 c = utf8iterator_current(&tokenizer->_input);
1974 }
1975 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1976 return emit_comment(parser, output);
1977 }
1978
1979 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
handle_markup_declaration_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1980 static StateResult handle_markup_declaration_state(GumboParser* parser,
1981 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1982 if (utf8iterator_maybe_consume_match(
1983 &tokenizer->_input, "--", sizeof("--") - 1, true)) {
1984 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
1985 tokenizer->_reconsume_current_input = true;
1986 } else if (utf8iterator_maybe_consume_match(
1987 &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
1988 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
1989 tokenizer->_reconsume_current_input = true;
1990 // If we get here, we know we'll eventually emit a doctype token, so now is
1991 // the time to initialize the doctype strings. (Not in doctype_state_init,
1992 // since then they'll leak if ownership never gets transferred to the
1993 // doctype token.
1994 tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
1995 tokenizer->_doc_type_state.public_identifier =
1996 gumbo_copy_stringz(parser, "");
1997 tokenizer->_doc_type_state.system_identifier =
1998 gumbo_copy_stringz(parser, "");
1999 } else if (tokenizer->_is_current_node_foreign &&
2000 utf8iterator_maybe_consume_match(
2001 &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2002 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2003 tokenizer->_is_in_cdata = true;
2004 tokenizer->_reconsume_current_input = true;
2005 } else {
2006 tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2007 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2008 tokenizer->_reconsume_current_input = true;
2009 clear_temporary_buffer(parser);
2010 }
2011 return NEXT_CHAR;
2012 }
2013
2014 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
handle_comment_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2015 static StateResult handle_comment_start_state(GumboParser* parser,
2016 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2017 switch (c) {
2018 case '-':
2019 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2020 return NEXT_CHAR;
2021 case '\0':
2022 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2023 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2024 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2025 return NEXT_CHAR;
2026 case '>':
2027 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2028 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2029 emit_comment(parser, output);
2030 return RETURN_ERROR;
2031 case -1:
2032 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2033 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2034 emit_comment(parser, output);
2035 return RETURN_ERROR;
2036 default:
2037 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2038 append_char_to_temporary_buffer(parser, c);
2039 return NEXT_CHAR;
2040 }
2041 }
2042
2043 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
handle_comment_start_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2044 static StateResult handle_comment_start_dash_state(GumboParser* parser,
2045 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2046 switch (c) {
2047 case '-':
2048 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2049 return NEXT_CHAR;
2050 case '\0':
2051 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2052 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2053 append_char_to_temporary_buffer(parser, '-');
2054 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2055 return NEXT_CHAR;
2056 case '>':
2057 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2058 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2059 emit_comment(parser, output);
2060 return RETURN_ERROR;
2061 case -1:
2062 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2063 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2064 emit_comment(parser, output);
2065 return RETURN_ERROR;
2066 default:
2067 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2068 append_char_to_temporary_buffer(parser, '-');
2069 append_char_to_temporary_buffer(parser, c);
2070 return NEXT_CHAR;
2071 }
2072 }
2073
2074 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
handle_comment_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2075 static StateResult handle_comment_state(GumboParser* parser,
2076 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2077 switch (c) {
2078 case '-':
2079 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2080 return NEXT_CHAR;
2081 case '\0':
2082 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2083 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2084 return NEXT_CHAR;
2085 case -1:
2086 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2087 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2088 emit_comment(parser, output);
2089 return RETURN_ERROR;
2090 default:
2091 append_char_to_temporary_buffer(parser, c);
2092 return NEXT_CHAR;
2093 }
2094 }
2095
2096 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
handle_comment_end_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2097 static StateResult handle_comment_end_dash_state(GumboParser* parser,
2098 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2099 switch (c) {
2100 case '-':
2101 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2102 return NEXT_CHAR;
2103 case '\0':
2104 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2105 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2106 append_char_to_temporary_buffer(parser, '-');
2107 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2108 return NEXT_CHAR;
2109 case -1:
2110 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2111 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2112 emit_comment(parser, output);
2113 return RETURN_ERROR;
2114 default:
2115 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2116 append_char_to_temporary_buffer(parser, '-');
2117 append_char_to_temporary_buffer(parser, c);
2118 return NEXT_CHAR;
2119 }
2120 }
2121
2122 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
handle_comment_end_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2123 static StateResult handle_comment_end_state(GumboParser* parser,
2124 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2125 switch (c) {
2126 case '>':
2127 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2128 return emit_comment(parser, output);
2129 case '\0':
2130 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2131 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2132 append_char_to_temporary_buffer(parser, '-');
2133 append_char_to_temporary_buffer(parser, '-');
2134 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2135 return NEXT_CHAR;
2136 case '!':
2137 tokenizer_add_parse_error(
2138 parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2139 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2140 return NEXT_CHAR;
2141 case '-':
2142 tokenizer_add_parse_error(
2143 parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2144 append_char_to_temporary_buffer(parser, '-');
2145 return NEXT_CHAR;
2146 case -1:
2147 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2148 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2149 emit_comment(parser, output);
2150 return RETURN_ERROR;
2151 default:
2152 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2153 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2154 append_char_to_temporary_buffer(parser, '-');
2155 append_char_to_temporary_buffer(parser, '-');
2156 append_char_to_temporary_buffer(parser, c);
2157 return NEXT_CHAR;
2158 }
2159 }
2160
2161 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
handle_comment_end_bang_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2162 static StateResult handle_comment_end_bang_state(GumboParser* parser,
2163 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2164 switch (c) {
2165 case '-':
2166 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2167 append_char_to_temporary_buffer(parser, '-');
2168 append_char_to_temporary_buffer(parser, '-');
2169 append_char_to_temporary_buffer(parser, '!');
2170 return NEXT_CHAR;
2171 case '>':
2172 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2173 return emit_comment(parser, output);
2174 case '\0':
2175 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2176 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2177 append_char_to_temporary_buffer(parser, '-');
2178 append_char_to_temporary_buffer(parser, '-');
2179 append_char_to_temporary_buffer(parser, '!');
2180 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2181 return NEXT_CHAR;
2182 case -1:
2183 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2184 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2185 emit_comment(parser, output);
2186 return RETURN_ERROR;
2187 default:
2188 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2189 append_char_to_temporary_buffer(parser, '-');
2190 append_char_to_temporary_buffer(parser, '-');
2191 append_char_to_temporary_buffer(parser, '!');
2192 append_char_to_temporary_buffer(parser, c);
2193 return NEXT_CHAR;
2194 }
2195 }
2196
2197 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
handle_doctype_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2198 static StateResult handle_doctype_state(GumboParser* parser,
2199 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2200 assert(!tokenizer->_temporary_buffer.length);
2201 switch (c) {
2202 case '\t':
2203 case '\n':
2204 case '\f':
2205 case ' ':
2206 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2207 return NEXT_CHAR;
2208 case -1:
2209 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2210 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2211 tokenizer->_doc_type_state.force_quirks = true;
2212 emit_doctype(parser, output);
2213 return RETURN_ERROR;
2214 default:
2215 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2216 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2217 tokenizer->_reconsume_current_input = true;
2218 tokenizer->_doc_type_state.force_quirks = true;
2219 return NEXT_CHAR;
2220 }
2221 }
2222
2223 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
handle_before_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2224 static StateResult handle_before_doctype_name_state(GumboParser* parser,
2225 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2226 switch (c) {
2227 case '\t':
2228 case '\n':
2229 case '\f':
2230 case ' ':
2231 return NEXT_CHAR;
2232 case '\0':
2233 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2234 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2235 tokenizer->_doc_type_state.force_quirks = true;
2236 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2237 return NEXT_CHAR;
2238 case '>':
2239 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2240 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2241 tokenizer->_doc_type_state.force_quirks = true;
2242 emit_doctype(parser, output);
2243 return RETURN_ERROR;
2244 case -1:
2245 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2246 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2247 tokenizer->_doc_type_state.force_quirks = true;
2248 emit_doctype(parser, output);
2249 return RETURN_ERROR;
2250 default:
2251 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2252 tokenizer->_doc_type_state.force_quirks = false;
2253 append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2254 return NEXT_CHAR;
2255 }
2256 }
2257
2258 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
handle_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2259 static StateResult handle_doctype_name_state(GumboParser* parser,
2260 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2261 switch (c) {
2262 case '\t':
2263 case '\n':
2264 case '\f':
2265 case ' ':
2266 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2267 gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2268 finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2269 return NEXT_CHAR;
2270 case '>':
2271 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2272 gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2273 finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2274 emit_doctype(parser, output);
2275 return RETURN_SUCCESS;
2276 case '\0':
2277 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2278 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2279 return NEXT_CHAR;
2280 case -1:
2281 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2282 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2283 tokenizer->_doc_type_state.force_quirks = true;
2284 gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2285 finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2286 emit_doctype(parser, output);
2287 return RETURN_ERROR;
2288 default:
2289 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2290 tokenizer->_doc_type_state.force_quirks = false;
2291 append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2292 return NEXT_CHAR;
2293 }
2294 }
2295
2296 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
handle_after_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2297 static StateResult handle_after_doctype_name_state(GumboParser* parser,
2298 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2299 switch (c) {
2300 case '\t':
2301 case '\n':
2302 case '\f':
2303 case ' ':
2304 return NEXT_CHAR;
2305 case '>':
2306 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2307 emit_doctype(parser, output);
2308 return RETURN_SUCCESS;
2309 case -1:
2310 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2311 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2312 tokenizer->_doc_type_state.force_quirks = true;
2313 emit_doctype(parser, output);
2314 return RETURN_ERROR;
2315 default:
2316 if (utf8iterator_maybe_consume_match(
2317 &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2318 gumbo_tokenizer_set_state(
2319 parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2320 tokenizer->_reconsume_current_input = true;
2321 } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2322 sizeof("SYSTEM") - 1, false)) {
2323 gumbo_tokenizer_set_state(
2324 parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2325 tokenizer->_reconsume_current_input = true;
2326 } else {
2327 tokenizer_add_parse_error(
2328 parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2329 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2330 tokenizer->_doc_type_state.force_quirks = true;
2331 }
2332 return NEXT_CHAR;
2333 }
2334 }
2335
2336 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
handle_after_doctype_public_keyword_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2337 static StateResult handle_after_doctype_public_keyword_state(
2338 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2339 GumboToken* output) {
2340 switch (c) {
2341 case '\t':
2342 case '\n':
2343 case '\f':
2344 case ' ':
2345 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2346 return NEXT_CHAR;
2347 case '"':
2348 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2349 assert(temporary_buffer_equals(parser, ""));
2350 gumbo_tokenizer_set_state(
2351 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2352 return NEXT_CHAR;
2353 case '\'':
2354 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2355 assert(temporary_buffer_equals(parser, ""));
2356 gumbo_tokenizer_set_state(
2357 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2358 return NEXT_CHAR;
2359 case '>':
2360 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2361 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2362 tokenizer->_doc_type_state.force_quirks = true;
2363 emit_doctype(parser, output);
2364 return RETURN_ERROR;
2365 case -1:
2366 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2367 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2368 tokenizer->_doc_type_state.force_quirks = true;
2369 emit_doctype(parser, output);
2370 return RETURN_ERROR;
2371 default:
2372 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2373 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2374 tokenizer->_doc_type_state.force_quirks = true;
2375 emit_doctype(parser, output);
2376 return RETURN_ERROR;
2377 }
2378 }
2379
2380 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
handle_before_doctype_public_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2381 static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2382 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2383 switch (c) {
2384 case '\t':
2385 case '\n':
2386 case '\f':
2387 case ' ':
2388 return NEXT_CHAR;
2389 case '"':
2390 assert(temporary_buffer_equals(parser, ""));
2391 gumbo_tokenizer_set_state(
2392 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2393 return NEXT_CHAR;
2394 case '\'':
2395 assert(temporary_buffer_equals(parser, ""));
2396 gumbo_tokenizer_set_state(
2397 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2398 return NEXT_CHAR;
2399 case '>':
2400 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2401 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2402 tokenizer->_doc_type_state.force_quirks = true;
2403 emit_doctype(parser, output);
2404 return RETURN_ERROR;
2405 case -1:
2406 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2407 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2408 tokenizer->_doc_type_state.force_quirks = true;
2409 emit_doctype(parser, output);
2410 return RETURN_ERROR;
2411 default:
2412 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2413 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2414 tokenizer->_doc_type_state.force_quirks = true;
2415 emit_doctype(parser, output);
2416 return RETURN_ERROR;
2417 }
2418 }
2419
2420 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
handle_doctype_public_id_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2421 static StateResult handle_doctype_public_id_double_quoted_state(
2422 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2423 GumboToken* output) {
2424 switch (c) {
2425 case '"':
2426 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2427 finish_doctype_public_id(parser);
2428 return NEXT_CHAR;
2429 case '\0':
2430 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2431 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2432 return NEXT_CHAR;
2433 case '>':
2434 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2435 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2436 tokenizer->_doc_type_state.force_quirks = true;
2437 finish_doctype_public_id(parser);
2438 emit_doctype(parser, output);
2439 return RETURN_ERROR;
2440 case -1:
2441 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2442 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2443 tokenizer->_doc_type_state.force_quirks = true;
2444 finish_doctype_public_id(parser);
2445 emit_doctype(parser, output);
2446 return RETURN_ERROR;
2447 default:
2448 append_char_to_temporary_buffer(parser, c);
2449 return NEXT_CHAR;
2450 }
2451 }
2452
2453 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
handle_doctype_public_id_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2454 static StateResult handle_doctype_public_id_single_quoted_state(
2455 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2456 GumboToken* output) {
2457 switch (c) {
2458 case '\'':
2459 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2460 finish_doctype_public_id(parser);
2461 return NEXT_CHAR;
2462 case '\0':
2463 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2464 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2465 return NEXT_CHAR;
2466 case '>':
2467 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2468 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2469 tokenizer->_doc_type_state.force_quirks = true;
2470 finish_doctype_public_id(parser);
2471 emit_doctype(parser, output);
2472 return RETURN_ERROR;
2473 case -1:
2474 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2475 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2476 tokenizer->_doc_type_state.force_quirks = true;
2477 finish_doctype_public_id(parser);
2478 emit_doctype(parser, output);
2479 return RETURN_ERROR;
2480 default:
2481 append_char_to_temporary_buffer(parser, c);
2482 return NEXT_CHAR;
2483 }
2484 }
2485
2486 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
handle_after_doctype_public_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2487 static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2488 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2489 switch (c) {
2490 case '\t':
2491 case '\n':
2492 case '\f':
2493 case ' ':
2494 gumbo_tokenizer_set_state(
2495 parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2496 return NEXT_CHAR;
2497 case '>':
2498 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2499 emit_doctype(parser, output);
2500 return RETURN_SUCCESS;
2501 case '"':
2502 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2503 assert(temporary_buffer_equals(parser, ""));
2504 gumbo_tokenizer_set_state(
2505 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2506 return NEXT_CHAR;
2507 case '\'':
2508 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2509 assert(temporary_buffer_equals(parser, ""));
2510 gumbo_tokenizer_set_state(
2511 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2512 return NEXT_CHAR;
2513 case -1:
2514 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2515 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2516 tokenizer->_reconsume_current_input = true;
2517 tokenizer->_doc_type_state.force_quirks = true;
2518 emit_doctype(parser, output);
2519 return RETURN_ERROR;
2520 default:
2521 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2522 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2523 tokenizer->_doc_type_state.force_quirks = true;
2524 return NEXT_CHAR;
2525 }
2526 }
2527
2528 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
handle_between_doctype_public_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2529 static StateResult handle_between_doctype_public_system_id_state(
2530 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2531 GumboToken* output) {
2532 switch (c) {
2533 case '\t':
2534 case '\n':
2535 case '\f':
2536 case ' ':
2537 return NEXT_CHAR;
2538 case '>':
2539 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2540 emit_doctype(parser, output);
2541 return RETURN_SUCCESS;
2542 case '"':
2543 assert(temporary_buffer_equals(parser, ""));
2544 gumbo_tokenizer_set_state(
2545 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2546 return NEXT_CHAR;
2547 case '\'':
2548 assert(temporary_buffer_equals(parser, ""));
2549 gumbo_tokenizer_set_state(
2550 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2551 return NEXT_CHAR;
2552 case -1:
2553 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2554 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2555 tokenizer->_doc_type_state.force_quirks = true;
2556 emit_doctype(parser, output);
2557 return RETURN_ERROR;
2558 default:
2559 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2560 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2561 tokenizer->_doc_type_state.force_quirks = true;
2562 emit_doctype(parser, output);
2563 return RETURN_ERROR;
2564 }
2565 }
2566
2567 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
handle_after_doctype_system_keyword_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2568 static StateResult handle_after_doctype_system_keyword_state(
2569 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2570 GumboToken* output) {
2571 switch (c) {
2572 case '\t':
2573 case '\n':
2574 case '\f':
2575 case ' ':
2576 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2577 return NEXT_CHAR;
2578 case '"':
2579 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2580 assert(temporary_buffer_equals(parser, ""));
2581 gumbo_tokenizer_set_state(
2582 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2583 return NEXT_CHAR;
2584 case '\'':
2585 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2586 assert(temporary_buffer_equals(parser, ""));
2587 gumbo_tokenizer_set_state(
2588 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2589 return NEXT_CHAR;
2590 case '>':
2591 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2592 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2593 tokenizer->_doc_type_state.force_quirks = true;
2594 emit_doctype(parser, output);
2595 return RETURN_ERROR;
2596 case -1:
2597 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2598 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2599 tokenizer->_doc_type_state.force_quirks = true;
2600 emit_doctype(parser, output);
2601 return RETURN_ERROR;
2602 default:
2603 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2604 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2605 tokenizer->_doc_type_state.force_quirks = true;
2606 return NEXT_CHAR;
2607 }
2608 }
2609
2610 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
handle_before_doctype_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2611 static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2612 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2613 switch (c) {
2614 case '\t':
2615 case '\n':
2616 case '\f':
2617 case ' ':
2618 return NEXT_CHAR;
2619 case '"':
2620 assert(temporary_buffer_equals(parser, ""));
2621 gumbo_tokenizer_set_state(
2622 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2623 return NEXT_CHAR;
2624 case '\'':
2625 assert(temporary_buffer_equals(parser, ""));
2626 gumbo_tokenizer_set_state(
2627 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2628 return NEXT_CHAR;
2629 case '>':
2630 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2631 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2632 tokenizer->_doc_type_state.force_quirks = true;
2633 emit_doctype(parser, output);
2634 return RETURN_ERROR;
2635 case -1:
2636 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2637 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2638 tokenizer->_doc_type_state.force_quirks = true;
2639 emit_doctype(parser, output);
2640 return RETURN_ERROR;
2641 default:
2642 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2643 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2644 tokenizer->_doc_type_state.force_quirks = true;
2645 return NEXT_CHAR;
2646 }
2647 }
2648
2649 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
handle_doctype_system_id_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2650 static StateResult handle_doctype_system_id_double_quoted_state(
2651 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2652 GumboToken* output) {
2653 switch (c) {
2654 case '"':
2655 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2656 finish_doctype_system_id(parser);
2657 return NEXT_CHAR;
2658 case '\0':
2659 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2660 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2661 return NEXT_CHAR;
2662 case '>':
2663 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2664 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2665 tokenizer->_doc_type_state.force_quirks = true;
2666 finish_doctype_system_id(parser);
2667 emit_doctype(parser, output);
2668 return RETURN_ERROR;
2669 case -1:
2670 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2671 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2672 tokenizer->_doc_type_state.force_quirks = true;
2673 finish_doctype_system_id(parser);
2674 emit_doctype(parser, output);
2675 return RETURN_ERROR;
2676 default:
2677 append_char_to_temporary_buffer(parser, c);
2678 return NEXT_CHAR;
2679 }
2680 }
2681
2682 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
handle_doctype_system_id_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2683 static StateResult handle_doctype_system_id_single_quoted_state(
2684 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2685 GumboToken* output) {
2686 switch (c) {
2687 case '\'':
2688 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2689 finish_doctype_system_id(parser);
2690 return NEXT_CHAR;
2691 case '\0':
2692 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2693 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2694 return NEXT_CHAR;
2695 case '>':
2696 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2697 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2698 tokenizer->_doc_type_state.force_quirks = true;
2699 finish_doctype_system_id(parser);
2700 emit_doctype(parser, output);
2701 return RETURN_ERROR;
2702 case -1:
2703 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2704 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2705 tokenizer->_doc_type_state.force_quirks = true;
2706 finish_doctype_system_id(parser);
2707 emit_doctype(parser, output);
2708 return RETURN_ERROR;
2709 default:
2710 append_char_to_temporary_buffer(parser, c);
2711 return NEXT_CHAR;
2712 }
2713 }
2714
2715 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
handle_after_doctype_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2716 static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2717 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2718 switch (c) {
2719 case '\t':
2720 case '\n':
2721 case '\f':
2722 case ' ':
2723 return NEXT_CHAR;
2724 case '>':
2725 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2726 emit_doctype(parser, output);
2727 return RETURN_SUCCESS;
2728 case -1:
2729 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2730 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2731 tokenizer->_doc_type_state.force_quirks = true;
2732 emit_doctype(parser, output);
2733 return RETURN_ERROR;
2734 default:
2735 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2736 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2737 return NEXT_CHAR;
2738 }
2739 }
2740
2741 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
handle_bogus_doctype_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2742 static StateResult handle_bogus_doctype_state(GumboParser* parser,
2743 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2744 if (c == '>' || c == -1) {
2745 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2746 emit_doctype(parser, output);
2747 return RETURN_ERROR;
2748 }
2749 return NEXT_CHAR;
2750 }
2751
2752 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
handle_cdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2753 static StateResult handle_cdata_state(GumboParser* parser,
2754 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2755 if (c == -1 || utf8iterator_maybe_consume_match(
2756 &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2757 tokenizer->_reconsume_current_input = true;
2758 reset_token_start_point(tokenizer);
2759 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2760 tokenizer->_is_in_cdata = false;
2761 return NEXT_CHAR;
2762 } else {
2763 return emit_current_char(parser, output);
2764 }
2765 }
2766
2767 typedef StateResult (*GumboLexerStateFunction)(
2768 GumboParser*, GumboTokenizerState*, int, GumboToken*);
2769
2770 static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
2771 handle_char_ref_in_data_state, handle_rcdata_state,
2772 handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
2773 handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
2774 handle_tag_name_state, handle_rcdata_lt_state,
2775 handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
2776 handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
2777 handle_rawtext_end_tag_name_state, handle_script_lt_state,
2778 handle_script_end_tag_open_state, handle_script_end_tag_name_state,
2779 handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
2780 handle_script_escaped_state, handle_script_escaped_dash_state,
2781 handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
2782 handle_script_escaped_end_tag_open_state,
2783 handle_script_escaped_end_tag_name_state,
2784 handle_script_double_escaped_start_state,
2785 handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
2786 handle_script_double_escaped_dash_dash_state,
2787 handle_script_double_escaped_lt_state,
2788 handle_script_double_escaped_end_state, handle_before_attr_name_state,
2789 handle_attr_name_state, handle_after_attr_name_state,
2790 handle_before_attr_value_state, handle_attr_value_double_quoted_state,
2791 handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
2792 handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
2793 handle_self_closing_start_tag_state, handle_bogus_comment_state,
2794 handle_markup_declaration_state, handle_comment_start_state,
2795 handle_comment_start_dash_state, handle_comment_state,
2796 handle_comment_end_dash_state, handle_comment_end_state,
2797 handle_comment_end_bang_state, handle_doctype_state,
2798 handle_before_doctype_name_state, handle_doctype_name_state,
2799 handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
2800 handle_before_doctype_public_id_state,
2801 handle_doctype_public_id_double_quoted_state,
2802 handle_doctype_public_id_single_quoted_state,
2803 handle_after_doctype_public_id_state,
2804 handle_between_doctype_public_system_id_state,
2805 handle_after_doctype_system_keyword_state,
2806 handle_before_doctype_system_id_state,
2807 handle_doctype_system_id_double_quoted_state,
2808 handle_doctype_system_id_single_quoted_state,
2809 handle_after_doctype_system_id_state, handle_bogus_doctype_state,
2810 handle_cdata_state};
2811
gumbo_lex(GumboParser * parser,GumboToken * output)2812 bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2813 // Because of the spec requirements that...
2814 //
2815 // 1. Tokens be handled immediately by the parser upon emission.
2816 // 2. Some states (eg. CDATA, or various error conditions) require the
2817 // emission of multiple tokens in the same states.
2818 // 3. The tokenizer often has to reconsume the same character in a different
2819 // state.
2820 //
2821 // ...all state must be held in the GumboTokenizer struct instead of in local
2822 // variables in this function. That allows us to return from this method with
2823 // a token, and then immediately jump back to the same state with the same
2824 // input if we need to return a different token. The various emit_* functions
2825 // are responsible for changing state (eg. flushing the chardata buffer,
2826 // reading the next input character) to avoid an infinite loop.
2827 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
2828
2829 if (tokenizer->_buffered_emit_char != kGumboNoChar) {
2830 tokenizer->_reconsume_current_input = true;
2831 emit_char(parser, tokenizer->_buffered_emit_char, output);
2832 // And now that we've avoided advancing the input, make sure we set
2833 // _reconsume_current_input back to false to make sure the *next* character
2834 // isn't consumed twice.
2835 tokenizer->_reconsume_current_input = false;
2836 tokenizer->_buffered_emit_char = kGumboNoChar;
2837 return true;
2838 }
2839
2840 if (maybe_emit_from_temporary_buffer(parser, output)) {
2841 return true;
2842 }
2843
2844 while (1) {
2845 assert(!tokenizer->_temporary_buffer_emit);
2846 assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2847 int c = utf8iterator_current(&tokenizer->_input);
2848 gumbo_debug(
2849 "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
2850 StateResult result =
2851 dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2852 // We need to clear reconsume_current_input before returning to prevent
2853 // certain infinite loop states.
2854 bool should_advance = !tokenizer->_reconsume_current_input;
2855 tokenizer->_reconsume_current_input = false;
2856
2857 if (result == RETURN_SUCCESS) {
2858 return true;
2859 } else if (result == RETURN_ERROR) {
2860 return false;
2861 }
2862
2863 if (should_advance) {
2864 utf8iterator_next(&tokenizer->_input);
2865 }
2866 }
2867 }
2868
gumbo_token_destroy(GumboParser * parser,GumboToken * token)2869 void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
2870 if (!token) return;
2871
2872 switch (token->type) {
2873 case GUMBO_TOKEN_DOCTYPE:
2874 gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
2875 gumbo_parser_deallocate(
2876 parser, (void*) token->v.doc_type.public_identifier);
2877 gumbo_parser_deallocate(
2878 parser, (void*) token->v.doc_type.system_identifier);
2879 return;
2880 case GUMBO_TOKEN_START_TAG:
2881 for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2882 GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2883 if (attr) {
2884 // May have been nulled out if this token was merged with another.
2885 gumbo_destroy_attribute(parser, attr);
2886 }
2887 }
2888 gumbo_parser_deallocate(
2889 parser, (void*) token->v.start_tag.attributes.data);
2890 return;
2891 case GUMBO_TOKEN_COMMENT:
2892 gumbo_parser_deallocate(parser, (void*) token->v.text);
2893 return;
2894 default:
2895 return;
2896 }
2897 }
2898