1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 //
17 // Coding conventions specific to this file:
18 //
19 // 1. Functions that fill in a token should be named emit_*, and should be
20 // followed immediately by a return from the tokenizer (true if no error
21 // occurred, false if an error occurred). Sometimes the emit functions
22 // themselves return a boolean so that they can be combined with the return
23 // statement; in this case, they should match this convention.
24 // 2. Functions that shuffle data from temporaries to final API structures
25 // should be named finish_*, and be called just before the tokenizer exits the
26 // state that accumulates the temporary.
27 // 3. All internal data structures should be kept in an initialized state from
28 // tokenizer creation onwards, ready to accept input. When a buffer's flushed
29 // and reset, it should be deallocated and immediately reinitialized.
30 // 4. Make sure there are appropriate break statements following each state.
31 // 5. Assertions on the state of the temporary and tag buffers are usually a
32 // good idea, and should go at the entry point of each state when added.
33 // 6. Statement order within states goes:
34 // 1. Add parse errors, if appropriate.
35 // 2. Call finish_* functions to build up tag state.
36 // 2. Switch to new state. Set _reconsume flag if appropriate.
37 // 3. Perform any other temporary buffer manipulation.
38 // 4. Emit tokens
39 // 5. Return/break.
40 // This order ensures that we can verify that every emit is followed by a
41 // return, ensures that the correct state is recorded with any parse errors, and
42 // prevents parse error position from being messed up by possible mark/resets in
43 // temporary buffer manipulation.
44
45 #include "tokenizer.h"
46
47 #include <assert.h>
48 #include <stdbool.h>
49 #include <string.h>
50
51 #include "attribute.h"
52 #include "char_ref.h"
53 #include "error.h"
54 #include "gumbo.h"
55 #include "parser.h"
56 #include "string_buffer.h"
57 #include "string_piece.h"
58 #include "token_type.h"
59 #include "tokenizer_states.h"
60 #include "utf8.h"
61 #include "util.h"
62 #include "vector.h"
63
64 #define AVOID_UNUSED_VARIABLE_WARNING(i) (void) (i)
65
66 // Compared against _script_data_buffer to determine if we're in double-escaped
67 // script mode.
68 const GumboStringPiece kScriptTag = {"script", 6};
69
70 // An enum for the return value of each individual state.
71 typedef enum {
72 RETURN_ERROR, // Return false (error) from the tokenizer.
73 RETURN_SUCCESS, // Return true (success) from the tokenizer.
74 NEXT_CHAR // Proceed to the next character and continue lexing.
75 } StateResult;
76
77 // This is a struct containing state necessary to build up a tag token,
78 // character by character.
79 typedef struct GumboInternalTagState {
80 // A buffer to accumulate characters for various GumboStringPiece fields.
81 GumboStringBuffer _buffer;
82
83 // A pointer to the start of the original text corresponding to the contents
84 // of the buffer.
85 const char* _original_text;
86
87 // The current tag enum, computed once the tag name state has finished so that
88 // the buffer can be re-used for building up attributes.
89 GumboTag _tag;
90
91 // The starting location of the text in the buffer.
92 GumboSourcePosition _start_pos;
93
94 // The current list of attributes. This is copied (and ownership of its data
95 // transferred) to the GumboStartTag token upon completion of the tag. New
96 // attributes are added as soon as their attribute name state is complete, and
97 // values are filled in by operating on _attributes.data[attributes.length-1].
98 GumboVector /* GumboAttribute */ _attributes;
99
100 // If true, the next attribute value to be finished should be dropped. This
101 // happens if a duplicate attribute name is encountered - we want to consume
102 // the attribute value, but shouldn't overwrite the existing value.
103 bool _drop_next_attr_value;
104
105 // The state that caused the tokenizer to switch into a character reference in
106 // attribute value state. This is used to set the additional allowed
107 // character, and is switched back to on completion. Initialized as the
108 // tokenizer enters the character reference state.
109 GumboTokenizerEnum _attr_value_state;
110
111 // The last start tag to have been emitted by the tokenizer. This is
112 // necessary to check for appropriate end tags.
113 GumboTag _last_start_tag;
114
115 // If true, then this is a start tag. If false, it's an end tag. This is
116 // necessary to generate the appropriate token type at tag-closing time.
117 bool _is_start_tag;
118
119 // If true, then this tag is "self-closing" and doesn't have an end tag.
120 bool _is_self_closing;
121 } GumboTagState;
122
123 // This is the main tokenizer state struct, containing all state used by in
124 // tokenizing the input stream.
125 typedef struct GumboInternalTokenizerState {
126 // The current lexer state. Starts in GUMBO_LEX_DATA.
127 GumboTokenizerEnum _state;
128
129 // A flag indicating whether the current input character needs to reconsumed
130 // in another state, or whether the next input character should be read for
131 // the next iteration of the state loop. This is set when the spec reads
132 // "Reconsume the current input character in..."
133 bool _reconsume_current_input;
134
135 // A flag indicating whether the current node is a foreign element. This is
136 // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
137 // markup declaration state.
138 bool _is_current_node_foreign;
139
140 // A flag indicating whether the tokenizer is in a CDATA section. If so, then
141 // text tokens emitted will be GUMBO_TOKEN_CDATA.
142 bool _is_in_cdata;
143
144 // Certain states (notably character references) may emit two character tokens
145 // at once, but the contract for lex() fills in only one token at a time. The
146 // extra character is buffered here, and then this is checked on entry to
147 // lex(). If a character is stored here, it's immediately emitted and control
148 // returns from the lexer. kGumboNoChar is used to represent 'no character
149 // stored.'
150 //
151 // Note that characters emitted through this mechanism will have their source
152 // position marked as the character under the mark, i.e. multiple characters
153 // may be emitted with the same position. This is desirable for character
154 // references, but unsuitable for many other cases. Use the _temporary_buffer
155 // mechanism if the buffered characters must have their original positions in
156 // the document.
157 int _buffered_emit_char;
158
159 // A temporary buffer to accumulate characters, as described by the "temporary
160 // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
161 // way: we record the specific character to go into the buffer, which may
162 // sometimes be a lowercased version of the actual input character. However,
163 // we *also* use utf8iterator_mark() to record the position at tag start.
164 // When we start flushing the temporary buffer, we set _temporary_buffer_emit
165 // to the start of it, and then increment it for each call to the tokenizer.
166 // We also call utf8iterator_reset(), and utf8iterator_next() through the
167 // input stream, so that tokens emitted by emit_char have the correct position
168 // and original text.
169 GumboStringBuffer _temporary_buffer;
170
171 // The current cursor position we're emitting from within
172 // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
173 const char* _temporary_buffer_emit;
174
175 // The temporary buffer is also used by the spec to check whether we should
176 // enter the script data double escaped state, but we can't use the same
177 // buffer for both because we have to flush out "<s" as emits while still
178 // maintaining the context that will eventually become "script". This is a
179 // separate buffer that's used in place of the temporary buffer for states
180 // that may enter the script data double escape start state.
181 GumboStringBuffer _script_data_buffer;
182
183 // Pointer to the beginning of the current token in the original buffer; used
184 // to record the original text.
185 const char* _token_start;
186
187 // GumboSourcePosition recording the source location of the start of the
188 // current token.
189 GumboSourcePosition _token_start_pos;
190
191 // Current tag state.
192 GumboTagState _tag_state;
193
194 // Doctype state. We use the temporary buffer to accumulate characters (it's
195 // not used for anything else in the doctype states), and then freshly
196 // allocate the strings in the doctype token, then copy it over on emit.
197 GumboTokenDocType _doc_type_state;
198
199 // The UTF8Iterator over the tokenizer input.
200 Utf8Iterator _input;
201 } GumboTokenizerState;
202
203 // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
tokenizer_add_parse_error(GumboParser * parser,GumboErrorType type)204 static void tokenizer_add_parse_error(
205 GumboParser* parser, GumboErrorType type) {
206 GumboError* error = gumbo_add_error(parser);
207 if (!error) {
208 return;
209 }
210 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
211 utf8iterator_get_position(&tokenizer->_input, &error->position);
212 error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
213 error->type = type;
214 error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
215 switch (tokenizer->_state) {
216 case GUMBO_LEX_DATA:
217 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
218 break;
219 case GUMBO_LEX_CHAR_REF_IN_DATA:
220 case GUMBO_LEX_CHAR_REF_IN_RCDATA:
221 case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
222 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
223 break;
224 case GUMBO_LEX_RCDATA:
225 case GUMBO_LEX_RCDATA_LT:
226 case GUMBO_LEX_RCDATA_END_TAG_OPEN:
227 case GUMBO_LEX_RCDATA_END_TAG_NAME:
228 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
229 break;
230 case GUMBO_LEX_RAWTEXT:
231 case GUMBO_LEX_RAWTEXT_LT:
232 case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
233 case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
234 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
235 break;
236 case GUMBO_LEX_PLAINTEXT:
237 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
238 break;
239 case GUMBO_LEX_SCRIPT:
240 case GUMBO_LEX_SCRIPT_LT:
241 case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
242 case GUMBO_LEX_SCRIPT_END_TAG_NAME:
243 case GUMBO_LEX_SCRIPT_ESCAPED_START:
244 case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
245 case GUMBO_LEX_SCRIPT_ESCAPED:
246 case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
247 case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
248 case GUMBO_LEX_SCRIPT_ESCAPED_LT:
249 case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
250 case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
251 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
252 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
253 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
254 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
255 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
256 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
257 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
258 break;
259 case GUMBO_LEX_TAG_OPEN:
260 case GUMBO_LEX_END_TAG_OPEN:
261 case GUMBO_LEX_TAG_NAME:
262 case GUMBO_LEX_BEFORE_ATTR_NAME:
263 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
264 break;
265 case GUMBO_LEX_SELF_CLOSING_START_TAG:
266 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
267 break;
268 case GUMBO_LEX_ATTR_NAME:
269 case GUMBO_LEX_AFTER_ATTR_NAME:
270 case GUMBO_LEX_BEFORE_ATTR_VALUE:
271 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
272 break;
273 case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
274 case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
275 case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
276 case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
277 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
278 break;
279 case GUMBO_LEX_BOGUS_COMMENT:
280 case GUMBO_LEX_COMMENT_START:
281 case GUMBO_LEX_COMMENT_START_DASH:
282 case GUMBO_LEX_COMMENT:
283 case GUMBO_LEX_COMMENT_END_DASH:
284 case GUMBO_LEX_COMMENT_END:
285 case GUMBO_LEX_COMMENT_END_BANG:
286 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
287 break;
288 case GUMBO_LEX_MARKUP_DECLARATION:
289 case GUMBO_LEX_DOCTYPE:
290 case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
291 case GUMBO_LEX_DOCTYPE_NAME:
292 case GUMBO_LEX_AFTER_DOCTYPE_NAME:
293 case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
294 case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
295 case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
296 case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
297 case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
298 case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
299 case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
300 case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
301 case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
302 case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
303 case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
304 case GUMBO_LEX_BOGUS_DOCTYPE:
305 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
306 break;
307 case GUMBO_LEX_CDATA:
308 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
309 break;
310 }
311 }
312
get_char_token_type(bool is_in_cdata,int c)313 static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
314 if (is_in_cdata && c > 0) {
315 return GUMBO_TOKEN_CDATA;
316 }
317
318 switch (c) {
319 case '\t':
320 case '\n':
321 case '\r':
322 case '\f':
323 case ' ':
324 return GUMBO_TOKEN_WHITESPACE;
325 case 0:
326 gumbo_debug("Emitted null byte.\n");
327 return GUMBO_TOKEN_NULL;
328 case -1:
329 return GUMBO_TOKEN_EOF;
330 default:
331 return GUMBO_TOKEN_CHARACTER;
332 }
333 }
334
335 // Starts recording characters in the temporary buffer.
336 // Because this needs to reset the utf8iterator_mark to the beginning of the
337 // text that will eventually be emitted, it needs to be called a couple of
338 // states before the spec says "Set the temporary buffer to the empty string".
339 // In general, this should be called whenever there's a transition to a
340 // "less-than sign state". The initial < and possibly / then need to be
341 // appended to the temporary buffer, their presence needs to be accounted for in
342 // states that compare the temporary buffer against a literal value, and
343 // spec stanzas that say "emit a < and / character token along with a character
344 // token for each character in the temporary buffer" need to be adjusted to
345 // account for the presence of the < and / inside the temporary buffer.
clear_temporary_buffer(GumboParser * parser)346 static void clear_temporary_buffer(GumboParser* parser) {
347 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
348 assert(!tokenizer->_temporary_buffer_emit);
349 utf8iterator_mark(&tokenizer->_input);
350 gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
351 // The temporary buffer and script data buffer are the same object in the
352 // spec, so the script data buffer should be cleared as well.
353 gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
354 }
355
356 // Appends a codepoint to the temporary buffer.
append_char_to_temporary_buffer(GumboParser * parser,int codepoint)357 static void append_char_to_temporary_buffer(
358 GumboParser* parser, int codepoint) {
359 gumbo_string_buffer_append_codepoint(
360 codepoint, &parser->_tokenizer_state->_temporary_buffer);
361 }
362
363 // Checks to see if the temporary buffer equals a certain string.
364 // Make sure this remains side-effect free; it's used in assertions.
365 #ifndef NDEBUG
temporary_buffer_equals(GumboParser * parser,const char * text)366 static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
367 GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
368 // TODO(jdtang): See if the extra strlen is a performance problem, and replace
369 // it with an explicit sizeof(literal) if necessary. I don't think it will
370 // be, as this is only used in a couple of rare states.
371 size_t text_len = strlen(text);
372 return text_len == buffer->length &&
373 memcmp(buffer->data, text, text_len) == 0;
374 }
375 #endif
376
doc_type_state_init(GumboParser * parser)377 static void doc_type_state_init(GumboParser* parser) {
378 GumboTokenDocType* doc_type_state =
379 &parser->_tokenizer_state->_doc_type_state;
380 // We initialize these to NULL here so that we don't end up leaking memory if
381 // we never see a doctype token. When we do see a doctype token, we reset
382 // them to a freshly-allocated empty string so that we can present a uniform
383 // interface to client code and not make them check for null. Ownership is
384 // transferred to the doctype token when it's emitted.
385 doc_type_state->name = NULL;
386 doc_type_state->public_identifier = NULL;
387 doc_type_state->system_identifier = NULL;
388 doc_type_state->force_quirks = false;
389 doc_type_state->has_public_identifier = false;
390 doc_type_state->has_system_identifier = false;
391 }
392
393 // Sets the token original_text and position to the current iterator position.
394 // This is necessary because [CDATA[ sections may include text that is ignored
395 // by the tokenizer.
reset_token_start_point(GumboTokenizerState * tokenizer)396 static void reset_token_start_point(GumboTokenizerState* tokenizer) {
397 tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
398 utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
399 }
400
401 // Sets the tag buffer original text and start point to the current iterator
402 // position. This is necessary because attribute names & values may have
403 // whitespace preceeding them, and so we can't assume that the actual token
404 // starting point was the end of the last tag buffer usage.
reset_tag_buffer_start_point(GumboParser * parser)405 static void reset_tag_buffer_start_point(GumboParser* parser) {
406 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
407 GumboTagState* tag_state = &tokenizer->_tag_state;
408
409 utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
410 tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
411 }
412
413 // Moves the temporary buffer contents over to the specified output string,
414 // and clears the temporary buffer.
finish_temporary_buffer(GumboParser * parser,const char ** output)415 static void finish_temporary_buffer(GumboParser* parser, const char** output) {
416 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
417 *output = gumbo_string_buffer_to_string(&tokenizer->_temporary_buffer);
418 clear_temporary_buffer(parser);
419 }
420
421 // Advances the iterator past the end of the token, and then fills in the
422 // relevant position fields. It's assumed that after every emit, the tokenizer
423 // will immediately return (letting the tree-construction stage read the filled
424 // in Token). Thus, it's safe to advance the input stream here, since it will
425 // bypass the advance at the bottom of the state machine loop.
426 //
427 // Since this advances the iterator and resets the current input, make sure to
428 // call it after you've recorded any other data you need for the token.
finish_token(GumboParser * parser,GumboToken * token)429 static void finish_token(GumboParser* parser, GumboToken* token) {
430 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
431 if (!tokenizer->_reconsume_current_input) {
432 utf8iterator_next(&tokenizer->_input);
433 }
434
435 token->position = tokenizer->_token_start_pos;
436 token->original_text.data = tokenizer->_token_start;
437 reset_token_start_point(tokenizer);
438 token->original_text.length =
439 tokenizer->_token_start - token->original_text.data;
440 if (token->original_text.length > 0 &&
441 token->original_text.data[token->original_text.length - 1] == '\r') {
442 // The UTF8 iterator will ignore carriage returns in the input stream, which
443 // means that the next token may start one past a \r character. The pointer
444 // arithmetic above results in that \r being appended to the original text
445 // of the preceding token, so we have to adjust its length here to chop the
446 // \r off.
447 --token->original_text.length;
448 }
449 }
450
451 // Records the doctype public ID, assumed to be in the temporary buffer.
452 // Convenience method that also sets has_public_identifier to true.
finish_doctype_public_id(GumboParser * parser)453 static void finish_doctype_public_id(GumboParser* parser) {
454 GumboTokenDocType* doc_type_state =
455 &parser->_tokenizer_state->_doc_type_state;
456 gumbo_free((void*) doc_type_state->public_identifier);
457 finish_temporary_buffer(parser, &doc_type_state->public_identifier);
458 doc_type_state->has_public_identifier = true;
459 }
460
461 // Records the doctype system ID, assumed to be in the temporary buffer.
462 // Convenience method that also sets has_system_identifier to true.
finish_doctype_system_id(GumboParser * parser)463 static void finish_doctype_system_id(GumboParser* parser) {
464 GumboTokenDocType* doc_type_state =
465 &parser->_tokenizer_state->_doc_type_state;
466 gumbo_free((void*) doc_type_state->system_identifier);
467 finish_temporary_buffer(parser, &doc_type_state->system_identifier);
468 doc_type_state->has_system_identifier = true;
469 }
470
471 // Writes a single specified character to the output token.
emit_char(GumboParser * parser,int c,GumboToken * output)472 static void emit_char(GumboParser* parser, int c, GumboToken* output) {
473 output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
474 output->v.character = c;
475 finish_token(parser, output);
476 }
477
478 // Writes a replacement character token and records a parse error.
479 // Always returns RETURN_ERROR, per gumbo_lex return value.
emit_replacement_char(GumboParser * parser,GumboToken * output)480 static StateResult emit_replacement_char(
481 GumboParser* parser, GumboToken* output) {
482 // In all cases, this is because of a null byte in the input stream.
483 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
484 emit_char(parser, kUtf8ReplacementChar, output);
485 return RETURN_ERROR;
486 }
487
488 // Writes an EOF character token. Always returns RETURN_SUCCESS.
emit_eof(GumboParser * parser,GumboToken * output)489 static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
490 emit_char(parser, -1, output);
491 return RETURN_SUCCESS;
492 }
493
494 // Writes the current input character out as a character token.
495 // Always returns RETURN_SUCCESS.
emit_current_char(GumboParser * parser,GumboToken * output)496 static bool emit_current_char(GumboParser* parser, GumboToken* output) {
497 emit_char(
498 parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
499 return RETURN_SUCCESS;
500 }
501
502 // Writes out a doctype token, copying it from the tokenizer state.
emit_doctype(GumboParser * parser,GumboToken * output)503 static void emit_doctype(GumboParser* parser, GumboToken* output) {
504 output->type = GUMBO_TOKEN_DOCTYPE;
505 output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
506 finish_token(parser, output);
507 doc_type_state_init(parser);
508 }
509
510 // Debug-only function that explicitly sets the attribute vector data to NULL so
511 // it can be asserted on tag creation, verifying that there are no memory leaks.
mark_tag_state_as_empty(GumboTagState * tag_state)512 static void mark_tag_state_as_empty(GumboTagState* tag_state) {
513 #ifndef NDEBUG
514 tag_state->_attributes = kGumboEmptyVector;
515 #else
516 AVOID_UNUSED_VARIABLE_WARNING(tag_state);
517 #endif
518 }
519
520 // Writes out the current tag as a start or end tag token.
521 // Always returns RETURN_SUCCESS.
emit_current_tag(GumboParser * parser,GumboToken * output)522 static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
523 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
524 if (tag_state->_is_start_tag) {
525 output->type = GUMBO_TOKEN_START_TAG;
526 output->v.start_tag.tag = tag_state->_tag;
527 output->v.start_tag.attributes = tag_state->_attributes;
528 output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
529 tag_state->_last_start_tag = tag_state->_tag;
530 mark_tag_state_as_empty(tag_state);
531 gumbo_debug(
532 "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
533 } else {
534 output->type = GUMBO_TOKEN_END_TAG;
535 output->v.end_tag = tag_state->_tag;
536 // In end tags, ownership of the attributes vector is not transferred to the
537 // token, but it's still initialized as normal, so it must be manually
538 // deallocated. There may also be attributes to destroy, in certain broken
539 // cases like </div</th> (the "th" is an attribute there).
540 for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
541 gumbo_destroy_attribute(tag_state->_attributes.data[i]);
542 }
543 gumbo_free(tag_state->_attributes.data);
544 mark_tag_state_as_empty(tag_state);
545 gumbo_debug(
546 "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
547 }
548 gumbo_string_buffer_destroy(&tag_state->_buffer);
549 finish_token(parser, output);
550 gumbo_debug("Original text = %.*s.\n", output->original_text.length,
551 output->original_text.data);
552 assert(output->original_text.length >= 2);
553 assert(output->original_text.data[0] == '<');
554 assert(output->original_text.data[output->original_text.length - 1] == '>');
555 return RETURN_SUCCESS;
556 }
557
558 // In some states, we speculatively start a tag, but don't know whether it'll be
559 // emitted as tag token or as a series of character tokens until we finish it.
560 // We need to abandon the tag we'd started & free its memory in that case to
561 // avoid a memory leak.
abandon_current_tag(GumboParser * parser)562 static void abandon_current_tag(GumboParser* parser) {
563 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
564 for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
565 gumbo_destroy_attribute(tag_state->_attributes.data[i]);
566 }
567 gumbo_free(tag_state->_attributes.data);
568 mark_tag_state_as_empty(tag_state);
569 gumbo_string_buffer_destroy(&tag_state->_buffer);
570 gumbo_debug("Abandoning current tag.\n");
571 }
572
573 // Wraps the consume_char_ref function to handle its output and make the
574 // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
575 // error occurred, RETURN_SUCCESS otherwise.
emit_char_ref(GumboParser * parser,int additional_allowed_char,GumboToken * output)576 static StateResult emit_char_ref(
577 GumboParser* parser, int additional_allowed_char, GumboToken* output) {
578 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
579 OneOrTwoCodepoints char_ref;
580 bool status = consume_char_ref(
581 parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
582 if (char_ref.first != kGumboNoChar) {
583 // consume_char_ref ends with the iterator pointing at the next character,
584 // so we need to be sure not advance it again before reading the next token.
585 tokenizer->_reconsume_current_input = true;
586 emit_char(parser, char_ref.first, output);
587 tokenizer->_buffered_emit_char = char_ref.second;
588 } else {
589 emit_char(parser, '&', output);
590 }
591 return status ? RETURN_SUCCESS : RETURN_ERROR;
592 }
593
594 // Emits a comment token. Comments use the temporary buffer to accumulate their
595 // data, and then it's copied over and released to the 'text' field of the
596 // GumboToken union. Always returns RETURN_SUCCESS.
emit_comment(GumboParser * parser,GumboToken * output)597 static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
598 output->type = GUMBO_TOKEN_COMMENT;
599 finish_temporary_buffer(parser, &output->v.text);
600 finish_token(parser, output);
601 return RETURN_SUCCESS;
602 }
603
604 // Checks to see we should be flushing accumulated characters in the temporary
605 // buffer, and fills the output token with the next output character if so.
606 // Returns true if a character has been emitted and the tokenizer should
607 // immediately return, false if we're at the end of the temporary buffer and
608 // should resume normal operation.
maybe_emit_from_temporary_buffer(GumboParser * parser,GumboToken * output)609 static bool maybe_emit_from_temporary_buffer(
610 GumboParser* parser, GumboToken* output) {
611 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
612 const char* c = tokenizer->_temporary_buffer_emit;
613 GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
614
615 if (!c || c >= buffer->data + buffer->length) {
616 tokenizer->_temporary_buffer_emit = NULL;
617 return false;
618 }
619
620 assert(*c == utf8iterator_current(&tokenizer->_input));
621 // emit_char also advances the input stream. We need to do some juggling of
622 // the _reconsume_current_input flag to get the proper behavior when emitting
623 // previous tokens. Basically, _reconsume_current_input should *never* be set
624 // when emitting anything from the temporary buffer, since those characters
625 // have already been advanced past. However, it should be preserved so that
626 // when the *next* character is encountered again, the tokenizer knows not to
627 // advance past it.
628 bool saved_reconsume_state = tokenizer->_reconsume_current_input;
629 tokenizer->_reconsume_current_input = false;
630 emit_char(parser, *c, output);
631 ++tokenizer->_temporary_buffer_emit;
632 tokenizer->_reconsume_current_input = saved_reconsume_state;
633 return true;
634 }
635
636 // Sets up the tokenizer to begin flushing the temporary buffer.
637 // This resets the input iterator stream to the start of the last tag, sets up
638 // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
639 // the first character in it. It returns true if a character was emitted, false
640 // otherwise.
emit_temporary_buffer(GumboParser * parser,GumboToken * output)641 static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
642 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
643 assert(tokenizer->_temporary_buffer.data);
644 utf8iterator_reset(&tokenizer->_input);
645 tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
646 return maybe_emit_from_temporary_buffer(parser, output);
647 }
648
649 // Appends a codepoint to the current tag buffer. If
650 // reinitilize_position_on_first is set, this also initializes the tag buffer
651 // start point; the only time you would *not* want to pass true for this
652 // parameter is if you want the original_text to include character (like an
653 // opening quote) that doesn't appear in the value.
append_char_to_tag_buffer(GumboParser * parser,int codepoint,bool reinitilize_position_on_first)654 static void append_char_to_tag_buffer(
655 GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
656 GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
657 if (buffer->length == 0 && reinitilize_position_on_first) {
658 reset_tag_buffer_start_point(parser);
659 }
660 gumbo_string_buffer_append_codepoint(codepoint, buffer);
661 }
662
663 // (Re-)initialize the tag buffer. This also resets the original_text pointer
664 // and _start_pos field to point to the current position.
initialize_tag_buffer(GumboParser * parser)665 static void initialize_tag_buffer(GumboParser* parser) {
666 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
667 GumboTagState* tag_state = &tokenizer->_tag_state;
668
669 gumbo_string_buffer_init(&tag_state->_buffer);
670 reset_tag_buffer_start_point(parser);
671 }
672
673 // Initializes the tag_state to start a new tag, keeping track of the opening
674 // positions and original text. Takes a boolean indicating whether this is a
675 // start or end tag.
start_new_tag(GumboParser * parser,bool is_start_tag)676 static void start_new_tag(GumboParser* parser, bool is_start_tag) {
677 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
678 GumboTagState* tag_state = &tokenizer->_tag_state;
679 int c = utf8iterator_current(&tokenizer->_input);
680 assert(gumbo_isalpha(c));
681 c = gumbo_tolower(c);
682 assert(gumbo_isalpha(c));
683
684 initialize_tag_buffer(parser);
685 gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
686
687 assert(tag_state->_attributes.data == NULL);
688 // Initial size chosen by statistical analysis of a corpus of 60k webpages.
689 // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
690 // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
691 // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
692 gumbo_vector_init(2, &tag_state->_attributes);
693 tag_state->_drop_next_attr_value = false;
694 tag_state->_is_start_tag = is_start_tag;
695 tag_state->_is_self_closing = false;
696 gumbo_debug("Starting new tag.\n");
697 }
698
699 // Fills in the specified char* with the contents of the tag buffer.
copy_over_tag_buffer(GumboParser * parser,const char ** output)700 static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
701 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
702 GumboTagState* tag_state = &tokenizer->_tag_state;
703 *output = gumbo_string_buffer_to_string(&tag_state->_buffer);
704 }
705
706 // Fills in:
707 // * The original_text GumboStringPiece with the portion of the original
708 // buffer that corresponds to the tag buffer.
709 // * The start_pos GumboSourcePosition with the start position of the tag
710 // buffer.
711 // * The end_pos GumboSourcePosition with the current source position.
copy_over_original_tag_text(GumboParser * parser,GumboStringPiece * original_text,GumboSourcePosition * start_pos,GumboSourcePosition * end_pos)712 static void copy_over_original_tag_text(GumboParser* parser,
713 GumboStringPiece* original_text, GumboSourcePosition* start_pos,
714 GumboSourcePosition* end_pos) {
715 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
716 GumboTagState* tag_state = &tokenizer->_tag_state;
717
718 original_text->data = tag_state->_original_text;
719 original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
720 tag_state->_original_text;
721 if (original_text->data[original_text->length - 1] == '\r') {
722 // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
723 // appended to the end of original text even when it's really the first part
724 // of the next character. If we detect this situation, shrink the length of
725 // the original text by 1 to remove the carriage return.
726 --original_text->length;
727 }
728 *start_pos = tag_state->_start_pos;
729 utf8iterator_get_position(&tokenizer->_input, end_pos);
730 }
731
732 // Releases and then re-initializes the tag buffer.
reinitialize_tag_buffer(GumboParser * parser)733 static void reinitialize_tag_buffer(GumboParser* parser) {
734 gumbo_free(parser->_tokenizer_state->_tag_state._buffer.data);
735 initialize_tag_buffer(parser);
736 }
737
738 // Moves some data from the temporary buffer over the the tag-based fields in
739 // TagState.
finish_tag_name(GumboParser * parser)740 static void finish_tag_name(GumboParser* parser) {
741 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
742 GumboTagState* tag_state = &tokenizer->_tag_state;
743
744 tag_state->_tag =
745 gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
746 reinitialize_tag_buffer(parser);
747 }
748
749 // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
add_duplicate_attr_error(GumboParser * parser,int original_index,int new_index)750 static void add_duplicate_attr_error(
751 GumboParser* parser, int original_index, int new_index) {
752 GumboError* error = gumbo_add_error(parser);
753 if (!error) {
754 return;
755 }
756 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
757 error->type = GUMBO_ERR_DUPLICATE_ATTR;
758 error->position = tag_state->_start_pos;
759 error->original_text = tag_state->_original_text;
760 error->v.duplicate_attr.original_index = original_index;
761 error->v.duplicate_attr.new_index = new_index;
762 copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
763 reinitialize_tag_buffer(parser);
764 }
765
766 // Creates a new attribute in the current tag, copying the current tag buffer to
767 // the attribute's name. The attribute's value starts out as the empty string
768 // (following the "Boolean attributes" section of the spec) and is only
769 // overwritten on finish_attribute_value(). If the attribute has already been
770 // specified, the new attribute is dropped, a parse error is added, and the
771 // function returns false. Otherwise, this returns true.
finish_attribute_name(GumboParser * parser)772 static bool finish_attribute_name(GumboParser* parser) {
773 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
774 GumboTagState* tag_state = &tokenizer->_tag_state;
775 // May've been set by a previous attribute without a value; reset it here.
776 tag_state->_drop_next_attr_value = false;
777 assert(tag_state->_attributes.data);
778 assert(tag_state->_attributes.capacity);
779
780 GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
781 for (unsigned int i = 0; i < attributes->length; ++i) {
782 GumboAttribute* attr = attributes->data[i];
783 if (strlen(attr->name) == tag_state->_buffer.length &&
784 memcmp(attr->name, tag_state->_buffer.data,
785 tag_state->_buffer.length) == 0) {
786 // Identical attribute; bail.
787 add_duplicate_attr_error(parser, i, attributes->length);
788 tag_state->_drop_next_attr_value = true;
789 return false;
790 }
791 }
792
793 GumboAttribute* attr = gumbo_malloc(sizeof(GumboAttribute));
794 attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
795 copy_over_tag_buffer(parser, &attr->name);
796 copy_over_original_tag_text(
797 parser, &attr->original_name, &attr->name_start, &attr->name_end);
798 attr->value = gumbo_strdup("");
799 copy_over_original_tag_text(
800 parser, &attr->original_value, &attr->name_start, &attr->name_end);
801 gumbo_vector_add(attr, attributes);
802 reinitialize_tag_buffer(parser);
803 return true;
804 }
805
806 // Finishes an attribute value. This sets the value of the most recently added
807 // attribute to the current contents of the tag buffer.
finish_attribute_value(GumboParser * parser)808 static void finish_attribute_value(GumboParser* parser) {
809 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
810 if (tag_state->_drop_next_attr_value) {
811 // Duplicate attribute name detected in an earlier state, so we have to
812 // ignore the value.
813 tag_state->_drop_next_attr_value = false;
814 reinitialize_tag_buffer(parser);
815 return;
816 }
817
818 GumboAttribute* attr =
819 tag_state->_attributes.data[tag_state->_attributes.length - 1];
820 gumbo_free((void*) attr->value);
821 copy_over_tag_buffer(parser, &attr->value);
822 copy_over_original_tag_text(
823 parser, &attr->original_value, &attr->value_start, &attr->value_end);
824 reinitialize_tag_buffer(parser);
825 }
826
827 // Returns true if the current end tag matches the last start tag emitted.
is_appropriate_end_tag(GumboParser * parser)828 static bool is_appropriate_end_tag(GumboParser* parser) {
829 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
830 assert(!tag_state->_is_start_tag);
831 return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
832 tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
833 tag_state->_buffer.length);
834 }
835
gumbo_tokenizer_state_init(GumboParser * parser,const char * text,size_t text_length)836 void gumbo_tokenizer_state_init(
837 GumboParser* parser, const char* text, size_t text_length) {
838 GumboTokenizerState* tokenizer = gumbo_malloc(sizeof(GumboTokenizerState));
839 parser->_tokenizer_state = tokenizer;
840 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
841 tokenizer->_reconsume_current_input = false;
842 tokenizer->_is_current_node_foreign = false;
843 tokenizer->_is_in_cdata = false;
844 tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
845
846 tokenizer->_buffered_emit_char = kGumboNoChar;
847 gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
848 tokenizer->_temporary_buffer_emit = NULL;
849
850 mark_tag_state_as_empty(&tokenizer->_tag_state);
851
852 gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
853 tokenizer->_token_start = text;
854 utf8iterator_init(parser, text, text_length, &tokenizer->_input);
855 utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
856 doc_type_state_init(parser);
857 }
858
gumbo_tokenizer_state_destroy(GumboParser * parser)859 void gumbo_tokenizer_state_destroy(GumboParser* parser) {
860 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
861 assert(tokenizer->_doc_type_state.name == NULL);
862 assert(tokenizer->_doc_type_state.public_identifier == NULL);
863 assert(tokenizer->_doc_type_state.system_identifier == NULL);
864 gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
865 gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
866 gumbo_free(tokenizer);
867 }
868
gumbo_tokenizer_set_state(GumboParser * parser,GumboTokenizerEnum state)869 void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
870 parser->_tokenizer_state->_state = state;
871 }
872
gumbo_tokenizer_set_is_current_node_foreign(GumboParser * parser,bool is_foreign)873 void gumbo_tokenizer_set_is_current_node_foreign(
874 GumboParser* parser, bool is_foreign) {
875 if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
876 gumbo_debug("Toggling is_current_node_foreign to %s.\n",
877 is_foreign ? "true" : "false");
878 }
879 parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
880 }
881
882 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
handle_data_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)883 static StateResult handle_data_state(GumboParser* parser,
884 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
885 switch (c) {
886 case '&':
887 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
888 // The char_ref machinery expects to be on the & so it can mark that
889 // and return to it if the text isn't a char ref, so we need to
890 // reconsume it.
891 tokenizer->_reconsume_current_input = true;
892 return NEXT_CHAR;
893 case '<':
894 gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
895 clear_temporary_buffer(parser);
896 append_char_to_temporary_buffer(parser, '<');
897 return NEXT_CHAR;
898 case '\0':
899 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
900 emit_char(parser, c, output);
901 return RETURN_ERROR;
902 default:
903 return emit_current_char(parser, output);
904 }
905 }
906
907 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
handle_char_ref_in_data_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)908 static StateResult handle_char_ref_in_data_state(GumboParser* parser,
909 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
910 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
911 AVOID_UNUSED_VARIABLE_WARNING(c);
912 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
913 return emit_char_ref(parser, ' ', output);
914 }
915
916 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
handle_rcdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)917 static StateResult handle_rcdata_state(GumboParser* parser,
918 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
919 switch (c) {
920 case '&':
921 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
922 tokenizer->_reconsume_current_input = true;
923 return NEXT_CHAR;
924 case '<':
925 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
926 clear_temporary_buffer(parser);
927 append_char_to_temporary_buffer(parser, '<');
928 return NEXT_CHAR;
929 case '\0':
930 return emit_replacement_char(parser, output);
931 case -1:
932 return emit_eof(parser, output);
933 default:
934 return emit_current_char(parser, output);
935 }
936 }
937
938 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
handle_char_ref_in_rcdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)939 static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
940 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
941 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
942 AVOID_UNUSED_VARIABLE_WARNING(c);
943 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
944 return emit_char_ref(parser, ' ', output);
945 }
946
947 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
handle_rawtext_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)948 static StateResult handle_rawtext_state(GumboParser* parser,
949 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
950 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
951 switch (c) {
952 case '<':
953 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
954 clear_temporary_buffer(parser);
955 append_char_to_temporary_buffer(parser, '<');
956 return NEXT_CHAR;
957 case '\0':
958 return emit_replacement_char(parser, output);
959 case -1:
960 return emit_eof(parser, output);
961 default:
962 return emit_current_char(parser, output);
963 }
964 }
965
966 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
handle_script_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)967 static StateResult handle_script_state(GumboParser* parser,
968 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
969 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
970 switch (c) {
971 case '<':
972 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
973 clear_temporary_buffer(parser);
974 append_char_to_temporary_buffer(parser, '<');
975 return NEXT_CHAR;
976 case '\0':
977 return emit_replacement_char(parser, output);
978 case -1:
979 return emit_eof(parser, output);
980 default:
981 return emit_current_char(parser, output);
982 }
983 }
984
985 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
handle_plaintext_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)986 static StateResult handle_plaintext_state(GumboParser* parser,
987 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
988 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
989 switch (c) {
990 case '\0':
991 return emit_replacement_char(parser, output);
992 case -1:
993 return emit_eof(parser, output);
994 default:
995 return emit_current_char(parser, output);
996 }
997 }
998
999 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
handle_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1000 static StateResult handle_tag_open_state(GumboParser* parser,
1001 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1002 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1003 assert(temporary_buffer_equals(parser, "<"));
1004 switch (c) {
1005 case '!':
1006 gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1007 clear_temporary_buffer(parser);
1008 return NEXT_CHAR;
1009 case '/':
1010 gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1011 append_char_to_temporary_buffer(parser, '/');
1012 return NEXT_CHAR;
1013 case '?':
1014 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1015 clear_temporary_buffer(parser);
1016 append_char_to_temporary_buffer(parser, '?');
1017 tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1018 return NEXT_CHAR;
1019 default:
1020 if (gumbo_isalpha(c)) {
1021 gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1022 start_new_tag(parser, true);
1023 return NEXT_CHAR;
1024 } else {
1025 tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1026 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1027 emit_temporary_buffer(parser, output);
1028 return RETURN_ERROR;
1029 }
1030 }
1031 }
1032
1033 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
handle_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1034 static StateResult handle_end_tag_open_state(GumboParser* parser,
1035 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1036 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1037 assert(temporary_buffer_equals(parser, "</"));
1038 switch (c) {
1039 case '>':
1040 tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1041 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1042 return NEXT_CHAR;
1043 case -1:
1044 tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1045 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1046 return emit_temporary_buffer(parser, output);
1047 default:
1048 if (gumbo_isalpha(c)) {
1049 gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1050 start_new_tag(parser, false);
1051 } else {
1052 tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1053 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1054 clear_temporary_buffer(parser);
1055 append_char_to_temporary_buffer(parser, c);
1056 }
1057 return NEXT_CHAR;
1058 }
1059 }
1060
1061 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
handle_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1062 static StateResult handle_tag_name_state(GumboParser* parser,
1063 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1064 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1065 switch (c) {
1066 case '\t':
1067 case '\n':
1068 case '\f':
1069 case ' ':
1070 finish_tag_name(parser);
1071 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1072 return NEXT_CHAR;
1073 case '/':
1074 finish_tag_name(parser);
1075 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1076 return NEXT_CHAR;
1077 case '>':
1078 finish_tag_name(parser);
1079 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1080 return emit_current_tag(parser, output);
1081 case '\0':
1082 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1083 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1084 return NEXT_CHAR;
1085 case -1:
1086 tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1087 abandon_current_tag(parser);
1088 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1089 return NEXT_CHAR;
1090 default:
1091 append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1092 return NEXT_CHAR;
1093 }
1094 }
1095
1096 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
handle_rcdata_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1097 static StateResult handle_rcdata_lt_state(GumboParser* parser,
1098 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1099 assert(temporary_buffer_equals(parser, "<"));
1100 if (c == '/') {
1101 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1102 append_char_to_temporary_buffer(parser, '/');
1103 return NEXT_CHAR;
1104 } else {
1105 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1106 tokenizer->_reconsume_current_input = true;
1107 return emit_temporary_buffer(parser, output);
1108 }
1109 }
1110
1111 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
handle_rcdata_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1112 static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1113 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1114 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1115 assert(temporary_buffer_equals(parser, "</"));
1116 if (gumbo_isalpha(c)) {
1117 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1118 start_new_tag(parser, false);
1119 append_char_to_temporary_buffer(parser, c);
1120 return NEXT_CHAR;
1121 } else {
1122 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1123 return emit_temporary_buffer(parser, output);
1124 }
1125 return true;
1126 }
1127
1128 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
handle_rcdata_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1129 static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1130 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1131 #ifndef NDEBUG
1132 assert(tokenizer->_temporary_buffer.length >= 2);
1133 #else
1134 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1135 #endif
1136 if (gumbo_isalpha(c)) {
1137 append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1138 append_char_to_temporary_buffer(parser, c);
1139 return NEXT_CHAR;
1140 } else if (is_appropriate_end_tag(parser)) {
1141 switch (c) {
1142 case '\t':
1143 case '\n':
1144 case '\f':
1145 case ' ':
1146 finish_tag_name(parser);
1147 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1148 return NEXT_CHAR;
1149 case '/':
1150 finish_tag_name(parser);
1151 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1152 return NEXT_CHAR;
1153 case '>':
1154 finish_tag_name(parser);
1155 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1156 return emit_current_tag(parser, output);
1157 }
1158 }
1159 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1160 abandon_current_tag(parser);
1161 return emit_temporary_buffer(parser, output);
1162 }
1163
1164 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
handle_rawtext_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1165 static StateResult handle_rawtext_lt_state(GumboParser* parser,
1166 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1167 assert(temporary_buffer_equals(parser, "<"));
1168 if (c == '/') {
1169 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1170 append_char_to_temporary_buffer(parser, '/');
1171 return NEXT_CHAR;
1172 } else {
1173 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1174 tokenizer->_reconsume_current_input = true;
1175 return emit_temporary_buffer(parser, output);
1176 }
1177 }
1178
1179 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
handle_rawtext_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1180 static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1181 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1182 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1183 assert(temporary_buffer_equals(parser, "</"));
1184 if (gumbo_isalpha(c)) {
1185 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1186 start_new_tag(parser, false);
1187 append_char_to_temporary_buffer(parser, c);
1188 return NEXT_CHAR;
1189 } else {
1190 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1191 return emit_temporary_buffer(parser, output);
1192 }
1193 }
1194
1195 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
handle_rawtext_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1196 static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1197 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1198 assert(tokenizer->_temporary_buffer.length >= 2);
1199 gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1200 tokenizer->_tag_state._buffer.data);
1201 if (gumbo_isalpha(c)) {
1202 append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1203 append_char_to_temporary_buffer(parser, c);
1204 return NEXT_CHAR;
1205 } else if (is_appropriate_end_tag(parser)) {
1206 gumbo_debug("Is an appropriate end tag.\n");
1207 switch (c) {
1208 case '\t':
1209 case '\n':
1210 case '\f':
1211 case ' ':
1212 finish_tag_name(parser);
1213 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1214 return NEXT_CHAR;
1215 case '/':
1216 finish_tag_name(parser);
1217 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1218 return NEXT_CHAR;
1219 case '>':
1220 finish_tag_name(parser);
1221 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1222 return emit_current_tag(parser, output);
1223 }
1224 }
1225 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1226 abandon_current_tag(parser);
1227 return emit_temporary_buffer(parser, output);
1228 }
1229
1230 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
handle_script_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1231 static StateResult handle_script_lt_state(GumboParser* parser,
1232 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1233 assert(temporary_buffer_equals(parser, "<"));
1234 if (c == '/') {
1235 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1236 append_char_to_temporary_buffer(parser, '/');
1237 return NEXT_CHAR;
1238 } else if (c == '!') {
1239 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1240 append_char_to_temporary_buffer(parser, '!');
1241 return emit_temporary_buffer(parser, output);
1242 } else {
1243 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1244 tokenizer->_reconsume_current_input = true;
1245 return emit_temporary_buffer(parser, output);
1246 }
1247 }
1248
1249 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
handle_script_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1250 static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1251 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1252 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1253 assert(temporary_buffer_equals(parser, "</"));
1254 if (gumbo_isalpha(c)) {
1255 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1256 start_new_tag(parser, false);
1257 append_char_to_temporary_buffer(parser, c);
1258 return NEXT_CHAR;
1259 } else {
1260 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1261 return emit_temporary_buffer(parser, output);
1262 }
1263 }
1264
1265 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
handle_script_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1266 static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1267 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1268 #ifndef NDEBUG
1269 assert(tokenizer->_temporary_buffer.length >= 2);
1270 #else
1271 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1272 #endif
1273 if (gumbo_isalpha(c)) {
1274 append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1275 append_char_to_temporary_buffer(parser, c);
1276 return NEXT_CHAR;
1277 } else if (is_appropriate_end_tag(parser)) {
1278 switch (c) {
1279 case '\t':
1280 case '\n':
1281 case '\f':
1282 case ' ':
1283 finish_tag_name(parser);
1284 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1285 return NEXT_CHAR;
1286 case '/':
1287 finish_tag_name(parser);
1288 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1289 return NEXT_CHAR;
1290 case '>':
1291 finish_tag_name(parser);
1292 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1293 return emit_current_tag(parser, output);
1294 }
1295 }
1296 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1297 abandon_current_tag(parser);
1298 return emit_temporary_buffer(parser, output);
1299 }
1300
1301 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
handle_script_escaped_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1302 static StateResult handle_script_escaped_start_state(GumboParser* parser,
1303 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1304 if (c == '-') {
1305 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1306 return emit_current_char(parser, output);
1307 } else {
1308 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1309 tokenizer->_reconsume_current_input = true;
1310 return NEXT_CHAR;
1311 }
1312 }
1313
1314 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
handle_script_escaped_start_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1315 static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1316 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1317 if (c == '-') {
1318 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1319 return emit_current_char(parser, output);
1320 } else {
1321 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1322 tokenizer->_reconsume_current_input = true;
1323 return NEXT_CHAR;
1324 }
1325 }
1326
1327 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
handle_script_escaped_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1328 static StateResult handle_script_escaped_state(GumboParser* parser,
1329 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1330 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1331 switch (c) {
1332 case '-':
1333 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1334 return emit_current_char(parser, output);
1335 case '<':
1336 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1337 clear_temporary_buffer(parser);
1338 append_char_to_temporary_buffer(parser, c);
1339 return NEXT_CHAR;
1340 case '\0':
1341 return emit_replacement_char(parser, output);
1342 case -1:
1343 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1344 return emit_eof(parser, output);
1345 default:
1346 return emit_current_char(parser, output);
1347 }
1348 }
1349
1350 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
handle_script_escaped_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1351 static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1352 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1353 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1354 switch (c) {
1355 case '-':
1356 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1357 return emit_current_char(parser, output);
1358 case '<':
1359 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1360 clear_temporary_buffer(parser);
1361 append_char_to_temporary_buffer(parser, c);
1362 return NEXT_CHAR;
1363 case '\0':
1364 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1365 return emit_replacement_char(parser, output);
1366 case -1:
1367 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1368 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1369 return NEXT_CHAR;
1370 default:
1371 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1372 return emit_current_char(parser, output);
1373 }
1374 }
1375
1376 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
handle_script_escaped_dash_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1377 static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1378 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1379 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1380 switch (c) {
1381 case '-':
1382 return emit_current_char(parser, output);
1383 case '<':
1384 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1385 clear_temporary_buffer(parser);
1386 append_char_to_temporary_buffer(parser, c);
1387 return NEXT_CHAR;
1388 case '>':
1389 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1390 return emit_current_char(parser, output);
1391 case '\0':
1392 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1393 return emit_replacement_char(parser, output);
1394 case -1:
1395 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1396 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1397 return NEXT_CHAR;
1398 default:
1399 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1400 return emit_current_char(parser, output);
1401 }
1402 }
1403
1404 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
handle_script_escaped_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1405 static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1406 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1407 assert(temporary_buffer_equals(parser, "<"));
1408 assert(!tokenizer->_script_data_buffer.length);
1409 if (c == '/') {
1410 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1411 append_char_to_temporary_buffer(parser, c);
1412 return NEXT_CHAR;
1413 } else if (gumbo_isalpha(c)) {
1414 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1415 append_char_to_temporary_buffer(parser, c);
1416 gumbo_string_buffer_append_codepoint(
1417 gumbo_tolower(c), &tokenizer->_script_data_buffer);
1418 return emit_temporary_buffer(parser, output);
1419 } else {
1420 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1421 return emit_temporary_buffer(parser, output);
1422 }
1423 }
1424
1425 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
handle_script_escaped_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1426 static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1427 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1428 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1429 assert(temporary_buffer_equals(parser, "</"));
1430 if (gumbo_isalpha(c)) {
1431 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1432 start_new_tag(parser, false);
1433 append_char_to_temporary_buffer(parser, c);
1434 return NEXT_CHAR;
1435 } else {
1436 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1437 return emit_temporary_buffer(parser, output);
1438 }
1439 }
1440
1441 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
handle_script_escaped_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1442 static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1443 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1444 #ifndef NDEBUG
1445 assert(tokenizer->_temporary_buffer.length >= 2);
1446 #else
1447 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1448 #endif
1449 if (gumbo_isalpha(c)) {
1450 append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1451 append_char_to_temporary_buffer(parser, c);
1452 return NEXT_CHAR;
1453 } else if (is_appropriate_end_tag(parser)) {
1454 switch (c) {
1455 case '\t':
1456 case '\n':
1457 case '\f':
1458 case ' ':
1459 finish_tag_name(parser);
1460 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1461 return NEXT_CHAR;
1462 case '/':
1463 finish_tag_name(parser);
1464 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1465 return NEXT_CHAR;
1466 case '>':
1467 finish_tag_name(parser);
1468 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1469 return emit_current_tag(parser, output);
1470 }
1471 }
1472 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1473 abandon_current_tag(parser);
1474 return emit_temporary_buffer(parser, output);
1475 }
1476
1477 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
handle_script_double_escaped_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1478 static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1479 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1480 switch (c) {
1481 case '\t':
1482 case '\n':
1483 case '\f':
1484 case ' ':
1485 case '/':
1486 case '>':
1487 gumbo_tokenizer_set_state(parser,
1488 gumbo_string_equals(
1489 &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1490 ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1491 : GUMBO_LEX_SCRIPT_ESCAPED);
1492 return emit_current_char(parser, output);
1493 default:
1494 if (gumbo_isalpha(c)) {
1495 gumbo_string_buffer_append_codepoint(
1496 gumbo_tolower(c), &tokenizer->_script_data_buffer);
1497 return emit_current_char(parser, output);
1498 } else {
1499 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1500 tokenizer->_reconsume_current_input = true;
1501 return NEXT_CHAR;
1502 }
1503 }
1504 }
1505
1506 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
handle_script_double_escaped_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1507 static StateResult handle_script_double_escaped_state(GumboParser* parser,
1508 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1509 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1510 switch (c) {
1511 case '-':
1512 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1513 return emit_current_char(parser, output);
1514 case '<':
1515 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1516 return emit_current_char(parser, output);
1517 case '\0':
1518 return emit_replacement_char(parser, output);
1519 case -1:
1520 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1521 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1522 return NEXT_CHAR;
1523 default:
1524 return emit_current_char(parser, output);
1525 }
1526 }
1527
1528 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
handle_script_double_escaped_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1529 static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1530 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1531 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1532 switch (c) {
1533 case '-':
1534 gumbo_tokenizer_set_state(
1535 parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1536 return emit_current_char(parser, output);
1537 case '<':
1538 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1539 return emit_current_char(parser, output);
1540 case '\0':
1541 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1542 return emit_replacement_char(parser, output);
1543 case -1:
1544 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1545 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1546 return NEXT_CHAR;
1547 default:
1548 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1549 return emit_current_char(parser, output);
1550 }
1551 }
1552
1553 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
handle_script_double_escaped_dash_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1554 static StateResult handle_script_double_escaped_dash_dash_state(
1555 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
1556 GumboToken* output) {
1557 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1558 switch (c) {
1559 case '-':
1560 return emit_current_char(parser, output);
1561 case '<':
1562 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1563 return emit_current_char(parser, output);
1564 case '>':
1565 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1566 return emit_current_char(parser, output);
1567 case '\0':
1568 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1569 return emit_replacement_char(parser, output);
1570 case -1:
1571 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1572 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1573 return NEXT_CHAR;
1574 default:
1575 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1576 return emit_current_char(parser, output);
1577 }
1578 }
1579
1580 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
handle_script_double_escaped_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1581 static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1582 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1583 if (c == '/') {
1584 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1585 gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
1586 gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
1587 return emit_current_char(parser, output);
1588 } else {
1589 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1590 tokenizer->_reconsume_current_input = true;
1591 return NEXT_CHAR;
1592 }
1593 }
1594
1595 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
handle_script_double_escaped_end_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1596 static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1597 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1598 switch (c) {
1599 case '\t':
1600 case '\n':
1601 case '\f':
1602 case ' ':
1603 case '/':
1604 case '>':
1605 gumbo_tokenizer_set_state(parser,
1606 gumbo_string_equals(
1607 &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1608 ? GUMBO_LEX_SCRIPT_ESCAPED
1609 : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1610 return emit_current_char(parser, output);
1611 default:
1612 if (gumbo_isalpha(c)) {
1613 gumbo_string_buffer_append_codepoint(
1614 gumbo_tolower(c), &tokenizer->_script_data_buffer);
1615 return emit_current_char(parser, output);
1616 } else {
1617 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1618 tokenizer->_reconsume_current_input = true;
1619 return NEXT_CHAR;
1620 }
1621 }
1622 }
1623
1624 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
handle_before_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1625 static StateResult handle_before_attr_name_state(GumboParser* parser,
1626 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1627 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1628 switch (c) {
1629 case '\t':
1630 case '\n':
1631 case '\f':
1632 case ' ':
1633 return NEXT_CHAR;
1634 case '/':
1635 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1636 return NEXT_CHAR;
1637 case '>':
1638 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1639 return emit_current_tag(parser, output);
1640 case '\0':
1641 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1642 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1643 append_char_to_temporary_buffer(parser, 0xfffd);
1644 return NEXT_CHAR;
1645 case -1:
1646 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1647 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1648 abandon_current_tag(parser);
1649 return NEXT_CHAR;
1650 case '"':
1651 case '\'':
1652 case '<':
1653 case '=':
1654 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1655 // Fall through.
1656 default:
1657 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1658 append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1659 return NEXT_CHAR;
1660 }
1661 }
1662
1663 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
handle_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1664 static StateResult handle_attr_name_state(GumboParser* parser,
1665 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1666 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1667 switch (c) {
1668 case '\t':
1669 case '\n':
1670 case '\f':
1671 case ' ':
1672 finish_attribute_name(parser);
1673 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1674 return NEXT_CHAR;
1675 case '/':
1676 finish_attribute_name(parser);
1677 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1678 return NEXT_CHAR;
1679 case '=':
1680 finish_attribute_name(parser);
1681 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1682 return NEXT_CHAR;
1683 case '>':
1684 finish_attribute_name(parser);
1685 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1686 return emit_current_tag(parser, output);
1687 case '\0':
1688 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1689 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1690 return NEXT_CHAR;
1691 case -1:
1692 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1693 abandon_current_tag(parser);
1694 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1695 return NEXT_CHAR;
1696 case '"':
1697 case '\'':
1698 case '<':
1699 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1700 // Fall through.
1701 default:
1702 append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1703 return NEXT_CHAR;
1704 }
1705 }
1706
1707 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
handle_after_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1708 static StateResult handle_after_attr_name_state(GumboParser* parser,
1709 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1710 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1711 switch (c) {
1712 case '\t':
1713 case '\n':
1714 case '\f':
1715 case ' ':
1716 return NEXT_CHAR;
1717 case '/':
1718 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1719 return NEXT_CHAR;
1720 case '=':
1721 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1722 return NEXT_CHAR;
1723 case '>':
1724 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1725 return emit_current_tag(parser, output);
1726 case '\0':
1727 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1728 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1729 append_char_to_temporary_buffer(parser, 0xfffd);
1730 return NEXT_CHAR;
1731 case -1:
1732 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1733 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1734 abandon_current_tag(parser);
1735 return NEXT_CHAR;
1736 case '"':
1737 case '\'':
1738 case '<':
1739 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1740 // Fall through.
1741 default:
1742 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1743 append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1744 return NEXT_CHAR;
1745 }
1746 }
1747
1748 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
handle_before_attr_value_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1749 static StateResult handle_before_attr_value_state(GumboParser* parser,
1750 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1751 switch (c) {
1752 case '\t':
1753 case '\n':
1754 case '\f':
1755 case ' ':
1756 return NEXT_CHAR;
1757 case '"':
1758 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1759 reset_tag_buffer_start_point(parser);
1760 return NEXT_CHAR;
1761 case '&':
1762 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1763 tokenizer->_reconsume_current_input = true;
1764 return NEXT_CHAR;
1765 case '\'':
1766 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1767 reset_tag_buffer_start_point(parser);
1768 return NEXT_CHAR;
1769 case '\0':
1770 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1771 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1772 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1773 return NEXT_CHAR;
1774 case -1:
1775 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1776 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1777 abandon_current_tag(parser);
1778 tokenizer->_reconsume_current_input = true;
1779 return NEXT_CHAR;
1780 case '>':
1781 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1782 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1783 emit_current_tag(parser, output);
1784 return RETURN_ERROR;
1785 case '<':
1786 case '=':
1787 case '`':
1788 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1789 // Fall through.
1790 default:
1791 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1792 append_char_to_tag_buffer(parser, c, true);
1793 return NEXT_CHAR;
1794 }
1795 }
1796
1797 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
handle_attr_value_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1798 static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1799 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1800 AVOID_UNUSED_VARIABLE_WARNING(output);
1801 switch (c) {
1802 case '"':
1803 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1804 return NEXT_CHAR;
1805 case '&':
1806 tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1807 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1808 tokenizer->_reconsume_current_input = true;
1809 return NEXT_CHAR;
1810 case '\0':
1811 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1812 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1813 return NEXT_CHAR;
1814 case -1:
1815 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
1816 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1817 abandon_current_tag(parser);
1818 tokenizer->_reconsume_current_input = true;
1819 return NEXT_CHAR;
1820 default:
1821 append_char_to_tag_buffer(parser, c, false);
1822 return NEXT_CHAR;
1823 }
1824 }
1825
1826 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
handle_attr_value_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1827 static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1828 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1829 AVOID_UNUSED_VARIABLE_WARNING(output);
1830 switch (c) {
1831 case '\'':
1832 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1833 return NEXT_CHAR;
1834 case '&':
1835 tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1836 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1837 tokenizer->_reconsume_current_input = true;
1838 return NEXT_CHAR;
1839 case '\0':
1840 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1841 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1842 return NEXT_CHAR;
1843 case -1:
1844 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
1845 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1846 abandon_current_tag(parser);
1847 tokenizer->_reconsume_current_input = true;
1848 return NEXT_CHAR;
1849 default:
1850 append_char_to_tag_buffer(parser, c, false);
1851 return NEXT_CHAR;
1852 }
1853 }
1854
1855 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
handle_attr_value_unquoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1856 static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1857 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1858 switch (c) {
1859 case '\t':
1860 case '\n':
1861 case '\f':
1862 case ' ':
1863 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1864 finish_attribute_value(parser);
1865 return NEXT_CHAR;
1866 case '&':
1867 tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1868 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1869 tokenizer->_reconsume_current_input = true;
1870 return NEXT_CHAR;
1871 case '>':
1872 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1873 finish_attribute_value(parser);
1874 return emit_current_tag(parser, output);
1875 case '\0':
1876 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1877 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1878 return NEXT_CHAR;
1879 case -1:
1880 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1881 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1882 tokenizer->_reconsume_current_input = true;
1883 abandon_current_tag(parser);
1884 return NEXT_CHAR;
1885 case '<':
1886 case '=':
1887 case '"':
1888 case '\'':
1889 case '`':
1890 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1891 // Fall through.
1892 default:
1893 append_char_to_tag_buffer(parser, c, true);
1894 return NEXT_CHAR;
1895 }
1896 }
1897
1898 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
handle_char_ref_in_attr_value_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1899 static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1900 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1901 AVOID_UNUSED_VARIABLE_WARNING(output);
1902 AVOID_UNUSED_VARIABLE_WARNING(c);
1903 OneOrTwoCodepoints char_ref;
1904 int allowed_char;
1905 bool is_unquoted = false;
1906 switch (tokenizer->_tag_state._attr_value_state) {
1907 case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
1908 allowed_char = '"';
1909 break;
1910 case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
1911 allowed_char = '\'';
1912 break;
1913 case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
1914 allowed_char = '>';
1915 is_unquoted = true;
1916 break;
1917 default:
1918 // -Wmaybe-uninitialized is a little overzealous here, and doesn't
1919 // get that the assert(0) means this codepath will never happen.
1920 allowed_char = ' ';
1921 assert(0);
1922 }
1923
1924 // Ignore the status, since we don't have a convenient way of signalling that
1925 // a parser error has occurred when the error occurs in the middle of a
1926 // multi-state token. We'd need a flag inside the TokenizerState to do this,
1927 // but that's a low priority fix.
1928 consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
1929 if (char_ref.first != kGumboNoChar) {
1930 tokenizer->_reconsume_current_input = true;
1931 append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
1932 if (char_ref.second != kGumboNoChar) {
1933 append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
1934 }
1935 } else {
1936 append_char_to_tag_buffer(parser, '&', is_unquoted);
1937 }
1938 gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
1939 return NEXT_CHAR;
1940 }
1941
1942 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
handle_after_attr_value_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1943 static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1944 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1945 finish_attribute_value(parser);
1946 switch (c) {
1947 case '\t':
1948 case '\n':
1949 case '\f':
1950 case ' ':
1951 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1952 return NEXT_CHAR;
1953 case '/':
1954 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1955 return NEXT_CHAR;
1956 case '>':
1957 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1958 return emit_current_tag(parser, output);
1959 case -1:
1960 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
1961 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1962 abandon_current_tag(parser);
1963 tokenizer->_reconsume_current_input = true;
1964 return NEXT_CHAR;
1965 default:
1966 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
1967 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1968 tokenizer->_reconsume_current_input = true;
1969 return NEXT_CHAR;
1970 }
1971 }
1972
1973 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
handle_self_closing_start_tag_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1974 static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1975 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1976 switch (c) {
1977 case '>':
1978 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1979 tokenizer->_tag_state._is_self_closing = true;
1980 return emit_current_tag(parser, output);
1981 case -1:
1982 tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
1983 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1984 abandon_current_tag(parser);
1985 return NEXT_CHAR;
1986 default:
1987 tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
1988 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1989 tokenizer->_reconsume_current_input = true;
1990 return NEXT_CHAR;
1991 }
1992 }
1993
1994 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
handle_bogus_comment_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1995 static StateResult handle_bogus_comment_state(GumboParser* parser,
1996 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1997 while (c != '>' && c != -1) {
1998 if (c == '\0') {
1999 c = 0xFFFD;
2000 }
2001 append_char_to_temporary_buffer(parser, c);
2002 utf8iterator_next(&tokenizer->_input);
2003 c = utf8iterator_current(&tokenizer->_input);
2004 }
2005 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2006 return emit_comment(parser, output);
2007 }
2008
2009 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
handle_markup_declaration_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2010 static StateResult handle_markup_declaration_state(GumboParser* parser,
2011 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2012 AVOID_UNUSED_VARIABLE_WARNING(output);
2013 AVOID_UNUSED_VARIABLE_WARNING(c);
2014 if (utf8iterator_maybe_consume_match(
2015 &tokenizer->_input, "--", sizeof("--") - 1, true)) {
2016 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2017 tokenizer->_reconsume_current_input = true;
2018 } else if (utf8iterator_maybe_consume_match(
2019 &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
2020 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2021 tokenizer->_reconsume_current_input = true;
2022 // If we get here, we know we'll eventually emit a doctype token, so now is
2023 // the time to initialize the doctype strings. (Not in doctype_state_init,
2024 // since then they'll leak if ownership never gets transferred to the
2025 // doctype token.
2026 tokenizer->_doc_type_state.name = gumbo_strdup("");
2027 tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
2028 tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
2029 } else if (tokenizer->_is_current_node_foreign &&
2030 utf8iterator_maybe_consume_match(
2031 &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2032 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2033 tokenizer->_is_in_cdata = true;
2034 tokenizer->_reconsume_current_input = true;
2035 } else {
2036 tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2037 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2038 tokenizer->_reconsume_current_input = true;
2039 clear_temporary_buffer(parser);
2040 }
2041 return NEXT_CHAR;
2042 }
2043
2044 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
handle_comment_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2045 static StateResult handle_comment_start_state(GumboParser* parser,
2046 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2047 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2048 switch (c) {
2049 case '-':
2050 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2051 return NEXT_CHAR;
2052 case '\0':
2053 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2054 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2055 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2056 return NEXT_CHAR;
2057 case '>':
2058 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2059 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2060 emit_comment(parser, output);
2061 return RETURN_ERROR;
2062 case -1:
2063 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2064 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2065 emit_comment(parser, output);
2066 return RETURN_ERROR;
2067 default:
2068 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2069 append_char_to_temporary_buffer(parser, c);
2070 return NEXT_CHAR;
2071 }
2072 }
2073
2074 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
handle_comment_start_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2075 static StateResult handle_comment_start_dash_state(GumboParser* parser,
2076 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2077 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2078 switch (c) {
2079 case '-':
2080 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2081 return NEXT_CHAR;
2082 case '\0':
2083 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2084 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2085 append_char_to_temporary_buffer(parser, '-');
2086 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2087 return NEXT_CHAR;
2088 case '>':
2089 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2090 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2091 emit_comment(parser, output);
2092 return RETURN_ERROR;
2093 case -1:
2094 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2095 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2096 emit_comment(parser, output);
2097 return RETURN_ERROR;
2098 default:
2099 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2100 append_char_to_temporary_buffer(parser, '-');
2101 append_char_to_temporary_buffer(parser, c);
2102 return NEXT_CHAR;
2103 }
2104 }
2105
2106 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
handle_comment_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2107 static StateResult handle_comment_state(GumboParser* parser,
2108 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2109 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2110 switch (c) {
2111 case '-':
2112 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2113 return NEXT_CHAR;
2114 case '\0':
2115 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2116 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2117 return NEXT_CHAR;
2118 case -1:
2119 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2120 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2121 emit_comment(parser, output);
2122 return RETURN_ERROR;
2123 default:
2124 append_char_to_temporary_buffer(parser, c);
2125 return NEXT_CHAR;
2126 }
2127 }
2128
2129 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
handle_comment_end_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2130 static StateResult handle_comment_end_dash_state(GumboParser* parser,
2131 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2132 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2133 switch (c) {
2134 case '-':
2135 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2136 return NEXT_CHAR;
2137 case '\0':
2138 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2139 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2140 append_char_to_temporary_buffer(parser, '-');
2141 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2142 return NEXT_CHAR;
2143 case -1:
2144 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2145 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2146 emit_comment(parser, output);
2147 return RETURN_ERROR;
2148 default:
2149 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2150 append_char_to_temporary_buffer(parser, '-');
2151 append_char_to_temporary_buffer(parser, c);
2152 return NEXT_CHAR;
2153 }
2154 }
2155
2156 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
handle_comment_end_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2157 static StateResult handle_comment_end_state(GumboParser* parser,
2158 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2159 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2160 switch (c) {
2161 case '>':
2162 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2163 return emit_comment(parser, output);
2164 case '\0':
2165 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2166 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2167 append_char_to_temporary_buffer(parser, '-');
2168 append_char_to_temporary_buffer(parser, '-');
2169 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2170 return NEXT_CHAR;
2171 case '!':
2172 tokenizer_add_parse_error(
2173 parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2174 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2175 return NEXT_CHAR;
2176 case '-':
2177 tokenizer_add_parse_error(
2178 parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2179 append_char_to_temporary_buffer(parser, '-');
2180 return NEXT_CHAR;
2181 case -1:
2182 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2183 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2184 emit_comment(parser, output);
2185 return RETURN_ERROR;
2186 default:
2187 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2188 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2189 append_char_to_temporary_buffer(parser, '-');
2190 append_char_to_temporary_buffer(parser, '-');
2191 append_char_to_temporary_buffer(parser, c);
2192 return NEXT_CHAR;
2193 }
2194 }
2195
2196 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
handle_comment_end_bang_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2197 static StateResult handle_comment_end_bang_state(GumboParser* parser,
2198 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2199 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2200
2201 switch (c) {
2202 case '-':
2203 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2204 append_char_to_temporary_buffer(parser, '-');
2205 append_char_to_temporary_buffer(parser, '-');
2206 append_char_to_temporary_buffer(parser, '!');
2207 return NEXT_CHAR;
2208 case '>':
2209 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2210 return emit_comment(parser, output);
2211 case '\0':
2212 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2213 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2214 append_char_to_temporary_buffer(parser, '-');
2215 append_char_to_temporary_buffer(parser, '-');
2216 append_char_to_temporary_buffer(parser, '!');
2217 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2218 return NEXT_CHAR;
2219 case -1:
2220 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2221 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2222 emit_comment(parser, output);
2223 return RETURN_ERROR;
2224 default:
2225 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2226 append_char_to_temporary_buffer(parser, '-');
2227 append_char_to_temporary_buffer(parser, '-');
2228 append_char_to_temporary_buffer(parser, '!');
2229 append_char_to_temporary_buffer(parser, c);
2230 return NEXT_CHAR;
2231 }
2232 }
2233
2234 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
handle_doctype_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2235 static StateResult handle_doctype_state(GumboParser* parser,
2236 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2237 assert(!tokenizer->_temporary_buffer.length);
2238 switch (c) {
2239 case '\t':
2240 case '\n':
2241 case '\f':
2242 case ' ':
2243 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2244 return NEXT_CHAR;
2245 case -1:
2246 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2247 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2248 tokenizer->_doc_type_state.force_quirks = true;
2249 emit_doctype(parser, output);
2250 return RETURN_ERROR;
2251 default:
2252 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2253 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2254 tokenizer->_reconsume_current_input = true;
2255 tokenizer->_doc_type_state.force_quirks = true;
2256 return NEXT_CHAR;
2257 }
2258 }
2259
2260 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
handle_before_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2261 static StateResult handle_before_doctype_name_state(GumboParser* parser,
2262 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2263 switch (c) {
2264 case '\t':
2265 case '\n':
2266 case '\f':
2267 case ' ':
2268 return NEXT_CHAR;
2269 case '\0':
2270 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2271 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2272 tokenizer->_doc_type_state.force_quirks = true;
2273 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2274 return NEXT_CHAR;
2275 case '>':
2276 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2277 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2278 tokenizer->_doc_type_state.force_quirks = true;
2279 emit_doctype(parser, output);
2280 return RETURN_ERROR;
2281 case -1:
2282 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2283 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2284 tokenizer->_doc_type_state.force_quirks = true;
2285 emit_doctype(parser, output);
2286 return RETURN_ERROR;
2287 default:
2288 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2289 tokenizer->_doc_type_state.force_quirks = false;
2290 append_char_to_temporary_buffer(parser, gumbo_tolower(c));
2291 return NEXT_CHAR;
2292 }
2293 }
2294
2295 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
handle_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2296 static StateResult handle_doctype_name_state(GumboParser* parser,
2297 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2298 switch (c) {
2299 case '\t':
2300 case '\n':
2301 case '\f':
2302 case ' ':
2303 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2304 gumbo_free((void*) tokenizer->_doc_type_state.name);
2305 finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2306 return NEXT_CHAR;
2307 case '>':
2308 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2309 gumbo_free((void*) tokenizer->_doc_type_state.name);
2310 finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2311 emit_doctype(parser, output);
2312 return RETURN_SUCCESS;
2313 case '\0':
2314 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2315 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2316 return NEXT_CHAR;
2317 case -1:
2318 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2319 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2320 tokenizer->_doc_type_state.force_quirks = true;
2321 gumbo_free((void*) tokenizer->_doc_type_state.name);
2322 finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2323 emit_doctype(parser, output);
2324 return RETURN_ERROR;
2325 default:
2326 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2327 tokenizer->_doc_type_state.force_quirks = false;
2328 append_char_to_temporary_buffer(parser, gumbo_tolower(c));
2329 return NEXT_CHAR;
2330 }
2331 }
2332
2333 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
handle_after_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2334 static StateResult handle_after_doctype_name_state(GumboParser* parser,
2335 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2336 switch (c) {
2337 case '\t':
2338 case '\n':
2339 case '\f':
2340 case ' ':
2341 return NEXT_CHAR;
2342 case '>':
2343 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2344 emit_doctype(parser, output);
2345 return RETURN_SUCCESS;
2346 case -1:
2347 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2348 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2349 tokenizer->_doc_type_state.force_quirks = true;
2350 emit_doctype(parser, output);
2351 return RETURN_ERROR;
2352 default:
2353 if (utf8iterator_maybe_consume_match(
2354 &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2355 gumbo_tokenizer_set_state(
2356 parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2357 tokenizer->_reconsume_current_input = true;
2358 } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2359 sizeof("SYSTEM") - 1, false)) {
2360 gumbo_tokenizer_set_state(
2361 parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2362 tokenizer->_reconsume_current_input = true;
2363 } else {
2364 tokenizer_add_parse_error(
2365 parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2366 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2367 tokenizer->_doc_type_state.force_quirks = true;
2368 }
2369 return NEXT_CHAR;
2370 }
2371 }
2372
2373 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
handle_after_doctype_public_keyword_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2374 static StateResult handle_after_doctype_public_keyword_state(
2375 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2376 GumboToken* output) {
2377 switch (c) {
2378 case '\t':
2379 case '\n':
2380 case '\f':
2381 case ' ':
2382 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2383 return NEXT_CHAR;
2384 case '"':
2385 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2386 assert(temporary_buffer_equals(parser, ""));
2387 gumbo_tokenizer_set_state(
2388 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2389 return NEXT_CHAR;
2390 case '\'':
2391 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2392 assert(temporary_buffer_equals(parser, ""));
2393 gumbo_tokenizer_set_state(
2394 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2395 return NEXT_CHAR;
2396 case '>':
2397 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2398 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2399 tokenizer->_doc_type_state.force_quirks = true;
2400 emit_doctype(parser, output);
2401 return RETURN_ERROR;
2402 case -1:
2403 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2404 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2405 tokenizer->_doc_type_state.force_quirks = true;
2406 emit_doctype(parser, output);
2407 return RETURN_ERROR;
2408 default:
2409 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2410 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2411 tokenizer->_doc_type_state.force_quirks = true;
2412 emit_doctype(parser, output);
2413 return RETURN_ERROR;
2414 }
2415 }
2416
2417 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
handle_before_doctype_public_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2418 static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2419 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2420 switch (c) {
2421 case '\t':
2422 case '\n':
2423 case '\f':
2424 case ' ':
2425 return NEXT_CHAR;
2426 case '"':
2427 assert(temporary_buffer_equals(parser, ""));
2428 gumbo_tokenizer_set_state(
2429 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2430 return NEXT_CHAR;
2431 case '\'':
2432 assert(temporary_buffer_equals(parser, ""));
2433 gumbo_tokenizer_set_state(
2434 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2435 return NEXT_CHAR;
2436 case '>':
2437 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2438 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2439 tokenizer->_doc_type_state.force_quirks = true;
2440 emit_doctype(parser, output);
2441 return RETURN_ERROR;
2442 case -1:
2443 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2444 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2445 tokenizer->_doc_type_state.force_quirks = true;
2446 emit_doctype(parser, output);
2447 return RETURN_ERROR;
2448 default:
2449 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2450 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2451 tokenizer->_doc_type_state.force_quirks = true;
2452 emit_doctype(parser, output);
2453 return RETURN_ERROR;
2454 }
2455 }
2456
2457 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
handle_doctype_public_id_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2458 static StateResult handle_doctype_public_id_double_quoted_state(
2459 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2460 GumboToken* output) {
2461 switch (c) {
2462 case '"':
2463 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2464 finish_doctype_public_id(parser);
2465 return NEXT_CHAR;
2466 case '\0':
2467 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2468 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2469 return NEXT_CHAR;
2470 case '>':
2471 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2472 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2473 tokenizer->_doc_type_state.force_quirks = true;
2474 finish_doctype_public_id(parser);
2475 emit_doctype(parser, output);
2476 return RETURN_ERROR;
2477 case -1:
2478 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2479 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2480 tokenizer->_doc_type_state.force_quirks = true;
2481 finish_doctype_public_id(parser);
2482 emit_doctype(parser, output);
2483 return RETURN_ERROR;
2484 default:
2485 append_char_to_temporary_buffer(parser, c);
2486 return NEXT_CHAR;
2487 }
2488 }
2489
2490 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
handle_doctype_public_id_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2491 static StateResult handle_doctype_public_id_single_quoted_state(
2492 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2493 GumboToken* output) {
2494 switch (c) {
2495 case '\'':
2496 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2497 finish_doctype_public_id(parser);
2498 return NEXT_CHAR;
2499 case '\0':
2500 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2501 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2502 return NEXT_CHAR;
2503 case '>':
2504 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2505 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2506 tokenizer->_doc_type_state.force_quirks = true;
2507 finish_doctype_public_id(parser);
2508 emit_doctype(parser, output);
2509 return RETURN_ERROR;
2510 case -1:
2511 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2512 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2513 tokenizer->_doc_type_state.force_quirks = true;
2514 finish_doctype_public_id(parser);
2515 emit_doctype(parser, output);
2516 return RETURN_ERROR;
2517 default:
2518 append_char_to_temporary_buffer(parser, c);
2519 return NEXT_CHAR;
2520 }
2521 }
2522
2523 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
handle_after_doctype_public_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2524 static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2525 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2526 switch (c) {
2527 case '\t':
2528 case '\n':
2529 case '\f':
2530 case ' ':
2531 gumbo_tokenizer_set_state(
2532 parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2533 return NEXT_CHAR;
2534 case '>':
2535 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2536 emit_doctype(parser, output);
2537 return RETURN_SUCCESS;
2538 case '"':
2539 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2540 assert(temporary_buffer_equals(parser, ""));
2541 gumbo_tokenizer_set_state(
2542 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2543 return NEXT_CHAR;
2544 case '\'':
2545 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2546 assert(temporary_buffer_equals(parser, ""));
2547 gumbo_tokenizer_set_state(
2548 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2549 return NEXT_CHAR;
2550 case -1:
2551 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2552 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2553 tokenizer->_reconsume_current_input = true;
2554 tokenizer->_doc_type_state.force_quirks = true;
2555 emit_doctype(parser, output);
2556 return RETURN_ERROR;
2557 default:
2558 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2559 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2560 tokenizer->_doc_type_state.force_quirks = true;
2561 return NEXT_CHAR;
2562 }
2563 }
2564
2565 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
handle_between_doctype_public_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2566 static StateResult handle_between_doctype_public_system_id_state(
2567 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2568 GumboToken* output) {
2569 switch (c) {
2570 case '\t':
2571 case '\n':
2572 case '\f':
2573 case ' ':
2574 return NEXT_CHAR;
2575 case '>':
2576 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2577 emit_doctype(parser, output);
2578 return RETURN_SUCCESS;
2579 case '"':
2580 assert(temporary_buffer_equals(parser, ""));
2581 gumbo_tokenizer_set_state(
2582 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2583 return NEXT_CHAR;
2584 case '\'':
2585 assert(temporary_buffer_equals(parser, ""));
2586 gumbo_tokenizer_set_state(
2587 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2588 return NEXT_CHAR;
2589 case -1:
2590 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2591 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2592 tokenizer->_doc_type_state.force_quirks = true;
2593 emit_doctype(parser, output);
2594 return RETURN_ERROR;
2595 default:
2596 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2597 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2598 tokenizer->_doc_type_state.force_quirks = true;
2599 emit_doctype(parser, output);
2600 return RETURN_ERROR;
2601 }
2602 }
2603
2604 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
handle_after_doctype_system_keyword_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2605 static StateResult handle_after_doctype_system_keyword_state(
2606 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2607 GumboToken* output) {
2608 switch (c) {
2609 case '\t':
2610 case '\n':
2611 case '\f':
2612 case ' ':
2613 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2614 return NEXT_CHAR;
2615 case '"':
2616 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2617 assert(temporary_buffer_equals(parser, ""));
2618 gumbo_tokenizer_set_state(
2619 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2620 return NEXT_CHAR;
2621 case '\'':
2622 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2623 assert(temporary_buffer_equals(parser, ""));
2624 gumbo_tokenizer_set_state(
2625 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2626 return NEXT_CHAR;
2627 case '>':
2628 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2629 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2630 tokenizer->_doc_type_state.force_quirks = true;
2631 emit_doctype(parser, output);
2632 return RETURN_ERROR;
2633 case -1:
2634 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2635 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2636 tokenizer->_doc_type_state.force_quirks = true;
2637 emit_doctype(parser, output);
2638 return RETURN_ERROR;
2639 default:
2640 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2641 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2642 tokenizer->_doc_type_state.force_quirks = true;
2643 return NEXT_CHAR;
2644 }
2645 }
2646
2647 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
handle_before_doctype_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2648 static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2649 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2650 switch (c) {
2651 case '\t':
2652 case '\n':
2653 case '\f':
2654 case ' ':
2655 return NEXT_CHAR;
2656 case '"':
2657 assert(temporary_buffer_equals(parser, ""));
2658 gumbo_tokenizer_set_state(
2659 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2660 return NEXT_CHAR;
2661 case '\'':
2662 assert(temporary_buffer_equals(parser, ""));
2663 gumbo_tokenizer_set_state(
2664 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2665 return NEXT_CHAR;
2666 case '>':
2667 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2668 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2669 tokenizer->_doc_type_state.force_quirks = true;
2670 emit_doctype(parser, output);
2671 return RETURN_ERROR;
2672 case -1:
2673 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2674 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2675 tokenizer->_doc_type_state.force_quirks = true;
2676 emit_doctype(parser, output);
2677 return RETURN_ERROR;
2678 default:
2679 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2680 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2681 tokenizer->_doc_type_state.force_quirks = true;
2682 return NEXT_CHAR;
2683 }
2684 }
2685
2686 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
handle_doctype_system_id_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2687 static StateResult handle_doctype_system_id_double_quoted_state(
2688 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2689 GumboToken* output) {
2690 switch (c) {
2691 case '"':
2692 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2693 finish_doctype_system_id(parser);
2694 return NEXT_CHAR;
2695 case '\0':
2696 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2697 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2698 return NEXT_CHAR;
2699 case '>':
2700 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2701 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2702 tokenizer->_doc_type_state.force_quirks = true;
2703 finish_doctype_system_id(parser);
2704 emit_doctype(parser, output);
2705 return RETURN_ERROR;
2706 case -1:
2707 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2708 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2709 tokenizer->_doc_type_state.force_quirks = true;
2710 finish_doctype_system_id(parser);
2711 emit_doctype(parser, output);
2712 return RETURN_ERROR;
2713 default:
2714 append_char_to_temporary_buffer(parser, c);
2715 return NEXT_CHAR;
2716 }
2717 }
2718
2719 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
handle_doctype_system_id_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2720 static StateResult handle_doctype_system_id_single_quoted_state(
2721 GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2722 GumboToken* output) {
2723 switch (c) {
2724 case '\'':
2725 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2726 finish_doctype_system_id(parser);
2727 return NEXT_CHAR;
2728 case '\0':
2729 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2730 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2731 return NEXT_CHAR;
2732 case '>':
2733 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2734 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2735 tokenizer->_doc_type_state.force_quirks = true;
2736 finish_doctype_system_id(parser);
2737 emit_doctype(parser, output);
2738 return RETURN_ERROR;
2739 case -1:
2740 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2741 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2742 tokenizer->_doc_type_state.force_quirks = true;
2743 finish_doctype_system_id(parser);
2744 emit_doctype(parser, output);
2745 return RETURN_ERROR;
2746 default:
2747 append_char_to_temporary_buffer(parser, c);
2748 return NEXT_CHAR;
2749 }
2750 }
2751
2752 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
handle_after_doctype_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2753 static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2754 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2755 switch (c) {
2756 case '\t':
2757 case '\n':
2758 case '\f':
2759 case ' ':
2760 return NEXT_CHAR;
2761 case '>':
2762 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2763 emit_doctype(parser, output);
2764 return RETURN_SUCCESS;
2765 case -1:
2766 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2767 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2768 tokenizer->_doc_type_state.force_quirks = true;
2769 emit_doctype(parser, output);
2770 return RETURN_ERROR;
2771 default:
2772 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2773 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2774 return NEXT_CHAR;
2775 }
2776 }
2777
2778 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
handle_bogus_doctype_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2779 static StateResult handle_bogus_doctype_state(GumboParser* parser,
2780 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2781 AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2782 if (c == '>' || c == -1) {
2783 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2784 emit_doctype(parser, output);
2785 return RETURN_ERROR;
2786 }
2787 return NEXT_CHAR;
2788 }
2789
2790 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
handle_cdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2791 static StateResult handle_cdata_state(GumboParser* parser,
2792 GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2793 if (c == -1 || utf8iterator_maybe_consume_match(
2794 &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2795 tokenizer->_reconsume_current_input = true;
2796 reset_token_start_point(tokenizer);
2797 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2798 tokenizer->_is_in_cdata = false;
2799 return NEXT_CHAR;
2800 } else {
2801 return emit_current_char(parser, output);
2802 }
2803 }
2804
2805 typedef StateResult (*GumboLexerStateFunction)(
2806 GumboParser*, GumboTokenizerState*, int, GumboToken*);
2807
2808 static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
2809 handle_char_ref_in_data_state, handle_rcdata_state,
2810 handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
2811 handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
2812 handle_tag_name_state, handle_rcdata_lt_state,
2813 handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
2814 handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
2815 handle_rawtext_end_tag_name_state, handle_script_lt_state,
2816 handle_script_end_tag_open_state, handle_script_end_tag_name_state,
2817 handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
2818 handle_script_escaped_state, handle_script_escaped_dash_state,
2819 handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
2820 handle_script_escaped_end_tag_open_state,
2821 handle_script_escaped_end_tag_name_state,
2822 handle_script_double_escaped_start_state,
2823 handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
2824 handle_script_double_escaped_dash_dash_state,
2825 handle_script_double_escaped_lt_state,
2826 handle_script_double_escaped_end_state, handle_before_attr_name_state,
2827 handle_attr_name_state, handle_after_attr_name_state,
2828 handle_before_attr_value_state, handle_attr_value_double_quoted_state,
2829 handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
2830 handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
2831 handle_self_closing_start_tag_state, handle_bogus_comment_state,
2832 handle_markup_declaration_state, handle_comment_start_state,
2833 handle_comment_start_dash_state, handle_comment_state,
2834 handle_comment_end_dash_state, handle_comment_end_state,
2835 handle_comment_end_bang_state, handle_doctype_state,
2836 handle_before_doctype_name_state, handle_doctype_name_state,
2837 handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
2838 handle_before_doctype_public_id_state,
2839 handle_doctype_public_id_double_quoted_state,
2840 handle_doctype_public_id_single_quoted_state,
2841 handle_after_doctype_public_id_state,
2842 handle_between_doctype_public_system_id_state,
2843 handle_after_doctype_system_keyword_state,
2844 handle_before_doctype_system_id_state,
2845 handle_doctype_system_id_double_quoted_state,
2846 handle_doctype_system_id_single_quoted_state,
2847 handle_after_doctype_system_id_state, handle_bogus_doctype_state,
2848 handle_cdata_state};
2849
gumbo_lex(GumboParser * parser,GumboToken * output)2850 bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2851 // Because of the spec requirements that...
2852 //
2853 // 1. Tokens be handled immediately by the parser upon emission.
2854 // 2. Some states (eg. CDATA, or various error conditions) require the
2855 // emission of multiple tokens in the same states.
2856 // 3. The tokenizer often has to reconsume the same character in a different
2857 // state.
2858 //
2859 // ...all state must be held in the GumboTokenizer struct instead of in local
2860 // variables in this function. That allows us to return from this method with
2861 // a token, and then immediately jump back to the same state with the same
2862 // input if we need to return a different token. The various emit_* functions
2863 // are responsible for changing state (eg. flushing the chardata buffer,
2864 // reading the next input character) to avoid an infinite loop.
2865 GumboTokenizerState* tokenizer = parser->_tokenizer_state;
2866
2867 if (tokenizer->_buffered_emit_char != kGumboNoChar) {
2868 tokenizer->_reconsume_current_input = true;
2869 emit_char(parser, tokenizer->_buffered_emit_char, output);
2870 // And now that we've avoided advancing the input, make sure we set
2871 // _reconsume_current_input back to false to make sure the *next* character
2872 // isn't consumed twice.
2873 tokenizer->_reconsume_current_input = false;
2874 tokenizer->_buffered_emit_char = kGumboNoChar;
2875 return true;
2876 }
2877
2878 if (maybe_emit_from_temporary_buffer(parser, output)) {
2879 return true;
2880 }
2881
2882 while (1) {
2883 assert(!tokenizer->_temporary_buffer_emit);
2884 assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2885 int c = utf8iterator_current(&tokenizer->_input);
2886 gumbo_debug(
2887 "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
2888 StateResult result =
2889 dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2890 // We need to clear reconsume_current_input before returning to prevent
2891 // certain infinite loop states.
2892 bool should_advance = !tokenizer->_reconsume_current_input;
2893 tokenizer->_reconsume_current_input = false;
2894
2895 if (result == RETURN_SUCCESS) {
2896 return true;
2897 } else if (result == RETURN_ERROR) {
2898 return false;
2899 }
2900
2901 if (should_advance) {
2902 utf8iterator_next(&tokenizer->_input);
2903 }
2904 }
2905 }
2906
gumbo_token_destroy(GumboToken * token)2907 void gumbo_token_destroy(GumboToken* token) {
2908 if (!token) return;
2909
2910 switch (token->type) {
2911 case GUMBO_TOKEN_DOCTYPE:
2912 gumbo_free((void*) token->v.doc_type.name);
2913 gumbo_free((void*) token->v.doc_type.public_identifier);
2914 gumbo_free((void*) token->v.doc_type.system_identifier);
2915 return;
2916 case GUMBO_TOKEN_START_TAG:
2917 for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2918 GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2919 if (attr) {
2920 // May have been nulled out if this token was merged with another.
2921 gumbo_destroy_attribute(attr);
2922 }
2923 }
2924 gumbo_free((void*) token->v.start_tag.attributes.data);
2925 return;
2926 case GUMBO_TOKEN_COMMENT:
2927 gumbo_free((void*) token->v.text);
2928 return;
2929 default:
2930 return;
2931 }
2932 }
2933