1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 //
17 // Coding conventions specific to this file:
18 //
19 // 1. Functions that fill in a token should be named emit_*, and should be
20 // followed immediately by a return from the tokenizer (true if no error
21 // occurred, false if an error occurred).  Sometimes the emit functions
22 // themselves return a boolean so that they can be combined with the return
23 // statement; in this case, they should match this convention.
24 // 2. Functions that shuffle data from temporaries to final API structures
25 // should be named finish_*, and be called just before the tokenizer exits the
26 // state that accumulates the temporary.
27 // 3. All internal data structures should be kept in an initialized state from
28 // tokenizer creation onwards, ready to accept input.  When a buffer's flushed
29 // and reset, it should be deallocated and immediately reinitialized.
30 // 4. Make sure there are appropriate break statements following each state.
31 // 5. Assertions on the state of the temporary and tag buffers are usually a
32 // good idea, and should go at the entry point of each state when added.
33 // 6. Statement order within states goes:
34 //    1. Add parse errors, if appropriate.
35 //    2. Call finish_* functions to build up tag state.
36 //    2. Switch to new state.  Set _reconsume flag if appropriate.
37 //    3. Perform any other temporary buffer manipulation.
38 //    4. Emit tokens
39 //    5. Return/break.
40 // This order ensures that we can verify that every emit is followed by a
41 // return, ensures that the correct state is recorded with any parse errors, and
42 // prevents parse error position from being messed up by possible mark/resets in
43 // temporary buffer manipulation.
44 
45 #include "tokenizer.h"
46 
47 #include <assert.h>
48 #include <stdbool.h>
49 #include <string.h>
50 
51 #include "attribute.h"
52 #include "char_ref.h"
53 #include "error.h"
54 #include "gumbo.h"
55 #include "parser.h"
56 #include "string_buffer.h"
57 #include "string_piece.h"
58 #include "token_type.h"
59 #include "tokenizer_states.h"
60 #include "utf8.h"
61 #include "util.h"
62 #include "vector.h"
63 
64 #define AVOID_UNUSED_VARIABLE_WARNING(i) (void) (i)
65 
66 // Compared against _script_data_buffer to determine if we're in double-escaped
67 // script mode.
68 const GumboStringPiece kScriptTag = {"script", 6};
69 
70 // An enum for the return value of each individual state.
71 typedef enum {
72   RETURN_ERROR,    // Return false (error) from the tokenizer.
73   RETURN_SUCCESS,  // Return true (success) from the tokenizer.
74   NEXT_CHAR        // Proceed to the next character and continue lexing.
75 } StateResult;
76 
77 // This is a struct containing state necessary to build up a tag token,
78 // character by character.
79 typedef struct GumboInternalTagState {
80   // A buffer to accumulate characters for various GumboStringPiece fields.
81   GumboStringBuffer _buffer;
82 
83   // A pointer to the start of the original text corresponding to the contents
84   // of the buffer.
85   const char* _original_text;
86 
87   // The current tag enum, computed once the tag name state has finished so that
88   // the buffer can be re-used for building up attributes.
89   GumboTag _tag;
90 
91   // The starting location of the text in the buffer.
92   GumboSourcePosition _start_pos;
93 
94   // The current list of attributes.  This is copied (and ownership of its data
95   // transferred) to the GumboStartTag token upon completion of the tag.  New
96   // attributes are added as soon as their attribute name state is complete, and
97   // values are filled in by operating on _attributes.data[attributes.length-1].
98   GumboVector /* GumboAttribute */ _attributes;
99 
100   // If true, the next attribute value to be finished should be dropped.  This
101   // happens if a duplicate attribute name is encountered - we want to consume
102   // the attribute value, but shouldn't overwrite the existing value.
103   bool _drop_next_attr_value;
104 
105   // The state that caused the tokenizer to switch into a character reference in
106   // attribute value state.  This is used to set the additional allowed
107   // character, and is switched back to on completion.  Initialized as the
108   // tokenizer enters the character reference state.
109   GumboTokenizerEnum _attr_value_state;
110 
111   // The last start tag to have been emitted by the tokenizer.  This is
112   // necessary to check for appropriate end tags.
113   GumboTag _last_start_tag;
114 
115   // If true, then this is a start tag.  If false, it's an end tag.  This is
116   // necessary to generate the appropriate token type at tag-closing time.
117   bool _is_start_tag;
118 
119   // If true, then this tag is "self-closing" and doesn't have an end tag.
120   bool _is_self_closing;
121 } GumboTagState;
122 
123 // This is the main tokenizer state struct, containing all state used by in
124 // tokenizing the input stream.
125 typedef struct GumboInternalTokenizerState {
126   // The current lexer state.  Starts in GUMBO_LEX_DATA.
127   GumboTokenizerEnum _state;
128 
129   // A flag indicating whether the current input character needs to reconsumed
130   // in another state, or whether the next input character should be read for
131   // the next iteration of the state loop.  This is set when the spec reads
132   // "Reconsume the current input character in..."
133   bool _reconsume_current_input;
134 
135   // A flag indicating whether the current node is a foreign element.  This is
136   // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
137   // markup declaration state.
138   bool _is_current_node_foreign;
139 
140   // A flag indicating whether the tokenizer is in a CDATA section.  If so, then
141   // text tokens emitted will be GUMBO_TOKEN_CDATA.
142   bool _is_in_cdata;
143 
144   // Certain states (notably character references) may emit two character tokens
145   // at once, but the contract for lex() fills in only one token at a time.  The
146   // extra character is buffered here, and then this is checked on entry to
147   // lex().  If a character is stored here, it's immediately emitted and control
148   // returns from the lexer.  kGumboNoChar is used to represent 'no character
149   // stored.'
150   //
151   // Note that characters emitted through this mechanism will have their source
152   // position marked as the character under the mark, i.e. multiple characters
153   // may be emitted with the same position.  This is desirable for character
154   // references, but unsuitable for many other cases.  Use the _temporary_buffer
155   // mechanism if the buffered characters must have their original positions in
156   // the document.
157   int _buffered_emit_char;
158 
159   // A temporary buffer to accumulate characters, as described by the "temporary
160   // buffer" phrase in the tokenizer spec.  We use this in a somewhat unorthodox
161   // way: we record the specific character to go into the buffer, which may
162   // sometimes be a lowercased version of the actual input character.  However,
163   // we *also* use utf8iterator_mark() to record the position at tag start.
164   // When we start flushing the temporary buffer, we set _temporary_buffer_emit
165   // to the start of it, and then increment it for each call to the tokenizer.
166   // We also call utf8iterator_reset(), and utf8iterator_next() through the
167   // input stream, so that tokens emitted by emit_char have the correct position
168   // and original text.
169   GumboStringBuffer _temporary_buffer;
170 
171   // The current cursor position we're emitting from within
172   // _temporary_buffer.data.  NULL whenever we're not flushing the buffer.
173   const char* _temporary_buffer_emit;
174 
175   // The temporary buffer is also used by the spec to check whether we should
176   // enter the script data double escaped state, but we can't use the same
177   // buffer for both because we have to flush out "<s" as emits while still
178   // maintaining the context that will eventually become "script".  This is a
179   // separate buffer that's used in place of the temporary buffer for states
180   // that may enter the script data double escape start state.
181   GumboStringBuffer _script_data_buffer;
182 
183   // Pointer to the beginning of the current token in the original buffer; used
184   // to record the original text.
185   const char* _token_start;
186 
187   // GumboSourcePosition recording the source location of the start of the
188   // current token.
189   GumboSourcePosition _token_start_pos;
190 
191   // Current tag state.
192   GumboTagState _tag_state;
193 
194   // Doctype state.  We use the temporary buffer to accumulate characters (it's
195   // not used for anything else in the doctype states), and then freshly
196   // allocate the strings in the doctype token, then copy it over on emit.
197   GumboTokenDocType _doc_type_state;
198 
199   // The UTF8Iterator over the tokenizer input.
200   Utf8Iterator _input;
201 } GumboTokenizerState;
202 
203 // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
tokenizer_add_parse_error(GumboParser * parser,GumboErrorType type)204 static void tokenizer_add_parse_error(
205     GumboParser* parser, GumboErrorType type) {
206   GumboError* error = gumbo_add_error(parser);
207   if (!error) {
208     return;
209   }
210   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
211   utf8iterator_get_position(&tokenizer->_input, &error->position);
212   error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
213   error->type = type;
214   error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
215   switch (tokenizer->_state) {
216     case GUMBO_LEX_DATA:
217       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
218       break;
219     case GUMBO_LEX_CHAR_REF_IN_DATA:
220     case GUMBO_LEX_CHAR_REF_IN_RCDATA:
221     case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
222       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
223       break;
224     case GUMBO_LEX_RCDATA:
225     case GUMBO_LEX_RCDATA_LT:
226     case GUMBO_LEX_RCDATA_END_TAG_OPEN:
227     case GUMBO_LEX_RCDATA_END_TAG_NAME:
228       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
229       break;
230     case GUMBO_LEX_RAWTEXT:
231     case GUMBO_LEX_RAWTEXT_LT:
232     case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
233     case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
234       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
235       break;
236     case GUMBO_LEX_PLAINTEXT:
237       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
238       break;
239     case GUMBO_LEX_SCRIPT:
240     case GUMBO_LEX_SCRIPT_LT:
241     case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
242     case GUMBO_LEX_SCRIPT_END_TAG_NAME:
243     case GUMBO_LEX_SCRIPT_ESCAPED_START:
244     case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
245     case GUMBO_LEX_SCRIPT_ESCAPED:
246     case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
247     case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
248     case GUMBO_LEX_SCRIPT_ESCAPED_LT:
249     case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
250     case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
251     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
252     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
253     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
254     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
255     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
256     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
257       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
258       break;
259     case GUMBO_LEX_TAG_OPEN:
260     case GUMBO_LEX_END_TAG_OPEN:
261     case GUMBO_LEX_TAG_NAME:
262     case GUMBO_LEX_BEFORE_ATTR_NAME:
263       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
264       break;
265     case GUMBO_LEX_SELF_CLOSING_START_TAG:
266       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
267       break;
268     case GUMBO_LEX_ATTR_NAME:
269     case GUMBO_LEX_AFTER_ATTR_NAME:
270     case GUMBO_LEX_BEFORE_ATTR_VALUE:
271       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
272       break;
273     case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
274     case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
275     case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
276     case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
277       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
278       break;
279     case GUMBO_LEX_BOGUS_COMMENT:
280     case GUMBO_LEX_COMMENT_START:
281     case GUMBO_LEX_COMMENT_START_DASH:
282     case GUMBO_LEX_COMMENT:
283     case GUMBO_LEX_COMMENT_END_DASH:
284     case GUMBO_LEX_COMMENT_END:
285     case GUMBO_LEX_COMMENT_END_BANG:
286       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
287       break;
288     case GUMBO_LEX_MARKUP_DECLARATION:
289     case GUMBO_LEX_DOCTYPE:
290     case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
291     case GUMBO_LEX_DOCTYPE_NAME:
292     case GUMBO_LEX_AFTER_DOCTYPE_NAME:
293     case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
294     case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
295     case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
296     case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
297     case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
298     case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
299     case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
300     case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
301     case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
302     case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
303     case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
304     case GUMBO_LEX_BOGUS_DOCTYPE:
305       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
306       break;
307     case GUMBO_LEX_CDATA:
308       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
309       break;
310   }
311 }
312 
get_char_token_type(bool is_in_cdata,int c)313 static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
314   if (is_in_cdata && c > 0) {
315     return GUMBO_TOKEN_CDATA;
316   }
317 
318   switch (c) {
319     case '\t':
320     case '\n':
321     case '\r':
322     case '\f':
323     case ' ':
324       return GUMBO_TOKEN_WHITESPACE;
325     case 0:
326       gumbo_debug("Emitted null byte.\n");
327       return GUMBO_TOKEN_NULL;
328     case -1:
329       return GUMBO_TOKEN_EOF;
330     default:
331       return GUMBO_TOKEN_CHARACTER;
332   }
333 }
334 
335 // Starts recording characters in the temporary buffer.
336 // Because this needs to reset the utf8iterator_mark to the beginning of the
337 // text that will eventually be emitted, it needs to be called a couple of
338 // states before the spec says "Set the temporary buffer to the empty string".
339 // In general, this should be called whenever there's a transition to a
340 // "less-than sign state".  The initial < and possibly / then need to be
341 // appended to the temporary buffer, their presence needs to be accounted for in
342 // states that compare the temporary buffer against a literal value, and
343 // spec stanzas that say "emit a < and / character token along with a character
344 // token for each character in the temporary buffer" need to be adjusted to
345 // account for the presence of the < and / inside the temporary buffer.
clear_temporary_buffer(GumboParser * parser)346 static void clear_temporary_buffer(GumboParser* parser) {
347   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
348   assert(!tokenizer->_temporary_buffer_emit);
349   utf8iterator_mark(&tokenizer->_input);
350   gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
351   // The temporary buffer and script data buffer are the same object in the
352   // spec, so the script data buffer should be cleared as well.
353   gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
354 }
355 
356 // Appends a codepoint to the temporary buffer.
append_char_to_temporary_buffer(GumboParser * parser,int codepoint)357 static void append_char_to_temporary_buffer(
358     GumboParser* parser, int codepoint) {
359   gumbo_string_buffer_append_codepoint(
360       codepoint, &parser->_tokenizer_state->_temporary_buffer);
361 }
362 
363 // Checks to see if the temporary buffer equals a certain string.
364 // Make sure this remains side-effect free; it's used in assertions.
365 #ifndef NDEBUG
temporary_buffer_equals(GumboParser * parser,const char * text)366 static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
367   GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
368   // TODO(jdtang): See if the extra strlen is a performance problem, and replace
369   // it with an explicit sizeof(literal) if necessary.  I don't think it will
370   // be, as this is only used in a couple of rare states.
371   size_t text_len = strlen(text);
372   return text_len == buffer->length &&
373          memcmp(buffer->data, text, text_len) == 0;
374 }
375 #endif
376 
doc_type_state_init(GumboParser * parser)377 static void doc_type_state_init(GumboParser* parser) {
378   GumboTokenDocType* doc_type_state =
379       &parser->_tokenizer_state->_doc_type_state;
380   // We initialize these to NULL here so that we don't end up leaking memory if
381   // we never see a doctype token.  When we do see a doctype token, we reset
382   // them to a freshly-allocated empty string so that we can present a uniform
383   // interface to client code and not make them check for null.  Ownership is
384   // transferred to the doctype token when it's emitted.
385   doc_type_state->name = NULL;
386   doc_type_state->public_identifier = NULL;
387   doc_type_state->system_identifier = NULL;
388   doc_type_state->force_quirks = false;
389   doc_type_state->has_public_identifier = false;
390   doc_type_state->has_system_identifier = false;
391 }
392 
393 // Sets the token original_text and position to the current iterator position.
394 // This is necessary because [CDATA[ sections may include text that is ignored
395 // by the tokenizer.
reset_token_start_point(GumboTokenizerState * tokenizer)396 static void reset_token_start_point(GumboTokenizerState* tokenizer) {
397   tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
398   utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
399 }
400 
401 // Sets the tag buffer original text and start point to the current iterator
402 // position.  This is necessary because attribute names & values may have
403 // whitespace preceeding them, and so we can't assume that the actual token
404 // starting point was the end of the last tag buffer usage.
reset_tag_buffer_start_point(GumboParser * parser)405 static void reset_tag_buffer_start_point(GumboParser* parser) {
406   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
407   GumboTagState* tag_state = &tokenizer->_tag_state;
408 
409   utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
410   tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
411 }
412 
413 // Moves the temporary buffer contents over to the specified output string,
414 // and clears the temporary buffer.
finish_temporary_buffer(GumboParser * parser,const char ** output)415 static void finish_temporary_buffer(GumboParser* parser, const char** output) {
416   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
417   *output = gumbo_string_buffer_to_string(&tokenizer->_temporary_buffer);
418   clear_temporary_buffer(parser);
419 }
420 
421 // Advances the iterator past the end of the token, and then fills in the
422 // relevant position fields.  It's assumed that after every emit, the tokenizer
423 // will immediately return (letting the tree-construction stage read the filled
424 // in Token).  Thus, it's safe to advance the input stream here, since it will
425 // bypass the advance at the bottom of the state machine loop.
426 //
427 // Since this advances the iterator and resets the current input, make sure to
428 // call it after you've recorded any other data you need for the token.
finish_token(GumboParser * parser,GumboToken * token)429 static void finish_token(GumboParser* parser, GumboToken* token) {
430   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
431   if (!tokenizer->_reconsume_current_input) {
432     utf8iterator_next(&tokenizer->_input);
433   }
434 
435   token->position = tokenizer->_token_start_pos;
436   token->original_text.data = tokenizer->_token_start;
437   reset_token_start_point(tokenizer);
438   token->original_text.length =
439       tokenizer->_token_start - token->original_text.data;
440   if (token->original_text.length > 0 &&
441       token->original_text.data[token->original_text.length - 1] == '\r') {
442     // The UTF8 iterator will ignore carriage returns in the input stream, which
443     // means that the next token may start one past a \r character.  The pointer
444     // arithmetic above results in that \r being appended to the original text
445     // of the preceding token, so we have to adjust its length here to chop the
446     // \r off.
447     --token->original_text.length;
448   }
449 }
450 
451 // Records the doctype public ID, assumed to be in the temporary buffer.
452 // Convenience method that also sets has_public_identifier to true.
finish_doctype_public_id(GumboParser * parser)453 static void finish_doctype_public_id(GumboParser* parser) {
454   GumboTokenDocType* doc_type_state =
455       &parser->_tokenizer_state->_doc_type_state;
456   gumbo_free((void*) doc_type_state->public_identifier);
457   finish_temporary_buffer(parser, &doc_type_state->public_identifier);
458   doc_type_state->has_public_identifier = true;
459 }
460 
461 // Records the doctype system ID, assumed to be in the temporary buffer.
462 // Convenience method that also sets has_system_identifier to true.
finish_doctype_system_id(GumboParser * parser)463 static void finish_doctype_system_id(GumboParser* parser) {
464   GumboTokenDocType* doc_type_state =
465       &parser->_tokenizer_state->_doc_type_state;
466   gumbo_free((void*) doc_type_state->system_identifier);
467   finish_temporary_buffer(parser, &doc_type_state->system_identifier);
468   doc_type_state->has_system_identifier = true;
469 }
470 
471 // Writes a single specified character to the output token.
emit_char(GumboParser * parser,int c,GumboToken * output)472 static void emit_char(GumboParser* parser, int c, GumboToken* output) {
473   output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
474   output->v.character = c;
475   finish_token(parser, output);
476 }
477 
478 // Writes a replacement character token and records a parse error.
479 // Always returns RETURN_ERROR, per gumbo_lex return value.
emit_replacement_char(GumboParser * parser,GumboToken * output)480 static StateResult emit_replacement_char(
481     GumboParser* parser, GumboToken* output) {
482   // In all cases, this is because of a null byte in the input stream.
483   tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
484   emit_char(parser, kUtf8ReplacementChar, output);
485   return RETURN_ERROR;
486 }
487 
488 // Writes an EOF character token.  Always returns RETURN_SUCCESS.
emit_eof(GumboParser * parser,GumboToken * output)489 static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
490   emit_char(parser, -1, output);
491   return RETURN_SUCCESS;
492 }
493 
494 // Writes the current input character out as a character token.
495 // Always returns RETURN_SUCCESS.
emit_current_char(GumboParser * parser,GumboToken * output)496 static bool emit_current_char(GumboParser* parser, GumboToken* output) {
497   emit_char(
498       parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
499   return RETURN_SUCCESS;
500 }
501 
502 // Writes out a doctype token, copying it from the tokenizer state.
emit_doctype(GumboParser * parser,GumboToken * output)503 static void emit_doctype(GumboParser* parser, GumboToken* output) {
504   output->type = GUMBO_TOKEN_DOCTYPE;
505   output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
506   finish_token(parser, output);
507   doc_type_state_init(parser);
508 }
509 
510 // Debug-only function that explicitly sets the attribute vector data to NULL so
511 // it can be asserted on tag creation, verifying that there are no memory leaks.
mark_tag_state_as_empty(GumboTagState * tag_state)512 static void mark_tag_state_as_empty(GumboTagState* tag_state) {
513 #ifndef NDEBUG
514   tag_state->_attributes = kGumboEmptyVector;
515 #else
516   AVOID_UNUSED_VARIABLE_WARNING(tag_state);
517 #endif
518 }
519 
520 // Writes out the current tag as a start or end tag token.
521 // Always returns RETURN_SUCCESS.
emit_current_tag(GumboParser * parser,GumboToken * output)522 static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
523   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
524   if (tag_state->_is_start_tag) {
525     output->type = GUMBO_TOKEN_START_TAG;
526     output->v.start_tag.tag = tag_state->_tag;
527     output->v.start_tag.attributes = tag_state->_attributes;
528     output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
529     tag_state->_last_start_tag = tag_state->_tag;
530     mark_tag_state_as_empty(tag_state);
531     gumbo_debug(
532         "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
533   } else {
534     output->type = GUMBO_TOKEN_END_TAG;
535     output->v.end_tag = tag_state->_tag;
536     // In end tags, ownership of the attributes vector is not transferred to the
537     // token, but it's still initialized as normal, so it must be manually
538     // deallocated.  There may also be attributes to destroy, in certain broken
539     // cases like </div</th> (the "th" is an attribute there).
540     for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
541       gumbo_destroy_attribute(tag_state->_attributes.data[i]);
542     }
543     gumbo_free(tag_state->_attributes.data);
544     mark_tag_state_as_empty(tag_state);
545     gumbo_debug(
546         "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
547   }
548   gumbo_string_buffer_destroy(&tag_state->_buffer);
549   finish_token(parser, output);
550   gumbo_debug("Original text = %.*s.\n", output->original_text.length,
551       output->original_text.data);
552   assert(output->original_text.length >= 2);
553   assert(output->original_text.data[0] == '<');
554   assert(output->original_text.data[output->original_text.length - 1] == '>');
555   return RETURN_SUCCESS;
556 }
557 
558 // In some states, we speculatively start a tag, but don't know whether it'll be
559 // emitted as tag token or as a series of character tokens until we finish it.
560 // We need to abandon the tag we'd started & free its memory in that case to
561 // avoid a memory leak.
abandon_current_tag(GumboParser * parser)562 static void abandon_current_tag(GumboParser* parser) {
563   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
564   for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
565     gumbo_destroy_attribute(tag_state->_attributes.data[i]);
566   }
567   gumbo_free(tag_state->_attributes.data);
568   mark_tag_state_as_empty(tag_state);
569   gumbo_string_buffer_destroy(&tag_state->_buffer);
570   gumbo_debug("Abandoning current tag.\n");
571 }
572 
573 // Wraps the consume_char_ref function to handle its output and make the
574 // appropriate TokenizerState modifications.  Returns RETURN_ERROR if a parse
575 // error occurred, RETURN_SUCCESS otherwise.
emit_char_ref(GumboParser * parser,int additional_allowed_char,GumboToken * output)576 static StateResult emit_char_ref(
577     GumboParser* parser, int additional_allowed_char, GumboToken* output) {
578   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
579   OneOrTwoCodepoints char_ref;
580   bool status = consume_char_ref(
581       parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
582   if (char_ref.first != kGumboNoChar) {
583     // consume_char_ref ends with the iterator pointing at the next character,
584     // so we need to be sure not advance it again before reading the next token.
585     tokenizer->_reconsume_current_input = true;
586     emit_char(parser, char_ref.first, output);
587     tokenizer->_buffered_emit_char = char_ref.second;
588   } else {
589     emit_char(parser, '&', output);
590   }
591   return status ? RETURN_SUCCESS : RETURN_ERROR;
592 }
593 
594 // Emits a comment token.  Comments use the temporary buffer to accumulate their
595 // data, and then it's copied over and released to the 'text' field of the
596 // GumboToken union.  Always returns RETURN_SUCCESS.
emit_comment(GumboParser * parser,GumboToken * output)597 static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
598   output->type = GUMBO_TOKEN_COMMENT;
599   finish_temporary_buffer(parser, &output->v.text);
600   finish_token(parser, output);
601   return RETURN_SUCCESS;
602 }
603 
604 // Checks to see we should be flushing accumulated characters in the temporary
605 // buffer, and fills the output token with the next output character if so.
606 // Returns true if a character has been emitted and the tokenizer should
607 // immediately return, false if we're at the end of the temporary buffer and
608 // should resume normal operation.
maybe_emit_from_temporary_buffer(GumboParser * parser,GumboToken * output)609 static bool maybe_emit_from_temporary_buffer(
610     GumboParser* parser, GumboToken* output) {
611   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
612   const char* c = tokenizer->_temporary_buffer_emit;
613   GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
614 
615   if (!c || c >= buffer->data + buffer->length) {
616     tokenizer->_temporary_buffer_emit = NULL;
617     return false;
618   }
619 
620   assert(*c == utf8iterator_current(&tokenizer->_input));
621   // emit_char also advances the input stream.  We need to do some juggling of
622   // the _reconsume_current_input flag to get the proper behavior when emitting
623   // previous tokens.  Basically, _reconsume_current_input should *never* be set
624   // when emitting anything from the temporary buffer, since those characters
625   // have already been advanced past.  However, it should be preserved so that
626   // when the *next* character is encountered again, the tokenizer knows not to
627   // advance past it.
628   bool saved_reconsume_state = tokenizer->_reconsume_current_input;
629   tokenizer->_reconsume_current_input = false;
630   emit_char(parser, *c, output);
631   ++tokenizer->_temporary_buffer_emit;
632   tokenizer->_reconsume_current_input = saved_reconsume_state;
633   return true;
634 }
635 
636 // Sets up the tokenizer to begin flushing the temporary buffer.
637 // This resets the input iterator stream to the start of the last tag, sets up
638 // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
639 // the first character in it.  It returns true if a character was emitted, false
640 // otherwise.
emit_temporary_buffer(GumboParser * parser,GumboToken * output)641 static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
642   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
643   assert(tokenizer->_temporary_buffer.data);
644   utf8iterator_reset(&tokenizer->_input);
645   tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
646   return maybe_emit_from_temporary_buffer(parser, output);
647 }
648 
649 // Appends a codepoint to the current tag buffer.  If
650 // reinitilize_position_on_first is set, this also initializes the tag buffer
651 // start point; the only time you would *not* want to pass true for this
652 // parameter is if you want the original_text to include character (like an
653 // opening quote) that doesn't appear in the value.
append_char_to_tag_buffer(GumboParser * parser,int codepoint,bool reinitilize_position_on_first)654 static void append_char_to_tag_buffer(
655     GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
656   GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
657   if (buffer->length == 0 && reinitilize_position_on_first) {
658     reset_tag_buffer_start_point(parser);
659   }
660   gumbo_string_buffer_append_codepoint(codepoint, buffer);
661 }
662 
663 // (Re-)initialize the tag buffer.  This also resets the original_text pointer
664 // and _start_pos field to point to the current position.
initialize_tag_buffer(GumboParser * parser)665 static void initialize_tag_buffer(GumboParser* parser) {
666   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
667   GumboTagState* tag_state = &tokenizer->_tag_state;
668 
669   gumbo_string_buffer_init(&tag_state->_buffer);
670   reset_tag_buffer_start_point(parser);
671 }
672 
673 // Initializes the tag_state to start a new tag, keeping track of the opening
674 // positions and original text.  Takes a boolean indicating whether this is a
675 // start or end tag.
start_new_tag(GumboParser * parser,bool is_start_tag)676 static void start_new_tag(GumboParser* parser, bool is_start_tag) {
677   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
678   GumboTagState* tag_state = &tokenizer->_tag_state;
679   int c = utf8iterator_current(&tokenizer->_input);
680   assert(gumbo_isalpha(c));
681   c = gumbo_tolower(c);
682   assert(gumbo_isalpha(c));
683 
684   initialize_tag_buffer(parser);
685   gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
686 
687   assert(tag_state->_attributes.data == NULL);
688   // Initial size chosen by statistical analysis of a corpus of 60k webpages.
689   // 99.5% of elements have 0 attributes, 93% of the remainder have 1.  These
690   // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
691   // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
692   gumbo_vector_init(2, &tag_state->_attributes);
693   tag_state->_drop_next_attr_value = false;
694   tag_state->_is_start_tag = is_start_tag;
695   tag_state->_is_self_closing = false;
696   gumbo_debug("Starting new tag.\n");
697 }
698 
699 // Fills in the specified char* with the contents of the tag buffer.
copy_over_tag_buffer(GumboParser * parser,const char ** output)700 static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
701   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
702   GumboTagState* tag_state = &tokenizer->_tag_state;
703   *output = gumbo_string_buffer_to_string(&tag_state->_buffer);
704 }
705 
706 // Fills in:
707 // * The original_text GumboStringPiece with the portion of the original
708 // buffer that corresponds to the tag buffer.
709 // * The start_pos GumboSourcePosition with the start position of the tag
710 // buffer.
711 // * The end_pos GumboSourcePosition with the current source position.
copy_over_original_tag_text(GumboParser * parser,GumboStringPiece * original_text,GumboSourcePosition * start_pos,GumboSourcePosition * end_pos)712 static void copy_over_original_tag_text(GumboParser* parser,
713     GumboStringPiece* original_text, GumboSourcePosition* start_pos,
714     GumboSourcePosition* end_pos) {
715   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
716   GumboTagState* tag_state = &tokenizer->_tag_state;
717 
718   original_text->data = tag_state->_original_text;
719   original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
720                           tag_state->_original_text;
721   if (original_text->data[original_text->length - 1] == '\r') {
722     // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
723     // appended to the end of original text even when it's really the first part
724     // of the next character.  If we detect this situation, shrink the length of
725     // the original text by 1 to remove the carriage return.
726     --original_text->length;
727   }
728   *start_pos = tag_state->_start_pos;
729   utf8iterator_get_position(&tokenizer->_input, end_pos);
730 }
731 
732 // Releases and then re-initializes the tag buffer.
reinitialize_tag_buffer(GumboParser * parser)733 static void reinitialize_tag_buffer(GumboParser* parser) {
734   gumbo_free(parser->_tokenizer_state->_tag_state._buffer.data);
735   initialize_tag_buffer(parser);
736 }
737 
738 // Moves some data from the temporary buffer over the the tag-based fields in
739 // TagState.
finish_tag_name(GumboParser * parser)740 static void finish_tag_name(GumboParser* parser) {
741   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
742   GumboTagState* tag_state = &tokenizer->_tag_state;
743 
744   tag_state->_tag =
745       gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
746   reinitialize_tag_buffer(parser);
747 }
748 
749 // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
add_duplicate_attr_error(GumboParser * parser,int original_index,int new_index)750 static void add_duplicate_attr_error(
751     GumboParser* parser, int original_index, int new_index) {
752   GumboError* error = gumbo_add_error(parser);
753   if (!error) {
754     return;
755   }
756   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
757   error->type = GUMBO_ERR_DUPLICATE_ATTR;
758   error->position = tag_state->_start_pos;
759   error->original_text = tag_state->_original_text;
760   error->v.duplicate_attr.original_index = original_index;
761   error->v.duplicate_attr.new_index = new_index;
762   copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
763   reinitialize_tag_buffer(parser);
764 }
765 
766 // Creates a new attribute in the current tag, copying the current tag buffer to
767 // the attribute's name.  The attribute's value starts out as the empty string
768 // (following the "Boolean attributes" section of the spec) and is only
769 // overwritten on finish_attribute_value().  If the attribute has already been
770 // specified, the new attribute is dropped, a parse error is added, and the
771 // function returns false.  Otherwise, this returns true.
finish_attribute_name(GumboParser * parser)772 static bool finish_attribute_name(GumboParser* parser) {
773   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
774   GumboTagState* tag_state = &tokenizer->_tag_state;
775   // May've been set by a previous attribute without a value; reset it here.
776   tag_state->_drop_next_attr_value = false;
777   assert(tag_state->_attributes.data);
778   assert(tag_state->_attributes.capacity);
779 
780   GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
781   for (unsigned int i = 0; i < attributes->length; ++i) {
782     GumboAttribute* attr = attributes->data[i];
783     if (strlen(attr->name) == tag_state->_buffer.length &&
784         memcmp(attr->name, tag_state->_buffer.data,
785             tag_state->_buffer.length) == 0) {
786       // Identical attribute; bail.
787       add_duplicate_attr_error(parser, i, attributes->length);
788       tag_state->_drop_next_attr_value = true;
789       return false;
790     }
791   }
792 
793   GumboAttribute* attr = gumbo_malloc(sizeof(GumboAttribute));
794   attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
795   copy_over_tag_buffer(parser, &attr->name);
796   copy_over_original_tag_text(
797       parser, &attr->original_name, &attr->name_start, &attr->name_end);
798   attr->value = gumbo_strdup("");
799   copy_over_original_tag_text(
800       parser, &attr->original_value, &attr->name_start, &attr->name_end);
801   gumbo_vector_add(attr, attributes);
802   reinitialize_tag_buffer(parser);
803   return true;
804 }
805 
806 // Finishes an attribute value.  This sets the value of the most recently added
807 // attribute to the current contents of the tag buffer.
finish_attribute_value(GumboParser * parser)808 static void finish_attribute_value(GumboParser* parser) {
809   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
810   if (tag_state->_drop_next_attr_value) {
811     // Duplicate attribute name detected in an earlier state, so we have to
812     // ignore the value.
813     tag_state->_drop_next_attr_value = false;
814     reinitialize_tag_buffer(parser);
815     return;
816   }
817 
818   GumboAttribute* attr =
819       tag_state->_attributes.data[tag_state->_attributes.length - 1];
820   gumbo_free((void*) attr->value);
821   copy_over_tag_buffer(parser, &attr->value);
822   copy_over_original_tag_text(
823       parser, &attr->original_value, &attr->value_start, &attr->value_end);
824   reinitialize_tag_buffer(parser);
825 }
826 
827 // Returns true if the current end tag matches the last start tag emitted.
is_appropriate_end_tag(GumboParser * parser)828 static bool is_appropriate_end_tag(GumboParser* parser) {
829   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
830   assert(!tag_state->_is_start_tag);
831   return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
832          tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
833                                            tag_state->_buffer.length);
834 }
835 
gumbo_tokenizer_state_init(GumboParser * parser,const char * text,size_t text_length)836 void gumbo_tokenizer_state_init(
837     GumboParser* parser, const char* text, size_t text_length) {
838   GumboTokenizerState* tokenizer = gumbo_malloc(sizeof(GumboTokenizerState));
839   parser->_tokenizer_state = tokenizer;
840   gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
841   tokenizer->_reconsume_current_input = false;
842   tokenizer->_is_current_node_foreign = false;
843   tokenizer->_is_in_cdata = false;
844   tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
845 
846   tokenizer->_buffered_emit_char = kGumboNoChar;
847   gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
848   tokenizer->_temporary_buffer_emit = NULL;
849 
850   mark_tag_state_as_empty(&tokenizer->_tag_state);
851 
852   gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
853   tokenizer->_token_start = text;
854   utf8iterator_init(parser, text, text_length, &tokenizer->_input);
855   utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
856   doc_type_state_init(parser);
857 }
858 
gumbo_tokenizer_state_destroy(GumboParser * parser)859 void gumbo_tokenizer_state_destroy(GumboParser* parser) {
860   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
861   assert(tokenizer->_doc_type_state.name == NULL);
862   assert(tokenizer->_doc_type_state.public_identifier == NULL);
863   assert(tokenizer->_doc_type_state.system_identifier == NULL);
864   gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
865   gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
866   gumbo_free(tokenizer);
867 }
868 
gumbo_tokenizer_set_state(GumboParser * parser,GumboTokenizerEnum state)869 void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
870   parser->_tokenizer_state->_state = state;
871 }
872 
gumbo_tokenizer_set_is_current_node_foreign(GumboParser * parser,bool is_foreign)873 void gumbo_tokenizer_set_is_current_node_foreign(
874     GumboParser* parser, bool is_foreign) {
875   if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
876     gumbo_debug("Toggling is_current_node_foreign to %s.\n",
877         is_foreign ? "true" : "false");
878   }
879   parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
880 }
881 
882 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
handle_data_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)883 static StateResult handle_data_state(GumboParser* parser,
884     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
885   switch (c) {
886     case '&':
887       gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
888       // The char_ref machinery expects to be on the & so it can mark that
889       // and return to it if the text isn't a char ref, so we need to
890       // reconsume it.
891       tokenizer->_reconsume_current_input = true;
892       return NEXT_CHAR;
893     case '<':
894       gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
895       clear_temporary_buffer(parser);
896       append_char_to_temporary_buffer(parser, '<');
897       return NEXT_CHAR;
898     case '\0':
899       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
900       emit_char(parser, c, output);
901       return RETURN_ERROR;
902     default:
903       return emit_current_char(parser, output);
904   }
905 }
906 
907 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
handle_char_ref_in_data_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)908 static StateResult handle_char_ref_in_data_state(GumboParser* parser,
909     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
910   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
911   AVOID_UNUSED_VARIABLE_WARNING(c);
912   gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
913   return emit_char_ref(parser, ' ', output);
914 }
915 
916 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
handle_rcdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)917 static StateResult handle_rcdata_state(GumboParser* parser,
918     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
919   switch (c) {
920     case '&':
921       gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
922       tokenizer->_reconsume_current_input = true;
923       return NEXT_CHAR;
924     case '<':
925       gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
926       clear_temporary_buffer(parser);
927       append_char_to_temporary_buffer(parser, '<');
928       return NEXT_CHAR;
929     case '\0':
930       return emit_replacement_char(parser, output);
931     case -1:
932       return emit_eof(parser, output);
933     default:
934       return emit_current_char(parser, output);
935   }
936 }
937 
938 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
handle_char_ref_in_rcdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)939 static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
940     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
941   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
942   AVOID_UNUSED_VARIABLE_WARNING(c);
943   gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
944   return emit_char_ref(parser, ' ', output);
945 }
946 
947 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
handle_rawtext_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)948 static StateResult handle_rawtext_state(GumboParser* parser,
949     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
950   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
951   switch (c) {
952     case '<':
953       gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
954       clear_temporary_buffer(parser);
955       append_char_to_temporary_buffer(parser, '<');
956       return NEXT_CHAR;
957     case '\0':
958       return emit_replacement_char(parser, output);
959     case -1:
960       return emit_eof(parser, output);
961     default:
962       return emit_current_char(parser, output);
963   }
964 }
965 
966 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
handle_script_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)967 static StateResult handle_script_state(GumboParser* parser,
968     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
969   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
970   switch (c) {
971     case '<':
972       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
973       clear_temporary_buffer(parser);
974       append_char_to_temporary_buffer(parser, '<');
975       return NEXT_CHAR;
976     case '\0':
977       return emit_replacement_char(parser, output);
978     case -1:
979       return emit_eof(parser, output);
980     default:
981       return emit_current_char(parser, output);
982   }
983 }
984 
985 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
handle_plaintext_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)986 static StateResult handle_plaintext_state(GumboParser* parser,
987     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
988   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
989   switch (c) {
990     case '\0':
991       return emit_replacement_char(parser, output);
992     case -1:
993       return emit_eof(parser, output);
994     default:
995       return emit_current_char(parser, output);
996   }
997 }
998 
999 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
handle_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1000 static StateResult handle_tag_open_state(GumboParser* parser,
1001     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1002   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1003   assert(temporary_buffer_equals(parser, "<"));
1004   switch (c) {
1005     case '!':
1006       gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1007       clear_temporary_buffer(parser);
1008       return NEXT_CHAR;
1009     case '/':
1010       gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1011       append_char_to_temporary_buffer(parser, '/');
1012       return NEXT_CHAR;
1013     case '?':
1014       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1015       clear_temporary_buffer(parser);
1016       append_char_to_temporary_buffer(parser, '?');
1017       tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1018       return NEXT_CHAR;
1019     default:
1020       if (gumbo_isalpha(c)) {
1021         gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1022         start_new_tag(parser, true);
1023         return NEXT_CHAR;
1024       } else {
1025         tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1026         gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1027         emit_temporary_buffer(parser, output);
1028         return RETURN_ERROR;
1029       }
1030   }
1031 }
1032 
1033 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
handle_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1034 static StateResult handle_end_tag_open_state(GumboParser* parser,
1035     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1036   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1037   assert(temporary_buffer_equals(parser, "</"));
1038   switch (c) {
1039     case '>':
1040       tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1041       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1042       return NEXT_CHAR;
1043     case -1:
1044       tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1045       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1046       return emit_temporary_buffer(parser, output);
1047     default:
1048       if (gumbo_isalpha(c)) {
1049         gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1050         start_new_tag(parser, false);
1051       } else {
1052         tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1053         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1054         clear_temporary_buffer(parser);
1055         append_char_to_temporary_buffer(parser, c);
1056       }
1057       return NEXT_CHAR;
1058   }
1059 }
1060 
1061 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
handle_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1062 static StateResult handle_tag_name_state(GumboParser* parser,
1063     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1064   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1065   switch (c) {
1066     case '\t':
1067     case '\n':
1068     case '\f':
1069     case ' ':
1070       finish_tag_name(parser);
1071       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1072       return NEXT_CHAR;
1073     case '/':
1074       finish_tag_name(parser);
1075       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1076       return NEXT_CHAR;
1077     case '>':
1078       finish_tag_name(parser);
1079       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1080       return emit_current_tag(parser, output);
1081     case '\0':
1082       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1083       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1084       return NEXT_CHAR;
1085     case -1:
1086       tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1087       abandon_current_tag(parser);
1088       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1089       return NEXT_CHAR;
1090     default:
1091       append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1092       return NEXT_CHAR;
1093   }
1094 }
1095 
1096 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
handle_rcdata_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1097 static StateResult handle_rcdata_lt_state(GumboParser* parser,
1098     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1099   assert(temporary_buffer_equals(parser, "<"));
1100   if (c == '/') {
1101     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1102     append_char_to_temporary_buffer(parser, '/');
1103     return NEXT_CHAR;
1104   } else {
1105     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1106     tokenizer->_reconsume_current_input = true;
1107     return emit_temporary_buffer(parser, output);
1108   }
1109 }
1110 
1111 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
handle_rcdata_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1112 static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1113     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1114   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1115   assert(temporary_buffer_equals(parser, "</"));
1116   if (gumbo_isalpha(c)) {
1117     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1118     start_new_tag(parser, false);
1119     append_char_to_temporary_buffer(parser, c);
1120     return NEXT_CHAR;
1121   } else {
1122     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1123     return emit_temporary_buffer(parser, output);
1124   }
1125   return true;
1126 }
1127 
1128 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
handle_rcdata_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1129 static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1130     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1131 #ifndef NDEBUG
1132   assert(tokenizer->_temporary_buffer.length >= 2);
1133 #else
1134   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1135 #endif
1136   if (gumbo_isalpha(c)) {
1137     append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1138     append_char_to_temporary_buffer(parser, c);
1139     return NEXT_CHAR;
1140   } else if (is_appropriate_end_tag(parser)) {
1141     switch (c) {
1142       case '\t':
1143       case '\n':
1144       case '\f':
1145       case ' ':
1146         finish_tag_name(parser);
1147         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1148         return NEXT_CHAR;
1149       case '/':
1150         finish_tag_name(parser);
1151         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1152         return NEXT_CHAR;
1153       case '>':
1154         finish_tag_name(parser);
1155         gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1156         return emit_current_tag(parser, output);
1157     }
1158   }
1159   gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1160   abandon_current_tag(parser);
1161   return emit_temporary_buffer(parser, output);
1162 }
1163 
1164 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
handle_rawtext_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1165 static StateResult handle_rawtext_lt_state(GumboParser* parser,
1166     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1167   assert(temporary_buffer_equals(parser, "<"));
1168   if (c == '/') {
1169     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1170     append_char_to_temporary_buffer(parser, '/');
1171     return NEXT_CHAR;
1172   } else {
1173     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1174     tokenizer->_reconsume_current_input = true;
1175     return emit_temporary_buffer(parser, output);
1176   }
1177 }
1178 
1179 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
handle_rawtext_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1180 static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1181     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1182   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1183   assert(temporary_buffer_equals(parser, "</"));
1184   if (gumbo_isalpha(c)) {
1185     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1186     start_new_tag(parser, false);
1187     append_char_to_temporary_buffer(parser, c);
1188     return NEXT_CHAR;
1189   } else {
1190     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1191     return emit_temporary_buffer(parser, output);
1192   }
1193 }
1194 
1195 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
handle_rawtext_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1196 static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1197     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1198   assert(tokenizer->_temporary_buffer.length >= 2);
1199   gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1200       tokenizer->_tag_state._buffer.data);
1201   if (gumbo_isalpha(c)) {
1202     append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1203     append_char_to_temporary_buffer(parser, c);
1204     return NEXT_CHAR;
1205   } else if (is_appropriate_end_tag(parser)) {
1206     gumbo_debug("Is an appropriate end tag.\n");
1207     switch (c) {
1208       case '\t':
1209       case '\n':
1210       case '\f':
1211       case ' ':
1212         finish_tag_name(parser);
1213         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1214         return NEXT_CHAR;
1215       case '/':
1216         finish_tag_name(parser);
1217         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1218         return NEXT_CHAR;
1219       case '>':
1220         finish_tag_name(parser);
1221         gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1222         return emit_current_tag(parser, output);
1223     }
1224   }
1225   gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1226   abandon_current_tag(parser);
1227   return emit_temporary_buffer(parser, output);
1228 }
1229 
1230 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
handle_script_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1231 static StateResult handle_script_lt_state(GumboParser* parser,
1232     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1233   assert(temporary_buffer_equals(parser, "<"));
1234   if (c == '/') {
1235     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1236     append_char_to_temporary_buffer(parser, '/');
1237     return NEXT_CHAR;
1238   } else if (c == '!') {
1239     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1240     append_char_to_temporary_buffer(parser, '!');
1241     return emit_temporary_buffer(parser, output);
1242   } else {
1243     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1244     tokenizer->_reconsume_current_input = true;
1245     return emit_temporary_buffer(parser, output);
1246   }
1247 }
1248 
1249 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
handle_script_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1250 static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1251     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1252   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1253   assert(temporary_buffer_equals(parser, "</"));
1254   if (gumbo_isalpha(c)) {
1255     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1256     start_new_tag(parser, false);
1257     append_char_to_temporary_buffer(parser, c);
1258     return NEXT_CHAR;
1259   } else {
1260     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1261     return emit_temporary_buffer(parser, output);
1262   }
1263 }
1264 
1265 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
handle_script_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1266 static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1267     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1268 #ifndef NDEBUG
1269   assert(tokenizer->_temporary_buffer.length >= 2);
1270 #else
1271   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1272 #endif
1273   if (gumbo_isalpha(c)) {
1274     append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1275     append_char_to_temporary_buffer(parser, c);
1276     return NEXT_CHAR;
1277   } else if (is_appropriate_end_tag(parser)) {
1278     switch (c) {
1279       case '\t':
1280       case '\n':
1281       case '\f':
1282       case ' ':
1283         finish_tag_name(parser);
1284         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1285         return NEXT_CHAR;
1286       case '/':
1287         finish_tag_name(parser);
1288         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1289         return NEXT_CHAR;
1290       case '>':
1291         finish_tag_name(parser);
1292         gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1293         return emit_current_tag(parser, output);
1294     }
1295   }
1296   gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1297   abandon_current_tag(parser);
1298   return emit_temporary_buffer(parser, output);
1299 }
1300 
1301 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
handle_script_escaped_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1302 static StateResult handle_script_escaped_start_state(GumboParser* parser,
1303     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1304   if (c == '-') {
1305     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1306     return emit_current_char(parser, output);
1307   } else {
1308     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1309     tokenizer->_reconsume_current_input = true;
1310     return NEXT_CHAR;
1311   }
1312 }
1313 
1314 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
handle_script_escaped_start_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1315 static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1316     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1317   if (c == '-') {
1318     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1319     return emit_current_char(parser, output);
1320   } else {
1321     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1322     tokenizer->_reconsume_current_input = true;
1323     return NEXT_CHAR;
1324   }
1325 }
1326 
1327 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
handle_script_escaped_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1328 static StateResult handle_script_escaped_state(GumboParser* parser,
1329     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1330   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1331   switch (c) {
1332     case '-':
1333       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1334       return emit_current_char(parser, output);
1335     case '<':
1336       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1337       clear_temporary_buffer(parser);
1338       append_char_to_temporary_buffer(parser, c);
1339       return NEXT_CHAR;
1340     case '\0':
1341       return emit_replacement_char(parser, output);
1342     case -1:
1343       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1344       return emit_eof(parser, output);
1345     default:
1346       return emit_current_char(parser, output);
1347   }
1348 }
1349 
1350 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
handle_script_escaped_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1351 static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1352     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1353   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1354   switch (c) {
1355     case '-':
1356       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1357       return emit_current_char(parser, output);
1358     case '<':
1359       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1360       clear_temporary_buffer(parser);
1361       append_char_to_temporary_buffer(parser, c);
1362       return NEXT_CHAR;
1363     case '\0':
1364       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1365       return emit_replacement_char(parser, output);
1366     case -1:
1367       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1368       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1369       return NEXT_CHAR;
1370     default:
1371       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1372       return emit_current_char(parser, output);
1373   }
1374 }
1375 
1376 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
handle_script_escaped_dash_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1377 static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1378     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1379   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1380   switch (c) {
1381     case '-':
1382       return emit_current_char(parser, output);
1383     case '<':
1384       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1385       clear_temporary_buffer(parser);
1386       append_char_to_temporary_buffer(parser, c);
1387       return NEXT_CHAR;
1388     case '>':
1389       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1390       return emit_current_char(parser, output);
1391     case '\0':
1392       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1393       return emit_replacement_char(parser, output);
1394     case -1:
1395       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1396       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1397       return NEXT_CHAR;
1398     default:
1399       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1400       return emit_current_char(parser, output);
1401   }
1402 }
1403 
1404 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
handle_script_escaped_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1405 static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1406     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1407   assert(temporary_buffer_equals(parser, "<"));
1408   assert(!tokenizer->_script_data_buffer.length);
1409   if (c == '/') {
1410     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1411     append_char_to_temporary_buffer(parser, c);
1412     return NEXT_CHAR;
1413   } else if (gumbo_isalpha(c)) {
1414     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1415     append_char_to_temporary_buffer(parser, c);
1416     gumbo_string_buffer_append_codepoint(
1417         gumbo_tolower(c), &tokenizer->_script_data_buffer);
1418     return emit_temporary_buffer(parser, output);
1419   } else {
1420     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1421     return emit_temporary_buffer(parser, output);
1422   }
1423 }
1424 
1425 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
handle_script_escaped_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1426 static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1427     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1428   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1429   assert(temporary_buffer_equals(parser, "</"));
1430   if (gumbo_isalpha(c)) {
1431     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1432     start_new_tag(parser, false);
1433     append_char_to_temporary_buffer(parser, c);
1434     return NEXT_CHAR;
1435   } else {
1436     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1437     return emit_temporary_buffer(parser, output);
1438   }
1439 }
1440 
1441 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
handle_script_escaped_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1442 static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1443     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1444 #ifndef NDEBUG
1445   assert(tokenizer->_temporary_buffer.length >= 2);
1446 #else
1447   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1448 #endif
1449   if (gumbo_isalpha(c)) {
1450     append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1451     append_char_to_temporary_buffer(parser, c);
1452     return NEXT_CHAR;
1453   } else if (is_appropriate_end_tag(parser)) {
1454     switch (c) {
1455       case '\t':
1456       case '\n':
1457       case '\f':
1458       case ' ':
1459         finish_tag_name(parser);
1460         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1461         return NEXT_CHAR;
1462       case '/':
1463         finish_tag_name(parser);
1464         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1465         return NEXT_CHAR;
1466       case '>':
1467         finish_tag_name(parser);
1468         gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1469         return emit_current_tag(parser, output);
1470     }
1471   }
1472   gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1473   abandon_current_tag(parser);
1474   return emit_temporary_buffer(parser, output);
1475 }
1476 
1477 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
handle_script_double_escaped_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1478 static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1479     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1480   switch (c) {
1481     case '\t':
1482     case '\n':
1483     case '\f':
1484     case ' ':
1485     case '/':
1486     case '>':
1487       gumbo_tokenizer_set_state(parser,
1488           gumbo_string_equals(
1489               &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1490               ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1491               : GUMBO_LEX_SCRIPT_ESCAPED);
1492       return emit_current_char(parser, output);
1493     default:
1494       if (gumbo_isalpha(c)) {
1495         gumbo_string_buffer_append_codepoint(
1496             gumbo_tolower(c), &tokenizer->_script_data_buffer);
1497         return emit_current_char(parser, output);
1498       } else {
1499         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1500         tokenizer->_reconsume_current_input = true;
1501         return NEXT_CHAR;
1502       }
1503   }
1504 }
1505 
1506 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
handle_script_double_escaped_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1507 static StateResult handle_script_double_escaped_state(GumboParser* parser,
1508     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1509   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1510   switch (c) {
1511     case '-':
1512       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1513       return emit_current_char(parser, output);
1514     case '<':
1515       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1516       return emit_current_char(parser, output);
1517     case '\0':
1518       return emit_replacement_char(parser, output);
1519     case -1:
1520       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1521       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1522       return NEXT_CHAR;
1523     default:
1524       return emit_current_char(parser, output);
1525   }
1526 }
1527 
1528 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
handle_script_double_escaped_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1529 static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1530     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1531   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1532   switch (c) {
1533     case '-':
1534       gumbo_tokenizer_set_state(
1535           parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1536       return emit_current_char(parser, output);
1537     case '<':
1538       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1539       return emit_current_char(parser, output);
1540     case '\0':
1541       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1542       return emit_replacement_char(parser, output);
1543     case -1:
1544       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1545       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1546       return NEXT_CHAR;
1547     default:
1548       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1549       return emit_current_char(parser, output);
1550   }
1551 }
1552 
1553 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
handle_script_double_escaped_dash_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1554 static StateResult handle_script_double_escaped_dash_dash_state(
1555     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
1556     GumboToken* output) {
1557   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1558   switch (c) {
1559     case '-':
1560       return emit_current_char(parser, output);
1561     case '<':
1562       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1563       return emit_current_char(parser, output);
1564     case '>':
1565       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1566       return emit_current_char(parser, output);
1567     case '\0':
1568       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1569       return emit_replacement_char(parser, output);
1570     case -1:
1571       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1572       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1573       return NEXT_CHAR;
1574     default:
1575       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1576       return emit_current_char(parser, output);
1577   }
1578 }
1579 
1580 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
handle_script_double_escaped_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1581 static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1582     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1583   if (c == '/') {
1584     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1585     gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
1586     gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
1587     return emit_current_char(parser, output);
1588   } else {
1589     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1590     tokenizer->_reconsume_current_input = true;
1591     return NEXT_CHAR;
1592   }
1593 }
1594 
1595 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
handle_script_double_escaped_end_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1596 static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1597     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1598   switch (c) {
1599     case '\t':
1600     case '\n':
1601     case '\f':
1602     case ' ':
1603     case '/':
1604     case '>':
1605       gumbo_tokenizer_set_state(parser,
1606           gumbo_string_equals(
1607               &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1608               ? GUMBO_LEX_SCRIPT_ESCAPED
1609               : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1610       return emit_current_char(parser, output);
1611     default:
1612       if (gumbo_isalpha(c)) {
1613         gumbo_string_buffer_append_codepoint(
1614             gumbo_tolower(c), &tokenizer->_script_data_buffer);
1615         return emit_current_char(parser, output);
1616       } else {
1617         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1618         tokenizer->_reconsume_current_input = true;
1619         return NEXT_CHAR;
1620       }
1621   }
1622 }
1623 
1624 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
handle_before_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1625 static StateResult handle_before_attr_name_state(GumboParser* parser,
1626     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1627   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1628   switch (c) {
1629     case '\t':
1630     case '\n':
1631     case '\f':
1632     case ' ':
1633       return NEXT_CHAR;
1634     case '/':
1635       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1636       return NEXT_CHAR;
1637     case '>':
1638       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1639       return emit_current_tag(parser, output);
1640     case '\0':
1641       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1642       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1643       append_char_to_temporary_buffer(parser, 0xfffd);
1644       return NEXT_CHAR;
1645     case -1:
1646       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1647       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1648       abandon_current_tag(parser);
1649       return NEXT_CHAR;
1650     case '"':
1651     case '\'':
1652     case '<':
1653     case '=':
1654       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1655     // Fall through.
1656     default:
1657       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1658       append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1659       return NEXT_CHAR;
1660   }
1661 }
1662 
1663 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
handle_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1664 static StateResult handle_attr_name_state(GumboParser* parser,
1665     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1666   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1667   switch (c) {
1668     case '\t':
1669     case '\n':
1670     case '\f':
1671     case ' ':
1672       finish_attribute_name(parser);
1673       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1674       return NEXT_CHAR;
1675     case '/':
1676       finish_attribute_name(parser);
1677       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1678       return NEXT_CHAR;
1679     case '=':
1680       finish_attribute_name(parser);
1681       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1682       return NEXT_CHAR;
1683     case '>':
1684       finish_attribute_name(parser);
1685       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1686       return emit_current_tag(parser, output);
1687     case '\0':
1688       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1689       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1690       return NEXT_CHAR;
1691     case -1:
1692       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1693       abandon_current_tag(parser);
1694       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1695       return NEXT_CHAR;
1696     case '"':
1697     case '\'':
1698     case '<':
1699       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1700     // Fall through.
1701     default:
1702       append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1703       return NEXT_CHAR;
1704   }
1705 }
1706 
1707 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
handle_after_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1708 static StateResult handle_after_attr_name_state(GumboParser* parser,
1709     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1710   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
1711   switch (c) {
1712     case '\t':
1713     case '\n':
1714     case '\f':
1715     case ' ':
1716       return NEXT_CHAR;
1717     case '/':
1718       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1719       return NEXT_CHAR;
1720     case '=':
1721       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1722       return NEXT_CHAR;
1723     case '>':
1724       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1725       return emit_current_tag(parser, output);
1726     case '\0':
1727       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1728       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1729       append_char_to_temporary_buffer(parser, 0xfffd);
1730       return NEXT_CHAR;
1731     case -1:
1732       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1733       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1734       abandon_current_tag(parser);
1735       return NEXT_CHAR;
1736     case '"':
1737     case '\'':
1738     case '<':
1739       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1740     // Fall through.
1741     default:
1742       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1743       append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
1744       return NEXT_CHAR;
1745   }
1746 }
1747 
1748 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
handle_before_attr_value_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1749 static StateResult handle_before_attr_value_state(GumboParser* parser,
1750     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1751   switch (c) {
1752     case '\t':
1753     case '\n':
1754     case '\f':
1755     case ' ':
1756       return NEXT_CHAR;
1757     case '"':
1758       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1759       reset_tag_buffer_start_point(parser);
1760       return NEXT_CHAR;
1761     case '&':
1762       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1763       tokenizer->_reconsume_current_input = true;
1764       return NEXT_CHAR;
1765     case '\'':
1766       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1767       reset_tag_buffer_start_point(parser);
1768       return NEXT_CHAR;
1769     case '\0':
1770       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1771       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1772       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1773       return NEXT_CHAR;
1774     case -1:
1775       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1776       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1777       abandon_current_tag(parser);
1778       tokenizer->_reconsume_current_input = true;
1779       return NEXT_CHAR;
1780     case '>':
1781       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1782       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1783       emit_current_tag(parser, output);
1784       return RETURN_ERROR;
1785     case '<':
1786     case '=':
1787     case '`':
1788       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1789     // Fall through.
1790     default:
1791       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1792       append_char_to_tag_buffer(parser, c, true);
1793       return NEXT_CHAR;
1794   }
1795 }
1796 
1797 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
handle_attr_value_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1798 static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1799     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1800   AVOID_UNUSED_VARIABLE_WARNING(output);
1801   switch (c) {
1802     case '"':
1803       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1804       return NEXT_CHAR;
1805     case '&':
1806       tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1807       gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1808       tokenizer->_reconsume_current_input = true;
1809       return NEXT_CHAR;
1810     case '\0':
1811       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1812       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1813       return NEXT_CHAR;
1814     case -1:
1815       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
1816       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1817       abandon_current_tag(parser);
1818       tokenizer->_reconsume_current_input = true;
1819       return NEXT_CHAR;
1820     default:
1821       append_char_to_tag_buffer(parser, c, false);
1822       return NEXT_CHAR;
1823   }
1824 }
1825 
1826 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
handle_attr_value_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1827 static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1828     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1829   AVOID_UNUSED_VARIABLE_WARNING(output);
1830   switch (c) {
1831     case '\'':
1832       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1833       return NEXT_CHAR;
1834     case '&':
1835       tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1836       gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1837       tokenizer->_reconsume_current_input = true;
1838       return NEXT_CHAR;
1839     case '\0':
1840       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1841       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1842       return NEXT_CHAR;
1843     case -1:
1844       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
1845       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1846       abandon_current_tag(parser);
1847       tokenizer->_reconsume_current_input = true;
1848       return NEXT_CHAR;
1849     default:
1850       append_char_to_tag_buffer(parser, c, false);
1851       return NEXT_CHAR;
1852   }
1853 }
1854 
1855 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
handle_attr_value_unquoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1856 static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1857     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1858   switch (c) {
1859     case '\t':
1860     case '\n':
1861     case '\f':
1862     case ' ':
1863       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1864       finish_attribute_value(parser);
1865       return NEXT_CHAR;
1866     case '&':
1867       tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1868       gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1869       tokenizer->_reconsume_current_input = true;
1870       return NEXT_CHAR;
1871     case '>':
1872       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1873       finish_attribute_value(parser);
1874       return emit_current_tag(parser, output);
1875     case '\0':
1876       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1877       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1878       return NEXT_CHAR;
1879     case -1:
1880       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1881       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1882       tokenizer->_reconsume_current_input = true;
1883       abandon_current_tag(parser);
1884       return NEXT_CHAR;
1885     case '<':
1886     case '=':
1887     case '"':
1888     case '\'':
1889     case '`':
1890       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1891     // Fall through.
1892     default:
1893       append_char_to_tag_buffer(parser, c, true);
1894       return NEXT_CHAR;
1895   }
1896 }
1897 
1898 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
handle_char_ref_in_attr_value_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1899 static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1900     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1901   AVOID_UNUSED_VARIABLE_WARNING(output);
1902   AVOID_UNUSED_VARIABLE_WARNING(c);
1903   OneOrTwoCodepoints char_ref;
1904   int allowed_char;
1905   bool is_unquoted = false;
1906   switch (tokenizer->_tag_state._attr_value_state) {
1907     case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
1908       allowed_char = '"';
1909       break;
1910     case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
1911       allowed_char = '\'';
1912       break;
1913     case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
1914       allowed_char = '>';
1915       is_unquoted = true;
1916       break;
1917     default:
1918       // -Wmaybe-uninitialized is a little overzealous here, and doesn't
1919       // get that the assert(0) means this codepath will never happen.
1920       allowed_char = ' ';
1921       assert(0);
1922   }
1923 
1924   // Ignore the status, since we don't have a convenient way of signalling that
1925   // a parser error has occurred when the error occurs in the middle of a
1926   // multi-state token.  We'd need a flag inside the TokenizerState to do this,
1927   // but that's a low priority fix.
1928   consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
1929   if (char_ref.first != kGumboNoChar) {
1930     tokenizer->_reconsume_current_input = true;
1931     append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
1932     if (char_ref.second != kGumboNoChar) {
1933       append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
1934     }
1935   } else {
1936     append_char_to_tag_buffer(parser, '&', is_unquoted);
1937   }
1938   gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
1939   return NEXT_CHAR;
1940 }
1941 
1942 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
handle_after_attr_value_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1943 static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1944     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1945   finish_attribute_value(parser);
1946   switch (c) {
1947     case '\t':
1948     case '\n':
1949     case '\f':
1950     case ' ':
1951       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1952       return NEXT_CHAR;
1953     case '/':
1954       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1955       return NEXT_CHAR;
1956     case '>':
1957       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1958       return emit_current_tag(parser, output);
1959     case -1:
1960       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
1961       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1962       abandon_current_tag(parser);
1963       tokenizer->_reconsume_current_input = true;
1964       return NEXT_CHAR;
1965     default:
1966       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
1967       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1968       tokenizer->_reconsume_current_input = true;
1969       return NEXT_CHAR;
1970   }
1971 }
1972 
1973 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
handle_self_closing_start_tag_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1974 static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1975     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1976   switch (c) {
1977     case '>':
1978       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1979       tokenizer->_tag_state._is_self_closing = true;
1980       return emit_current_tag(parser, output);
1981     case -1:
1982       tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
1983       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1984       abandon_current_tag(parser);
1985       return NEXT_CHAR;
1986     default:
1987       tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
1988       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1989       tokenizer->_reconsume_current_input = true;
1990       return NEXT_CHAR;
1991   }
1992 }
1993 
1994 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
handle_bogus_comment_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1995 static StateResult handle_bogus_comment_state(GumboParser* parser,
1996     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1997   while (c != '>' && c != -1) {
1998     if (c == '\0') {
1999       c = 0xFFFD;
2000     }
2001     append_char_to_temporary_buffer(parser, c);
2002     utf8iterator_next(&tokenizer->_input);
2003     c = utf8iterator_current(&tokenizer->_input);
2004   }
2005   gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2006   return emit_comment(parser, output);
2007 }
2008 
2009 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
handle_markup_declaration_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2010 static StateResult handle_markup_declaration_state(GumboParser* parser,
2011     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2012   AVOID_UNUSED_VARIABLE_WARNING(output);
2013   AVOID_UNUSED_VARIABLE_WARNING(c);
2014   if (utf8iterator_maybe_consume_match(
2015           &tokenizer->_input, "--", sizeof("--") - 1, true)) {
2016     gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2017     tokenizer->_reconsume_current_input = true;
2018   } else if (utf8iterator_maybe_consume_match(
2019                  &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
2020     gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2021     tokenizer->_reconsume_current_input = true;
2022     // If we get here, we know we'll eventually emit a doctype token, so now is
2023     // the time to initialize the doctype strings.  (Not in doctype_state_init,
2024     // since then they'll leak if ownership never gets transferred to the
2025     // doctype token.
2026     tokenizer->_doc_type_state.name = gumbo_strdup("");
2027     tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
2028     tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
2029   } else if (tokenizer->_is_current_node_foreign &&
2030              utf8iterator_maybe_consume_match(
2031                  &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2032     gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2033     tokenizer->_is_in_cdata = true;
2034     tokenizer->_reconsume_current_input = true;
2035   } else {
2036     tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2037     gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2038     tokenizer->_reconsume_current_input = true;
2039     clear_temporary_buffer(parser);
2040   }
2041   return NEXT_CHAR;
2042 }
2043 
2044 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
handle_comment_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2045 static StateResult handle_comment_start_state(GumboParser* parser,
2046     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2047   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2048   switch (c) {
2049     case '-':
2050       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2051       return NEXT_CHAR;
2052     case '\0':
2053       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2054       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2055       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2056       return NEXT_CHAR;
2057     case '>':
2058       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2059       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2060       emit_comment(parser, output);
2061       return RETURN_ERROR;
2062     case -1:
2063       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2064       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2065       emit_comment(parser, output);
2066       return RETURN_ERROR;
2067     default:
2068       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2069       append_char_to_temporary_buffer(parser, c);
2070       return NEXT_CHAR;
2071   }
2072 }
2073 
2074 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
handle_comment_start_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2075 static StateResult handle_comment_start_dash_state(GumboParser* parser,
2076     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2077   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2078   switch (c) {
2079     case '-':
2080       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2081       return NEXT_CHAR;
2082     case '\0':
2083       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2084       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2085       append_char_to_temporary_buffer(parser, '-');
2086       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2087       return NEXT_CHAR;
2088     case '>':
2089       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2090       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2091       emit_comment(parser, output);
2092       return RETURN_ERROR;
2093     case -1:
2094       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2095       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2096       emit_comment(parser, output);
2097       return RETURN_ERROR;
2098     default:
2099       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2100       append_char_to_temporary_buffer(parser, '-');
2101       append_char_to_temporary_buffer(parser, c);
2102       return NEXT_CHAR;
2103   }
2104 }
2105 
2106 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
handle_comment_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2107 static StateResult handle_comment_state(GumboParser* parser,
2108     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2109   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2110   switch (c) {
2111     case '-':
2112       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2113       return NEXT_CHAR;
2114     case '\0':
2115       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2116       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2117       return NEXT_CHAR;
2118     case -1:
2119       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2120       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2121       emit_comment(parser, output);
2122       return RETURN_ERROR;
2123     default:
2124       append_char_to_temporary_buffer(parser, c);
2125       return NEXT_CHAR;
2126   }
2127 }
2128 
2129 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
handle_comment_end_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2130 static StateResult handle_comment_end_dash_state(GumboParser* parser,
2131     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2132   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2133   switch (c) {
2134     case '-':
2135       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2136       return NEXT_CHAR;
2137     case '\0':
2138       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2139       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2140       append_char_to_temporary_buffer(parser, '-');
2141       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2142       return NEXT_CHAR;
2143     case -1:
2144       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2145       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2146       emit_comment(parser, output);
2147       return RETURN_ERROR;
2148     default:
2149       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2150       append_char_to_temporary_buffer(parser, '-');
2151       append_char_to_temporary_buffer(parser, c);
2152       return NEXT_CHAR;
2153   }
2154 }
2155 
2156 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
handle_comment_end_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2157 static StateResult handle_comment_end_state(GumboParser* parser,
2158     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2159   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2160   switch (c) {
2161     case '>':
2162       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2163       return emit_comment(parser, output);
2164     case '\0':
2165       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2166       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2167       append_char_to_temporary_buffer(parser, '-');
2168       append_char_to_temporary_buffer(parser, '-');
2169       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2170       return NEXT_CHAR;
2171     case '!':
2172       tokenizer_add_parse_error(
2173           parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2174       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2175       return NEXT_CHAR;
2176     case '-':
2177       tokenizer_add_parse_error(
2178           parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2179       append_char_to_temporary_buffer(parser, '-');
2180       return NEXT_CHAR;
2181     case -1:
2182       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2183       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2184       emit_comment(parser, output);
2185       return RETURN_ERROR;
2186     default:
2187       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2188       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2189       append_char_to_temporary_buffer(parser, '-');
2190       append_char_to_temporary_buffer(parser, '-');
2191       append_char_to_temporary_buffer(parser, c);
2192       return NEXT_CHAR;
2193   }
2194 }
2195 
2196 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
handle_comment_end_bang_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2197 static StateResult handle_comment_end_bang_state(GumboParser* parser,
2198     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2199   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2200 
2201   switch (c) {
2202     case '-':
2203       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2204       append_char_to_temporary_buffer(parser, '-');
2205       append_char_to_temporary_buffer(parser, '-');
2206       append_char_to_temporary_buffer(parser, '!');
2207       return NEXT_CHAR;
2208     case '>':
2209       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2210       return emit_comment(parser, output);
2211     case '\0':
2212       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2213       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2214       append_char_to_temporary_buffer(parser, '-');
2215       append_char_to_temporary_buffer(parser, '-');
2216       append_char_to_temporary_buffer(parser, '!');
2217       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2218       return NEXT_CHAR;
2219     case -1:
2220       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2221       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2222       emit_comment(parser, output);
2223       return RETURN_ERROR;
2224     default:
2225       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2226       append_char_to_temporary_buffer(parser, '-');
2227       append_char_to_temporary_buffer(parser, '-');
2228       append_char_to_temporary_buffer(parser, '!');
2229       append_char_to_temporary_buffer(parser, c);
2230       return NEXT_CHAR;
2231   }
2232 }
2233 
2234 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
handle_doctype_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2235 static StateResult handle_doctype_state(GumboParser* parser,
2236     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2237   assert(!tokenizer->_temporary_buffer.length);
2238   switch (c) {
2239     case '\t':
2240     case '\n':
2241     case '\f':
2242     case ' ':
2243       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2244       return NEXT_CHAR;
2245     case -1:
2246       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2247       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2248       tokenizer->_doc_type_state.force_quirks = true;
2249       emit_doctype(parser, output);
2250       return RETURN_ERROR;
2251     default:
2252       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2253       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2254       tokenizer->_reconsume_current_input = true;
2255       tokenizer->_doc_type_state.force_quirks = true;
2256       return NEXT_CHAR;
2257   }
2258 }
2259 
2260 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
handle_before_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2261 static StateResult handle_before_doctype_name_state(GumboParser* parser,
2262     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2263   switch (c) {
2264     case '\t':
2265     case '\n':
2266     case '\f':
2267     case ' ':
2268       return NEXT_CHAR;
2269     case '\0':
2270       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2271       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2272       tokenizer->_doc_type_state.force_quirks = true;
2273       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2274       return NEXT_CHAR;
2275     case '>':
2276       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2277       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2278       tokenizer->_doc_type_state.force_quirks = true;
2279       emit_doctype(parser, output);
2280       return RETURN_ERROR;
2281     case -1:
2282       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2283       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2284       tokenizer->_doc_type_state.force_quirks = true;
2285       emit_doctype(parser, output);
2286       return RETURN_ERROR;
2287     default:
2288       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2289       tokenizer->_doc_type_state.force_quirks = false;
2290       append_char_to_temporary_buffer(parser, gumbo_tolower(c));
2291       return NEXT_CHAR;
2292   }
2293 }
2294 
2295 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
handle_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2296 static StateResult handle_doctype_name_state(GumboParser* parser,
2297     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2298   switch (c) {
2299     case '\t':
2300     case '\n':
2301     case '\f':
2302     case ' ':
2303       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2304       gumbo_free((void*) tokenizer->_doc_type_state.name);
2305       finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2306       return NEXT_CHAR;
2307     case '>':
2308       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2309       gumbo_free((void*) tokenizer->_doc_type_state.name);
2310       finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2311       emit_doctype(parser, output);
2312       return RETURN_SUCCESS;
2313     case '\0':
2314       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2315       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2316       return NEXT_CHAR;
2317     case -1:
2318       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2319       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2320       tokenizer->_doc_type_state.force_quirks = true;
2321       gumbo_free((void*) tokenizer->_doc_type_state.name);
2322       finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2323       emit_doctype(parser, output);
2324       return RETURN_ERROR;
2325     default:
2326       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2327       tokenizer->_doc_type_state.force_quirks = false;
2328       append_char_to_temporary_buffer(parser, gumbo_tolower(c));
2329       return NEXT_CHAR;
2330   }
2331 }
2332 
2333 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
handle_after_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2334 static StateResult handle_after_doctype_name_state(GumboParser* parser,
2335     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2336   switch (c) {
2337     case '\t':
2338     case '\n':
2339     case '\f':
2340     case ' ':
2341       return NEXT_CHAR;
2342     case '>':
2343       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2344       emit_doctype(parser, output);
2345       return RETURN_SUCCESS;
2346     case -1:
2347       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2348       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2349       tokenizer->_doc_type_state.force_quirks = true;
2350       emit_doctype(parser, output);
2351       return RETURN_ERROR;
2352     default:
2353       if (utf8iterator_maybe_consume_match(
2354               &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2355         gumbo_tokenizer_set_state(
2356             parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2357         tokenizer->_reconsume_current_input = true;
2358       } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2359                      sizeof("SYSTEM") - 1, false)) {
2360         gumbo_tokenizer_set_state(
2361             parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2362         tokenizer->_reconsume_current_input = true;
2363       } else {
2364         tokenizer_add_parse_error(
2365             parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2366         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2367         tokenizer->_doc_type_state.force_quirks = true;
2368       }
2369       return NEXT_CHAR;
2370   }
2371 }
2372 
2373 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
handle_after_doctype_public_keyword_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2374 static StateResult handle_after_doctype_public_keyword_state(
2375     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2376     GumboToken* output) {
2377   switch (c) {
2378     case '\t':
2379     case '\n':
2380     case '\f':
2381     case ' ':
2382       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2383       return NEXT_CHAR;
2384     case '"':
2385       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2386       assert(temporary_buffer_equals(parser, ""));
2387       gumbo_tokenizer_set_state(
2388           parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2389       return NEXT_CHAR;
2390     case '\'':
2391       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2392       assert(temporary_buffer_equals(parser, ""));
2393       gumbo_tokenizer_set_state(
2394           parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2395       return NEXT_CHAR;
2396     case '>':
2397       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2398       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2399       tokenizer->_doc_type_state.force_quirks = true;
2400       emit_doctype(parser, output);
2401       return RETURN_ERROR;
2402     case -1:
2403       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2404       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2405       tokenizer->_doc_type_state.force_quirks = true;
2406       emit_doctype(parser, output);
2407       return RETURN_ERROR;
2408     default:
2409       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2410       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2411       tokenizer->_doc_type_state.force_quirks = true;
2412       emit_doctype(parser, output);
2413       return RETURN_ERROR;
2414   }
2415 }
2416 
2417 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
handle_before_doctype_public_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2418 static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2419     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2420   switch (c) {
2421     case '\t':
2422     case '\n':
2423     case '\f':
2424     case ' ':
2425       return NEXT_CHAR;
2426     case '"':
2427       assert(temporary_buffer_equals(parser, ""));
2428       gumbo_tokenizer_set_state(
2429           parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2430       return NEXT_CHAR;
2431     case '\'':
2432       assert(temporary_buffer_equals(parser, ""));
2433       gumbo_tokenizer_set_state(
2434           parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2435       return NEXT_CHAR;
2436     case '>':
2437       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2438       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2439       tokenizer->_doc_type_state.force_quirks = true;
2440       emit_doctype(parser, output);
2441       return RETURN_ERROR;
2442     case -1:
2443       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2444       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2445       tokenizer->_doc_type_state.force_quirks = true;
2446       emit_doctype(parser, output);
2447       return RETURN_ERROR;
2448     default:
2449       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2450       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2451       tokenizer->_doc_type_state.force_quirks = true;
2452       emit_doctype(parser, output);
2453       return RETURN_ERROR;
2454   }
2455 }
2456 
2457 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
handle_doctype_public_id_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2458 static StateResult handle_doctype_public_id_double_quoted_state(
2459     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2460     GumboToken* output) {
2461   switch (c) {
2462     case '"':
2463       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2464       finish_doctype_public_id(parser);
2465       return NEXT_CHAR;
2466     case '\0':
2467       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2468       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2469       return NEXT_CHAR;
2470     case '>':
2471       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2472       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2473       tokenizer->_doc_type_state.force_quirks = true;
2474       finish_doctype_public_id(parser);
2475       emit_doctype(parser, output);
2476       return RETURN_ERROR;
2477     case -1:
2478       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2479       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2480       tokenizer->_doc_type_state.force_quirks = true;
2481       finish_doctype_public_id(parser);
2482       emit_doctype(parser, output);
2483       return RETURN_ERROR;
2484     default:
2485       append_char_to_temporary_buffer(parser, c);
2486       return NEXT_CHAR;
2487   }
2488 }
2489 
2490 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
handle_doctype_public_id_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2491 static StateResult handle_doctype_public_id_single_quoted_state(
2492     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2493     GumboToken* output) {
2494   switch (c) {
2495     case '\'':
2496       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2497       finish_doctype_public_id(parser);
2498       return NEXT_CHAR;
2499     case '\0':
2500       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2501       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2502       return NEXT_CHAR;
2503     case '>':
2504       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2505       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2506       tokenizer->_doc_type_state.force_quirks = true;
2507       finish_doctype_public_id(parser);
2508       emit_doctype(parser, output);
2509       return RETURN_ERROR;
2510     case -1:
2511       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2512       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2513       tokenizer->_doc_type_state.force_quirks = true;
2514       finish_doctype_public_id(parser);
2515       emit_doctype(parser, output);
2516       return RETURN_ERROR;
2517     default:
2518       append_char_to_temporary_buffer(parser, c);
2519       return NEXT_CHAR;
2520   }
2521 }
2522 
2523 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
handle_after_doctype_public_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2524 static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2525     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2526   switch (c) {
2527     case '\t':
2528     case '\n':
2529     case '\f':
2530     case ' ':
2531       gumbo_tokenizer_set_state(
2532           parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2533       return NEXT_CHAR;
2534     case '>':
2535       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2536       emit_doctype(parser, output);
2537       return RETURN_SUCCESS;
2538     case '"':
2539       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2540       assert(temporary_buffer_equals(parser, ""));
2541       gumbo_tokenizer_set_state(
2542           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2543       return NEXT_CHAR;
2544     case '\'':
2545       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2546       assert(temporary_buffer_equals(parser, ""));
2547       gumbo_tokenizer_set_state(
2548           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2549       return NEXT_CHAR;
2550     case -1:
2551       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2552       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2553       tokenizer->_reconsume_current_input = true;
2554       tokenizer->_doc_type_state.force_quirks = true;
2555       emit_doctype(parser, output);
2556       return RETURN_ERROR;
2557     default:
2558       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2559       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2560       tokenizer->_doc_type_state.force_quirks = true;
2561       return NEXT_CHAR;
2562   }
2563 }
2564 
2565 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
handle_between_doctype_public_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2566 static StateResult handle_between_doctype_public_system_id_state(
2567     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2568     GumboToken* output) {
2569   switch (c) {
2570     case '\t':
2571     case '\n':
2572     case '\f':
2573     case ' ':
2574       return NEXT_CHAR;
2575     case '>':
2576       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2577       emit_doctype(parser, output);
2578       return RETURN_SUCCESS;
2579     case '"':
2580       assert(temporary_buffer_equals(parser, ""));
2581       gumbo_tokenizer_set_state(
2582           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2583       return NEXT_CHAR;
2584     case '\'':
2585       assert(temporary_buffer_equals(parser, ""));
2586       gumbo_tokenizer_set_state(
2587           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2588       return NEXT_CHAR;
2589     case -1:
2590       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2591       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2592       tokenizer->_doc_type_state.force_quirks = true;
2593       emit_doctype(parser, output);
2594       return RETURN_ERROR;
2595     default:
2596       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2597       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2598       tokenizer->_doc_type_state.force_quirks = true;
2599       emit_doctype(parser, output);
2600       return RETURN_ERROR;
2601   }
2602 }
2603 
2604 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
handle_after_doctype_system_keyword_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2605 static StateResult handle_after_doctype_system_keyword_state(
2606     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2607     GumboToken* output) {
2608   switch (c) {
2609     case '\t':
2610     case '\n':
2611     case '\f':
2612     case ' ':
2613       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2614       return NEXT_CHAR;
2615     case '"':
2616       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2617       assert(temporary_buffer_equals(parser, ""));
2618       gumbo_tokenizer_set_state(
2619           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2620       return NEXT_CHAR;
2621     case '\'':
2622       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2623       assert(temporary_buffer_equals(parser, ""));
2624       gumbo_tokenizer_set_state(
2625           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2626       return NEXT_CHAR;
2627     case '>':
2628       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2629       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2630       tokenizer->_doc_type_state.force_quirks = true;
2631       emit_doctype(parser, output);
2632       return RETURN_ERROR;
2633     case -1:
2634       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2635       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2636       tokenizer->_doc_type_state.force_quirks = true;
2637       emit_doctype(parser, output);
2638       return RETURN_ERROR;
2639     default:
2640       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2641       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2642       tokenizer->_doc_type_state.force_quirks = true;
2643       return NEXT_CHAR;
2644   }
2645 }
2646 
2647 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
handle_before_doctype_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2648 static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2649     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2650   switch (c) {
2651     case '\t':
2652     case '\n':
2653     case '\f':
2654     case ' ':
2655       return NEXT_CHAR;
2656     case '"':
2657       assert(temporary_buffer_equals(parser, ""));
2658       gumbo_tokenizer_set_state(
2659           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2660       return NEXT_CHAR;
2661     case '\'':
2662       assert(temporary_buffer_equals(parser, ""));
2663       gumbo_tokenizer_set_state(
2664           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2665       return NEXT_CHAR;
2666     case '>':
2667       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2668       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2669       tokenizer->_doc_type_state.force_quirks = true;
2670       emit_doctype(parser, output);
2671       return RETURN_ERROR;
2672     case -1:
2673       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2674       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2675       tokenizer->_doc_type_state.force_quirks = true;
2676       emit_doctype(parser, output);
2677       return RETURN_ERROR;
2678     default:
2679       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2680       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2681       tokenizer->_doc_type_state.force_quirks = true;
2682       return NEXT_CHAR;
2683   }
2684 }
2685 
2686 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
handle_doctype_system_id_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2687 static StateResult handle_doctype_system_id_double_quoted_state(
2688     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2689     GumboToken* output) {
2690   switch (c) {
2691     case '"':
2692       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2693       finish_doctype_system_id(parser);
2694       return NEXT_CHAR;
2695     case '\0':
2696       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2697       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2698       return NEXT_CHAR;
2699     case '>':
2700       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2701       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2702       tokenizer->_doc_type_state.force_quirks = true;
2703       finish_doctype_system_id(parser);
2704       emit_doctype(parser, output);
2705       return RETURN_ERROR;
2706     case -1:
2707       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2708       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2709       tokenizer->_doc_type_state.force_quirks = true;
2710       finish_doctype_system_id(parser);
2711       emit_doctype(parser, output);
2712       return RETURN_ERROR;
2713     default:
2714       append_char_to_temporary_buffer(parser, c);
2715       return NEXT_CHAR;
2716   }
2717 }
2718 
2719 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
handle_doctype_system_id_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2720 static StateResult handle_doctype_system_id_single_quoted_state(
2721     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2722     GumboToken* output) {
2723   switch (c) {
2724     case '\'':
2725       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2726       finish_doctype_system_id(parser);
2727       return NEXT_CHAR;
2728     case '\0':
2729       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2730       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2731       return NEXT_CHAR;
2732     case '>':
2733       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2734       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2735       tokenizer->_doc_type_state.force_quirks = true;
2736       finish_doctype_system_id(parser);
2737       emit_doctype(parser, output);
2738       return RETURN_ERROR;
2739     case -1:
2740       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2741       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2742       tokenizer->_doc_type_state.force_quirks = true;
2743       finish_doctype_system_id(parser);
2744       emit_doctype(parser, output);
2745       return RETURN_ERROR;
2746     default:
2747       append_char_to_temporary_buffer(parser, c);
2748       return NEXT_CHAR;
2749   }
2750 }
2751 
2752 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
handle_after_doctype_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2753 static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2754     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2755   switch (c) {
2756     case '\t':
2757     case '\n':
2758     case '\f':
2759     case ' ':
2760       return NEXT_CHAR;
2761     case '>':
2762       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2763       emit_doctype(parser, output);
2764       return RETURN_SUCCESS;
2765     case -1:
2766       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2767       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2768       tokenizer->_doc_type_state.force_quirks = true;
2769       emit_doctype(parser, output);
2770       return RETURN_ERROR;
2771     default:
2772       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2773       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2774       return NEXT_CHAR;
2775   }
2776 }
2777 
2778 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
handle_bogus_doctype_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2779 static StateResult handle_bogus_doctype_state(GumboParser* parser,
2780     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2781   AVOID_UNUSED_VARIABLE_WARNING(tokenizer);
2782   if (c == '>' || c == -1) {
2783     gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2784     emit_doctype(parser, output);
2785     return RETURN_ERROR;
2786   }
2787   return NEXT_CHAR;
2788 }
2789 
2790 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
handle_cdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2791 static StateResult handle_cdata_state(GumboParser* parser,
2792     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2793   if (c == -1 || utf8iterator_maybe_consume_match(
2794                      &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2795     tokenizer->_reconsume_current_input = true;
2796     reset_token_start_point(tokenizer);
2797     gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2798     tokenizer->_is_in_cdata = false;
2799     return NEXT_CHAR;
2800   } else {
2801     return emit_current_char(parser, output);
2802   }
2803 }
2804 
2805 typedef StateResult (*GumboLexerStateFunction)(
2806     GumboParser*, GumboTokenizerState*, int, GumboToken*);
2807 
2808 static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
2809     handle_char_ref_in_data_state, handle_rcdata_state,
2810     handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
2811     handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
2812     handle_tag_name_state, handle_rcdata_lt_state,
2813     handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
2814     handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
2815     handle_rawtext_end_tag_name_state, handle_script_lt_state,
2816     handle_script_end_tag_open_state, handle_script_end_tag_name_state,
2817     handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
2818     handle_script_escaped_state, handle_script_escaped_dash_state,
2819     handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
2820     handle_script_escaped_end_tag_open_state,
2821     handle_script_escaped_end_tag_name_state,
2822     handle_script_double_escaped_start_state,
2823     handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
2824     handle_script_double_escaped_dash_dash_state,
2825     handle_script_double_escaped_lt_state,
2826     handle_script_double_escaped_end_state, handle_before_attr_name_state,
2827     handle_attr_name_state, handle_after_attr_name_state,
2828     handle_before_attr_value_state, handle_attr_value_double_quoted_state,
2829     handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
2830     handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
2831     handle_self_closing_start_tag_state, handle_bogus_comment_state,
2832     handle_markup_declaration_state, handle_comment_start_state,
2833     handle_comment_start_dash_state, handle_comment_state,
2834     handle_comment_end_dash_state, handle_comment_end_state,
2835     handle_comment_end_bang_state, handle_doctype_state,
2836     handle_before_doctype_name_state, handle_doctype_name_state,
2837     handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
2838     handle_before_doctype_public_id_state,
2839     handle_doctype_public_id_double_quoted_state,
2840     handle_doctype_public_id_single_quoted_state,
2841     handle_after_doctype_public_id_state,
2842     handle_between_doctype_public_system_id_state,
2843     handle_after_doctype_system_keyword_state,
2844     handle_before_doctype_system_id_state,
2845     handle_doctype_system_id_double_quoted_state,
2846     handle_doctype_system_id_single_quoted_state,
2847     handle_after_doctype_system_id_state, handle_bogus_doctype_state,
2848     handle_cdata_state};
2849 
gumbo_lex(GumboParser * parser,GumboToken * output)2850 bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2851   // Because of the spec requirements that...
2852   //
2853   // 1. Tokens be handled immediately by the parser upon emission.
2854   // 2. Some states (eg. CDATA, or various error conditions) require the
2855   // emission of multiple tokens in the same states.
2856   // 3. The tokenizer often has to reconsume the same character in a different
2857   // state.
2858   //
2859   // ...all state must be held in the GumboTokenizer struct instead of in local
2860   // variables in this function.  That allows us to return from this method with
2861   // a token, and then immediately jump back to the same state with the same
2862   // input if we need to return a different token.  The various emit_* functions
2863   // are responsible for changing state (eg. flushing the chardata buffer,
2864   // reading the next input character) to avoid an infinite loop.
2865   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
2866 
2867   if (tokenizer->_buffered_emit_char != kGumboNoChar) {
2868     tokenizer->_reconsume_current_input = true;
2869     emit_char(parser, tokenizer->_buffered_emit_char, output);
2870     // And now that we've avoided advancing the input, make sure we set
2871     // _reconsume_current_input back to false to make sure the *next* character
2872     // isn't consumed twice.
2873     tokenizer->_reconsume_current_input = false;
2874     tokenizer->_buffered_emit_char = kGumboNoChar;
2875     return true;
2876   }
2877 
2878   if (maybe_emit_from_temporary_buffer(parser, output)) {
2879     return true;
2880   }
2881 
2882   while (1) {
2883     assert(!tokenizer->_temporary_buffer_emit);
2884     assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2885     int c = utf8iterator_current(&tokenizer->_input);
2886     gumbo_debug(
2887         "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
2888     StateResult result =
2889         dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2890     // We need to clear reconsume_current_input before returning to prevent
2891     // certain infinite loop states.
2892     bool should_advance = !tokenizer->_reconsume_current_input;
2893     tokenizer->_reconsume_current_input = false;
2894 
2895     if (result == RETURN_SUCCESS) {
2896       return true;
2897     } else if (result == RETURN_ERROR) {
2898       return false;
2899     }
2900 
2901     if (should_advance) {
2902       utf8iterator_next(&tokenizer->_input);
2903     }
2904   }
2905 }
2906 
gumbo_token_destroy(GumboToken * token)2907 void gumbo_token_destroy(GumboToken* token) {
2908   if (!token) return;
2909 
2910   switch (token->type) {
2911     case GUMBO_TOKEN_DOCTYPE:
2912       gumbo_free((void*) token->v.doc_type.name);
2913       gumbo_free((void*) token->v.doc_type.public_identifier);
2914       gumbo_free((void*) token->v.doc_type.system_identifier);
2915       return;
2916     case GUMBO_TOKEN_START_TAG:
2917       for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2918         GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2919         if (attr) {
2920           // May have been nulled out if this token was merged with another.
2921           gumbo_destroy_attribute(attr);
2922         }
2923       }
2924       gumbo_free((void*) token->v.start_tag.attributes.data);
2925       return;
2926     case GUMBO_TOKEN_COMMENT:
2927       gumbo_free((void*) token->v.text);
2928       return;
2929     default:
2930       return;
2931   }
2932 }
2933