1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 //
17 // Coding conventions specific to this file:
18 //
19 // 1. Functions that fill in a token should be named emit_*, and should be
20 // followed immediately by a return from the tokenizer (true if no error
21 // occurred, false if an error occurred).  Sometimes the emit functions
22 // themselves return a boolean so that they can be combined with the return
23 // statement; in this case, they should match this convention.
24 // 2. Functions that shuffle data from temporaries to final API structures
25 // should be named finish_*, and be called just before the tokenizer exits the
26 // state that accumulates the temporary.
27 // 3. All internal data structures should be kept in an initialized state from
28 // tokenizer creation onwards, ready to accept input.  When a buffer's flushed
29 // and reset, it should be deallocated and immediately reinitialized.
30 // 4. Make sure there are appropriate break statements following each state.
31 // 5. Assertions on the state of the temporary and tag buffers are usually a
32 // good idea, and should go at the entry point of each state when added.
33 // 6. Statement order within states goes:
34 //    1. Add parse errors, if appropriate.
35 //    2. Call finish_* functions to build up tag state.
36 //    2. Switch to new state.  Set _reconsume flag if appropriate.
37 //    3. Perform any other temporary buffer manipulation.
38 //    4. Emit tokens
39 //    5. Return/break.
40 // This order ensures that we can verify that every emit is followed by a
41 // return, ensures that the correct state is recorded with any parse errors, and
42 // prevents parse error position from being messed up by possible mark/resets in
43 // temporary buffer manipulation.
44 
45 #include "tokenizer.h"
46 
47 #include <assert.h>
48 #include <stdbool.h>
49 #include <string.h>
50 
51 #include "attribute.h"
52 #include "char_ref.h"
53 #include "error.h"
54 #include "gumbo.h"
55 #include "parser.h"
56 #include "string_buffer.h"
57 #include "string_piece.h"
58 #include "token_type.h"
59 #include "tokenizer_states.h"
60 #include "utf8.h"
61 #include "util.h"
62 #include "vector.h"
63 
64 // Compared against _script_data_buffer to determine if we're in double-escaped
65 // script mode.
66 const GumboStringPiece kScriptTag = {"script", 6};
67 
68 // An enum for the return value of each individual state.
69 typedef enum {
70   RETURN_ERROR,    // Return false (error) from the tokenizer.
71   RETURN_SUCCESS,  // Return true (success) from the tokenizer.
72   NEXT_CHAR        // Proceed to the next character and continue lexing.
73 } StateResult;
74 
75 // This is a struct containing state necessary to build up a tag token,
76 // character by character.
77 typedef struct GumboInternalTagState {
78   // A buffer to accumulate characters for various GumboStringPiece fields.
79   GumboStringBuffer _buffer;
80 
81   // A pointer to the start of the original text corresponding to the contents
82   // of the buffer.
83   const char* _original_text;
84 
85   // The current tag enum, computed once the tag name state has finished so that
86   // the buffer can be re-used for building up attributes.
87   GumboTag _tag;
88 
89   // The starting location of the text in the buffer.
90   GumboSourcePosition _start_pos;
91 
92   // The current list of attributes.  This is copied (and ownership of its data
93   // transferred) to the GumboStartTag token upon completion of the tag.  New
94   // attributes are added as soon as their attribute name state is complete, and
95   // values are filled in by operating on _attributes.data[attributes.length-1].
96   GumboVector /* GumboAttribute */ _attributes;
97 
98   // If true, the next attribute value to be finished should be dropped.  This
99   // happens if a duplicate attribute name is encountered - we want to consume
100   // the attribute value, but shouldn't overwrite the existing value.
101   bool _drop_next_attr_value;
102 
103   // The state that caused the tokenizer to switch into a character reference in
104   // attribute value state.  This is used to set the additional allowed
105   // character, and is switched back to on completion.  Initialized as the
106   // tokenizer enters the character reference state.
107   GumboTokenizerEnum _attr_value_state;
108 
109   // The last start tag to have been emitted by the tokenizer.  This is
110   // necessary to check for appropriate end tags.
111   GumboTag _last_start_tag;
112 
113   // If true, then this is a start tag.  If false, it's an end tag.  This is
114   // necessary to generate the appropriate token type at tag-closing time.
115   bool _is_start_tag;
116 
117   // If true, then this tag is "self-closing" and doesn't have an end tag.
118   bool _is_self_closing;
119 } GumboTagState;
120 
121 // This is the main tokenizer state struct, containing all state used by in
122 // tokenizing the input stream.
123 typedef struct GumboInternalTokenizerState {
124   // The current lexer state.  Starts in GUMBO_LEX_DATA.
125   GumboTokenizerEnum _state;
126 
127   // A flag indicating whether the current input character needs to reconsumed
128   // in another state, or whether the next input character should be read for
129   // the next iteration of the state loop.  This is set when the spec reads
130   // "Reconsume the current input character in..."
131   bool _reconsume_current_input;
132 
133   // A flag indicating whether the current node is a foreign element.  This is
134   // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
135   // markup declaration state.
136   bool _is_current_node_foreign;
137 
138   // A flag indicating whether the tokenizer is in a CDATA section.  If so, then
139   // text tokens emitted will be GUMBO_TOKEN_CDATA.
140   bool _is_in_cdata;
141 
142   // Certain states (notably character references) may emit two character tokens
143   // at once, but the contract for lex() fills in only one token at a time.  The
144   // extra character is buffered here, and then this is checked on entry to
145   // lex().  If a character is stored here, it's immediately emitted and control
146   // returns from the lexer.  kGumboNoChar is used to represent 'no character
147   // stored.'
148   //
149   // Note that characters emitted through this mechanism will have their source
150   // position marked as the character under the mark, i.e. multiple characters
151   // may be emitted with the same position.  This is desirable for character
152   // references, but unsuitable for many other cases.  Use the _temporary_buffer
153   // mechanism if the buffered characters must have their original positions in
154   // the document.
155   int _buffered_emit_char;
156 
157   // A temporary buffer to accumulate characters, as described by the "temporary
158   // buffer" phrase in the tokenizer spec.  We use this in a somewhat unorthodox
159   // way: we record the specific character to go into the buffer, which may
160   // sometimes be a lowercased version of the actual input character.  However,
161   // we *also* use utf8iterator_mark() to record the position at tag start.
162   // When we start flushing the temporary buffer, we set _temporary_buffer_emit
163   // to the start of it, and then increment it for each call to the tokenizer.
164   // We also call utf8iterator_reset(), and utf8iterator_next() through the
165   // input stream, so that tokens emitted by emit_char have the correct position
166   // and original text.
167   GumboStringBuffer _temporary_buffer;
168 
169   // The current cursor position we're emitting from within
170   // _temporary_buffer.data.  NULL whenever we're not flushing the buffer.
171   const char* _temporary_buffer_emit;
172 
173   // The temporary buffer is also used by the spec to check whether we should
174   // enter the script data double escaped state, but we can't use the same
175   // buffer for both because we have to flush out "<s" as emits while still
176   // maintaining the context that will eventually become "script".  This is a
177   // separate buffer that's used in place of the temporary buffer for states
178   // that may enter the script data double escape start state.
179   GumboStringBuffer _script_data_buffer;
180 
181   // Pointer to the beginning of the current token in the original buffer; used
182   // to record the original text.
183   const char* _token_start;
184 
185   // GumboSourcePosition recording the source location of the start of the
186   // current token.
187   GumboSourcePosition _token_start_pos;
188 
189   // Current tag state.
190   GumboTagState _tag_state;
191 
192   // Doctype state.  We use the temporary buffer to accumulate characters (it's
193   // not used for anything else in the doctype states), and then freshly
194   // allocate the strings in the doctype token, then copy it over on emit.
195   GumboTokenDocType _doc_type_state;
196 
197   // The UTF8Iterator over the tokenizer input.
198   Utf8Iterator _input;
199 } GumboTokenizerState;
200 
201 // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
tokenizer_add_parse_error(GumboParser * parser,GumboErrorType type)202 static void tokenizer_add_parse_error(
203     GumboParser* parser, GumboErrorType type) {
204   GumboError* error = gumbo_add_error(parser);
205   if (!error) {
206     return;
207   }
208   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
209   utf8iterator_get_position(&tokenizer->_input, &error->position);
210   error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
211   error->type = type;
212   error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
213   switch (tokenizer->_state) {
214     case GUMBO_LEX_DATA:
215       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
216       break;
217     case GUMBO_LEX_CHAR_REF_IN_DATA:
218     case GUMBO_LEX_CHAR_REF_IN_RCDATA:
219     case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
220       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
221       break;
222     case GUMBO_LEX_RCDATA:
223     case GUMBO_LEX_RCDATA_LT:
224     case GUMBO_LEX_RCDATA_END_TAG_OPEN:
225     case GUMBO_LEX_RCDATA_END_TAG_NAME:
226       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
227       break;
228     case GUMBO_LEX_RAWTEXT:
229     case GUMBO_LEX_RAWTEXT_LT:
230     case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
231     case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
232       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
233       break;
234     case GUMBO_LEX_PLAINTEXT:
235       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
236       break;
237     case GUMBO_LEX_SCRIPT:
238     case GUMBO_LEX_SCRIPT_LT:
239     case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
240     case GUMBO_LEX_SCRIPT_END_TAG_NAME:
241     case GUMBO_LEX_SCRIPT_ESCAPED_START:
242     case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
243     case GUMBO_LEX_SCRIPT_ESCAPED:
244     case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
245     case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
246     case GUMBO_LEX_SCRIPT_ESCAPED_LT:
247     case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
248     case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
249     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
250     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
251     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
252     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
253     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
254     case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
255       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
256       break;
257     case GUMBO_LEX_TAG_OPEN:
258     case GUMBO_LEX_END_TAG_OPEN:
259     case GUMBO_LEX_TAG_NAME:
260     case GUMBO_LEX_BEFORE_ATTR_NAME:
261       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
262       break;
263     case GUMBO_LEX_SELF_CLOSING_START_TAG:
264       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
265       break;
266     case GUMBO_LEX_ATTR_NAME:
267     case GUMBO_LEX_AFTER_ATTR_NAME:
268     case GUMBO_LEX_BEFORE_ATTR_VALUE:
269       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
270       break;
271     case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
272     case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
273     case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
274     case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
275       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
276       break;
277     case GUMBO_LEX_BOGUS_COMMENT:
278     case GUMBO_LEX_COMMENT_START:
279     case GUMBO_LEX_COMMENT_START_DASH:
280     case GUMBO_LEX_COMMENT:
281     case GUMBO_LEX_COMMENT_END_DASH:
282     case GUMBO_LEX_COMMENT_END:
283     case GUMBO_LEX_COMMENT_END_BANG:
284       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
285       break;
286     case GUMBO_LEX_MARKUP_DECLARATION:
287     case GUMBO_LEX_DOCTYPE:
288     case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
289     case GUMBO_LEX_DOCTYPE_NAME:
290     case GUMBO_LEX_AFTER_DOCTYPE_NAME:
291     case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
292     case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
293     case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
294     case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
295     case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
296     case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
297     case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
298     case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
299     case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
300     case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
301     case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
302     case GUMBO_LEX_BOGUS_DOCTYPE:
303       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
304       break;
305     case GUMBO_LEX_CDATA:
306       error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
307       break;
308   }
309 }
310 
is_alpha(int c)311 static bool is_alpha(int c) {
312   // We don't use ISO C isupper/islower functions here because they
313   // depend upon the program's locale, while the behavior of the HTML5 spec is
314   // independent of which locale the program is run in.
315   return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
316 }
317 
ensure_lowercase(int c)318 static int ensure_lowercase(int c) {
319   return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
320 }
321 
get_char_token_type(bool is_in_cdata,int c)322 static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
323   if (is_in_cdata && c > 0) {
324     return GUMBO_TOKEN_CDATA;
325   }
326 
327   switch (c) {
328     case '\t':
329     case '\n':
330     case '\r':
331     case '\f':
332     case ' ':
333       return GUMBO_TOKEN_WHITESPACE;
334     case 0:
335       gumbo_debug("Emitted null byte.\n");
336       return GUMBO_TOKEN_NULL;
337     case -1:
338       return GUMBO_TOKEN_EOF;
339     default:
340       return GUMBO_TOKEN_CHARACTER;
341   }
342 }
343 
344 // Starts recording characters in the temporary buffer.
345 // Because this needs to reset the utf8iterator_mark to the beginning of the
346 // text that will eventually be emitted, it needs to be called a couple of
347 // states before the spec says "Set the temporary buffer to the empty string".
348 // In general, this should be called whenever there's a transition to a
349 // "less-than sign state".  The initial < and possibly / then need to be
350 // appended to the temporary buffer, their presence needs to be accounted for in
351 // states that compare the temporary buffer against a literal value, and
352 // spec stanzas that say "emit a < and / character token along with a character
353 // token for each character in the temporary buffer" need to be adjusted to
354 // account for the presence of the < and / inside the temporary buffer.
clear_temporary_buffer(GumboParser * parser)355 static void clear_temporary_buffer(GumboParser* parser) {
356   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
357   assert(!tokenizer->_temporary_buffer_emit);
358   utf8iterator_mark(&tokenizer->_input);
359   gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
360   // The temporary buffer and script data buffer are the same object in the
361   // spec, so the script data buffer should be cleared as well.
362   gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
363 }
364 
365 // Appends a codepoint to the temporary buffer.
append_char_to_temporary_buffer(GumboParser * parser,int codepoint)366 static void append_char_to_temporary_buffer(
367     GumboParser* parser, int codepoint) {
368   gumbo_string_buffer_append_codepoint(
369       parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
370 }
371 
372 // Checks to see if the temporary buffer equals a certain string.
373 // Make sure this remains side-effect free; it's used in assertions.
374 #ifndef NDEBUG
temporary_buffer_equals(GumboParser * parser,const char * text)375 static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
376   GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
377   // TODO(jdtang): See if the extra strlen is a performance problem, and replace
378   // it with an explicit sizeof(literal) if necessary.  I don't think it will
379   // be, as this is only used in a couple of rare states.
380   int text_len = strlen(text);
381   return text_len == buffer->length &&
382          memcmp(buffer->data, text, text_len) == 0;
383 }
384 #endif
385 
doc_type_state_init(GumboParser * parser)386 static void doc_type_state_init(GumboParser* parser) {
387   GumboTokenDocType* doc_type_state =
388       &parser->_tokenizer_state->_doc_type_state;
389   // We initialize these to NULL here so that we don't end up leaking memory if
390   // we never see a doctype token.  When we do see a doctype token, we reset
391   // them to a freshly-allocated empty string so that we can present a uniform
392   // interface to client code and not make them check for null.  Ownership is
393   // transferred to the doctype token when it's emitted.
394   doc_type_state->name = NULL;
395   doc_type_state->public_identifier = NULL;
396   doc_type_state->system_identifier = NULL;
397   doc_type_state->force_quirks = false;
398   doc_type_state->has_public_identifier = false;
399   doc_type_state->has_system_identifier = false;
400 }
401 
402 // Sets the token original_text and position to the current iterator position.
403 // This is necessary because [CDATA[ sections may include text that is ignored
404 // by the tokenizer.
reset_token_start_point(GumboTokenizerState * tokenizer)405 static void reset_token_start_point(GumboTokenizerState* tokenizer) {
406   tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
407   utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
408 }
409 
410 // Sets the tag buffer original text and start point to the current iterator
411 // position.  This is necessary because attribute names & values may have
412 // whitespace preceeding them, and so we can't assume that the actual token
413 // starting point was the end of the last tag buffer usage.
reset_tag_buffer_start_point(GumboParser * parser)414 static void reset_tag_buffer_start_point(GumboParser* parser) {
415   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
416   GumboTagState* tag_state = &tokenizer->_tag_state;
417 
418   utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
419   tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
420 }
421 
422 // Moves the temporary buffer contents over to the specified output string,
423 // and clears the temporary buffer.
finish_temporary_buffer(GumboParser * parser,const char ** output)424 static void finish_temporary_buffer(GumboParser* parser, const char** output) {
425   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
426   *output =
427       gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
428   clear_temporary_buffer(parser);
429 }
430 
431 // Advances the iterator past the end of the token, and then fills in the
432 // relevant position fields.  It's assumed that after every emit, the tokenizer
433 // will immediately return (letting the tree-construction stage read the filled
434 // in Token).  Thus, it's safe to advance the input stream here, since it will
435 // bypass the advance at the bottom of the state machine loop.
436 //
437 // Since this advances the iterator and resets the current input, make sure to
438 // call it after you've recorded any other data you need for the token.
finish_token(GumboParser * parser,GumboToken * token)439 static void finish_token(GumboParser* parser, GumboToken* token) {
440   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
441   if (!tokenizer->_reconsume_current_input) {
442     utf8iterator_next(&tokenizer->_input);
443   }
444 
445   token->position = tokenizer->_token_start_pos;
446   token->original_text.data = tokenizer->_token_start;
447   reset_token_start_point(tokenizer);
448   token->original_text.length =
449       tokenizer->_token_start - token->original_text.data;
450   if (token->original_text.length > 0 &&
451       token->original_text.data[token->original_text.length - 1] == '\r') {
452     // The UTF8 iterator will ignore carriage returns in the input stream, which
453     // means that the next token may start one past a \r character.  The pointer
454     // arithmetic above results in that \r being appended to the original text
455     // of the preceding token, so we have to adjust its length here to chop the
456     // \r off.
457     --token->original_text.length;
458   }
459 }
460 
461 // Records the doctype public ID, assumed to be in the temporary buffer.
462 // Convenience method that also sets has_public_identifier to true.
finish_doctype_public_id(GumboParser * parser)463 static void finish_doctype_public_id(GumboParser* parser) {
464   GumboTokenDocType* doc_type_state =
465       &parser->_tokenizer_state->_doc_type_state;
466   gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
467   finish_temporary_buffer(parser, &doc_type_state->public_identifier);
468   doc_type_state->has_public_identifier = true;
469 }
470 
471 // Records the doctype system ID, assumed to be in the temporary buffer.
472 // Convenience method that also sets has_system_identifier to true.
finish_doctype_system_id(GumboParser * parser)473 static void finish_doctype_system_id(GumboParser* parser) {
474   GumboTokenDocType* doc_type_state =
475       &parser->_tokenizer_state->_doc_type_state;
476   gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
477   finish_temporary_buffer(parser, &doc_type_state->system_identifier);
478   doc_type_state->has_system_identifier = true;
479 }
480 
481 // Writes a single specified character to the output token.
emit_char(GumboParser * parser,int c,GumboToken * output)482 static void emit_char(GumboParser* parser, int c, GumboToken* output) {
483   output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
484   output->v.character = c;
485   finish_token(parser, output);
486 }
487 
488 // Writes a replacement character token and records a parse error.
489 // Always returns RETURN_ERROR, per gumbo_lex return value.
emit_replacement_char(GumboParser * parser,GumboToken * output)490 static StateResult emit_replacement_char(
491     GumboParser* parser, GumboToken* output) {
492   // In all cases, this is because of a null byte in the input stream.
493   tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
494   emit_char(parser, kUtf8ReplacementChar, output);
495   return RETURN_ERROR;
496 }
497 
498 // Writes an EOF character token.  Always returns RETURN_SUCCESS.
emit_eof(GumboParser * parser,GumboToken * output)499 static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
500   emit_char(parser, -1, output);
501   return RETURN_SUCCESS;
502 }
503 
504 // Writes the current input character out as a character token.
505 // Always returns RETURN_SUCCESS.
emit_current_char(GumboParser * parser,GumboToken * output)506 static bool emit_current_char(GumboParser* parser, GumboToken* output) {
507   emit_char(
508       parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
509   return RETURN_SUCCESS;
510 }
511 
512 // Writes out a doctype token, copying it from the tokenizer state.
emit_doctype(GumboParser * parser,GumboToken * output)513 static void emit_doctype(GumboParser* parser, GumboToken* output) {
514   output->type = GUMBO_TOKEN_DOCTYPE;
515   output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
516   finish_token(parser, output);
517   doc_type_state_init(parser);
518 }
519 
520 // Debug-only function that explicitly sets the attribute vector data to NULL so
521 // it can be asserted on tag creation, verifying that there are no memory leaks.
mark_tag_state_as_empty(GumboTagState * tag_state)522 static void mark_tag_state_as_empty(GumboTagState* tag_state) {
523 #ifndef NDEBUG
524   tag_state->_attributes = kGumboEmptyVector;
525 #endif
526 }
527 
528 // Writes out the current tag as a start or end tag token.
529 // Always returns RETURN_SUCCESS.
emit_current_tag(GumboParser * parser,GumboToken * output)530 static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
531   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
532   if (tag_state->_is_start_tag) {
533     output->type = GUMBO_TOKEN_START_TAG;
534     output->v.start_tag.tag = tag_state->_tag;
535     output->v.start_tag.attributes = tag_state->_attributes;
536     output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
537     tag_state->_last_start_tag = tag_state->_tag;
538     mark_tag_state_as_empty(tag_state);
539     gumbo_debug(
540         "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
541   } else {
542     output->type = GUMBO_TOKEN_END_TAG;
543     output->v.end_tag = tag_state->_tag;
544     // In end tags, ownership of the attributes vector is not transferred to the
545     // token, but it's still initialized as normal, so it must be manually
546     // deallocated.  There may also be attributes to destroy, in certain broken
547     // cases like </div</th> (the "th" is an attribute there).
548     for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
549       gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
550     }
551     gumbo_parser_deallocate(parser, tag_state->_attributes.data);
552     mark_tag_state_as_empty(tag_state);
553     gumbo_debug(
554         "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
555   }
556   gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
557   finish_token(parser, output);
558   gumbo_debug("Original text = %.*s.\n", output->original_text.length,
559       output->original_text.data);
560   assert(output->original_text.length >= 2);
561   assert(output->original_text.data[0] == '<');
562   assert(output->original_text.data[output->original_text.length - 1] == '>');
563   return RETURN_SUCCESS;
564 }
565 
566 // In some states, we speculatively start a tag, but don't know whether it'll be
567 // emitted as tag token or as a series of character tokens until we finish it.
568 // We need to abandon the tag we'd started & free its memory in that case to
569 // avoid a memory leak.
abandon_current_tag(GumboParser * parser)570 static void abandon_current_tag(GumboParser* parser) {
571   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
572   for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
573     gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
574   }
575   gumbo_parser_deallocate(parser, tag_state->_attributes.data);
576   mark_tag_state_as_empty(tag_state);
577   gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
578   gumbo_debug("Abandoning current tag.\n");
579 }
580 
581 // Wraps the consume_char_ref function to handle its output and make the
582 // appropriate TokenizerState modifications.  Returns RETURN_ERROR if a parse
583 // error occurred, RETURN_SUCCESS otherwise.
emit_char_ref(GumboParser * parser,int additional_allowed_char,bool is_in_attribute,GumboToken * output)584 static StateResult emit_char_ref(GumboParser* parser,
585     int additional_allowed_char, bool is_in_attribute, GumboToken* output) {
586   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
587   OneOrTwoCodepoints char_ref;
588   bool status = consume_char_ref(
589       parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
590   if (char_ref.first != kGumboNoChar) {
591     // consume_char_ref ends with the iterator pointing at the next character,
592     // so we need to be sure not advance it again before reading the next token.
593     tokenizer->_reconsume_current_input = true;
594     emit_char(parser, char_ref.first, output);
595     tokenizer->_buffered_emit_char = char_ref.second;
596   } else {
597     emit_char(parser, '&', output);
598   }
599   return status ? RETURN_SUCCESS : RETURN_ERROR;
600 }
601 
602 // Emits a comment token.  Comments use the temporary buffer to accumulate their
603 // data, and then it's copied over and released to the 'text' field of the
604 // GumboToken union.  Always returns RETURN_SUCCESS.
emit_comment(GumboParser * parser,GumboToken * output)605 static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
606   output->type = GUMBO_TOKEN_COMMENT;
607   finish_temporary_buffer(parser, &output->v.text);
608   finish_token(parser, output);
609   return RETURN_SUCCESS;
610 }
611 
612 // Checks to see we should be flushing accumulated characters in the temporary
613 // buffer, and fills the output token with the next output character if so.
614 // Returns true if a character has been emitted and the tokenizer should
615 // immediately return, false if we're at the end of the temporary buffer and
616 // should resume normal operation.
maybe_emit_from_temporary_buffer(GumboParser * parser,GumboToken * output)617 static bool maybe_emit_from_temporary_buffer(
618     GumboParser* parser, GumboToken* output) {
619   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
620   const char* c = tokenizer->_temporary_buffer_emit;
621   GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
622 
623   if (!c || c >= buffer->data + buffer->length) {
624     tokenizer->_temporary_buffer_emit = NULL;
625     return false;
626   }
627 
628   assert(*c == utf8iterator_current(&tokenizer->_input));
629   // emit_char also advances the input stream.  We need to do some juggling of
630   // the _reconsume_current_input flag to get the proper behavior when emitting
631   // previous tokens.  Basically, _reconsume_current_input should *never* be set
632   // when emitting anything from the temporary buffer, since those characters
633   // have already been advanced past.  However, it should be preserved so that
634   // when the *next* character is encountered again, the tokenizer knows not to
635   // advance past it.
636   bool saved_reconsume_state = tokenizer->_reconsume_current_input;
637   tokenizer->_reconsume_current_input = false;
638   emit_char(parser, *c, output);
639   ++tokenizer->_temporary_buffer_emit;
640   tokenizer->_reconsume_current_input = saved_reconsume_state;
641   return true;
642 }
643 
644 // Sets up the tokenizer to begin flushing the temporary buffer.
645 // This resets the input iterator stream to the start of the last tag, sets up
646 // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
647 // the first character in it.  It returns true if a character was emitted, false
648 // otherwise.
emit_temporary_buffer(GumboParser * parser,GumboToken * output)649 static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
650   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
651   assert(tokenizer->_temporary_buffer.data);
652   utf8iterator_reset(&tokenizer->_input);
653   tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
654   return maybe_emit_from_temporary_buffer(parser, output);
655 }
656 
657 // Appends a codepoint to the current tag buffer.  If
658 // reinitilize_position_on_first is set, this also initializes the tag buffer
659 // start point; the only time you would *not* want to pass true for this
660 // parameter is if you want the original_text to include character (like an
661 // opening quote) that doesn't appear in the value.
append_char_to_tag_buffer(GumboParser * parser,int codepoint,bool reinitilize_position_on_first)662 static void append_char_to_tag_buffer(
663     GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
664   GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
665   if (buffer->length == 0 && reinitilize_position_on_first) {
666     reset_tag_buffer_start_point(parser);
667   }
668   gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
669 }
670 
671 // (Re-)initialize the tag buffer.  This also resets the original_text pointer
672 // and _start_pos field to point to the current position.
initialize_tag_buffer(GumboParser * parser)673 static void initialize_tag_buffer(GumboParser* parser) {
674   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
675   GumboTagState* tag_state = &tokenizer->_tag_state;
676 
677   gumbo_string_buffer_init(parser, &tag_state->_buffer);
678   reset_tag_buffer_start_point(parser);
679 }
680 
681 // Initializes the tag_state to start a new tag, keeping track of the opening
682 // positions and original text.  Takes a boolean indicating whether this is a
683 // start or end tag.
start_new_tag(GumboParser * parser,bool is_start_tag)684 static void start_new_tag(GumboParser* parser, bool is_start_tag) {
685   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
686   GumboTagState* tag_state = &tokenizer->_tag_state;
687   int c = utf8iterator_current(&tokenizer->_input);
688   assert(is_alpha(c));
689   c = ensure_lowercase(c);
690   assert(is_alpha(c));
691 
692   initialize_tag_buffer(parser);
693   gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
694 
695   assert(tag_state->_attributes.data == NULL);
696   // Initial size chosen by statistical analysis of a corpus of 60k webpages.
697   // 99.5% of elements have 0 attributes, 93% of the remainder have 1.  These
698   // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
699   // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
700   gumbo_vector_init(parser, 1, &tag_state->_attributes);
701   tag_state->_drop_next_attr_value = false;
702   tag_state->_is_start_tag = is_start_tag;
703   tag_state->_is_self_closing = false;
704   gumbo_debug("Starting new tag.\n");
705 }
706 
707 // Fills in the specified char* with the contents of the tag buffer.
copy_over_tag_buffer(GumboParser * parser,const char ** output)708 static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
709   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
710   GumboTagState* tag_state = &tokenizer->_tag_state;
711   *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
712 }
713 
714 // Fills in:
715 // * The original_text GumboStringPiece with the portion of the original
716 // buffer that corresponds to the tag buffer.
717 // * The start_pos GumboSourcePosition with the start position of the tag
718 // buffer.
719 // * The end_pos GumboSourcePosition with the current source position.
copy_over_original_tag_text(GumboParser * parser,GumboStringPiece * original_text,GumboSourcePosition * start_pos,GumboSourcePosition * end_pos)720 static void copy_over_original_tag_text(GumboParser* parser,
721     GumboStringPiece* original_text, GumboSourcePosition* start_pos,
722     GumboSourcePosition* end_pos) {
723   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
724   GumboTagState* tag_state = &tokenizer->_tag_state;
725 
726   original_text->data = tag_state->_original_text;
727   original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
728                           tag_state->_original_text;
729   if (original_text->data[original_text->length - 1] == '\r') {
730     // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
731     // appended to the end of original text even when it's really the first part
732     // of the next character.  If we detect this situation, shrink the length of
733     // the original text by 1 to remove the carriage return.
734     --original_text->length;
735   }
736   *start_pos = tag_state->_start_pos;
737   utf8iterator_get_position(&tokenizer->_input, end_pos);
738 }
739 
740 // Releases and then re-initializes the tag buffer.
reinitialize_tag_buffer(GumboParser * parser)741 static void reinitialize_tag_buffer(GumboParser* parser) {
742   gumbo_parser_deallocate(
743       parser, parser->_tokenizer_state->_tag_state._buffer.data);
744   initialize_tag_buffer(parser);
745 }
746 
747 // Moves some data from the temporary buffer over the the tag-based fields in
748 // TagState.
finish_tag_name(GumboParser * parser)749 static void finish_tag_name(GumboParser* parser) {
750   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
751   GumboTagState* tag_state = &tokenizer->_tag_state;
752 
753   tag_state->_tag =
754       gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
755   reinitialize_tag_buffer(parser);
756 }
757 
758 // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
add_duplicate_attr_error(GumboParser * parser,const char * attr_name,int original_index,int new_index)759 static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
760     int original_index, int new_index) {
761   GumboError* error = gumbo_add_error(parser);
762   if (!error) {
763     return;
764   }
765   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
766   error->type = GUMBO_ERR_DUPLICATE_ATTR;
767   error->position = tag_state->_start_pos;
768   error->original_text = tag_state->_original_text;
769   error->v.duplicate_attr.original_index = original_index;
770   error->v.duplicate_attr.new_index = new_index;
771   copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
772   reinitialize_tag_buffer(parser);
773 }
774 
775 // Creates a new attribute in the current tag, copying the current tag buffer to
776 // the attribute's name.  The attribute's value starts out as the empty string
777 // (following the "Boolean attributes" section of the spec) and is only
778 // overwritten on finish_attribute_value().  If the attribute has already been
779 // specified, the new attribute is dropped, a parse error is added, and the
780 // function returns false.  Otherwise, this returns true.
finish_attribute_name(GumboParser * parser)781 static bool finish_attribute_name(GumboParser* parser) {
782   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
783   GumboTagState* tag_state = &tokenizer->_tag_state;
784   // May've been set by a previous attribute without a value; reset it here.
785   tag_state->_drop_next_attr_value = false;
786   assert(tag_state->_attributes.data);
787   assert(tag_state->_attributes.capacity);
788 
789   GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
790   for (unsigned int i = 0; i < attributes->length; ++i) {
791     GumboAttribute* attr = attributes->data[i];
792     if (strlen(attr->name) == tag_state->_buffer.length &&
793         memcmp(attr->name, tag_state->_buffer.data,
794             tag_state->_buffer.length) == 0) {
795       // Identical attribute; bail.
796       add_duplicate_attr_error(parser, attr->name, i, attributes->length);
797       tag_state->_drop_next_attr_value = true;
798       return false;
799     }
800   }
801 
802   GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
803   attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
804   copy_over_tag_buffer(parser, &attr->name);
805   copy_over_original_tag_text(
806       parser, &attr->original_name, &attr->name_start, &attr->name_end);
807   attr->value = gumbo_copy_stringz(parser, "");
808   copy_over_original_tag_text(
809       parser, &attr->original_value, &attr->name_start, &attr->name_end);
810   gumbo_vector_add(parser, attr, attributes);
811   reinitialize_tag_buffer(parser);
812   return true;
813 }
814 
815 // Finishes an attribute value.  This sets the value of the most recently added
816 // attribute to the current contents of the tag buffer.
finish_attribute_value(GumboParser * parser)817 static void finish_attribute_value(GumboParser* parser) {
818   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
819   if (tag_state->_drop_next_attr_value) {
820     // Duplicate attribute name detected in an earlier state, so we have to
821     // ignore the value.
822     tag_state->_drop_next_attr_value = false;
823     reinitialize_tag_buffer(parser);
824     return;
825   }
826 
827   GumboAttribute* attr =
828       tag_state->_attributes.data[tag_state->_attributes.length - 1];
829   gumbo_parser_deallocate(parser, (void*) attr->value);
830   copy_over_tag_buffer(parser, &attr->value);
831   copy_over_original_tag_text(
832       parser, &attr->original_value, &attr->value_start, &attr->value_end);
833   reinitialize_tag_buffer(parser);
834 }
835 
836 // Returns true if the current end tag matches the last start tag emitted.
is_appropriate_end_tag(GumboParser * parser)837 static bool is_appropriate_end_tag(GumboParser* parser) {
838   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
839   assert(!tag_state->_is_start_tag);
840   return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
841          tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
842                                            tag_state->_buffer.length);
843 }
844 
gumbo_tokenizer_state_init(GumboParser * parser,const char * text,size_t text_length)845 void gumbo_tokenizer_state_init(
846     GumboParser* parser, const char* text, size_t text_length) {
847   GumboTokenizerState* tokenizer =
848       gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
849   parser->_tokenizer_state = tokenizer;
850   gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851   tokenizer->_reconsume_current_input = false;
852   tokenizer->_is_current_node_foreign = false;
853   tokenizer->_is_in_cdata = false;
854   tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
855 
856   tokenizer->_buffered_emit_char = kGumboNoChar;
857   gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
858   tokenizer->_temporary_buffer_emit = NULL;
859 
860   mark_tag_state_as_empty(&tokenizer->_tag_state);
861 
862   gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
863   tokenizer->_token_start = text;
864   utf8iterator_init(parser, text, text_length, &tokenizer->_input);
865   utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
866   doc_type_state_init(parser);
867 }
868 
gumbo_tokenizer_state_destroy(GumboParser * parser)869 void gumbo_tokenizer_state_destroy(GumboParser* parser) {
870   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
871   assert(tokenizer->_doc_type_state.name == NULL);
872   assert(tokenizer->_doc_type_state.public_identifier == NULL);
873   assert(tokenizer->_doc_type_state.system_identifier == NULL);
874   gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
875   gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
876   gumbo_parser_deallocate(parser, tokenizer);
877 }
878 
gumbo_tokenizer_set_state(GumboParser * parser,GumboTokenizerEnum state)879 void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
880   parser->_tokenizer_state->_state = state;
881 }
882 
gumbo_tokenizer_set_is_current_node_foreign(GumboParser * parser,bool is_foreign)883 void gumbo_tokenizer_set_is_current_node_foreign(
884     GumboParser* parser, bool is_foreign) {
885   if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
886     gumbo_debug("Toggling is_current_node_foreign to %s.\n",
887         is_foreign ? "true" : "false");
888   }
889   parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
890 }
891 
892 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
handle_data_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)893 static StateResult handle_data_state(GumboParser* parser,
894     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
895   switch (c) {
896     case '&':
897       gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
898       // The char_ref machinery expects to be on the & so it can mark that
899       // and return to it if the text isn't a char ref, so we need to
900       // reconsume it.
901       tokenizer->_reconsume_current_input = true;
902       return NEXT_CHAR;
903     case '<':
904       gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
905       clear_temporary_buffer(parser);
906       append_char_to_temporary_buffer(parser, '<');
907       return NEXT_CHAR;
908     case '\0':
909       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
910       emit_char(parser, c, output);
911       return RETURN_ERROR;
912     default:
913       return emit_current_char(parser, output);
914   }
915 }
916 
917 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
handle_char_ref_in_data_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)918 static StateResult handle_char_ref_in_data_state(GumboParser* parser,
919     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
920   gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
921   return emit_char_ref(parser, ' ', false, output);
922 }
923 
924 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
handle_rcdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)925 static StateResult handle_rcdata_state(GumboParser* parser,
926     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
927   switch (c) {
928     case '&':
929       gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
930       tokenizer->_reconsume_current_input = true;
931       return NEXT_CHAR;
932     case '<':
933       gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
934       clear_temporary_buffer(parser);
935       append_char_to_temporary_buffer(parser, '<');
936       return NEXT_CHAR;
937     case '\0':
938       return emit_replacement_char(parser, output);
939     case -1:
940       return emit_eof(parser, output);
941     default:
942       return emit_current_char(parser, output);
943   }
944 }
945 
946 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
handle_char_ref_in_rcdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)947 static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
948     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
949   gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
950   return emit_char_ref(parser, ' ', false, output);
951 }
952 
953 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
handle_rawtext_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)954 static StateResult handle_rawtext_state(GumboParser* parser,
955     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
956   switch (c) {
957     case '<':
958       gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
959       clear_temporary_buffer(parser);
960       append_char_to_temporary_buffer(parser, '<');
961       return NEXT_CHAR;
962     case '\0':
963       return emit_replacement_char(parser, output);
964     case -1:
965       return emit_eof(parser, output);
966     default:
967       return emit_current_char(parser, output);
968   }
969 }
970 
971 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
handle_script_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)972 static StateResult handle_script_state(GumboParser* parser,
973     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
974   switch (c) {
975     case '<':
976       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
977       clear_temporary_buffer(parser);
978       append_char_to_temporary_buffer(parser, '<');
979       return NEXT_CHAR;
980     case '\0':
981       return emit_replacement_char(parser, output);
982     case -1:
983       return emit_eof(parser, output);
984     default:
985       return emit_current_char(parser, output);
986   }
987 }
988 
989 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
handle_plaintext_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)990 static StateResult handle_plaintext_state(GumboParser* parser,
991     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
992   switch (c) {
993     case '\0':
994       return emit_replacement_char(parser, output);
995     case -1:
996       return emit_eof(parser, output);
997     default:
998       return emit_current_char(parser, output);
999   }
1000 }
1001 
1002 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
handle_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1003 static StateResult handle_tag_open_state(GumboParser* parser,
1004     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1005   assert(temporary_buffer_equals(parser, "<"));
1006   switch (c) {
1007     case '!':
1008       gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1009       clear_temporary_buffer(parser);
1010       return NEXT_CHAR;
1011     case '/':
1012       gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1013       append_char_to_temporary_buffer(parser, '/');
1014       return NEXT_CHAR;
1015     case '?':
1016       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1017       clear_temporary_buffer(parser);
1018       append_char_to_temporary_buffer(parser, '?');
1019       tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1020       return NEXT_CHAR;
1021     default:
1022       if (is_alpha(c)) {
1023         gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1024         start_new_tag(parser, true);
1025         return NEXT_CHAR;
1026       } else {
1027         tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1028         gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1029         emit_temporary_buffer(parser, output);
1030         return RETURN_ERROR;
1031       }
1032   }
1033 }
1034 
1035 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
handle_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1036 static StateResult handle_end_tag_open_state(GumboParser* parser,
1037     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1038   assert(temporary_buffer_equals(parser, "</"));
1039   switch (c) {
1040     case '>':
1041       tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1042       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1043       return NEXT_CHAR;
1044     case -1:
1045       tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1046       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1047       return emit_temporary_buffer(parser, output);
1048     default:
1049       if (is_alpha(c)) {
1050         gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1051         start_new_tag(parser, false);
1052       } else {
1053         tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1054         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1055         clear_temporary_buffer(parser);
1056         append_char_to_temporary_buffer(parser, c);
1057       }
1058       return NEXT_CHAR;
1059   }
1060 }
1061 
1062 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
handle_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1063 static StateResult handle_tag_name_state(GumboParser* parser,
1064     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1065   switch (c) {
1066     case '\t':
1067     case '\n':
1068     case '\f':
1069     case ' ':
1070       finish_tag_name(parser);
1071       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1072       return NEXT_CHAR;
1073     case '/':
1074       finish_tag_name(parser);
1075       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1076       return NEXT_CHAR;
1077     case '>':
1078       finish_tag_name(parser);
1079       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1080       return emit_current_tag(parser, output);
1081     case '\0':
1082       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1083       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1084       return NEXT_CHAR;
1085     case -1:
1086       tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1087       abandon_current_tag(parser);
1088       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1089       return NEXT_CHAR;
1090     default:
1091       append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1092       return NEXT_CHAR;
1093   }
1094 }
1095 
1096 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
handle_rcdata_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1097 static StateResult handle_rcdata_lt_state(GumboParser* parser,
1098     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1099   assert(temporary_buffer_equals(parser, "<"));
1100   if (c == '/') {
1101     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1102     append_char_to_temporary_buffer(parser, '/');
1103     return NEXT_CHAR;
1104   } else {
1105     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1106     tokenizer->_reconsume_current_input = true;
1107     return emit_temporary_buffer(parser, output);
1108   }
1109 }
1110 
1111 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
handle_rcdata_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1112 static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1113     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1114   assert(temporary_buffer_equals(parser, "</"));
1115   if (is_alpha(c)) {
1116     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1117     start_new_tag(parser, false);
1118     append_char_to_temporary_buffer(parser, c);
1119     return NEXT_CHAR;
1120   } else {
1121     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1122     return emit_temporary_buffer(parser, output);
1123   }
1124   return true;
1125 }
1126 
1127 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
handle_rcdata_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1128 static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1129     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1130   assert(tokenizer->_temporary_buffer.length >= 2);
1131   if (is_alpha(c)) {
1132     append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1133     append_char_to_temporary_buffer(parser, c);
1134     return NEXT_CHAR;
1135   } else if (is_appropriate_end_tag(parser)) {
1136     switch (c) {
1137       case '\t':
1138       case '\n':
1139       case '\f':
1140       case ' ':
1141         finish_tag_name(parser);
1142         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1143         return NEXT_CHAR;
1144       case '/':
1145         finish_tag_name(parser);
1146         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1147         return NEXT_CHAR;
1148       case '>':
1149         finish_tag_name(parser);
1150         gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1151         return emit_current_tag(parser, output);
1152     }
1153   }
1154   gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1155   abandon_current_tag(parser);
1156   return emit_temporary_buffer(parser, output);
1157 }
1158 
1159 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
handle_rawtext_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1160 static StateResult handle_rawtext_lt_state(GumboParser* parser,
1161     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1162   assert(temporary_buffer_equals(parser, "<"));
1163   if (c == '/') {
1164     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1165     append_char_to_temporary_buffer(parser, '/');
1166     return NEXT_CHAR;
1167   } else {
1168     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1169     tokenizer->_reconsume_current_input = true;
1170     return emit_temporary_buffer(parser, output);
1171   }
1172 }
1173 
1174 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
handle_rawtext_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1175 static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1176     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1177   assert(temporary_buffer_equals(parser, "</"));
1178   if (is_alpha(c)) {
1179     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1180     start_new_tag(parser, false);
1181     append_char_to_temporary_buffer(parser, c);
1182     return NEXT_CHAR;
1183   } else {
1184     gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1185     return emit_temporary_buffer(parser, output);
1186   }
1187 }
1188 
1189 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
handle_rawtext_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1190 static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1191     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1192   assert(tokenizer->_temporary_buffer.length >= 2);
1193   gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1194       tokenizer->_tag_state._buffer.data);
1195   if (is_alpha(c)) {
1196     append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1197     append_char_to_temporary_buffer(parser, c);
1198     return NEXT_CHAR;
1199   } else if (is_appropriate_end_tag(parser)) {
1200     gumbo_debug("Is an appropriate end tag.\n");
1201     switch (c) {
1202       case '\t':
1203       case '\n':
1204       case '\f':
1205       case ' ':
1206         finish_tag_name(parser);
1207         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1208         return NEXT_CHAR;
1209       case '/':
1210         finish_tag_name(parser);
1211         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1212         return NEXT_CHAR;
1213       case '>':
1214         finish_tag_name(parser);
1215         gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1216         return emit_current_tag(parser, output);
1217     }
1218   }
1219   gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1220   abandon_current_tag(parser);
1221   return emit_temporary_buffer(parser, output);
1222 }
1223 
1224 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
handle_script_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1225 static StateResult handle_script_lt_state(GumboParser* parser,
1226     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1227   assert(temporary_buffer_equals(parser, "<"));
1228   if (c == '/') {
1229     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1230     append_char_to_temporary_buffer(parser, '/');
1231     return NEXT_CHAR;
1232   } else if (c == '!') {
1233     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1234     append_char_to_temporary_buffer(parser, '!');
1235     return emit_temporary_buffer(parser, output);
1236   } else {
1237     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1238     tokenizer->_reconsume_current_input = true;
1239     return emit_temporary_buffer(parser, output);
1240   }
1241 }
1242 
1243 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
handle_script_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1244 static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1245     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1246   assert(temporary_buffer_equals(parser, "</"));
1247   if (is_alpha(c)) {
1248     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1249     start_new_tag(parser, false);
1250     append_char_to_temporary_buffer(parser, c);
1251     return NEXT_CHAR;
1252   } else {
1253     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1254     return emit_temporary_buffer(parser, output);
1255   }
1256 }
1257 
1258 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
handle_script_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1259 static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1260     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1261   assert(tokenizer->_temporary_buffer.length >= 2);
1262   if (is_alpha(c)) {
1263     append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1264     append_char_to_temporary_buffer(parser, c);
1265     return NEXT_CHAR;
1266   } else if (is_appropriate_end_tag(parser)) {
1267     switch (c) {
1268       case '\t':
1269       case '\n':
1270       case '\f':
1271       case ' ':
1272         finish_tag_name(parser);
1273         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1274         return NEXT_CHAR;
1275       case '/':
1276         finish_tag_name(parser);
1277         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1278         return NEXT_CHAR;
1279       case '>':
1280         finish_tag_name(parser);
1281         gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1282         return emit_current_tag(parser, output);
1283     }
1284   }
1285   gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1286   abandon_current_tag(parser);
1287   return emit_temporary_buffer(parser, output);
1288 }
1289 
1290 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
handle_script_escaped_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1291 static StateResult handle_script_escaped_start_state(GumboParser* parser,
1292     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1293   if (c == '-') {
1294     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1295     return emit_current_char(parser, output);
1296   } else {
1297     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1298     tokenizer->_reconsume_current_input = true;
1299     return NEXT_CHAR;
1300   }
1301 }
1302 
1303 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
handle_script_escaped_start_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1304 static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1305     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1306   if (c == '-') {
1307     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1308     return emit_current_char(parser, output);
1309   } else {
1310     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1311     tokenizer->_reconsume_current_input = true;
1312     return NEXT_CHAR;
1313   }
1314 }
1315 
1316 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
handle_script_escaped_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1317 static StateResult handle_script_escaped_state(GumboParser* parser,
1318     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1319   switch (c) {
1320     case '-':
1321       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1322       return emit_current_char(parser, output);
1323     case '<':
1324       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1325       clear_temporary_buffer(parser);
1326       append_char_to_temporary_buffer(parser, c);
1327       return NEXT_CHAR;
1328     case '\0':
1329       return emit_replacement_char(parser, output);
1330     case -1:
1331       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1332       return emit_eof(parser, output);
1333     default:
1334       return emit_current_char(parser, output);
1335   }
1336 }
1337 
1338 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
handle_script_escaped_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1339 static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1340     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1341   switch (c) {
1342     case '-':
1343       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1344       return emit_current_char(parser, output);
1345     case '<':
1346       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1347       clear_temporary_buffer(parser);
1348       append_char_to_temporary_buffer(parser, c);
1349       return NEXT_CHAR;
1350     case '\0':
1351       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1352       return emit_replacement_char(parser, output);
1353     case -1:
1354       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1355       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1356       return NEXT_CHAR;
1357     default:
1358       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1359       return emit_current_char(parser, output);
1360   }
1361 }
1362 
1363 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
handle_script_escaped_dash_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1364 static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1365     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1366   switch (c) {
1367     case '-':
1368       return emit_current_char(parser, output);
1369     case '<':
1370       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1371       clear_temporary_buffer(parser);
1372       append_char_to_temporary_buffer(parser, c);
1373       return NEXT_CHAR;
1374     case '>':
1375       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1376       return emit_current_char(parser, output);
1377     case '\0':
1378       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1379       return emit_replacement_char(parser, output);
1380     case -1:
1381       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1382       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1383       return NEXT_CHAR;
1384     default:
1385       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1386       return emit_current_char(parser, output);
1387   }
1388 }
1389 
1390 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
handle_script_escaped_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1391 static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1392     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1393   assert(temporary_buffer_equals(parser, "<"));
1394   assert(!tokenizer->_script_data_buffer.length);
1395   if (c == '/') {
1396     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1397     append_char_to_temporary_buffer(parser, c);
1398     return NEXT_CHAR;
1399   } else if (is_alpha(c)) {
1400     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1401     append_char_to_temporary_buffer(parser, c);
1402     gumbo_string_buffer_append_codepoint(
1403         parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1404     return emit_temporary_buffer(parser, output);
1405   } else {
1406     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1407     return emit_temporary_buffer(parser, output);
1408   }
1409 }
1410 
1411 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
handle_script_escaped_end_tag_open_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1412 static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1413     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1414   assert(temporary_buffer_equals(parser, "</"));
1415   if (is_alpha(c)) {
1416     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1417     start_new_tag(parser, false);
1418     append_char_to_temporary_buffer(parser, c);
1419     return NEXT_CHAR;
1420   } else {
1421     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1422     return emit_temporary_buffer(parser, output);
1423   }
1424 }
1425 
1426 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
handle_script_escaped_end_tag_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1427 static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1428     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1429   assert(tokenizer->_temporary_buffer.length >= 2);
1430   if (is_alpha(c)) {
1431     append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1432     append_char_to_temporary_buffer(parser, c);
1433     return NEXT_CHAR;
1434   } else if (is_appropriate_end_tag(parser)) {
1435     switch (c) {
1436       case '\t':
1437       case '\n':
1438       case '\f':
1439       case ' ':
1440         finish_tag_name(parser);
1441         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1442         return NEXT_CHAR;
1443       case '/':
1444         finish_tag_name(parser);
1445         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1446         return NEXT_CHAR;
1447       case '>':
1448         finish_tag_name(parser);
1449         gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1450         return emit_current_tag(parser, output);
1451     }
1452   }
1453   gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1454   abandon_current_tag(parser);
1455   return emit_temporary_buffer(parser, output);
1456 }
1457 
1458 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
handle_script_double_escaped_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1459 static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1460     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1461   switch (c) {
1462     case '\t':
1463     case '\n':
1464     case '\f':
1465     case ' ':
1466     case '/':
1467     case '>':
1468       gumbo_tokenizer_set_state(
1469           parser, gumbo_string_equals(&kScriptTag,
1470                       (GumboStringPiece*) &tokenizer->_script_data_buffer)
1471                       ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1472                       : GUMBO_LEX_SCRIPT_ESCAPED);
1473       return emit_current_char(parser, output);
1474     default:
1475       if (is_alpha(c)) {
1476         gumbo_string_buffer_append_codepoint(
1477             parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1478         return emit_current_char(parser, output);
1479       } else {
1480         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1481         tokenizer->_reconsume_current_input = true;
1482         return NEXT_CHAR;
1483       }
1484   }
1485 }
1486 
1487 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
handle_script_double_escaped_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1488 static StateResult handle_script_double_escaped_state(GumboParser* parser,
1489     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1490   switch (c) {
1491     case '-':
1492       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1493       return emit_current_char(parser, output);
1494     case '<':
1495       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1496       return emit_current_char(parser, output);
1497     case '\0':
1498       return emit_replacement_char(parser, output);
1499     case -1:
1500       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1501       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1502       return NEXT_CHAR;
1503     default:
1504       return emit_current_char(parser, output);
1505   }
1506 }
1507 
1508 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
handle_script_double_escaped_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1509 static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1510     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1511   switch (c) {
1512     case '-':
1513       gumbo_tokenizer_set_state(
1514           parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1515       return emit_current_char(parser, output);
1516     case '<':
1517       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1518       return emit_current_char(parser, output);
1519     case '\0':
1520       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1521       return emit_replacement_char(parser, output);
1522     case -1:
1523       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1524       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1525       return NEXT_CHAR;
1526     default:
1527       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1528       return emit_current_char(parser, output);
1529   }
1530 }
1531 
1532 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
handle_script_double_escaped_dash_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1533 static StateResult handle_script_double_escaped_dash_dash_state(
1534     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
1535     GumboToken* output) {
1536   switch (c) {
1537     case '-':
1538       return emit_current_char(parser, output);
1539     case '<':
1540       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1541       return emit_current_char(parser, output);
1542     case '>':
1543       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1544       return emit_current_char(parser, output);
1545     case '\0':
1546       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1547       return emit_replacement_char(parser, output);
1548     case -1:
1549       tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1550       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1551       return NEXT_CHAR;
1552     default:
1553       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1554       return emit_current_char(parser, output);
1555   }
1556 }
1557 
1558 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
handle_script_double_escaped_lt_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1559 static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1560     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1561   if (c == '/') {
1562     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1563     gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1564     return emit_current_char(parser, output);
1565   } else {
1566     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1567     tokenizer->_reconsume_current_input = true;
1568     return NEXT_CHAR;
1569   }
1570 }
1571 
1572 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
handle_script_double_escaped_end_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1573 static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1574     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1575   switch (c) {
1576     case '\t':
1577     case '\n':
1578     case '\f':
1579     case ' ':
1580     case '/':
1581     case '>':
1582       gumbo_tokenizer_set_state(
1583           parser, gumbo_string_equals(&kScriptTag,
1584                       (GumboStringPiece*) &tokenizer->_script_data_buffer)
1585                       ? GUMBO_LEX_SCRIPT_ESCAPED
1586                       : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1587       return emit_current_char(parser, output);
1588     default:
1589       if (is_alpha(c)) {
1590         gumbo_string_buffer_append_codepoint(
1591             parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1592         return emit_current_char(parser, output);
1593       } else {
1594         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1595         tokenizer->_reconsume_current_input = true;
1596         return NEXT_CHAR;
1597       }
1598   }
1599 }
1600 
1601 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
handle_before_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1602 static StateResult handle_before_attr_name_state(GumboParser* parser,
1603     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1604   switch (c) {
1605     case '\t':
1606     case '\n':
1607     case '\f':
1608     case ' ':
1609       return NEXT_CHAR;
1610     case '/':
1611       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1612       return NEXT_CHAR;
1613     case '>':
1614       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1615       return emit_current_tag(parser, output);
1616     case '\0':
1617       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1618       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1619       append_char_to_temporary_buffer(parser, 0xfffd);
1620       return NEXT_CHAR;
1621     case -1:
1622       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1623       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1624       abandon_current_tag(parser);
1625       return NEXT_CHAR;
1626     case '"':
1627     case '\'':
1628     case '<':
1629     case '=':
1630       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1631     // Fall through.
1632     default:
1633       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1634       append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1635       return NEXT_CHAR;
1636   }
1637 }
1638 
1639 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
handle_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1640 static StateResult handle_attr_name_state(GumboParser* parser,
1641     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1642   switch (c) {
1643     case '\t':
1644     case '\n':
1645     case '\f':
1646     case ' ':
1647       finish_attribute_name(parser);
1648       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1649       return NEXT_CHAR;
1650     case '/':
1651       finish_attribute_name(parser);
1652       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1653       return NEXT_CHAR;
1654     case '=':
1655       finish_attribute_name(parser);
1656       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1657       return NEXT_CHAR;
1658     case '>':
1659       finish_attribute_name(parser);
1660       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1661       return emit_current_tag(parser, output);
1662     case '\0':
1663       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1664       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1665       return NEXT_CHAR;
1666     case -1:
1667       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1668       abandon_current_tag(parser);
1669       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1670       return NEXT_CHAR;
1671     case '"':
1672     case '\'':
1673     case '<':
1674       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1675     // Fall through.
1676     default:
1677       append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1678       return NEXT_CHAR;
1679   }
1680 }
1681 
1682 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
handle_after_attr_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1683 static StateResult handle_after_attr_name_state(GumboParser* parser,
1684     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1685   switch (c) {
1686     case '\t':
1687     case '\n':
1688     case '\f':
1689     case ' ':
1690       return NEXT_CHAR;
1691     case '/':
1692       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1693       return NEXT_CHAR;
1694     case '=':
1695       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1696       return NEXT_CHAR;
1697     case '>':
1698       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1699       return emit_current_tag(parser, output);
1700     case '\0':
1701       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1702       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1703       append_char_to_temporary_buffer(parser, 0xfffd);
1704       return NEXT_CHAR;
1705     case -1:
1706       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1707       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1708       abandon_current_tag(parser);
1709       return NEXT_CHAR;
1710     case '"':
1711     case '\'':
1712     case '<':
1713       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1714     // Fall through.
1715     default:
1716       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1717       append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1718       return NEXT_CHAR;
1719   }
1720 }
1721 
1722 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
handle_before_attr_value_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1723 static StateResult handle_before_attr_value_state(GumboParser* parser,
1724     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1725   switch (c) {
1726     case '\t':
1727     case '\n':
1728     case '\f':
1729     case ' ':
1730       return NEXT_CHAR;
1731     case '"':
1732       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1733       reset_tag_buffer_start_point(parser);
1734       return NEXT_CHAR;
1735     case '&':
1736       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1737       tokenizer->_reconsume_current_input = true;
1738       return NEXT_CHAR;
1739     case '\'':
1740       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1741       reset_tag_buffer_start_point(parser);
1742       return NEXT_CHAR;
1743     case '\0':
1744       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1745       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1746       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1747       return NEXT_CHAR;
1748     case -1:
1749       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1750       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1751       abandon_current_tag(parser);
1752       tokenizer->_reconsume_current_input = true;
1753       return NEXT_CHAR;
1754     case '>':
1755       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1756       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1757       emit_current_tag(parser, output);
1758       return RETURN_ERROR;
1759     case '<':
1760     case '=':
1761     case '`':
1762       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1763     // Fall through.
1764     default:
1765       gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1766       append_char_to_tag_buffer(parser, c, true);
1767       return NEXT_CHAR;
1768   }
1769 }
1770 
1771 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
handle_attr_value_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1772 static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1773     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1774   switch (c) {
1775     case '"':
1776       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1777       return NEXT_CHAR;
1778     case '&':
1779       tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1780       gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1781       tokenizer->_reconsume_current_input = true;
1782       return NEXT_CHAR;
1783     case '\0':
1784       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1785       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1786       return NEXT_CHAR;
1787     case -1:
1788       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
1789       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1790       abandon_current_tag(parser);
1791       tokenizer->_reconsume_current_input = true;
1792       return NEXT_CHAR;
1793     default:
1794       append_char_to_tag_buffer(parser, c, false);
1795       return NEXT_CHAR;
1796   }
1797 }
1798 
1799 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
handle_attr_value_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1800 static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1801     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1802   switch (c) {
1803     case '\'':
1804       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1805       return NEXT_CHAR;
1806     case '&':
1807       tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1808       gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1809       tokenizer->_reconsume_current_input = true;
1810       return NEXT_CHAR;
1811     case '\0':
1812       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1813       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1814       return NEXT_CHAR;
1815     case -1:
1816       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
1817       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1818       abandon_current_tag(parser);
1819       tokenizer->_reconsume_current_input = true;
1820       return NEXT_CHAR;
1821     default:
1822       append_char_to_tag_buffer(parser, c, false);
1823       return NEXT_CHAR;
1824   }
1825 }
1826 
1827 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
handle_attr_value_unquoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1828 static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1829     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1830   switch (c) {
1831     case '\t':
1832     case '\n':
1833     case '\f':
1834     case ' ':
1835       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1836       finish_attribute_value(parser);
1837       return NEXT_CHAR;
1838     case '&':
1839       tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1840       gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1841       tokenizer->_reconsume_current_input = true;
1842       return NEXT_CHAR;
1843     case '>':
1844       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1845       finish_attribute_value(parser);
1846       return emit_current_tag(parser, output);
1847     case '\0':
1848       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1849       append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1850       return NEXT_CHAR;
1851     case -1:
1852       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1853       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1854       tokenizer->_reconsume_current_input = true;
1855       abandon_current_tag(parser);
1856       return NEXT_CHAR;
1857     case '<':
1858     case '=':
1859     case '"':
1860     case '\'':
1861     case '`':
1862       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1863     // Fall through.
1864     default:
1865       append_char_to_tag_buffer(parser, c, true);
1866       return NEXT_CHAR;
1867   }
1868 }
1869 
1870 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
handle_char_ref_in_attr_value_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1871 static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1872     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1873   OneOrTwoCodepoints char_ref;
1874   int allowed_char;
1875   bool is_unquoted = false;
1876   switch (tokenizer->_tag_state._attr_value_state) {
1877     case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
1878       allowed_char = '"';
1879       break;
1880     case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
1881       allowed_char = '\'';
1882       break;
1883     case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
1884       allowed_char = '>';
1885       is_unquoted = true;
1886       break;
1887     default:
1888       // -Wmaybe-uninitialized is a little overzealous here, and doesn't
1889       // get that the assert(0) means this codepath will never happen.
1890       allowed_char = ' ';
1891       assert(0);
1892   }
1893 
1894   // Ignore the status, since we don't have a convenient way of signalling that
1895   // a parser error has occurred when the error occurs in the middle of a
1896   // multi-state token.  We'd need a flag inside the TokenizerState to do this,
1897   // but that's a low priority fix.
1898   consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
1899   if (char_ref.first != kGumboNoChar) {
1900     tokenizer->_reconsume_current_input = true;
1901     append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
1902     if (char_ref.second != kGumboNoChar) {
1903       append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
1904     }
1905   } else {
1906     append_char_to_tag_buffer(parser, '&', is_unquoted);
1907   }
1908   gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
1909   return NEXT_CHAR;
1910 }
1911 
1912 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
handle_after_attr_value_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1913 static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1914     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1915   finish_attribute_value(parser);
1916   switch (c) {
1917     case '\t':
1918     case '\n':
1919     case '\f':
1920     case ' ':
1921       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1922       return NEXT_CHAR;
1923     case '/':
1924       gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1925       return NEXT_CHAR;
1926     case '>':
1927       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1928       return emit_current_tag(parser, output);
1929     case -1:
1930       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
1931       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1932       abandon_current_tag(parser);
1933       tokenizer->_reconsume_current_input = true;
1934       return NEXT_CHAR;
1935     default:
1936       tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
1937       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1938       tokenizer->_reconsume_current_input = true;
1939       return NEXT_CHAR;
1940   }
1941 }
1942 
1943 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
handle_self_closing_start_tag_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1944 static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1945     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1946   switch (c) {
1947     case '>':
1948       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1949       tokenizer->_tag_state._is_self_closing = true;
1950       return emit_current_tag(parser, output);
1951     case -1:
1952       tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
1953       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1954       abandon_current_tag(parser);
1955       return NEXT_CHAR;
1956     default:
1957       tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
1958       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1959       tokenizer->_reconsume_current_input = true;
1960       return NEXT_CHAR;
1961   }
1962 }
1963 
1964 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
handle_bogus_comment_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1965 static StateResult handle_bogus_comment_state(GumboParser* parser,
1966     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1967   while (c != '>' && c != -1) {
1968     if (c == '\0') {
1969       c = 0xFFFD;
1970     }
1971     append_char_to_temporary_buffer(parser, c);
1972     utf8iterator_next(&tokenizer->_input);
1973     c = utf8iterator_current(&tokenizer->_input);
1974   }
1975   gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1976   return emit_comment(parser, output);
1977 }
1978 
1979 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
handle_markup_declaration_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)1980 static StateResult handle_markup_declaration_state(GumboParser* parser,
1981     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1982   if (utf8iterator_maybe_consume_match(
1983           &tokenizer->_input, "--", sizeof("--") - 1, true)) {
1984     gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
1985     tokenizer->_reconsume_current_input = true;
1986   } else if (utf8iterator_maybe_consume_match(
1987                  &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
1988     gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
1989     tokenizer->_reconsume_current_input = true;
1990     // If we get here, we know we'll eventually emit a doctype token, so now is
1991     // the time to initialize the doctype strings.  (Not in doctype_state_init,
1992     // since then they'll leak if ownership never gets transferred to the
1993     // doctype token.
1994     tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
1995     tokenizer->_doc_type_state.public_identifier =
1996         gumbo_copy_stringz(parser, "");
1997     tokenizer->_doc_type_state.system_identifier =
1998         gumbo_copy_stringz(parser, "");
1999   } else if (tokenizer->_is_current_node_foreign &&
2000              utf8iterator_maybe_consume_match(
2001                  &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2002     gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2003     tokenizer->_is_in_cdata = true;
2004     tokenizer->_reconsume_current_input = true;
2005   } else {
2006     tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2007     gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2008     tokenizer->_reconsume_current_input = true;
2009     clear_temporary_buffer(parser);
2010   }
2011   return NEXT_CHAR;
2012 }
2013 
2014 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
handle_comment_start_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2015 static StateResult handle_comment_start_state(GumboParser* parser,
2016     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2017   switch (c) {
2018     case '-':
2019       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2020       return NEXT_CHAR;
2021     case '\0':
2022       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2023       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2024       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2025       return NEXT_CHAR;
2026     case '>':
2027       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2028       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2029       emit_comment(parser, output);
2030       return RETURN_ERROR;
2031     case -1:
2032       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2033       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2034       emit_comment(parser, output);
2035       return RETURN_ERROR;
2036     default:
2037       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2038       append_char_to_temporary_buffer(parser, c);
2039       return NEXT_CHAR;
2040   }
2041 }
2042 
2043 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
handle_comment_start_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2044 static StateResult handle_comment_start_dash_state(GumboParser* parser,
2045     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2046   switch (c) {
2047     case '-':
2048       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2049       return NEXT_CHAR;
2050     case '\0':
2051       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2052       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2053       append_char_to_temporary_buffer(parser, '-');
2054       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2055       return NEXT_CHAR;
2056     case '>':
2057       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2058       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2059       emit_comment(parser, output);
2060       return RETURN_ERROR;
2061     case -1:
2062       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2063       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2064       emit_comment(parser, output);
2065       return RETURN_ERROR;
2066     default:
2067       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2068       append_char_to_temporary_buffer(parser, '-');
2069       append_char_to_temporary_buffer(parser, c);
2070       return NEXT_CHAR;
2071   }
2072 }
2073 
2074 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
handle_comment_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2075 static StateResult handle_comment_state(GumboParser* parser,
2076     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2077   switch (c) {
2078     case '-':
2079       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2080       return NEXT_CHAR;
2081     case '\0':
2082       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2083       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2084       return NEXT_CHAR;
2085     case -1:
2086       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2087       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2088       emit_comment(parser, output);
2089       return RETURN_ERROR;
2090     default:
2091       append_char_to_temporary_buffer(parser, c);
2092       return NEXT_CHAR;
2093   }
2094 }
2095 
2096 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
handle_comment_end_dash_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2097 static StateResult handle_comment_end_dash_state(GumboParser* parser,
2098     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2099   switch (c) {
2100     case '-':
2101       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2102       return NEXT_CHAR;
2103     case '\0':
2104       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2105       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2106       append_char_to_temporary_buffer(parser, '-');
2107       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2108       return NEXT_CHAR;
2109     case -1:
2110       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2111       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2112       emit_comment(parser, output);
2113       return RETURN_ERROR;
2114     default:
2115       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2116       append_char_to_temporary_buffer(parser, '-');
2117       append_char_to_temporary_buffer(parser, c);
2118       return NEXT_CHAR;
2119   }
2120 }
2121 
2122 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
handle_comment_end_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2123 static StateResult handle_comment_end_state(GumboParser* parser,
2124     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2125   switch (c) {
2126     case '>':
2127       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2128       return emit_comment(parser, output);
2129     case '\0':
2130       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2131       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2132       append_char_to_temporary_buffer(parser, '-');
2133       append_char_to_temporary_buffer(parser, '-');
2134       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2135       return NEXT_CHAR;
2136     case '!':
2137       tokenizer_add_parse_error(
2138           parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2139       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2140       return NEXT_CHAR;
2141     case '-':
2142       tokenizer_add_parse_error(
2143           parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2144       append_char_to_temporary_buffer(parser, '-');
2145       return NEXT_CHAR;
2146     case -1:
2147       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2148       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2149       emit_comment(parser, output);
2150       return RETURN_ERROR;
2151     default:
2152       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2153       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2154       append_char_to_temporary_buffer(parser, '-');
2155       append_char_to_temporary_buffer(parser, '-');
2156       append_char_to_temporary_buffer(parser, c);
2157       return NEXT_CHAR;
2158   }
2159 }
2160 
2161 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
handle_comment_end_bang_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2162 static StateResult handle_comment_end_bang_state(GumboParser* parser,
2163     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2164   switch (c) {
2165     case '-':
2166       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2167       append_char_to_temporary_buffer(parser, '-');
2168       append_char_to_temporary_buffer(parser, '-');
2169       append_char_to_temporary_buffer(parser, '!');
2170       return NEXT_CHAR;
2171     case '>':
2172       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2173       return emit_comment(parser, output);
2174     case '\0':
2175       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2176       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2177       append_char_to_temporary_buffer(parser, '-');
2178       append_char_to_temporary_buffer(parser, '-');
2179       append_char_to_temporary_buffer(parser, '!');
2180       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2181       return NEXT_CHAR;
2182     case -1:
2183       tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2184       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2185       emit_comment(parser, output);
2186       return RETURN_ERROR;
2187     default:
2188       gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2189       append_char_to_temporary_buffer(parser, '-');
2190       append_char_to_temporary_buffer(parser, '-');
2191       append_char_to_temporary_buffer(parser, '!');
2192       append_char_to_temporary_buffer(parser, c);
2193       return NEXT_CHAR;
2194   }
2195 }
2196 
2197 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
handle_doctype_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2198 static StateResult handle_doctype_state(GumboParser* parser,
2199     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2200   assert(!tokenizer->_temporary_buffer.length);
2201   switch (c) {
2202     case '\t':
2203     case '\n':
2204     case '\f':
2205     case ' ':
2206       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2207       return NEXT_CHAR;
2208     case -1:
2209       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2210       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2211       tokenizer->_doc_type_state.force_quirks = true;
2212       emit_doctype(parser, output);
2213       return RETURN_ERROR;
2214     default:
2215       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2216       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2217       tokenizer->_reconsume_current_input = true;
2218       tokenizer->_doc_type_state.force_quirks = true;
2219       return NEXT_CHAR;
2220   }
2221 }
2222 
2223 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
handle_before_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2224 static StateResult handle_before_doctype_name_state(GumboParser* parser,
2225     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2226   switch (c) {
2227     case '\t':
2228     case '\n':
2229     case '\f':
2230     case ' ':
2231       return NEXT_CHAR;
2232     case '\0':
2233       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2234       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2235       tokenizer->_doc_type_state.force_quirks = true;
2236       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2237       return NEXT_CHAR;
2238     case '>':
2239       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2240       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2241       tokenizer->_doc_type_state.force_quirks = true;
2242       emit_doctype(parser, output);
2243       return RETURN_ERROR;
2244     case -1:
2245       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2246       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2247       tokenizer->_doc_type_state.force_quirks = true;
2248       emit_doctype(parser, output);
2249       return RETURN_ERROR;
2250     default:
2251       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2252       tokenizer->_doc_type_state.force_quirks = false;
2253       append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2254       return NEXT_CHAR;
2255   }
2256 }
2257 
2258 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
handle_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2259 static StateResult handle_doctype_name_state(GumboParser* parser,
2260     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2261   switch (c) {
2262     case '\t':
2263     case '\n':
2264     case '\f':
2265     case ' ':
2266       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2267       gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2268       finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2269       return NEXT_CHAR;
2270     case '>':
2271       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2272       gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2273       finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2274       emit_doctype(parser, output);
2275       return RETURN_SUCCESS;
2276     case '\0':
2277       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2278       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2279       return NEXT_CHAR;
2280     case -1:
2281       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2282       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2283       tokenizer->_doc_type_state.force_quirks = true;
2284       gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2285       finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2286       emit_doctype(parser, output);
2287       return RETURN_ERROR;
2288     default:
2289       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2290       tokenizer->_doc_type_state.force_quirks = false;
2291       append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2292       return NEXT_CHAR;
2293   }
2294 }
2295 
2296 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
handle_after_doctype_name_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2297 static StateResult handle_after_doctype_name_state(GumboParser* parser,
2298     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2299   switch (c) {
2300     case '\t':
2301     case '\n':
2302     case '\f':
2303     case ' ':
2304       return NEXT_CHAR;
2305     case '>':
2306       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2307       emit_doctype(parser, output);
2308       return RETURN_SUCCESS;
2309     case -1:
2310       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2311       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2312       tokenizer->_doc_type_state.force_quirks = true;
2313       emit_doctype(parser, output);
2314       return RETURN_ERROR;
2315     default:
2316       if (utf8iterator_maybe_consume_match(
2317               &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2318         gumbo_tokenizer_set_state(
2319             parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2320         tokenizer->_reconsume_current_input = true;
2321       } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2322                      sizeof("SYSTEM") - 1, false)) {
2323         gumbo_tokenizer_set_state(
2324             parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2325         tokenizer->_reconsume_current_input = true;
2326       } else {
2327         tokenizer_add_parse_error(
2328             parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2329         gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2330         tokenizer->_doc_type_state.force_quirks = true;
2331       }
2332       return NEXT_CHAR;
2333   }
2334 }
2335 
2336 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
handle_after_doctype_public_keyword_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2337 static StateResult handle_after_doctype_public_keyword_state(
2338     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2339     GumboToken* output) {
2340   switch (c) {
2341     case '\t':
2342     case '\n':
2343     case '\f':
2344     case ' ':
2345       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2346       return NEXT_CHAR;
2347     case '"':
2348       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2349       assert(temporary_buffer_equals(parser, ""));
2350       gumbo_tokenizer_set_state(
2351           parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2352       return NEXT_CHAR;
2353     case '\'':
2354       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2355       assert(temporary_buffer_equals(parser, ""));
2356       gumbo_tokenizer_set_state(
2357           parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2358       return NEXT_CHAR;
2359     case '>':
2360       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2361       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2362       tokenizer->_doc_type_state.force_quirks = true;
2363       emit_doctype(parser, output);
2364       return RETURN_ERROR;
2365     case -1:
2366       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2367       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2368       tokenizer->_doc_type_state.force_quirks = true;
2369       emit_doctype(parser, output);
2370       return RETURN_ERROR;
2371     default:
2372       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2373       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2374       tokenizer->_doc_type_state.force_quirks = true;
2375       emit_doctype(parser, output);
2376       return RETURN_ERROR;
2377   }
2378 }
2379 
2380 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
handle_before_doctype_public_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2381 static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2382     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2383   switch (c) {
2384     case '\t':
2385     case '\n':
2386     case '\f':
2387     case ' ':
2388       return NEXT_CHAR;
2389     case '"':
2390       assert(temporary_buffer_equals(parser, ""));
2391       gumbo_tokenizer_set_state(
2392           parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2393       return NEXT_CHAR;
2394     case '\'':
2395       assert(temporary_buffer_equals(parser, ""));
2396       gumbo_tokenizer_set_state(
2397           parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2398       return NEXT_CHAR;
2399     case '>':
2400       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2401       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2402       tokenizer->_doc_type_state.force_quirks = true;
2403       emit_doctype(parser, output);
2404       return RETURN_ERROR;
2405     case -1:
2406       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2407       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2408       tokenizer->_doc_type_state.force_quirks = true;
2409       emit_doctype(parser, output);
2410       return RETURN_ERROR;
2411     default:
2412       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2413       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2414       tokenizer->_doc_type_state.force_quirks = true;
2415       emit_doctype(parser, output);
2416       return RETURN_ERROR;
2417   }
2418 }
2419 
2420 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
handle_doctype_public_id_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2421 static StateResult handle_doctype_public_id_double_quoted_state(
2422     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2423     GumboToken* output) {
2424   switch (c) {
2425     case '"':
2426       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2427       finish_doctype_public_id(parser);
2428       return NEXT_CHAR;
2429     case '\0':
2430       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2431       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2432       return NEXT_CHAR;
2433     case '>':
2434       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2435       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2436       tokenizer->_doc_type_state.force_quirks = true;
2437       finish_doctype_public_id(parser);
2438       emit_doctype(parser, output);
2439       return RETURN_ERROR;
2440     case -1:
2441       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2442       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2443       tokenizer->_doc_type_state.force_quirks = true;
2444       finish_doctype_public_id(parser);
2445       emit_doctype(parser, output);
2446       return RETURN_ERROR;
2447     default:
2448       append_char_to_temporary_buffer(parser, c);
2449       return NEXT_CHAR;
2450   }
2451 }
2452 
2453 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
handle_doctype_public_id_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2454 static StateResult handle_doctype_public_id_single_quoted_state(
2455     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2456     GumboToken* output) {
2457   switch (c) {
2458     case '\'':
2459       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2460       finish_doctype_public_id(parser);
2461       return NEXT_CHAR;
2462     case '\0':
2463       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2464       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2465       return NEXT_CHAR;
2466     case '>':
2467       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2468       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2469       tokenizer->_doc_type_state.force_quirks = true;
2470       finish_doctype_public_id(parser);
2471       emit_doctype(parser, output);
2472       return RETURN_ERROR;
2473     case -1:
2474       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2475       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2476       tokenizer->_doc_type_state.force_quirks = true;
2477       finish_doctype_public_id(parser);
2478       emit_doctype(parser, output);
2479       return RETURN_ERROR;
2480     default:
2481       append_char_to_temporary_buffer(parser, c);
2482       return NEXT_CHAR;
2483   }
2484 }
2485 
2486 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
handle_after_doctype_public_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2487 static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2488     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2489   switch (c) {
2490     case '\t':
2491     case '\n':
2492     case '\f':
2493     case ' ':
2494       gumbo_tokenizer_set_state(
2495           parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2496       return NEXT_CHAR;
2497     case '>':
2498       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2499       emit_doctype(parser, output);
2500       return RETURN_SUCCESS;
2501     case '"':
2502       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2503       assert(temporary_buffer_equals(parser, ""));
2504       gumbo_tokenizer_set_state(
2505           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2506       return NEXT_CHAR;
2507     case '\'':
2508       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2509       assert(temporary_buffer_equals(parser, ""));
2510       gumbo_tokenizer_set_state(
2511           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2512       return NEXT_CHAR;
2513     case -1:
2514       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2515       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2516       tokenizer->_reconsume_current_input = true;
2517       tokenizer->_doc_type_state.force_quirks = true;
2518       emit_doctype(parser, output);
2519       return RETURN_ERROR;
2520     default:
2521       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2522       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2523       tokenizer->_doc_type_state.force_quirks = true;
2524       return NEXT_CHAR;
2525   }
2526 }
2527 
2528 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
handle_between_doctype_public_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2529 static StateResult handle_between_doctype_public_system_id_state(
2530     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2531     GumboToken* output) {
2532   switch (c) {
2533     case '\t':
2534     case '\n':
2535     case '\f':
2536     case ' ':
2537       return NEXT_CHAR;
2538     case '>':
2539       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2540       emit_doctype(parser, output);
2541       return RETURN_SUCCESS;
2542     case '"':
2543       assert(temporary_buffer_equals(parser, ""));
2544       gumbo_tokenizer_set_state(
2545           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2546       return NEXT_CHAR;
2547     case '\'':
2548       assert(temporary_buffer_equals(parser, ""));
2549       gumbo_tokenizer_set_state(
2550           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2551       return NEXT_CHAR;
2552     case -1:
2553       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2554       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2555       tokenizer->_doc_type_state.force_quirks = true;
2556       emit_doctype(parser, output);
2557       return RETURN_ERROR;
2558     default:
2559       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2560       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2561       tokenizer->_doc_type_state.force_quirks = true;
2562       emit_doctype(parser, output);
2563       return RETURN_ERROR;
2564   }
2565 }
2566 
2567 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
handle_after_doctype_system_keyword_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2568 static StateResult handle_after_doctype_system_keyword_state(
2569     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2570     GumboToken* output) {
2571   switch (c) {
2572     case '\t':
2573     case '\n':
2574     case '\f':
2575     case ' ':
2576       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2577       return NEXT_CHAR;
2578     case '"':
2579       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2580       assert(temporary_buffer_equals(parser, ""));
2581       gumbo_tokenizer_set_state(
2582           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2583       return NEXT_CHAR;
2584     case '\'':
2585       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2586       assert(temporary_buffer_equals(parser, ""));
2587       gumbo_tokenizer_set_state(
2588           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2589       return NEXT_CHAR;
2590     case '>':
2591       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2592       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2593       tokenizer->_doc_type_state.force_quirks = true;
2594       emit_doctype(parser, output);
2595       return RETURN_ERROR;
2596     case -1:
2597       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2598       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2599       tokenizer->_doc_type_state.force_quirks = true;
2600       emit_doctype(parser, output);
2601       return RETURN_ERROR;
2602     default:
2603       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2604       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2605       tokenizer->_doc_type_state.force_quirks = true;
2606       return NEXT_CHAR;
2607   }
2608 }
2609 
2610 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
handle_before_doctype_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2611 static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2612     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2613   switch (c) {
2614     case '\t':
2615     case '\n':
2616     case '\f':
2617     case ' ':
2618       return NEXT_CHAR;
2619     case '"':
2620       assert(temporary_buffer_equals(parser, ""));
2621       gumbo_tokenizer_set_state(
2622           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2623       return NEXT_CHAR;
2624     case '\'':
2625       assert(temporary_buffer_equals(parser, ""));
2626       gumbo_tokenizer_set_state(
2627           parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2628       return NEXT_CHAR;
2629     case '>':
2630       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2631       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2632       tokenizer->_doc_type_state.force_quirks = true;
2633       emit_doctype(parser, output);
2634       return RETURN_ERROR;
2635     case -1:
2636       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2637       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2638       tokenizer->_doc_type_state.force_quirks = true;
2639       emit_doctype(parser, output);
2640       return RETURN_ERROR;
2641     default:
2642       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2643       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2644       tokenizer->_doc_type_state.force_quirks = true;
2645       return NEXT_CHAR;
2646   }
2647 }
2648 
2649 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
handle_doctype_system_id_double_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2650 static StateResult handle_doctype_system_id_double_quoted_state(
2651     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2652     GumboToken* output) {
2653   switch (c) {
2654     case '"':
2655       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2656       finish_doctype_system_id(parser);
2657       return NEXT_CHAR;
2658     case '\0':
2659       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2660       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2661       return NEXT_CHAR;
2662     case '>':
2663       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2664       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2665       tokenizer->_doc_type_state.force_quirks = true;
2666       finish_doctype_system_id(parser);
2667       emit_doctype(parser, output);
2668       return RETURN_ERROR;
2669     case -1:
2670       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2671       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2672       tokenizer->_doc_type_state.force_quirks = true;
2673       finish_doctype_system_id(parser);
2674       emit_doctype(parser, output);
2675       return RETURN_ERROR;
2676     default:
2677       append_char_to_temporary_buffer(parser, c);
2678       return NEXT_CHAR;
2679   }
2680 }
2681 
2682 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
handle_doctype_system_id_single_quoted_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2683 static StateResult handle_doctype_system_id_single_quoted_state(
2684     GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2685     GumboToken* output) {
2686   switch (c) {
2687     case '\'':
2688       gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2689       finish_doctype_system_id(parser);
2690       return NEXT_CHAR;
2691     case '\0':
2692       tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2693       append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2694       return NEXT_CHAR;
2695     case '>':
2696       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2697       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2698       tokenizer->_doc_type_state.force_quirks = true;
2699       finish_doctype_system_id(parser);
2700       emit_doctype(parser, output);
2701       return RETURN_ERROR;
2702     case -1:
2703       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2704       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2705       tokenizer->_doc_type_state.force_quirks = true;
2706       finish_doctype_system_id(parser);
2707       emit_doctype(parser, output);
2708       return RETURN_ERROR;
2709     default:
2710       append_char_to_temporary_buffer(parser, c);
2711       return NEXT_CHAR;
2712   }
2713 }
2714 
2715 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
handle_after_doctype_system_id_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2716 static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2717     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2718   switch (c) {
2719     case '\t':
2720     case '\n':
2721     case '\f':
2722     case ' ':
2723       return NEXT_CHAR;
2724     case '>':
2725       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2726       emit_doctype(parser, output);
2727       return RETURN_SUCCESS;
2728     case -1:
2729       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2730       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2731       tokenizer->_doc_type_state.force_quirks = true;
2732       emit_doctype(parser, output);
2733       return RETURN_ERROR;
2734     default:
2735       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2736       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2737       return NEXT_CHAR;
2738   }
2739 }
2740 
2741 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
handle_bogus_doctype_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2742 static StateResult handle_bogus_doctype_state(GumboParser* parser,
2743     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2744   if (c == '>' || c == -1) {
2745     gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2746     emit_doctype(parser, output);
2747     return RETURN_ERROR;
2748   }
2749   return NEXT_CHAR;
2750 }
2751 
2752 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
handle_cdata_state(GumboParser * parser,GumboTokenizerState * tokenizer,int c,GumboToken * output)2753 static StateResult handle_cdata_state(GumboParser* parser,
2754     GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2755   if (c == -1 || utf8iterator_maybe_consume_match(
2756                      &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2757     tokenizer->_reconsume_current_input = true;
2758     reset_token_start_point(tokenizer);
2759     gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2760     tokenizer->_is_in_cdata = false;
2761     return NEXT_CHAR;
2762   } else {
2763     return emit_current_char(parser, output);
2764   }
2765 }
2766 
2767 typedef StateResult (*GumboLexerStateFunction)(
2768     GumboParser*, GumboTokenizerState*, int, GumboToken*);
2769 
2770 static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
2771     handle_char_ref_in_data_state, handle_rcdata_state,
2772     handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
2773     handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
2774     handle_tag_name_state, handle_rcdata_lt_state,
2775     handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
2776     handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
2777     handle_rawtext_end_tag_name_state, handle_script_lt_state,
2778     handle_script_end_tag_open_state, handle_script_end_tag_name_state,
2779     handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
2780     handle_script_escaped_state, handle_script_escaped_dash_state,
2781     handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
2782     handle_script_escaped_end_tag_open_state,
2783     handle_script_escaped_end_tag_name_state,
2784     handle_script_double_escaped_start_state,
2785     handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
2786     handle_script_double_escaped_dash_dash_state,
2787     handle_script_double_escaped_lt_state,
2788     handle_script_double_escaped_end_state, handle_before_attr_name_state,
2789     handle_attr_name_state, handle_after_attr_name_state,
2790     handle_before_attr_value_state, handle_attr_value_double_quoted_state,
2791     handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
2792     handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
2793     handle_self_closing_start_tag_state, handle_bogus_comment_state,
2794     handle_markup_declaration_state, handle_comment_start_state,
2795     handle_comment_start_dash_state, handle_comment_state,
2796     handle_comment_end_dash_state, handle_comment_end_state,
2797     handle_comment_end_bang_state, handle_doctype_state,
2798     handle_before_doctype_name_state, handle_doctype_name_state,
2799     handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
2800     handle_before_doctype_public_id_state,
2801     handle_doctype_public_id_double_quoted_state,
2802     handle_doctype_public_id_single_quoted_state,
2803     handle_after_doctype_public_id_state,
2804     handle_between_doctype_public_system_id_state,
2805     handle_after_doctype_system_keyword_state,
2806     handle_before_doctype_system_id_state,
2807     handle_doctype_system_id_double_quoted_state,
2808     handle_doctype_system_id_single_quoted_state,
2809     handle_after_doctype_system_id_state, handle_bogus_doctype_state,
2810     handle_cdata_state};
2811 
gumbo_lex(GumboParser * parser,GumboToken * output)2812 bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2813   // Because of the spec requirements that...
2814   //
2815   // 1. Tokens be handled immediately by the parser upon emission.
2816   // 2. Some states (eg. CDATA, or various error conditions) require the
2817   // emission of multiple tokens in the same states.
2818   // 3. The tokenizer often has to reconsume the same character in a different
2819   // state.
2820   //
2821   // ...all state must be held in the GumboTokenizer struct instead of in local
2822   // variables in this function.  That allows us to return from this method with
2823   // a token, and then immediately jump back to the same state with the same
2824   // input if we need to return a different token.  The various emit_* functions
2825   // are responsible for changing state (eg. flushing the chardata buffer,
2826   // reading the next input character) to avoid an infinite loop.
2827   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
2828 
2829   if (tokenizer->_buffered_emit_char != kGumboNoChar) {
2830     tokenizer->_reconsume_current_input = true;
2831     emit_char(parser, tokenizer->_buffered_emit_char, output);
2832     // And now that we've avoided advancing the input, make sure we set
2833     // _reconsume_current_input back to false to make sure the *next* character
2834     // isn't consumed twice.
2835     tokenizer->_reconsume_current_input = false;
2836     tokenizer->_buffered_emit_char = kGumboNoChar;
2837     return true;
2838   }
2839 
2840   if (maybe_emit_from_temporary_buffer(parser, output)) {
2841     return true;
2842   }
2843 
2844   while (1) {
2845     assert(!tokenizer->_temporary_buffer_emit);
2846     assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2847     int c = utf8iterator_current(&tokenizer->_input);
2848     gumbo_debug(
2849         "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
2850     StateResult result =
2851         dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2852     // We need to clear reconsume_current_input before returning to prevent
2853     // certain infinite loop states.
2854     bool should_advance = !tokenizer->_reconsume_current_input;
2855     tokenizer->_reconsume_current_input = false;
2856 
2857     if (result == RETURN_SUCCESS) {
2858       return true;
2859     } else if (result == RETURN_ERROR) {
2860       return false;
2861     }
2862 
2863     if (should_advance) {
2864       utf8iterator_next(&tokenizer->_input);
2865     }
2866   }
2867 }
2868 
gumbo_token_destroy(GumboParser * parser,GumboToken * token)2869 void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
2870   if (!token) return;
2871 
2872   switch (token->type) {
2873     case GUMBO_TOKEN_DOCTYPE:
2874       gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
2875       gumbo_parser_deallocate(
2876           parser, (void*) token->v.doc_type.public_identifier);
2877       gumbo_parser_deallocate(
2878           parser, (void*) token->v.doc_type.system_identifier);
2879       return;
2880     case GUMBO_TOKEN_START_TAG:
2881       for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2882         GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2883         if (attr) {
2884           // May have been nulled out if this token was merged with another.
2885           gumbo_destroy_attribute(parser, attr);
2886         }
2887       }
2888       gumbo_parser_deallocate(
2889           parser, (void*) token->v.start_tag.attributes.data);
2890       return;
2891     case GUMBO_TOKEN_COMMENT:
2892       gumbo_parser_deallocate(parser, (void*) token->v.text);
2893       return;
2894     default:
2895       return;
2896   }
2897 }
2898