1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 
17 #include <assert.h>
18 #include <ctype.h>
19 #include <stdarg.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <strings.h>
23 
24 #include "attribute.h"
25 #include "error.h"
26 #include "gumbo.h"
27 #include "insertion_mode.h"
28 #include "parser.h"
29 #include "tokenizer.h"
30 #include "tokenizer_states.h"
31 #include "utf8.h"
32 #include "util.h"
33 #include "vector.h"
34 
35 #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
36 
37 #define GUMBO_STRING(literal) \
38   { literal, sizeof(literal) - 1 }
39 #define TERMINATOR \
40   { "", 0 }
41 
42 typedef char gumbo_tagset[GUMBO_TAG_LAST];
43 #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML)
44 #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG)
45 #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML)
46 
47 #define TAGSET_INCLUDES(tagset, namespace, tag) \
48   (tag < GUMBO_TAG_LAST && tagset[(int) tag] == (1 << (int) namespace))
49 
50 // selected forward declarations as it is getting hard to find
51 // an appropriate order
52 static bool node_html_tag_is(const GumboNode*, GumboTag);
53 static GumboInsertionMode get_current_template_insertion_mode(
54     const GumboParser*);
55 static bool handle_in_template(GumboParser*, GumboToken*);
56 static void destroy_node(GumboParser*, GumboNode*);
57 
malloc_wrapper(void * unused,size_t size)58 static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); }
59 
free_wrapper(void * unused,void * ptr)60 static void free_wrapper(void* unused, void* ptr) { free(ptr); }
61 
62 const GumboOptions kGumboDefaultOptions = {&malloc_wrapper, &free_wrapper, NULL,
63     8, false, -1, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML};
64 
65 static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html");
66 static const GumboStringPiece kPublicIdHtml4_0 =
67     GUMBO_STRING("-//W3C//DTD HTML 4.0//EN");
68 static const GumboStringPiece kPublicIdHtml4_01 =
69     GUMBO_STRING("-//W3C//DTD HTML 4.01//EN");
70 static const GumboStringPiece kPublicIdXhtml1_0 =
71     GUMBO_STRING("-//W3C//DTD XHTML 1.0 Strict//EN");
72 static const GumboStringPiece kPublicIdXhtml1_1 =
73     GUMBO_STRING("-//W3C//DTD XHTML 1.1//EN");
74 static const GumboStringPiece kSystemIdRecHtml4_0 =
75     GUMBO_STRING("http://www.w3.org/TR/REC-html40/strict.dtd");
76 static const GumboStringPiece kSystemIdHtml4 =
77     GUMBO_STRING("http://www.w3.org/TR/html4/strict.dtd");
78 static const GumboStringPiece kSystemIdXhtmlStrict1_1 =
79     GUMBO_STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
80 static const GumboStringPiece kSystemIdXhtml1_1 =
81     GUMBO_STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
82 static const GumboStringPiece kSystemIdLegacyCompat =
83     GUMBO_STRING("about:legacy-compat");
84 
85 // The doctype arrays have an explicit terminator because we want to pass them
86 // to a helper function, and passing them as a pointer discards sizeof
87 // information.  The SVG arrays are used only by one-off functions, and so loops
88 // over them use sizeof directly instead of a terminator.
89 
90 static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
91     GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
92     GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
93     GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
94     GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"),
95     GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"),
96     GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
97     GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
98     GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"),
99     GUMBO_STRING("-//IETF//DTD HTML 2.0//"),
100     GUMBO_STRING("-//IETF//DTD HTML 2.1E//"),
101     GUMBO_STRING("-//IETF//DTD HTML 3.0//"),
102     GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"),
103     GUMBO_STRING("-//IETF//DTD HTML 3.2//"),
104     GUMBO_STRING("-//IETF//DTD HTML 3//"),
105     GUMBO_STRING("-//IETF//DTD HTML Level 0//"),
106     GUMBO_STRING("-//IETF//DTD HTML Level 1//"),
107     GUMBO_STRING("-//IETF//DTD HTML Level 2//"),
108     GUMBO_STRING("-//IETF//DTD HTML Level 3//"),
109     GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"),
110     GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"),
111     GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"),
112     GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"),
113     GUMBO_STRING("-//IETF//DTD HTML Strict//"),
114     GUMBO_STRING("-//IETF//DTD HTML//"),
115     GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"),
116     GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
117     GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
118     GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
119     GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
120     GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
121     GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
122     GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"),
123     GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
124     GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
125     GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
126     GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
127     GUMBO_STRING(
128         "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
129         "extensions to HTML 4.0//"),
130     GUMBO_STRING(
131         "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
132         "extensions to HTML 4.0//"),
133     GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
134     GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
135     GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
136     GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
137     GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"),
138     GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"),
139     GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"),
140     GUMBO_STRING("-//W3C//DTD HTML 3.2//"),
141     GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"),
142     GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"),
143     GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"),
144     GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"),
145     GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"),
146     GUMBO_STRING("-//W3C//DTD W3 HTML//"),
147     GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"),
148     GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
149     GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"), TERMINATOR};
150 
151 static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
152     GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
153     GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), GUMBO_STRING("HTML"),
154     TERMINATOR};
155 
156 static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
157     GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
158     TERMINATOR};
159 
160 static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
161     GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
162     GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"), TERMINATOR};
163 
164 static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] =
165     {GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"),
166         GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"), TERMINATOR};
167 
168 // Indexed by GumboNamespaceEnum; keep in sync with that.
169 static const char* kLegalXmlns[] = {"http://www.w3.org/1999/xhtml",
170     "http://www.w3.org/2000/svg", "http://www.w3.org/1998/Math/MathML"};
171 
172 typedef struct _ReplacementEntry {
173   const GumboStringPiece from;
174   const GumboStringPiece to;
175 } ReplacementEntry;
176 
177 #define REPLACEMENT_ENTRY(from, to) \
178   { GUMBO_STRING(from), GUMBO_STRING(to) }
179 
180 // Static data for SVG attribute replacements.
181 // https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes
182 static const ReplacementEntry kSvgAttributeReplacements[] = {
183     REPLACEMENT_ENTRY("attributename", "attributeName"),
184     REPLACEMENT_ENTRY("attributetype", "attributeType"),
185     REPLACEMENT_ENTRY("basefrequency", "baseFrequency"),
186     REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
187     REPLACEMENT_ENTRY("calcmode", "calcMode"),
188     REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
189     // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
190     // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
191     REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
192     REPLACEMENT_ENTRY("edgemode", "edgeMode"),
193     // REPLACEMENT_ENTRY("externalresourcesrequired",
194     // "externalResourcesRequired"),
195     // REPLACEMENT_ENTRY("filterres", "filterRes"),
196     REPLACEMENT_ENTRY("filterunits", "filterUnits"),
197     REPLACEMENT_ENTRY("glyphref", "glyphRef"),
198     REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
199     REPLACEMENT_ENTRY("gradientunits", "gradientUnits"),
200     REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"),
201     REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"),
202     REPLACEMENT_ENTRY("keypoints", "keyPoints"),
203     REPLACEMENT_ENTRY("keysplines", "keySplines"),
204     REPLACEMENT_ENTRY("keytimes", "keyTimes"),
205     REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"),
206     REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"),
207     REPLACEMENT_ENTRY("markerheight", "markerHeight"),
208     REPLACEMENT_ENTRY("markerunits", "markerUnits"),
209     REPLACEMENT_ENTRY("markerwidth", "markerWidth"),
210     REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"),
211     REPLACEMENT_ENTRY("maskunits", "maskUnits"),
212     REPLACEMENT_ENTRY("numoctaves", "numOctaves"),
213     REPLACEMENT_ENTRY("pathlength", "pathLength"),
214     REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"),
215     REPLACEMENT_ENTRY("patterntransform", "patternTransform"),
216     REPLACEMENT_ENTRY("patternunits", "patternUnits"),
217     REPLACEMENT_ENTRY("pointsatx", "pointsAtX"),
218     REPLACEMENT_ENTRY("pointsaty", "pointsAtY"),
219     REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"),
220     REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"),
221     REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"),
222     REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"),
223     REPLACEMENT_ENTRY("refx", "refX"), REPLACEMENT_ENTRY("refy", "refY"),
224     REPLACEMENT_ENTRY("repeatcount", "repeatCount"),
225     REPLACEMENT_ENTRY("repeatdur", "repeatDur"),
226     REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"),
227     REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"),
228     REPLACEMENT_ENTRY("specularconstant", "specularConstant"),
229     REPLACEMENT_ENTRY("specularexponent", "specularExponent"),
230     REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"),
231     REPLACEMENT_ENTRY("startoffset", "startOffset"),
232     REPLACEMENT_ENTRY("stddeviation", "stdDeviation"),
233     REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"),
234     REPLACEMENT_ENTRY("surfacescale", "surfaceScale"),
235     REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"),
236     REPLACEMENT_ENTRY("tablevalues", "tableValues"),
237     REPLACEMENT_ENTRY("targetx", "targetX"),
238     REPLACEMENT_ENTRY("targety", "targetY"),
239     REPLACEMENT_ENTRY("textlength", "textLength"),
240     REPLACEMENT_ENTRY("viewbox", "viewBox"),
241     REPLACEMENT_ENTRY("viewtarget", "viewTarget"),
242     REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"),
243     REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"),
244     REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"),
245 };
246 
247 static const ReplacementEntry kSvgTagReplacements[] = {
248     REPLACEMENT_ENTRY("altglyph", "altGlyph"),
249     REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"),
250     REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"),
251     REPLACEMENT_ENTRY("animatecolor", "animateColor"),
252     REPLACEMENT_ENTRY("animatemotion", "animateMotion"),
253     REPLACEMENT_ENTRY("animatetransform", "animateTransform"),
254     REPLACEMENT_ENTRY("clippath", "clipPath"),
255     REPLACEMENT_ENTRY("feblend", "feBlend"),
256     REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"),
257     REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"),
258     REPLACEMENT_ENTRY("fecomposite", "feComposite"),
259     REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"),
260     REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"),
261     REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"),
262     REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"),
263     REPLACEMENT_ENTRY("feflood", "feFlood"),
264     REPLACEMENT_ENTRY("fefunca", "feFuncA"),
265     REPLACEMENT_ENTRY("fefuncb", "feFuncB"),
266     REPLACEMENT_ENTRY("fefuncg", "feFuncG"),
267     REPLACEMENT_ENTRY("fefuncr", "feFuncR"),
268     REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"),
269     REPLACEMENT_ENTRY("feimage", "feImage"),
270     REPLACEMENT_ENTRY("femerge", "feMerge"),
271     REPLACEMENT_ENTRY("femergenode", "feMergeNode"),
272     REPLACEMENT_ENTRY("femorphology", "feMorphology"),
273     REPLACEMENT_ENTRY("feoffset", "feOffset"),
274     REPLACEMENT_ENTRY("fepointlight", "fePointLight"),
275     REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"),
276     REPLACEMENT_ENTRY("fespotlight", "feSpotLight"),
277     REPLACEMENT_ENTRY("fetile", "feTile"),
278     REPLACEMENT_ENTRY("feturbulence", "feTurbulence"),
279     REPLACEMENT_ENTRY("foreignobject", "foreignObject"),
280     REPLACEMENT_ENTRY("glyphref", "glyphRef"),
281     REPLACEMENT_ENTRY("lineargradient", "linearGradient"),
282     REPLACEMENT_ENTRY("radialgradient", "radialGradient"),
283     REPLACEMENT_ENTRY("textpath", "textPath"),
284 };
285 
286 typedef struct _NamespacedAttributeReplacement {
287   const char* from;
288   const char* local_name;
289   const GumboAttributeNamespaceEnum attr_namespace;
290 } NamespacedAttributeReplacement;
291 
292 static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = {
293     {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
294     {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
295     {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
296     {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
297     {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
298     {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
299     {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
300     {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML},
301     {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
302     {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
303     {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
304     {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
305 };
306 
307 // The "scope marker" for the list of active formatting elements.  We use a
308 // pointer to this as a generic marker element, since the particular element
309 // scope doesn't matter.
310 static const GumboNode kActiveFormattingScopeMarker;
311 
312 // The tag_is and tag_in function use true & false to denote start & end tags,
313 // but for readability, we define constants for them here.
314 static const bool kStartTag = true;
315 static const bool kEndTag = false;
316 
317 // Because GumboStringPieces are immutable, we can't insert a character directly
318 // into a text node.  Instead, we accumulate all pending characters here and
319 // flush them out to a text node whenever a new element is inserted.
320 //
321 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character
322 typedef struct _TextNodeBufferState {
323   // The accumulated text to be inserted into the current text node.
324   GumboStringBuffer _buffer;
325 
326   // A pointer to the original text represented by this text node.  Note that
327   // because of foster parenting and other strange DOM manipulations, this may
328   // include other non-text HTML tags in it; it is defined as the span of
329   // original text from the first character in this text node to the last
330   // character in this text node.
331   const char* _start_original_text;
332 
333   // The source position of the start of this text node.
334   GumboSourcePosition _start_position;
335 
336   // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
337   GumboNodeType _type;
338 } TextNodeBufferState;
339 
340 typedef struct GumboInternalParserState {
341   // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
342   GumboInsertionMode _insertion_mode;
343 
344   // Used for run_generic_parsing_algorithm, which needs to switch back to the
345   // original insertion mode at its conclusion.
346   GumboInsertionMode _original_insertion_mode;
347 
348   // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements
349   GumboVector /*GumboNode*/ _open_elements;
350 
351   // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
352   GumboVector /*GumboNode*/ _active_formatting_elements;
353 
354   // The stack of template insertion modes.
355   // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode
356   GumboVector /*InsertionMode*/ _template_insertion_modes;
357 
358   // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
359   GumboNode* _head_element;
360   GumboNode* _form_element;
361 
362   // The element used as fragment context when parsing in fragment mode
363   GumboNode* _fragment_ctx;
364 
365   // The flag for when the spec says "Reprocess the current token in..."
366   bool _reprocess_current_token;
367 
368   // The flag for "acknowledge the token's self-closing flag".
369   bool _self_closing_flag_acknowledged;
370 
371   // The "frameset-ok" flag from the spec.
372   bool _frameset_ok;
373 
374   // The flag for "If the next token is a LINE FEED, ignore that token...".
375   bool _ignore_next_linefeed;
376 
377   // The flag for "whenever a node would be inserted into the current node, it
378   // must instead be foster parented".  This is used for misnested table
379   // content, which needs to be handled according to "in body" rules yet foster
380   // parented outside of the table.
381   // It would perhaps be more explicit to have this as a parameter to
382   // handle_in_body and insert_element, but given how special-purpose this is
383   // and the number of call-sites that would need to take the extra parameter,
384   // it's easier just to have a state flag.
385   bool _foster_parent_insertions;
386 
387   // The accumulated text node buffer state.
388   TextNodeBufferState _text_node;
389 
390   // The current token.
391   GumboToken* _current_token;
392 
393   // The way that the spec is written, the </body> and </html> tags are *always*
394   // implicit, because encountering one of those tokens merely switches the
395   // insertion mode out of "in body".  So we have individual state flags for
396   // those end tags that are then inspected by pop_current_node when the <body>
397   // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
398   // flag appropriately.
399   bool _closed_body_tag;
400   bool _closed_html_tag;
401 } GumboParserState;
402 
token_has_attribute(const GumboToken * token,const char * name)403 static bool token_has_attribute(const GumboToken* token, const char* name) {
404   assert(token->type == GUMBO_TOKEN_START_TAG);
405   return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
406 }
407 
408 // Checks if the value of the specified attribute is a case-insensitive match
409 // for the specified string.
attribute_matches(const GumboVector * attributes,const char * name,const char * value)410 static bool attribute_matches(
411     const GumboVector* attributes, const char* name, const char* value) {
412   const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
413   return attr ? strcasecmp(value, attr->value) == 0 : false;
414 }
415 
416 // Checks if the value of the specified attribute is a case-sensitive match
417 // for the specified string.
attribute_matches_case_sensitive(const GumboVector * attributes,const char * name,const char * value)418 static bool attribute_matches_case_sensitive(
419     const GumboVector* attributes, const char* name, const char* value) {
420   const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
421   return attr ? strcmp(value, attr->value) == 0 : false;
422 }
423 
424 // Checks if the specified attribute vectors are identical.
all_attributes_match(const GumboVector * attr1,const GumboVector * attr2)425 static bool all_attributes_match(
426     const GumboVector* attr1, const GumboVector* attr2) {
427   unsigned int num_unmatched_attr2_elements = attr2->length;
428   for (unsigned int i = 0; i < attr1->length; ++i) {
429     const GumboAttribute* attr = attr1->data[i];
430     if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
431       --num_unmatched_attr2_elements;
432     } else {
433       return false;
434     }
435   }
436   return num_unmatched_attr2_elements == 0;
437 }
438 
set_frameset_not_ok(GumboParser * parser)439 static void set_frameset_not_ok(GumboParser* parser) {
440   gumbo_debug("Setting frameset_ok to false.\n");
441   parser->_parser_state->_frameset_ok = false;
442 }
443 
create_node(GumboParser * parser,GumboNodeType type)444 static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
445   GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
446   node->parent = NULL;
447   node->index_within_parent = -1;
448   node->type = type;
449   node->parse_flags = GUMBO_INSERTION_NORMAL;
450   return node;
451 }
452 
new_document_node(GumboParser * parser)453 static GumboNode* new_document_node(GumboParser* parser) {
454   GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT);
455   document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
456   gumbo_vector_init(parser, 1, &document_node->v.document.children);
457 
458   // Must be initialized explicitly, as there's no guarantee that we'll see a
459   // doc type token.
460   GumboDocument* document = &document_node->v.document;
461   document->has_doctype = false;
462   document->name = NULL;
463   document->public_identifier = NULL;
464   document->system_identifier = NULL;
465   return document_node;
466 }
467 
output_init(GumboParser * parser)468 static void output_init(GumboParser* parser) {
469   GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput));
470   output->root = NULL;
471   output->document = new_document_node(parser);
472   parser->_output = output;
473   gumbo_init_errors(parser);
474 }
475 
parser_state_init(GumboParser * parser)476 static void parser_state_init(GumboParser* parser) {
477   GumboParserState* parser_state =
478       gumbo_parser_allocate(parser, sizeof(GumboParserState));
479   parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
480   parser_state->_reprocess_current_token = false;
481   parser_state->_frameset_ok = true;
482   parser_state->_ignore_next_linefeed = false;
483   parser_state->_foster_parent_insertions = false;
484   parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
485   gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
486   gumbo_vector_init(parser, 10, &parser_state->_open_elements);
487   gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
488   gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
489   parser_state->_head_element = NULL;
490   parser_state->_form_element = NULL;
491   parser_state->_fragment_ctx = NULL;
492   parser_state->_current_token = NULL;
493   parser_state->_closed_body_tag = false;
494   parser_state->_closed_html_tag = false;
495   parser->_parser_state = parser_state;
496 }
497 
parser_state_destroy(GumboParser * parser)498 static void parser_state_destroy(GumboParser* parser) {
499   GumboParserState* state = parser->_parser_state;
500   if (state->_fragment_ctx) {
501     destroy_node(parser, state->_fragment_ctx);
502   }
503   gumbo_vector_destroy(parser, &state->_active_formatting_elements);
504   gumbo_vector_destroy(parser, &state->_open_elements);
505   gumbo_vector_destroy(parser, &state->_template_insertion_modes);
506   gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
507   gumbo_parser_deallocate(parser, state);
508 }
509 
get_document_node(GumboParser * parser)510 static GumboNode* get_document_node(GumboParser* parser) {
511   return parser->_output->document;
512 }
513 
is_fragment_parser(const GumboParser * parser)514 static bool is_fragment_parser(const GumboParser* parser) {
515   return !!parser->_parser_state->_fragment_ctx;
516 }
517 
518 // Returns the node at the bottom of the stack of open elements, or NULL if no
519 // elements have been added yet.
get_current_node(GumboParser * parser)520 static GumboNode* get_current_node(GumboParser* parser) {
521   GumboVector* open_elements = &parser->_parser_state->_open_elements;
522   if (open_elements->length == 0) {
523     assert(!parser->_output->root);
524     return NULL;
525   }
526   assert(open_elements->length > 0);
527   assert(open_elements->data != NULL);
528   return open_elements->data[open_elements->length - 1];
529 }
530 
get_adjusted_current_node(GumboParser * parser)531 static GumboNode* get_adjusted_current_node(GumboParser* parser) {
532   GumboParserState* state = parser->_parser_state;
533   if (state->_open_elements.length == 1 && state->_fragment_ctx) {
534     return state->_fragment_ctx;
535   }
536   return get_current_node(parser);
537 }
538 
539 // Returns true if the given needle is in the given array of literal
540 // GumboStringPieces.  If exact_match is true, this requires that they match
541 // exactly; otherwise, this performs a prefix match to check if any of the
542 // elements in haystack start with needle.  This always performs a
543 // case-insensitive match.
is_in_static_list(const char * needle,const GumboStringPiece * haystack,bool exact_match)544 static bool is_in_static_list(
545     const char* needle, const GumboStringPiece* haystack, bool exact_match) {
546   for (unsigned int i = 0; haystack[i].length > 0; ++i) {
547     if ((exact_match && !strcmp(needle, haystack[i].data)) ||
548         (!exact_match && !strcasecmp(needle, haystack[i].data))) {
549       return true;
550     }
551   }
552   return false;
553 }
554 
set_insertion_mode(GumboParser * parser,GumboInsertionMode mode)555 static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
556   parser->_parser_state->_insertion_mode = mode;
557 }
558 
559 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
560 // This is a helper function that returns the appropriate insertion mode instead
561 // of setting it.  Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
562 // indicate that there is no appropriate insertion mode, and the loop should
563 // continue.
get_appropriate_insertion_mode(const GumboParser * parser,int index)564 static GumboInsertionMode get_appropriate_insertion_mode(
565     const GumboParser* parser, int index) {
566   const GumboVector* open_elements = &parser->_parser_state->_open_elements;
567   const GumboNode* node = open_elements->data[index];
568   const bool is_last = index == 0;
569 
570   if (is_last && is_fragment_parser(parser)) {
571     node = parser->_parser_state->_fragment_ctx;
572   }
573 
574   assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
575   switch (node->v.element.tag) {
576     case GUMBO_TAG_SELECT: {
577       if (is_last) {
578         return GUMBO_INSERTION_MODE_IN_SELECT;
579       }
580       for (int i = index; i > 0; --i) {
581         const GumboNode* ancestor = open_elements->data[i];
582         if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
583           return GUMBO_INSERTION_MODE_IN_SELECT;
584         }
585         if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
586           return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
587         }
588       }
589       return GUMBO_INSERTION_MODE_IN_SELECT;
590     }
591     case GUMBO_TAG_TD:
592     case GUMBO_TAG_TH:
593       if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
594       break;
595     case GUMBO_TAG_TR:
596       return GUMBO_INSERTION_MODE_IN_ROW;
597     case GUMBO_TAG_TBODY:
598     case GUMBO_TAG_THEAD:
599     case GUMBO_TAG_TFOOT:
600       return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
601     case GUMBO_TAG_CAPTION:
602       return GUMBO_INSERTION_MODE_IN_CAPTION;
603     case GUMBO_TAG_COLGROUP:
604       return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
605     case GUMBO_TAG_TABLE:
606       return GUMBO_INSERTION_MODE_IN_TABLE;
607     case GUMBO_TAG_TEMPLATE:
608       return get_current_template_insertion_mode(parser);
609     case GUMBO_TAG_HEAD:
610       if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD;
611       break;
612     case GUMBO_TAG_BODY:
613       return GUMBO_INSERTION_MODE_IN_BODY;
614     case GUMBO_TAG_FRAMESET:
615       return GUMBO_INSERTION_MODE_IN_FRAMESET;
616     case GUMBO_TAG_HTML:
617       return parser->_parser_state->_head_element
618                  ? GUMBO_INSERTION_MODE_AFTER_HEAD
619                  : GUMBO_INSERTION_MODE_BEFORE_HEAD;
620     default:
621       break;
622   }
623   return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
624 }
625 
626 // This performs the actual "reset the insertion mode" loop.
reset_insertion_mode_appropriately(GumboParser * parser)627 static void reset_insertion_mode_appropriately(GumboParser* parser) {
628   const GumboVector* open_elements = &parser->_parser_state->_open_elements;
629   for (int i = open_elements->length; --i >= 0;) {
630     GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i);
631     if (mode != GUMBO_INSERTION_MODE_INITIAL) {
632       set_insertion_mode(parser, mode);
633       return;
634     }
635   }
636   // Should never get here, because is_last will be set on the last iteration
637   // and will force GUMBO_INSERTION_MODE_IN_BODY.
638   assert(0);
639 }
640 
parser_add_parse_error(GumboParser * parser,const GumboToken * token)641 static GumboError* parser_add_parse_error(
642     GumboParser* parser, const GumboToken* token) {
643   gumbo_debug("Adding parse error.\n");
644   GumboError* error = gumbo_add_error(parser);
645   if (!error) {
646     return NULL;
647   }
648   error->type = GUMBO_ERR_PARSER;
649   error->position = token->position;
650   error->original_text = token->original_text.data;
651   GumboParserError* extra_data = &error->v.parser;
652   extra_data->input_type = token->type;
653   extra_data->input_tag = GUMBO_TAG_UNKNOWN;
654   if (token->type == GUMBO_TOKEN_START_TAG) {
655     extra_data->input_tag = token->v.start_tag.tag;
656   } else if (token->type == GUMBO_TOKEN_END_TAG) {
657     extra_data->input_tag = token->v.end_tag;
658   }
659   GumboParserState* state = parser->_parser_state;
660   extra_data->parser_state = state->_insertion_mode;
661   gumbo_vector_init(
662       parser, state->_open_elements.length, &extra_data->tag_stack);
663   for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
664     const GumboNode* node = state->_open_elements.data[i];
665     assert(
666         node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
667     gumbo_vector_add(
668         parser, (void*) node->v.element.tag, &extra_data->tag_stack);
669   }
670   return error;
671 }
672 
673 // Returns true if the specified token is either a start or end tag (specified
674 // by is_start) with one of the tag types in the varargs list.  Terminate the
675 // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
676 // the spec references tags that are not in the spec.
tag_in(const GumboToken * token,bool is_start,const gumbo_tagset tags)677 static bool tag_in(
678     const GumboToken* token, bool is_start, const gumbo_tagset tags) {
679   GumboTag token_tag;
680   if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
681     token_tag = token->v.start_tag.tag;
682   } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
683     token_tag = token->v.end_tag;
684   } else {
685     return false;
686   }
687   return (token_tag < GUMBO_TAG_LAST && tags[(int) token_tag] != 0);
688 }
689 
690 // Like tag_in, but for the single-tag case.
tag_is(const GumboToken * token,bool is_start,GumboTag tag)691 static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
692   if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
693     return token->v.start_tag.tag == tag;
694   } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
695     return token->v.end_tag == tag;
696   } else {
697     return false;
698   }
699 }
700 
701 // Like tag_in, but checks for the tag of a node, rather than a token.
node_tag_in_set(const GumboNode * node,const gumbo_tagset tags)702 static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) {
703   assert(node != NULL);
704   if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
705     return false;
706   }
707   return TAGSET_INCLUDES(
708       tags, node->v.element.tag_namespace, node->v.element.tag);
709 }
710 
711 // Like node_tag_in, but for the single-tag case.
node_qualified_tag_is(const GumboNode * node,GumboNamespaceEnum ns,GumboTag tag)712 static bool node_qualified_tag_is(
713     const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) {
714   assert(node);
715   return (node->type == GUMBO_NODE_ELEMENT ||
716              node->type == GUMBO_NODE_TEMPLATE) &&
717          node->v.element.tag == tag && node->v.element.tag_namespace == ns;
718 }
719 
720 // Like node_tag_in, but for the single-tag case in the HTML namespace
node_html_tag_is(const GumboNode * node,GumboTag tag)721 static bool node_html_tag_is(const GumboNode* node, GumboTag tag) {
722   return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
723 }
724 
push_template_insertion_mode(GumboParser * parser,GumboInsertionMode mode)725 static void push_template_insertion_mode(
726     GumboParser* parser, GumboInsertionMode mode) {
727   gumbo_vector_add(
728       parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
729 }
730 
pop_template_insertion_mode(GumboParser * parser)731 static void pop_template_insertion_mode(GumboParser* parser) {
732   gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
733 }
734 
735 // Returns the current template insertion mode.  If the stack of template
736 // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL.
get_current_template_insertion_mode(const GumboParser * parser)737 static GumboInsertionMode get_current_template_insertion_mode(
738     const GumboParser* parser) {
739   GumboVector* template_insertion_modes =
740       &parser->_parser_state->_template_insertion_modes;
741   if (template_insertion_modes->length == 0) {
742     return GUMBO_INSERTION_MODE_INITIAL;
743   }
744   return (GumboInsertionMode)
745       template_insertion_modes->data[(template_insertion_modes->length - 1)];
746 }
747 
748 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
is_mathml_integration_point(const GumboNode * node)749 static bool is_mathml_integration_point(const GumboNode* node) {
750   return node_tag_in_set(
751       node, (gumbo_tagset){TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
752                 TAG_MATHML(MS), TAG_MATHML(MTEXT)});
753 }
754 
755 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
is_html_integration_point(const GumboNode * node)756 static bool is_html_integration_point(const GumboNode* node) {
757   return node_tag_in_set(node, (gumbo_tagset){TAG_SVG(FOREIGNOBJECT),
758                                    TAG_SVG(DESC), TAG_SVG(TITLE)}) ||
759          (node_qualified_tag_is(
760               node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
761              (attribute_matches(
762                   &node->v.element.attributes, "encoding", "text/html") ||
763                  attribute_matches(&node->v.element.attributes, "encoding",
764                      "application/xhtml+xml")));
765 }
766 
767 // This represents a place to insert a node, consisting of a target parent and a
768 // child index within that parent.  If the node should be inserted at the end of
769 // the parent's child, index will be -1.
770 typedef struct {
771   GumboNode* target;
772   int index;
773 } InsertionLocation;
774 
get_appropriate_insertion_location(GumboParser * parser,GumboNode * override_target)775 InsertionLocation get_appropriate_insertion_location(
776     GumboParser* parser, GumboNode* override_target) {
777   InsertionLocation retval = {override_target, -1};
778   if (retval.target == NULL) {
779     // No override target; default to the current node, but special-case the
780     // root node since get_current_node() assumes the stack of open elements is
781     // non-empty.
782     retval.target = parser->_output->root != NULL ? get_current_node(parser)
783                                                   : get_document_node(parser);
784   }
785   if (!parser->_parser_state->_foster_parent_insertions ||
786       !node_tag_in_set(retval.target, (gumbo_tagset){TAG(TABLE), TAG(TBODY),
787                                           TAG(TFOOT), TAG(THEAD), TAG(TR)})) {
788     return retval;
789   }
790 
791   // Foster-parenting case.
792   int last_template_index = -1;
793   int last_table_index = -1;
794   GumboVector* open_elements = &parser->_parser_state->_open_elements;
795   for (unsigned int i = 0; i < open_elements->length; ++i) {
796     if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) {
797       last_template_index = i;
798     }
799     if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) {
800       last_table_index = i;
801     }
802   }
803   if (last_template_index != -1 &&
804       (last_table_index == -1 || last_template_index > last_table_index)) {
805     retval.target = open_elements->data[last_template_index];
806     return retval;
807   }
808   if (last_table_index == -1) {
809     retval.target = open_elements->data[0];
810     return retval;
811   }
812   GumboNode* last_table = open_elements->data[last_table_index];
813   if (last_table->parent != NULL) {
814     retval.target = last_table->parent;
815     retval.index = last_table->index_within_parent;
816     return retval;
817   }
818 
819   retval.target = open_elements->data[last_table_index - 1];
820   return retval;
821 }
822 
823 // Appends a node to the end of its parent, setting the "parent" and
824 // "index_within_parent" fields appropriately.
append_node(GumboParser * parser,GumboNode * parent,GumboNode * node)825 static void append_node(
826     GumboParser* parser, GumboNode* parent, GumboNode* node) {
827   assert(node->parent == NULL);
828   assert(node->index_within_parent == -1);
829   GumboVector* children;
830   if (parent->type == GUMBO_NODE_ELEMENT ||
831       parent->type == GUMBO_NODE_TEMPLATE) {
832     children = &parent->v.element.children;
833   } else {
834     assert(parent->type == GUMBO_NODE_DOCUMENT);
835     children = &parent->v.document.children;
836   }
837   node->parent = parent;
838   node->index_within_parent = children->length;
839   gumbo_vector_add(parser, (void*) node, children);
840   assert(node->index_within_parent < children->length);
841 }
842 
843 // Inserts a node at the specified InsertionLocation, updating the
844 // "parent" and "index_within_parent" fields of it and all its siblings.
845 // If the index of the location is -1, this calls append_node.
insert_node(GumboParser * parser,GumboNode * node,InsertionLocation location)846 static void insert_node(
847     GumboParser* parser, GumboNode* node, InsertionLocation location) {
848   assert(node->parent == NULL);
849   assert(node->index_within_parent == -1);
850   GumboNode* parent = location.target;
851   int index = location.index;
852   if (index != -1) {
853     GumboVector* children = NULL;
854     if (parent->type == GUMBO_NODE_ELEMENT ||
855         parent->type == GUMBO_NODE_TEMPLATE) {
856       children = &parent->v.element.children;
857     } else if (parent->type == GUMBO_NODE_DOCUMENT) {
858       children = &parent->v.document.children;
859       assert(children->length == 0);
860     } else {
861       assert(0);
862     }
863 
864     assert(index >= 0);
865     assert((unsigned int) index < children->length);
866     node->parent = parent;
867     node->index_within_parent = index;
868     gumbo_vector_insert_at(parser, (void*) node, index, children);
869     assert(node->index_within_parent < children->length);
870     for (unsigned int i = index + 1; i < children->length; ++i) {
871       GumboNode* sibling = children->data[i];
872       sibling->index_within_parent = i;
873       assert(sibling->index_within_parent < children->length);
874     }
875   } else {
876     append_node(parser, parent, node);
877   }
878 }
879 
maybe_flush_text_node_buffer(GumboParser * parser)880 static void maybe_flush_text_node_buffer(GumboParser* parser) {
881   GumboParserState* state = parser->_parser_state;
882   TextNodeBufferState* buffer_state = &state->_text_node;
883   if (buffer_state->_buffer.length == 0) {
884     return;
885   }
886 
887   assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
888          buffer_state->_type == GUMBO_NODE_TEXT ||
889          buffer_state->_type == GUMBO_NODE_CDATA);
890   GumboNode* text_node = create_node(parser, buffer_state->_type);
891   GumboText* text_node_data = &text_node->v.text;
892   text_node_data->text =
893       gumbo_string_buffer_to_string(parser, &buffer_state->_buffer);
894   text_node_data->original_text.data = buffer_state->_start_original_text;
895   text_node_data->original_text.length =
896       state->_current_token->original_text.data -
897       buffer_state->_start_original_text;
898   text_node_data->start_pos = buffer_state->_start_position;
899 
900   gumbo_debug("Flushing text node buffer of %.*s.\n",
901       (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
902 
903   InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
904   if (location.target->type == GUMBO_NODE_DOCUMENT) {
905     // The DOM does not allow Document nodes to have Text children, so per the
906     // spec, they are dropped on the floor.
907     destroy_node(parser, text_node);
908   } else {
909     insert_node(parser, text_node, location);
910   }
911 
912   gumbo_string_buffer_clear(parser, &buffer_state->_buffer);
913   buffer_state->_type = GUMBO_NODE_WHITESPACE;
914   assert(buffer_state->_buffer.length == 0);
915 }
916 
record_end_of_element(GumboToken * current_token,GumboElement * element)917 static void record_end_of_element(
918     GumboToken* current_token, GumboElement* element) {
919   element->end_pos = current_token->position;
920   element->original_end_tag = current_token->type == GUMBO_TOKEN_END_TAG
921                                   ? current_token->original_text
922                                   : kGumboEmptyString;
923 }
924 
pop_current_node(GumboParser * parser)925 static GumboNode* pop_current_node(GumboParser* parser) {
926   GumboParserState* state = parser->_parser_state;
927   maybe_flush_text_node_buffer(parser);
928   if (state->_open_elements.length > 0) {
929     assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
930     gumbo_debug("Popping %s node.\n",
931         gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
932   }
933   GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements);
934   if (!current_node) {
935     assert(state->_open_elements.length == 0);
936     return NULL;
937   }
938   assert(current_node->type == GUMBO_NODE_ELEMENT ||
939          current_node->type == GUMBO_NODE_TEMPLATE);
940   bool is_closed_body_or_html_tag =
941       (node_html_tag_is(current_node, GUMBO_TAG_BODY) &&
942           state->_closed_body_tag) ||
943       (node_html_tag_is(current_node, GUMBO_TAG_HTML) &&
944           state->_closed_html_tag);
945   if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
946           !node_html_tag_is(current_node, state->_current_token->v.end_tag)) &&
947       !is_closed_body_or_html_tag) {
948     current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
949   }
950   if (!is_closed_body_or_html_tag) {
951     record_end_of_element(state->_current_token, &current_node->v.element);
952   }
953   return current_node;
954 }
955 
append_comment_node(GumboParser * parser,GumboNode * node,const GumboToken * token)956 static void append_comment_node(
957     GumboParser* parser, GumboNode* node, const GumboToken* token) {
958   maybe_flush_text_node_buffer(parser);
959   GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT);
960   comment->type = GUMBO_NODE_COMMENT;
961   comment->parse_flags = GUMBO_INSERTION_NORMAL;
962   comment->v.text.text = token->v.text;
963   comment->v.text.original_text = token->original_text;
964   comment->v.text.start_pos = token->position;
965   append_node(parser, node, comment);
966 }
967 
968 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
clear_stack_to_table_row_context(GumboParser * parser)969 static void clear_stack_to_table_row_context(GumboParser* parser) {
970   while (!node_tag_in_set(get_current_node(parser),
971              (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) {
972     pop_current_node(parser);
973   }
974 }
975 
976 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
clear_stack_to_table_context(GumboParser * parser)977 static void clear_stack_to_table_context(GumboParser* parser) {
978   while (!node_tag_in_set(get_current_node(parser),
979              (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) {
980     pop_current_node(parser);
981   }
982 }
983 
984 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
clear_stack_to_table_body_context(GumboParser * parser)985 void clear_stack_to_table_body_context(GumboParser* parser) {
986   while (!node_tag_in_set(get_current_node(parser),
987              (gumbo_tagset){TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD),
988                  TAG(TEMPLATE)})) {
989     pop_current_node(parser);
990   }
991 }
992 
993 // Creates a parser-inserted element in the HTML namespace and returns it.
create_element(GumboParser * parser,GumboTag tag)994 static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
995   GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
996   GumboElement* element = &node->v.element;
997   gumbo_vector_init(parser, 1, &element->children);
998   gumbo_vector_init(parser, 0, &element->attributes);
999   element->tag = tag;
1000   element->tag_namespace = GUMBO_NAMESPACE_HTML;
1001   element->original_tag = kGumboEmptyString;
1002   element->original_end_tag = kGumboEmptyString;
1003   element->start_pos = (parser->_parser_state->_current_token)
1004                            ? parser->_parser_state->_current_token->position
1005                            : kGumboEmptySourcePosition;
1006   element->end_pos = kGumboEmptySourcePosition;
1007   return node;
1008 }
1009 
1010 // Constructs an element from the given start tag token.
create_element_from_token(GumboParser * parser,GumboToken * token,GumboNamespaceEnum tag_namespace)1011 static GumboNode* create_element_from_token(
1012     GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1013   assert(token->type == GUMBO_TOKEN_START_TAG);
1014   GumboTokenStartTag* start_tag = &token->v.start_tag;
1015 
1016   GumboNodeType type = (tag_namespace == GUMBO_NAMESPACE_HTML &&
1017                            start_tag->tag == GUMBO_TAG_TEMPLATE)
1018                            ? GUMBO_NODE_TEMPLATE
1019                            : GUMBO_NODE_ELEMENT;
1020 
1021   GumboNode* node = create_node(parser, type);
1022   GumboElement* element = &node->v.element;
1023   gumbo_vector_init(parser, 1, &element->children);
1024   element->attributes = start_tag->attributes;
1025   element->tag = start_tag->tag;
1026   element->tag_namespace = tag_namespace;
1027 
1028   assert(token->original_text.length >= 2);
1029   assert(token->original_text.data[0] == '<');
1030   assert(token->original_text.data[token->original_text.length - 1] == '>');
1031   element->original_tag = token->original_text;
1032   element->start_pos = token->position;
1033   element->original_end_tag = kGumboEmptyString;
1034   element->end_pos = kGumboEmptySourcePosition;
1035 
1036   // The element takes ownership of the attributes from the token, so any
1037   // allocated-memory fields should be nulled out.
1038   start_tag->attributes = kGumboEmptyVector;
1039   return node;
1040 }
1041 
1042 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element
insert_element(GumboParser * parser,GumboNode * node,bool is_reconstructing_formatting_elements)1043 static void insert_element(GumboParser* parser, GumboNode* node,
1044     bool is_reconstructing_formatting_elements) {
1045   GumboParserState* state = parser->_parser_state;
1046   // NOTE(jdtang): The text node buffer must always be flushed before inserting
1047   // a node, otherwise we're handling nodes in a different order than the spec
1048   // mandated.  However, one clause of the spec (character tokens in the body)
1049   // requires that we reconstruct the active formatting elements *before* adding
1050   // the character, and reconstructing the active formatting elements may itself
1051   // result in the insertion of new elements (which should be pushed onto the
1052   // stack of open elements before the buffer is flushed).  We solve this (for
1053   // the time being, the spec has been rewritten for <template> and the new
1054   // version may be simpler here) with a boolean flag to this method.
1055   if (!is_reconstructing_formatting_elements) {
1056     maybe_flush_text_node_buffer(parser);
1057   }
1058   InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
1059   insert_node(parser, node, location);
1060   gumbo_vector_add(parser, (void*) node, &state->_open_elements);
1061 }
1062 
1063 // Convenience method that combines create_element_from_token and
1064 // insert_element, inserting the generated element directly into the current
1065 // node.  Returns the node inserted.
insert_element_from_token(GumboParser * parser,GumboToken * token)1066 static GumboNode* insert_element_from_token(
1067     GumboParser* parser, GumboToken* token) {
1068   GumboNode* element =
1069       create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML);
1070   insert_element(parser, element, false);
1071   gumbo_debug("Inserting <%s> element (@%x) from token.\n",
1072       gumbo_normalized_tagname(element->v.element.tag), element);
1073   return element;
1074 }
1075 
1076 // Convenience method that combines create_element and insert_element, inserting
1077 // a parser-generated element of a specific tag type.  Returns the node
1078 // inserted.
insert_element_of_tag_type(GumboParser * parser,GumboTag tag,GumboParseFlags reason)1079 static GumboNode* insert_element_of_tag_type(
1080     GumboParser* parser, GumboTag tag, GumboParseFlags reason) {
1081   GumboNode* element = create_element(parser, tag);
1082   element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1083   insert_element(parser, element, false);
1084   gumbo_debug("Inserting %s element (@%x) from tag type.\n",
1085       gumbo_normalized_tagname(tag), element);
1086   return element;
1087 }
1088 
1089 // Convenience method for creating foreign namespaced element.  Returns the node
1090 // inserted.
insert_foreign_element(GumboParser * parser,GumboToken * token,GumboNamespaceEnum tag_namespace)1091 static GumboNode* insert_foreign_element(
1092     GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1093   assert(token->type == GUMBO_TOKEN_START_TAG);
1094   GumboNode* element = create_element_from_token(parser, token, tag_namespace);
1095   insert_element(parser, element, false);
1096   if (token_has_attribute(token, "xmlns") &&
1097       !attribute_matches_case_sensitive(&token->v.start_tag.attributes, "xmlns",
1098           kLegalXmlns[tag_namespace])) {
1099     // TODO(jdtang): Since there're multiple possible error codes here, we
1100     // eventually need reason codes to differentiate them.
1101     parser_add_parse_error(parser, token);
1102   }
1103   if (token_has_attribute(token, "xmlns:xlink") &&
1104       !attribute_matches_case_sensitive(&token->v.start_tag.attributes,
1105           "xmlns:xlink", "http://www.w3.org/1999/xlink")) {
1106     parser_add_parse_error(parser, token);
1107   }
1108   return element;
1109 }
1110 
insert_text_token(GumboParser * parser,GumboToken * token)1111 static void insert_text_token(GumboParser* parser, GumboToken* token) {
1112   assert(token->type == GUMBO_TOKEN_WHITESPACE ||
1113          token->type == GUMBO_TOKEN_CHARACTER ||
1114          token->type == GUMBO_TOKEN_NULL || token->type == GUMBO_TOKEN_CDATA);
1115   TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1116   if (buffer_state->_buffer.length == 0) {
1117     // Initialize position fields.
1118     buffer_state->_start_original_text = token->original_text.data;
1119     buffer_state->_start_position = token->position;
1120   }
1121   gumbo_string_buffer_append_codepoint(
1122       parser, token->v.character, &buffer_state->_buffer);
1123   if (token->type == GUMBO_TOKEN_CHARACTER) {
1124     buffer_state->_type = GUMBO_NODE_TEXT;
1125   } else if (token->type == GUMBO_TOKEN_CDATA) {
1126     buffer_state->_type = GUMBO_NODE_CDATA;
1127   }
1128   gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1129 }
1130 
1131 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm
run_generic_parsing_algorithm(GumboParser * parser,GumboToken * token,GumboTokenizerEnum lexer_state)1132 static void run_generic_parsing_algorithm(
1133     GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) {
1134   insert_element_from_token(parser, token);
1135   gumbo_tokenizer_set_state(parser, lexer_state);
1136   parser->_parser_state->_original_insertion_mode =
1137       parser->_parser_state->_insertion_mode;
1138   parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
1139 }
1140 
acknowledge_self_closing_tag(GumboParser * parser)1141 static void acknowledge_self_closing_tag(GumboParser* parser) {
1142   parser->_parser_state->_self_closing_flag_acknowledged = true;
1143 }
1144 
1145 // Returns true if there's an anchor tag in the list of active formatting
1146 // elements, and fills in its index if so.
find_last_anchor_index(GumboParser * parser,int * anchor_index)1147 static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1148   GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1149   for (int i = elements->length; --i >= 0;) {
1150     GumboNode* node = elements->data[i];
1151     if (node == &kActiveFormattingScopeMarker) {
1152       return false;
1153     }
1154     if (node_html_tag_is(node, GUMBO_TAG_A)) {
1155       *anchor_index = i;
1156       return true;
1157     }
1158   }
1159   return false;
1160 }
1161 
1162 // Counts the number of open formatting elements in the list of active
1163 // formatting elements (after the last active scope marker) that have a specific
1164 // tag.  If this is > 0, then earliest_matching_index will be filled in with the
1165 // index of the first such element.
count_formatting_elements_of_tag(GumboParser * parser,const GumboNode * desired_node,int * earliest_matching_index)1166 static int count_formatting_elements_of_tag(GumboParser* parser,
1167     const GumboNode* desired_node, int* earliest_matching_index) {
1168   const GumboElement* desired_element = &desired_node->v.element;
1169   GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1170   int num_identical_elements = 0;
1171   for (int i = elements->length; --i >= 0;) {
1172     GumboNode* node = elements->data[i];
1173     if (node == &kActiveFormattingScopeMarker) {
1174       break;
1175     }
1176     assert(node->type == GUMBO_NODE_ELEMENT);
1177     if (node_qualified_tag_is(
1178             node, desired_element->tag_namespace, desired_element->tag) &&
1179         all_attributes_match(
1180             &node->v.element.attributes, &desired_element->attributes)) {
1181       num_identical_elements++;
1182       *earliest_matching_index = i;
1183     }
1184   }
1185   return num_identical_elements;
1186 }
1187 
1188 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements
add_formatting_element(GumboParser * parser,const GumboNode * node)1189 static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
1190   assert(node == &kActiveFormattingScopeMarker ||
1191          node->type == GUMBO_NODE_ELEMENT);
1192   GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1193   if (node == &kActiveFormattingScopeMarker) {
1194     gumbo_debug("Adding a scope marker.\n");
1195   } else {
1196     gumbo_debug("Adding a formatting element.\n");
1197   }
1198 
1199   // Hunt for identical elements.
1200   int earliest_identical_element = elements->length;
1201   int num_identical_elements = count_formatting_elements_of_tag(
1202       parser, node, &earliest_identical_element);
1203 
1204   // Noah's Ark clause: if there're at least 3, remove the earliest.
1205   if (num_identical_elements >= 3) {
1206     gumbo_debug("Noah's ark clause: removing element at %d.\n",
1207         earliest_identical_element);
1208     gumbo_vector_remove_at(parser, earliest_identical_element, elements);
1209   }
1210 
1211   gumbo_vector_add(parser, (void*) node, elements);
1212 }
1213 
is_open_element(GumboParser * parser,const GumboNode * node)1214 static bool is_open_element(GumboParser* parser, const GumboNode* node) {
1215   GumboVector* open_elements = &parser->_parser_state->_open_elements;
1216   for (unsigned int i = 0; i < open_elements->length; ++i) {
1217     if (open_elements->data[i] == node) {
1218       return true;
1219     }
1220   }
1221   return false;
1222 }
1223 
1224 // Clones attributes, tags, etc. of a node, but does not copy the content.  The
1225 // clone shares no structure with the original node: all owned strings and
1226 // values are fresh copies.
clone_node(GumboParser * parser,GumboNode * node,GumboParseFlags reason)1227 GumboNode* clone_node(
1228     GumboParser* parser, GumboNode* node, GumboParseFlags reason) {
1229   assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1230   GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
1231   *new_node = *node;
1232   new_node->parent = NULL;
1233   new_node->index_within_parent = -1;
1234   // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
1235   // have a separate end tag.
1236   new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
1237   new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
1238   GumboElement* element = &new_node->v.element;
1239   gumbo_vector_init(parser, 1, &element->children);
1240 
1241   const GumboVector* old_attributes = &node->v.element.attributes;
1242   gumbo_vector_init(parser, old_attributes->length, &element->attributes);
1243   for (unsigned int i = 0; i < old_attributes->length; ++i) {
1244     const GumboAttribute* old_attr = old_attributes->data[i];
1245     GumboAttribute* attr =
1246         gumbo_parser_allocate(parser, sizeof(GumboAttribute));
1247     *attr = *old_attr;
1248     attr->name = gumbo_copy_stringz(parser, old_attr->name);
1249     attr->value = gumbo_copy_stringz(parser, old_attr->value);
1250     gumbo_vector_add(parser, attr, &element->attributes);
1251   }
1252   return new_node;
1253 }
1254 
1255 // "Reconstruct active formatting elements" part of the spec.
1256 // This implementation is based on the html5lib translation from the mess of
1257 // GOTOs in the spec to reasonably structured programming.
1258 // http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py
reconstruct_active_formatting_elements(GumboParser * parser)1259 static void reconstruct_active_formatting_elements(GumboParser* parser) {
1260   GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1261   // Step 1
1262   if (elements->length == 0) {
1263     return;
1264   }
1265 
1266   // Step 2 & 3
1267   unsigned int i = elements->length - 1;
1268   GumboNode* element = elements->data[i];
1269   if (element == &kActiveFormattingScopeMarker ||
1270       is_open_element(parser, element)) {
1271     return;
1272   }
1273 
1274   // Step 6
1275   do {
1276     if (i == 0) {
1277       // Step 4
1278       i = -1;  // Incremented to 0 below.
1279       break;
1280     }
1281     // Step 5
1282     element = elements->data[--i];
1283   } while (element != &kActiveFormattingScopeMarker &&
1284            !is_open_element(parser, element));
1285 
1286   ++i;
1287   gumbo_debug("Reconstructing elements from %d on %s parent.\n", i,
1288       gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
1289   for (; i < elements->length; ++i) {
1290     // Step 7 & 8.
1291     assert(elements->length > 0);
1292     assert(i < elements->length);
1293     element = elements->data[i];
1294     assert(element != &kActiveFormattingScopeMarker);
1295     GumboNode* clone = clone_node(
1296         parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
1297     // Step 9.
1298     InsertionLocation location =
1299         get_appropriate_insertion_location(parser, NULL);
1300     insert_node(parser, clone, location);
1301     gumbo_vector_add(
1302         parser, (void*) clone, &parser->_parser_state->_open_elements);
1303 
1304     // Step 10.
1305     elements->data[i] = clone;
1306     gumbo_debug("Reconstructed %s element at %d.\n",
1307         gumbo_normalized_tagname(clone->v.element.tag), i);
1308   }
1309 }
1310 
clear_active_formatting_elements(GumboParser * parser)1311 static void clear_active_formatting_elements(GumboParser* parser) {
1312   GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1313   int num_elements_cleared = 0;
1314   const GumboNode* node;
1315   do {
1316     node = gumbo_vector_pop(parser, elements);
1317     ++num_elements_cleared;
1318   } while (node && node != &kActiveFormattingScopeMarker);
1319   gumbo_debug("Cleared %d elements from active formatting list.\n",
1320       num_elements_cleared);
1321 }
1322 
1323 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode
compute_quirks_mode(const GumboTokenDocType * doctype)1324 static GumboQuirksModeEnum compute_quirks_mode(
1325     const GumboTokenDocType* doctype) {
1326   if (doctype->force_quirks || strcmp(doctype->name, kDoctypeHtml.data) ||
1327       is_in_static_list(
1328           doctype->public_identifier, kQuirksModePublicIdPrefixes, false) ||
1329       is_in_static_list(
1330           doctype->public_identifier, kQuirksModePublicIdExactMatches, true) ||
1331       is_in_static_list(
1332           doctype->system_identifier, kQuirksModeSystemIdExactMatches, true) ||
1333       (is_in_static_list(doctype->public_identifier,
1334            kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) &&
1335           !doctype->has_system_identifier)) {
1336     return GUMBO_DOCTYPE_QUIRKS;
1337   } else if (is_in_static_list(doctype->public_identifier,
1338                  kLimitedQuirksPublicIdPrefixes, false) ||
1339              (is_in_static_list(doctype->public_identifier,
1340                   kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) &&
1341                  doctype->has_system_identifier)) {
1342     return GUMBO_DOCTYPE_LIMITED_QUIRKS;
1343   }
1344   return GUMBO_DOCTYPE_NO_QUIRKS;
1345 }
1346 
1347 // The following functions are all defined by the "has an element in __ scope"
1348 // sections of the HTML5 spec:
1349 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
1350 // The basic idea behind them is that they check for an element of the given
1351 // qualified name, contained within a scope formed by a set of other qualified
1352 // names.  For example, "has an element in list scope" looks for an element of
1353 // the given qualified name within the nearest enclosing <ol> or <ul>, along
1354 // with a bunch of generic element types that serve to "firewall" their content
1355 // from the rest of the document. Note that because of the way the spec is
1356 // written,
1357 // all elements are expected to be in the HTML namespace
has_an_element_in_specific_scope(GumboParser * parser,int expected_size,const GumboTag * expected,bool negate,const gumbo_tagset tags)1358 static bool has_an_element_in_specific_scope(GumboParser* parser,
1359     int expected_size, const GumboTag* expected, bool negate,
1360     const gumbo_tagset tags) {
1361   GumboVector* open_elements = &parser->_parser_state->_open_elements;
1362   for (int i = open_elements->length; --i >= 0;) {
1363     const GumboNode* node = open_elements->data[i];
1364     if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE)
1365       continue;
1366 
1367     GumboTag node_tag = node->v.element.tag;
1368     GumboNamespaceEnum node_ns = node->v.element.tag_namespace;
1369     for (int j = 0; j < expected_size; ++j) {
1370       if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML)
1371         return true;
1372     }
1373 
1374     bool found = TAGSET_INCLUDES(tags, node_ns, node_tag);
1375     if (negate != found) return false;
1376   }
1377   return false;
1378 }
1379 
1380 // Checks for the presence of an open element of the specified tag type.
has_open_element(GumboParser * parser,GumboTag tag)1381 static bool has_open_element(GumboParser* parser, GumboTag tag) {
1382   return has_an_element_in_specific_scope(
1383       parser, 1, &tag, false, (gumbo_tagset){TAG(HTML)});
1384 }
1385 
1386 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
has_an_element_in_scope(GumboParser * parser,GumboTag tag)1387 static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
1388   return has_an_element_in_specific_scope(parser, 1, &tag, false,
1389       (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1390           TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1391           TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1392           TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1393           TAG_SVG(TITLE)});
1394 }
1395 
1396 // Like "has an element in scope", but for the specific case of looking for a
1397 // unique target node, not for any node with a given tag name.  This duplicates
1398 // much of the algorithm from has_an_element_in_specific_scope because the
1399 // predicate is different when checking for an exact node, and it's easier &
1400 // faster just to duplicate the code for this one case than to try and
1401 // parameterize it.
has_node_in_scope(GumboParser * parser,const GumboNode * node)1402 static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1403   GumboVector* open_elements = &parser->_parser_state->_open_elements;
1404   for (int i = open_elements->length; --i >= 0;) {
1405     const GumboNode* current = open_elements->data[i];
1406     if (current == node) {
1407       return true;
1408     }
1409     if (current->type != GUMBO_NODE_ELEMENT &&
1410         current->type != GUMBO_NODE_TEMPLATE) {
1411       continue;
1412     }
1413     if (node_tag_in_set(current,
1414             (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE),
1415                 TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE),
1416                 TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1417                 TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1418                 TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE)})) {
1419       return false;
1420     }
1421   }
1422   assert(false);
1423   return false;
1424 }
1425 
1426 // Like has_an_element_in_scope, but restricts the expected qualified name to a
1427 // range of possible qualified names instead of just a single one.
has_an_element_in_scope_with_tagname(GumboParser * parser,int expected_len,const GumboTag expected[])1428 static bool has_an_element_in_scope_with_tagname(
1429     GumboParser* parser, int expected_len, const GumboTag expected[]) {
1430   return has_an_element_in_specific_scope(parser, expected_len, expected, false,
1431       (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1432           TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1433           TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1434           TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1435           TAG_SVG(TITLE)});
1436 }
1437 
1438 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
has_an_element_in_list_scope(GumboParser * parser,GumboTag tag)1439 static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
1440   return has_an_element_in_specific_scope(parser, 1, &tag, false,
1441       (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1442           TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1443           TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1444           TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1445           TAG_SVG(TITLE), TAG(OL), TAG(UL)});
1446 }
1447 
1448 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
has_an_element_in_button_scope(GumboParser * parser,GumboTag tag)1449 static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
1450   return has_an_element_in_specific_scope(parser, 1, &tag, false,
1451       (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1452           TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1453           TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1454           TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1455           TAG_SVG(TITLE), TAG(BUTTON)});
1456 }
1457 
1458 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
has_an_element_in_table_scope(GumboParser * parser,GumboTag tag)1459 static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
1460   return has_an_element_in_specific_scope(parser, 1, &tag, false,
1461       (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)});
1462 }
1463 
1464 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
has_an_element_in_select_scope(GumboParser * parser,GumboTag tag)1465 static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
1466   return has_an_element_in_specific_scope(
1467       parser, 1, &tag, true, (gumbo_tagset){TAG(OPTGROUP), TAG(OPTION)});
1468 }
1469 
1470 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
1471 // "exception" is the "element to exclude from the process" listed in the spec.
1472 // Pass GUMBO_TAG_LAST to not exclude any of them.
generate_implied_end_tags(GumboParser * parser,GumboTag exception)1473 static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1474   for (; node_tag_in_set(get_current_node(parser),
1475              (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTION),
1476                  TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)}) &&
1477          !node_html_tag_is(get_current_node(parser), exception);
1478        pop_current_node(parser))
1479     ;
1480 }
1481 
1482 // This is the "generate all implied end tags thoroughly" clause of the spec.
1483 // https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags
generate_all_implied_end_tags_thoroughly(GumboParser * parser)1484 static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1485   for (
1486       ; node_tag_in_set(get_current_node(parser),
1487           (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI),
1488               TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC),
1489               TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)});
1490       pop_current_node(parser))
1491     ;
1492 }
1493 
1494 // This factors out the clauses relating to "act as if an end tag token with tag
1495 // name "table" had been seen.  Returns true if there's a table element in table
1496 // scope which was successfully closed, false if not and the token should be
1497 // ignored.  Does not add parse errors; callers should handle that.
close_table(GumboParser * parser)1498 static bool close_table(GumboParser* parser) {
1499   if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
1500     return false;
1501   }
1502 
1503   GumboNode* node = pop_current_node(parser);
1504   while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) {
1505     node = pop_current_node(parser);
1506   }
1507   reset_insertion_mode_appropriately(parser);
1508   return true;
1509 }
1510 
1511 // This factors out the clauses relating to "act as if an end tag token with tag
1512 // name `cell_tag` had been seen".
close_table_cell(GumboParser * parser,const GumboToken * token,GumboTag cell_tag)1513 static bool close_table_cell(
1514     GumboParser* parser, const GumboToken* token, GumboTag cell_tag) {
1515   bool result = true;
1516   generate_implied_end_tags(parser, GUMBO_TAG_LAST);
1517   const GumboNode* node = get_current_node(parser);
1518   if (!node_html_tag_is(node, cell_tag)) {
1519     parser_add_parse_error(parser, token);
1520     result = false;
1521   }
1522   do {
1523     node = pop_current_node(parser);
1524   } while (!node_html_tag_is(node, cell_tag));
1525 
1526   clear_active_formatting_elements(parser);
1527   set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
1528   return result;
1529 }
1530 
1531 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell
1532 // This holds the logic to determine whether we should close a <td> or a <th>.
close_current_cell(GumboParser * parser,const GumboToken * token)1533 static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1534   if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1535     assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1536     return close_table_cell(parser, token, GUMBO_TAG_TD);
1537   } else {
1538     assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1539     return close_table_cell(parser, token, GUMBO_TAG_TH);
1540   }
1541 }
1542 
1543 // This factors out the "act as if an end tag of tag name 'select' had been
1544 // seen" clause of the spec, since it's referenced in several places.  It pops
1545 // all nodes from the stack until the current <select> has been closed, then
1546 // resets the insertion mode appropriately.
close_current_select(GumboParser * parser)1547 static void close_current_select(GumboParser* parser) {
1548   GumboNode* node = pop_current_node(parser);
1549   while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
1550     node = pop_current_node(parser);
1551   }
1552   reset_insertion_mode_appropriately(parser);
1553 }
1554 
1555 // The list of nodes in the "special" category:
1556 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
is_special_node(const GumboNode * node)1557 static bool is_special_node(const GumboNode* node) {
1558   assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1559   return node_tag_in_set(node,
1560       (gumbo_tagset){TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE),
1561           TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
1562           TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
1563           TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR),
1564           TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET),
1565           TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME),
1566           TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6),
1567           TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME),
1568           TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK), TAG(LISTING),
1569           TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
1570           TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P),
1571           TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION),
1572           TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY),
1573           TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH),
1574           TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP),
1575 
1576           TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1577           TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1578 
1579           TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC)});
1580 }
1581 
1582 // Implicitly closes currently open elements until it reaches an element with
1583 // the
1584 // specified qualified name.  If the elements closed are in the set handled by
1585 // generate_implied_end_tags, this is normal operation and this function returns
1586 // true.  Otherwise, a parse error is recorded and this function returns false.
implicitly_close_tags(GumboParser * parser,GumboToken * token,GumboNamespaceEnum target_ns,GumboTag target)1587 static bool implicitly_close_tags(GumboParser* parser, GumboToken* token,
1588     GumboNamespaceEnum target_ns, GumboTag target) {
1589   bool result = true;
1590   generate_implied_end_tags(parser, target);
1591   if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1592     parser_add_parse_error(parser, token);
1593     while (
1594         !node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1595       pop_current_node(parser);
1596     }
1597     result = false;
1598   }
1599   assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
1600   pop_current_node(parser);
1601   return result;
1602 }
1603 
1604 // If the stack of open elements has a <p> tag in button scope, this acts as if
1605 // a </p> tag was encountered, implicitly closing tags.  Returns false if a
1606 // parse error occurs.  This is a convenience function because this particular
1607 // clause appears several times in the spec.
maybe_implicitly_close_p_tag(GumboParser * parser,GumboToken * token)1608 static bool maybe_implicitly_close_p_tag(
1609     GumboParser* parser, GumboToken* token) {
1610   if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1611     return implicitly_close_tags(
1612         parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
1613   }
1614   return true;
1615 }
1616 
1617 // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1618 // tags.  Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
maybe_implicitly_close_list_tag(GumboParser * parser,GumboToken * token,bool is_li)1619 static void maybe_implicitly_close_list_tag(
1620     GumboParser* parser, GumboToken* token, bool is_li) {
1621   GumboParserState* state = parser->_parser_state;
1622   state->_frameset_ok = false;
1623   for (int i = state->_open_elements.length; --i >= 0;) {
1624     const GumboNode* node = state->_open_elements.data[i];
1625     bool is_list_tag =
1626         is_li ? node_html_tag_is(node, GUMBO_TAG_LI)
1627               : node_tag_in_set(node, (gumbo_tagset){TAG(DD), TAG(DT)});
1628     if (is_list_tag) {
1629       implicitly_close_tags(
1630           parser, token, node->v.element.tag_namespace, node->v.element.tag);
1631       return;
1632     }
1633     if (is_special_node(node) &&
1634         !node_tag_in_set(
1635             node, (gumbo_tagset){TAG(ADDRESS), TAG(DIV), TAG(P)})) {
1636       return;
1637     }
1638   }
1639 }
1640 
merge_attributes(GumboParser * parser,GumboToken * token,GumboNode * node)1641 static void merge_attributes(
1642     GumboParser* parser, GumboToken* token, GumboNode* node) {
1643   assert(token->type == GUMBO_TOKEN_START_TAG);
1644   assert(node->type == GUMBO_NODE_ELEMENT);
1645   const GumboVector* token_attr = &token->v.start_tag.attributes;
1646   GumboVector* node_attr = &node->v.element.attributes;
1647 
1648   for (unsigned int i = 0; i < token_attr->length; ++i) {
1649     GumboAttribute* attr = token_attr->data[i];
1650     if (!gumbo_get_attribute(node_attr, attr->name)) {
1651       // Ownership of the attribute is transferred by this gumbo_vector_add,
1652       // so it has to be nulled out of the original token so it doesn't get
1653       // double-deleted.
1654       gumbo_vector_add(parser, attr, node_attr);
1655       token_attr->data[i] = NULL;
1656     }
1657   }
1658   // When attributes are merged, it means the token has been ignored and merged
1659   // with another token, so we need to free its memory.  The attributes that are
1660   // transferred need to be nulled-out in the vector above so that they aren't
1661   // double-deleted.
1662   gumbo_token_destroy(parser, token);
1663 
1664 #ifndef NDEBUG
1665   // Mark this sentinel so the assertion in the main loop knows it's been
1666   // destroyed.
1667   token->v.start_tag.attributes = kGumboEmptyVector;
1668 #endif
1669 }
1670 
gumbo_normalize_svg_tagname(const GumboStringPiece * tag)1671 const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
1672   for (size_t i = 0; i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry);
1673        ++i) {
1674     const ReplacementEntry* entry = &kSvgTagReplacements[i];
1675     if (gumbo_string_equals_ignore_case(tag, &entry->from)) {
1676       return entry->to.data;
1677     }
1678   }
1679   return NULL;
1680 }
1681 
1682 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
1683 // This destructively modifies any matching attributes on the token and sets the
1684 // namespace appropriately.
adjust_foreign_attributes(GumboParser * parser,GumboToken * token)1685 static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) {
1686   assert(token->type == GUMBO_TOKEN_START_TAG);
1687   const GumboVector* attributes = &token->v.start_tag.attributes;
1688   for (size_t i = 0; i < sizeof(kForeignAttributeReplacements) /
1689                              sizeof(NamespacedAttributeReplacement);
1690        ++i) {
1691     const NamespacedAttributeReplacement* entry =
1692         &kForeignAttributeReplacements[i];
1693     GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from);
1694     if (!attr) {
1695       continue;
1696     }
1697     gumbo_parser_deallocate(parser, (void*) attr->name);
1698     attr->attr_namespace = entry->attr_namespace;
1699     attr->name = gumbo_copy_stringz(parser, entry->local_name);
1700   }
1701 }
1702 
1703 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes
1704 // This destructively modifies any matching attributes on the token.
adjust_svg_attributes(GumboParser * parser,GumboToken * token)1705 static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) {
1706   assert(token->type == GUMBO_TOKEN_START_TAG);
1707   const GumboVector* attributes = &token->v.start_tag.attributes;
1708   for (size_t i = 0;
1709        i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) {
1710     const ReplacementEntry* entry = &kSvgAttributeReplacements[i];
1711     GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data);
1712     if (!attr) {
1713       continue;
1714     }
1715     gumbo_parser_deallocate(parser, (void*) attr->name);
1716     attr->name = gumbo_copy_stringz(parser, entry->to.data);
1717   }
1718 }
1719 
1720 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes
1721 // Note that this may destructively modify the token with the new attribute
1722 // value.
adjust_mathml_attributes(GumboParser * parser,GumboToken * token)1723 static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) {
1724   assert(token->type == GUMBO_TOKEN_START_TAG);
1725   GumboAttribute* attr =
1726       gumbo_get_attribute(&token->v.start_tag.attributes, "definitionurl");
1727   if (!attr) {
1728     return;
1729   }
1730   gumbo_parser_deallocate(parser, (void*) attr->name);
1731   attr->name = gumbo_copy_stringz(parser, "definitionURL");
1732 }
1733 
doctype_matches(const GumboTokenDocType * doctype,const GumboStringPiece * public_id,const GumboStringPiece * system_id,bool allow_missing_system_id)1734 static bool doctype_matches(const GumboTokenDocType* doctype,
1735     const GumboStringPiece* public_id, const GumboStringPiece* system_id,
1736     bool allow_missing_system_id) {
1737   return !strcmp(doctype->public_identifier, public_id->data) &&
1738          (allow_missing_system_id || doctype->has_system_identifier) &&
1739          !strcmp(doctype->system_identifier, system_id->data);
1740 }
1741 
maybe_add_doctype_error(GumboParser * parser,const GumboToken * token)1742 static bool maybe_add_doctype_error(
1743     GumboParser* parser, const GumboToken* token) {
1744   const GumboTokenDocType* doctype = &token->v.doc_type;
1745   bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data);
1746   if ((!html_doctype || doctype->has_public_identifier ||
1747           (doctype->has_system_identifier &&
1748               !strcmp(
1749                   doctype->system_identifier, kSystemIdLegacyCompat.data))) &&
1750       !(html_doctype && (doctype_matches(doctype, &kPublicIdHtml4_0,
1751                              &kSystemIdRecHtml4_0, true) ||
1752                             doctype_matches(doctype, &kPublicIdHtml4_01,
1753                                 &kSystemIdHtml4, true) ||
1754                             doctype_matches(doctype, &kPublicIdXhtml1_0,
1755                                 &kSystemIdXhtmlStrict1_1, false) ||
1756                             doctype_matches(doctype, &kPublicIdXhtml1_1,
1757                                 &kSystemIdXhtml1_1, false)))) {
1758     parser_add_parse_error(parser, token);
1759     return false;
1760   }
1761   return true;
1762 }
1763 
remove_from_parent(GumboParser * parser,GumboNode * node)1764 static void remove_from_parent(GumboParser* parser, GumboNode* node) {
1765   if (!node->parent) {
1766     // The node may not have a parent if, for example, it is a newly-cloned copy
1767     // of an active formatting element.  DOM manipulations continue with the
1768     // orphaned fragment of the DOM tree until it's appended/foster-parented to
1769     // the common ancestor at the end of the adoption agency algorithm.
1770     return;
1771   }
1772   assert(node->parent->type == GUMBO_NODE_ELEMENT);
1773   GumboVector* children = &node->parent->v.element.children;
1774   int index = gumbo_vector_index_of(children, node);
1775   assert(index != -1);
1776 
1777   gumbo_vector_remove_at(parser, index, children);
1778   node->parent = NULL;
1779   node->index_within_parent = -1;
1780   for (unsigned int i = index; i < children->length; ++i) {
1781     GumboNode* child = children->data[i];
1782     child->index_within_parent = i;
1783   }
1784 }
1785 
1786 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
1787 // Also described in the "in body" handling for end formatting tags.
adoption_agency_algorithm(GumboParser * parser,GumboToken * token,GumboTag subject)1788 static bool adoption_agency_algorithm(
1789     GumboParser* parser, GumboToken* token, GumboTag subject) {
1790   GumboParserState* state = parser->_parser_state;
1791   gumbo_debug("Entering adoption agency algorithm.\n");
1792   // Step 1.
1793   GumboNode* current_node = get_current_node(parser);
1794   if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
1795       current_node->v.element.tag == subject &&
1796       gumbo_vector_index_of(
1797           &state->_active_formatting_elements, current_node) == -1) {
1798     pop_current_node(parser);
1799     return false;
1800   }
1801   // Steps 2-4 & 20:
1802   for (unsigned int i = 0; i < 8; ++i) {
1803     // Step 5.
1804     GumboNode* formatting_node = NULL;
1805     int formatting_node_in_open_elements = -1;
1806     for (int j = state->_active_formatting_elements.length; --j >= 0;) {
1807       GumboNode* current_node = state->_active_formatting_elements.data[j];
1808       if (current_node == &kActiveFormattingScopeMarker) {
1809         gumbo_debug("Broke on scope marker; aborting.\n");
1810         // Last scope marker; abort the algorithm.
1811         return false;
1812       }
1813       if (node_html_tag_is(current_node, subject)) {
1814         // Found it.
1815         formatting_node = current_node;
1816         formatting_node_in_open_elements =
1817             gumbo_vector_index_of(&state->_open_elements, formatting_node);
1818         gumbo_debug("Formatting element of tag %s at %d.\n",
1819             gumbo_normalized_tagname(subject),
1820             formatting_node_in_open_elements);
1821         break;
1822       }
1823     }
1824     if (!formatting_node) {
1825       // No matching tag; not a parse error outright, but fall through to the
1826       // "any other end tag" clause (which may potentially add a parse error,
1827       // but not always).
1828       gumbo_debug("No active formatting elements; aborting.\n");
1829       return false;
1830     }
1831 
1832     // Step 6
1833     if (formatting_node_in_open_elements == -1) {
1834       gumbo_debug("Formatting node not on stack of open elements.\n");
1835       parser_add_parse_error(parser, token);
1836       gumbo_vector_remove(
1837           parser, formatting_node, &state->_active_formatting_elements);
1838       return false;
1839     }
1840 
1841     // Step 7
1842     if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
1843       parser_add_parse_error(parser, token);
1844       gumbo_debug("Element not in scope.\n");
1845       return false;
1846     }
1847 
1848     // Step 8
1849     if (formatting_node != get_current_node(parser)) {
1850       parser_add_parse_error(parser, token);  // But continue onwards.
1851     }
1852     assert(formatting_node);
1853     assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
1854     assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
1855 
1856     // Step 9 & 10
1857     GumboNode* furthest_block = NULL;
1858     for (unsigned int j = formatting_node_in_open_elements;
1859          j < state->_open_elements.length; ++j) {
1860       assert(j > 0);
1861       GumboNode* current = state->_open_elements.data[j];
1862       if (is_special_node(current)) {
1863         // Step 9.
1864         furthest_block = current;
1865         break;
1866       }
1867     }
1868     if (!furthest_block) {
1869       // Step 10.
1870       while (get_current_node(parser) != formatting_node) {
1871         pop_current_node(parser);
1872       }
1873       // And the formatting element itself.
1874       pop_current_node(parser);
1875       gumbo_vector_remove(
1876           parser, formatting_node, &state->_active_formatting_elements);
1877       return false;
1878     }
1879     assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
1880     assert(furthest_block);
1881 
1882     // Step 11.
1883     // Elements may be moved and reparented by this algorithm, so
1884     // common_ancestor is not necessarily the same as formatting_node->parent.
1885     GumboNode* common_ancestor =
1886         state->_open_elements.data[gumbo_vector_index_of(&state->_open_elements,
1887                                        formatting_node) -
1888                                    1];
1889     gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
1890         gumbo_normalized_tagname(common_ancestor->v.element.tag),
1891         gumbo_normalized_tagname(furthest_block->v.element.tag));
1892 
1893     // Step 12.
1894     int bookmark = gumbo_vector_index_of(
1895                        &state->_active_formatting_elements, formatting_node) +
1896                    1;
1897     gumbo_debug("Bookmark at %d.\n", bookmark);
1898     // Step 13.
1899     GumboNode* node = furthest_block;
1900     GumboNode* last_node = furthest_block;
1901     // Must be stored explicitly, in case node is removed from the stack of open
1902     // elements, to handle step 9.4.
1903     int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
1904     assert(saved_node_index > 0);
1905     // Step 13.1.
1906     for (int j = 0;;) {
1907       // Step 13.2.
1908       ++j;
1909       // Step 13.3.
1910       int node_index = gumbo_vector_index_of(&state->_open_elements, node);
1911       gumbo_debug(
1912           "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1913       if (node_index == -1) {
1914         node_index = saved_node_index;
1915       }
1916       saved_node_index = --node_index;
1917       assert(node_index > 0);
1918       assert((unsigned int) node_index < state->_open_elements.capacity);
1919       node = state->_open_elements.data[node_index];
1920       assert(node->parent);
1921       if (node == formatting_node) {
1922         // Step 13.4.
1923         break;
1924       }
1925       int formatting_index =
1926           gumbo_vector_index_of(&state->_active_formatting_elements, node);
1927       if (j > 3 && formatting_index != -1) {
1928         // Step 13.5.
1929         gumbo_debug("Removing formatting element at %d.\n", formatting_index);
1930         gumbo_vector_remove_at(
1931             parser, formatting_index, &state->_active_formatting_elements);
1932         // Removing the element shifts all indices over by one, so we may need
1933         // to move the bookmark.
1934         if (formatting_index < bookmark) {
1935           --bookmark;
1936           gumbo_debug("Moving bookmark to %d.\n", bookmark);
1937         }
1938         continue;
1939       }
1940       if (formatting_index == -1) {
1941         // Step 13.6.
1942         gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
1943         continue;
1944       }
1945       // Step 13.7.
1946       // "common ancestor as the intended parent" doesn't actually mean insert
1947       // it into the common ancestor; that happens below.
1948       node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1949       assert(formatting_index >= 0);
1950       state->_active_formatting_elements.data[formatting_index] = node;
1951       assert(node_index >= 0);
1952       state->_open_elements.data[node_index] = node;
1953       // Step 13.8.
1954       if (last_node == furthest_block) {
1955         bookmark = formatting_index + 1;
1956         gumbo_debug("Bookmark moved to %d.\n", bookmark);
1957         assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
1958       }
1959       // Step 13.9.
1960       last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1961       remove_from_parent(parser, last_node);
1962       append_node(parser, node, last_node);
1963       // Step 13.10.
1964       last_node = node;
1965     }  // Step 13.11.
1966 
1967     // Step 14.
1968     gumbo_debug("Removing %s node from parent ",
1969         gumbo_normalized_tagname(last_node->v.element.tag));
1970     remove_from_parent(parser, last_node);
1971     last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1972     InsertionLocation location =
1973         get_appropriate_insertion_location(parser, common_ancestor);
1974     gumbo_debug("and inserting it into %s.\n",
1975         gumbo_normalized_tagname(location.target->v.element.tag));
1976     insert_node(parser, last_node, location);
1977 
1978     // Step 15.
1979     GumboNode* new_formatting_node = clone_node(
1980         parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1981     formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1982 
1983     // Step 16.  Instead of appending nodes one-by-one, we swap the children
1984     // vector of furthest_block with the empty children of new_formatting_node,
1985     // reducing memory traffic and allocations.  We still have to reset their
1986     // parent pointers, though.
1987     GumboVector temp = new_formatting_node->v.element.children;
1988     new_formatting_node->v.element.children =
1989         furthest_block->v.element.children;
1990     furthest_block->v.element.children = temp;
1991 
1992     temp = new_formatting_node->v.element.children;
1993     for (unsigned int i = 0; i < temp.length; ++i) {
1994       GumboNode* child = temp.data[i];
1995       child->parent = new_formatting_node;
1996     }
1997 
1998     // Step 17.
1999     append_node(parser, furthest_block, new_formatting_node);
2000 
2001     // Step 18.
2002     // If the formatting node was before the bookmark, it may shift over all
2003     // indices after it, so we need to explicitly find the index and possibly
2004     // adjust the bookmark.
2005     int formatting_node_index = gumbo_vector_index_of(
2006         &state->_active_formatting_elements, formatting_node);
2007     assert(formatting_node_index != -1);
2008     if (formatting_node_index < bookmark) {
2009       gumbo_debug(
2010           "Formatting node at %d is before bookmark at %d; decrementing.\n",
2011           formatting_node_index, bookmark);
2012       --bookmark;
2013     }
2014     gumbo_vector_remove_at(
2015         parser, formatting_node_index, &state->_active_formatting_elements);
2016     assert(bookmark >= 0);
2017     assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2018     gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
2019         &state->_active_formatting_elements);
2020 
2021     // Step 19.
2022     gumbo_vector_remove(parser, formatting_node, &state->_open_elements);
2023     int insert_at =
2024         gumbo_vector_index_of(&state->_open_elements, furthest_block) + 1;
2025     assert(insert_at >= 0);
2026     assert((unsigned int) insert_at <= state->_open_elements.length);
2027     gumbo_vector_insert_at(
2028         parser, new_formatting_node, insert_at, &state->_open_elements);
2029   }  // Step 20.
2030   return true;
2031 }
2032 
2033 // This is here to clean up memory when the spec says "Ignore current token."
ignore_token(GumboParser * parser)2034 static void ignore_token(GumboParser* parser) {
2035   GumboToken* token = parser->_parser_state->_current_token;
2036   // Ownership of the token's internal buffers are normally transferred to the
2037   // element, but if no element is emitted (as happens in non-verbatim-mode
2038   // when a token is ignored), we need to free it here to prevent a memory
2039   // leak.
2040   gumbo_token_destroy(parser, token);
2041 #ifndef NDEBUG
2042   if (token->type == GUMBO_TOKEN_START_TAG) {
2043     // Mark this sentinel so the assertion in the main loop knows it's been
2044     // destroyed.
2045     token->v.start_tag.attributes = kGumboEmptyVector;
2046   }
2047 #endif
2048 }
2049 
2050 // http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html
finish_parsing(GumboParser * parser)2051 static void finish_parsing(GumboParser* parser) {
2052   gumbo_debug("Finishing parsing");
2053   maybe_flush_text_node_buffer(parser);
2054   GumboParserState* state = parser->_parser_state;
2055   for (GumboNode* node = pop_current_node(parser); node;
2056        node = pop_current_node(parser)) {
2057     if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
2058         (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
2059       continue;
2060     }
2061     node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2062   }
2063   while (pop_current_node(parser))
2064     ;  // Pop them all.
2065 }
2066 
handle_initial(GumboParser * parser,GumboToken * token)2067 static bool handle_initial(GumboParser* parser, GumboToken* token) {
2068   GumboDocument* document = &get_document_node(parser)->v.document;
2069   if (token->type == GUMBO_TOKEN_WHITESPACE) {
2070     ignore_token(parser);
2071     return true;
2072   } else if (token->type == GUMBO_TOKEN_COMMENT) {
2073     append_comment_node(parser, get_document_node(parser), token);
2074     return true;
2075   } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2076     document->has_doctype = true;
2077     document->name = token->v.doc_type.name;
2078     document->public_identifier = token->v.doc_type.public_identifier;
2079     document->system_identifier = token->v.doc_type.system_identifier;
2080     document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
2081     set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2082     return maybe_add_doctype_error(parser, token);
2083   }
2084   parser_add_parse_error(parser, token);
2085   document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
2086   set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2087   parser->_parser_state->_reprocess_current_token = true;
2088   return true;
2089 }
2090 
2091 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode
handle_before_html(GumboParser * parser,GumboToken * token)2092 static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2093   if (token->type == GUMBO_TOKEN_DOCTYPE) {
2094     parser_add_parse_error(parser, token);
2095     ignore_token(parser);
2096     return false;
2097   } else if (token->type == GUMBO_TOKEN_COMMENT) {
2098     append_comment_node(parser, get_document_node(parser), token);
2099     return true;
2100   } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2101     ignore_token(parser);
2102     return true;
2103   } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2104     GumboNode* html_node = insert_element_from_token(parser, token);
2105     parser->_output->root = html_node;
2106     set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2107     return true;
2108   } else if (token->type == GUMBO_TOKEN_END_TAG &&
2109              !tag_in(token, false,
2110                  (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) {
2111     parser_add_parse_error(parser, token);
2112     ignore_token(parser);
2113     return false;
2114   } else {
2115     GumboNode* html_node = insert_element_of_tag_type(
2116         parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
2117     assert(html_node);
2118     parser->_output->root = html_node;
2119     set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2120     parser->_parser_state->_reprocess_current_token = true;
2121     return true;
2122   }
2123 }
2124 
2125 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode
handle_before_head(GumboParser * parser,GumboToken * token)2126 static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2127   if (token->type == GUMBO_TOKEN_DOCTYPE) {
2128     parser_add_parse_error(parser, token);
2129     ignore_token(parser);
2130     return false;
2131   } else if (token->type == GUMBO_TOKEN_COMMENT) {
2132     append_comment_node(parser, get_current_node(parser), token);
2133     return true;
2134   } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2135     ignore_token(parser);
2136     return true;
2137   } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2138     GumboNode* node = insert_element_from_token(parser, token);
2139     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2140     parser->_parser_state->_head_element = node;
2141     return true;
2142   } else if (token->type == GUMBO_TOKEN_END_TAG &&
2143              !tag_in(token, false,
2144                  (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) {
2145     parser_add_parse_error(parser, token);
2146     ignore_token(parser);
2147     return false;
2148   } else {
2149     GumboNode* node = insert_element_of_tag_type(
2150         parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED);
2151     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2152     parser->_parser_state->_head_element = node;
2153     parser->_parser_state->_reprocess_current_token = true;
2154     return true;
2155   }
2156 }
2157 
2158 // Forward declarations because of mutual dependencies.
2159 static bool handle_token(GumboParser* parser, GumboToken* token);
2160 static bool handle_in_body(GumboParser* parser, GumboToken* token);
2161 
2162 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead
handle_in_head(GumboParser * parser,GumboToken * token)2163 static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2164   if (token->type == GUMBO_TOKEN_WHITESPACE) {
2165     insert_text_token(parser, token);
2166     return true;
2167   } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2168     parser_add_parse_error(parser, token);
2169     ignore_token(parser);
2170     return false;
2171   } else if (token->type == GUMBO_TOKEN_COMMENT) {
2172     append_comment_node(parser, get_current_node(parser), token);
2173     return true;
2174   } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2175     return handle_in_body(parser, token);
2176   } else if (tag_in(token, kStartTag,
2177                  (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2178                      TAG(MENUITEM), TAG(LINK)})) {
2179     insert_element_from_token(parser, token);
2180     pop_current_node(parser);
2181     acknowledge_self_closing_tag(parser);
2182     return true;
2183   } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2184     insert_element_from_token(parser, token);
2185     pop_current_node(parser);
2186     acknowledge_self_closing_tag(parser);
2187     // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
2188     // spec doesn't apply.  If clients want to handle meta-tag re-encoding, they
2189     // should specifically look for that string in the document and re-encode it
2190     // before passing to Gumbo.
2191     return true;
2192   } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2193     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2194     return true;
2195   } else if (tag_in(
2196                  token, kStartTag, (gumbo_tagset){TAG(NOFRAMES), TAG(STYLE)})) {
2197     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2198     return true;
2199   } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2200     insert_element_from_token(parser, token);
2201     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2202     return true;
2203   } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2204     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
2205     return true;
2206   } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2207     GumboNode* head = pop_current_node(parser);
2208     AVOID_UNUSED_VARIABLE_WARNING(head);
2209     assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2210     set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2211     return true;
2212   } else if (tag_in(token, kEndTag,
2213                  (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)})) {
2214     pop_current_node(parser);
2215     set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2216     parser->_parser_state->_reprocess_current_token = true;
2217     return true;
2218   } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2219     insert_element_from_token(parser, token);
2220     add_formatting_element(parser, &kActiveFormattingScopeMarker);
2221     parser->_parser_state->_frameset_ok = false;
2222     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2223     push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2224     return true;
2225   } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2226     if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2227       parser_add_parse_error(parser, token);
2228       ignore_token(parser);
2229       return false;
2230     }
2231     generate_all_implied_end_tags_thoroughly(parser);
2232     bool success = true;
2233     if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) {
2234       parser_add_parse_error(parser, token);
2235       success = false;
2236     }
2237     while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
2238       ;
2239     clear_active_formatting_elements(parser);
2240     pop_template_insertion_mode(parser);
2241     reset_insertion_mode_appropriately(parser);
2242     return success;
2243   } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2244              (token->type == GUMBO_TOKEN_END_TAG)) {
2245     parser_add_parse_error(parser, token);
2246     ignore_token(parser);
2247     return false;
2248   } else {
2249     pop_current_node(parser);
2250     set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2251     parser->_parser_state->_reprocess_current_token = true;
2252     return true;
2253   }
2254   return true;
2255 }
2256 
2257 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript
handle_in_head_noscript(GumboParser * parser,GumboToken * token)2258 static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2259   if (token->type == GUMBO_TOKEN_DOCTYPE) {
2260     parser_add_parse_error(parser, token);
2261     return false;
2262   } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2263     return handle_in_body(parser, token);
2264   } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2265     const GumboNode* node = pop_current_node(parser);
2266     assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2267     AVOID_UNUSED_VARIABLE_WARNING(node);
2268     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2269     return true;
2270   } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
2271              token->type == GUMBO_TOKEN_COMMENT ||
2272              tag_in(token, kStartTag,
2273                  (gumbo_tagset){TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2274                      TAG(META), TAG(NOFRAMES), TAG(STYLE)})) {
2275     return handle_in_head(parser, token);
2276   } else if (tag_in(
2277                  token, kStartTag, (gumbo_tagset){TAG(HEAD), TAG(NOSCRIPT)}) ||
2278              (token->type == GUMBO_TOKEN_END_TAG &&
2279                  !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
2280     parser_add_parse_error(parser, token);
2281     ignore_token(parser);
2282     return false;
2283   } else {
2284     parser_add_parse_error(parser, token);
2285     const GumboNode* node = pop_current_node(parser);
2286     assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2287     AVOID_UNUSED_VARIABLE_WARNING(node);
2288     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2289     parser->_parser_state->_reprocess_current_token = true;
2290     return false;
2291   }
2292 }
2293 
2294 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode
handle_after_head(GumboParser * parser,GumboToken * token)2295 static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2296   GumboParserState* state = parser->_parser_state;
2297   if (token->type == GUMBO_TOKEN_WHITESPACE) {
2298     insert_text_token(parser, token);
2299     return true;
2300   } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2301     parser_add_parse_error(parser, token);
2302     ignore_token(parser);
2303     return false;
2304   } else if (token->type == GUMBO_TOKEN_COMMENT) {
2305     append_comment_node(parser, get_current_node(parser), token);
2306     return true;
2307   } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2308     return handle_in_body(parser, token);
2309   } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2310     insert_element_from_token(parser, token);
2311     state->_frameset_ok = false;
2312     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2313     return true;
2314   } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2315     insert_element_from_token(parser, token);
2316     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2317     return true;
2318   } else if (tag_in(token, kStartTag,
2319                  (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2320                      TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
2321                      TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)})) {
2322     parser_add_parse_error(parser, token);
2323     assert(state->_head_element != NULL);
2324     // This must be flushed before we push the head element on, as there may be
2325     // pending character tokens that should be attached to the root.
2326     maybe_flush_text_node_buffer(parser);
2327     gumbo_vector_add(parser, state->_head_element, &state->_open_elements);
2328     bool result = handle_in_head(parser, token);
2329     gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
2330     return result;
2331   } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2332     return handle_in_head(parser, token);
2333   } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2334              (token->type == GUMBO_TOKEN_END_TAG &&
2335                  !tag_in(token, kEndTag,
2336                      (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)}))) {
2337     parser_add_parse_error(parser, token);
2338     ignore_token(parser);
2339     return false;
2340   } else {
2341     insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2342     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2343     state->_reprocess_current_token = true;
2344     return true;
2345   }
2346 }
2347 
destroy_node(GumboParser * parser,GumboNode * node)2348 static void destroy_node(GumboParser* parser, GumboNode* node) {
2349   switch (node->type) {
2350     case GUMBO_NODE_DOCUMENT: {
2351       GumboDocument* doc = &node->v.document;
2352       for (unsigned int i = 0; i < doc->children.length; ++i) {
2353         destroy_node(parser, doc->children.data[i]);
2354       }
2355       gumbo_parser_deallocate(parser, (void*) doc->children.data);
2356       gumbo_parser_deallocate(parser, (void*) doc->name);
2357       gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
2358       gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
2359     } break;
2360     case GUMBO_NODE_TEMPLATE:
2361     case GUMBO_NODE_ELEMENT:
2362       for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) {
2363         gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
2364       }
2365       gumbo_parser_deallocate(parser, node->v.element.attributes.data);
2366       for (unsigned int i = 0; i < node->v.element.children.length; ++i) {
2367         destroy_node(parser, node->v.element.children.data[i]);
2368       }
2369       gumbo_parser_deallocate(parser, node->v.element.children.data);
2370       break;
2371     case GUMBO_NODE_TEXT:
2372     case GUMBO_NODE_CDATA:
2373     case GUMBO_NODE_COMMENT:
2374     case GUMBO_NODE_WHITESPACE:
2375       gumbo_parser_deallocate(parser, (void*) node->v.text.text);
2376       break;
2377   }
2378   gumbo_parser_deallocate(parser, node);
2379 }
2380 
2381 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
handle_in_body(GumboParser * parser,GumboToken * token)2382 static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2383   GumboParserState* state = parser->_parser_state;
2384   assert(state->_open_elements.length > 0);
2385   if (token->type == GUMBO_TOKEN_NULL) {
2386     parser_add_parse_error(parser, token);
2387     ignore_token(parser);
2388     return false;
2389   } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2390     reconstruct_active_formatting_elements(parser);
2391     insert_text_token(parser, token);
2392     return true;
2393   } else if (token->type == GUMBO_TOKEN_CHARACTER ||
2394              token->type == GUMBO_TOKEN_CDATA) {
2395     reconstruct_active_formatting_elements(parser);
2396     insert_text_token(parser, token);
2397     set_frameset_not_ok(parser);
2398     return true;
2399   } else if (token->type == GUMBO_TOKEN_COMMENT) {
2400     append_comment_node(parser, get_current_node(parser), token);
2401     return true;
2402   } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2403     parser_add_parse_error(parser, token);
2404     ignore_token(parser);
2405     return false;
2406   } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2407     parser_add_parse_error(parser, token);
2408     if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2409       ignore_token(parser);
2410       return false;
2411     }
2412     assert(parser->_output->root != NULL);
2413     assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2414     merge_attributes(parser, token, parser->_output->root);
2415     return false;
2416   } else if (tag_in(token, kStartTag,
2417                  (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2418                      TAG(MENUITEM), TAG(LINK), TAG(META), TAG(NOFRAMES),
2419                      TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
2420              tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2421     return handle_in_head(parser, token);
2422   } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2423     parser_add_parse_error(parser, token);
2424     if (state->_open_elements.length < 2 ||
2425         !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2426         has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2427       ignore_token(parser);
2428       return false;
2429     }
2430     state->_frameset_ok = false;
2431     merge_attributes(parser, token, state->_open_elements.data[1]);
2432     return false;
2433   } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2434     parser_add_parse_error(parser, token);
2435     if (state->_open_elements.length < 2 ||
2436         !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2437         !state->_frameset_ok) {
2438       ignore_token(parser);
2439       return false;
2440     }
2441     // Save the body node for later removal.
2442     GumboNode* body_node = state->_open_elements.data[1];
2443 
2444     // Pop all nodes except root HTML element.
2445     GumboNode* node;
2446     do {
2447       node = pop_current_node(parser);
2448     } while (node != state->_open_elements.data[1]);
2449 
2450     // Removing & destroying the body node is going to kill any nodes that have
2451     // been added to the list of active formatting elements, and so we should
2452     // clear it to prevent a use-after-free if the list of active formatting
2453     // elements is reconstructed afterwards.  This may happen if whitespace
2454     // follows the </frameset>.
2455     clear_active_formatting_elements(parser);
2456 
2457     // Remove the body node.  We may want to factor this out into a generic
2458     // helper, but right now this is the only code that needs to do this.
2459     GumboVector* children = &parser->_output->root->v.element.children;
2460     for (unsigned int i = 0; i < children->length; ++i) {
2461       if (children->data[i] == body_node) {
2462         gumbo_vector_remove_at(parser, i, children);
2463         break;
2464       }
2465     }
2466     destroy_node(parser, body_node);
2467 
2468     // Insert the <frameset>, and switch the insertion mode.
2469     insert_element_from_token(parser, token);
2470     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2471     return true;
2472   } else if (token->type == GUMBO_TOKEN_EOF) {
2473     for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2474       if (!node_tag_in_set(state->_open_elements.data[i],
2475               (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY),
2476                   TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY),
2477                   TAG(HTML)})) {
2478         parser_add_parse_error(parser, token);
2479       }
2480     }
2481     if (get_current_template_insertion_mode(parser) !=
2482         GUMBO_INSERTION_MODE_INITIAL) {
2483       return handle_in_template(parser, token);
2484     }
2485     return true;
2486   } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(HTML)})) {
2487     if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2488       parser_add_parse_error(parser, token);
2489       ignore_token(parser);
2490       return false;
2491     }
2492     bool success = true;
2493     for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2494       if (!node_tag_in_set(state->_open_elements.data[i],
2495               (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
2496                   TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC),
2497                   TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR),
2498                   TAG(BODY), TAG(HTML)})) {
2499         parser_add_parse_error(parser, token);
2500         success = false;
2501         break;
2502       }
2503     }
2504     set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2505     if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2506       parser->_parser_state->_reprocess_current_token = true;
2507     } else {
2508       GumboNode* body = state->_open_elements.data[1];
2509       assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2510       record_end_of_element(state->_current_token, &body->v.element);
2511     }
2512     return success;
2513   } else if (tag_in(token, kStartTag,
2514                  (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
2515                      TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS), TAG(DIR),
2516                      TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
2517                      TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
2518                      TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P),
2519                      TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
2520     bool result = maybe_implicitly_close_p_tag(parser, token);
2521     insert_element_from_token(parser, token);
2522     return result;
2523   } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2524                                           TAG(H4), TAG(H5), TAG(H6)})) {
2525     bool result = maybe_implicitly_close_p_tag(parser, token);
2526     if (node_tag_in_set(
2527             get_current_node(parser), (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2528                                           TAG(H4), TAG(H5), TAG(H6)})) {
2529       parser_add_parse_error(parser, token);
2530       pop_current_node(parser);
2531       result = false;
2532     }
2533     insert_element_from_token(parser, token);
2534     return result;
2535   } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(PRE), TAG(LISTING)})) {
2536     bool result = maybe_implicitly_close_p_tag(parser, token);
2537     insert_element_from_token(parser, token);
2538     state->_ignore_next_linefeed = true;
2539     state->_frameset_ok = false;
2540     return result;
2541   } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2542     if (state->_form_element != NULL &&
2543         !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2544       gumbo_debug("Ignoring nested form.\n");
2545       parser_add_parse_error(parser, token);
2546       ignore_token(parser);
2547       return false;
2548     }
2549     bool result = maybe_implicitly_close_p_tag(parser, token);
2550     GumboNode* form_element = insert_element_from_token(parser, token);
2551     if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2552       state->_form_element = form_element;
2553     }
2554     return result;
2555   } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2556     maybe_implicitly_close_list_tag(parser, token, true);
2557     bool result = maybe_implicitly_close_p_tag(parser, token);
2558     insert_element_from_token(parser, token);
2559     return result;
2560   } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(DD), TAG(DT)})) {
2561     maybe_implicitly_close_list_tag(parser, token, false);
2562     bool result = maybe_implicitly_close_p_tag(parser, token);
2563     insert_element_from_token(parser, token);
2564     return result;
2565   } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2566     bool result = maybe_implicitly_close_p_tag(parser, token);
2567     insert_element_from_token(parser, token);
2568     gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2569     return result;
2570   } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2571     if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2572       parser_add_parse_error(parser, token);
2573       implicitly_close_tags(
2574           parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON);
2575       state->_reprocess_current_token = true;
2576       return false;
2577     }
2578     reconstruct_active_formatting_elements(parser);
2579     insert_element_from_token(parser, token);
2580     state->_frameset_ok = false;
2581     return true;
2582   } else if (tag_in(token, kEndTag,
2583                  (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
2584                      TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS),
2585                      TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
2586                      TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER),
2587                      TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV),
2588                      TAG(OL), TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
2589     GumboTag tag = token->v.end_tag;
2590     if (!has_an_element_in_scope(parser, tag)) {
2591       parser_add_parse_error(parser, token);
2592       ignore_token(parser);
2593       return false;
2594     }
2595     implicitly_close_tags(
2596         parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag);
2597     return true;
2598   } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2599     if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2600       if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
2601         parser_add_parse_error(parser, token);
2602         ignore_token(parser);
2603         return false;
2604       }
2605       bool success = true;
2606       generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2607       if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
2608         parser_add_parse_error(parser, token);
2609         return false;
2610       }
2611       while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
2612         ;
2613       return success;
2614     } else {
2615       bool result = true;
2616       const GumboNode* node = state->_form_element;
2617       assert(!node || node->type == GUMBO_NODE_ELEMENT);
2618       state->_form_element = NULL;
2619       if (!node || !has_node_in_scope(parser, node)) {
2620         gumbo_debug("Closing an unopened form.\n");
2621         parser_add_parse_error(parser, token);
2622         ignore_token(parser);
2623         return false;
2624       }
2625       // This differs from implicitly_close_tags because we remove *only* the
2626       // <form> element; other nodes are left in scope.
2627       generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2628       if (get_current_node(parser) != node) {
2629         parser_add_parse_error(parser, token);
2630         result = false;
2631       }
2632 
2633       GumboVector* open_elements = &state->_open_elements;
2634       int index = gumbo_vector_index_of(open_elements, node);
2635       assert(index >= 0);
2636       gumbo_vector_remove_at(parser, index, open_elements);
2637       return result;
2638     }
2639   } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
2640     if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2641       parser_add_parse_error(parser, token);
2642       // reconstruct_active_formatting_elements(parser);
2643       insert_element_of_tag_type(
2644           parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2645       state->_reprocess_current_token = true;
2646       return false;
2647     }
2648     return implicitly_close_tags(
2649         parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
2650   } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
2651     if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
2652       parser_add_parse_error(parser, token);
2653       ignore_token(parser);
2654       return false;
2655     }
2656     return implicitly_close_tags(
2657         parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI);
2658   } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(DD), TAG(DT)})) {
2659     assert(token->type == GUMBO_TOKEN_END_TAG);
2660     GumboTag token_tag = token->v.end_tag;
2661     if (!has_an_element_in_scope(parser, token_tag)) {
2662       parser_add_parse_error(parser, token);
2663       ignore_token(parser);
2664       return false;
2665     }
2666     return implicitly_close_tags(
2667         parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2668   } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2669                                         TAG(H4), TAG(H5), TAG(H6)})) {
2670     if (!has_an_element_in_scope_with_tagname(
2671             parser, 6, (GumboTag[]){GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2672                            GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) {
2673       // No heading open; ignore the token entirely.
2674       parser_add_parse_error(parser, token);
2675       ignore_token(parser);
2676       return false;
2677     } else {
2678       generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2679       const GumboNode* current_node = get_current_node(parser);
2680       bool success = node_html_tag_is(current_node, token->v.end_tag);
2681       if (!success) {
2682         // There're children of the heading currently open; close them below and
2683         // record a parse error.
2684         // TODO(jdtang): Add a way to distinguish this error case from the one
2685         // above.
2686         parser_add_parse_error(parser, token);
2687       }
2688       do {
2689         current_node = pop_current_node(parser);
2690       } while (!node_tag_in_set(
2691                    current_node, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2692                                      TAG(H4), TAG(H5), TAG(H6)}));
2693       return success;
2694     }
2695   } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
2696     bool success = true;
2697     int last_a;
2698     int has_matching_a = find_last_anchor_index(parser, &last_a);
2699     if (has_matching_a) {
2700       assert(has_matching_a == 1);
2701       parser_add_parse_error(parser, token);
2702       adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
2703       // The adoption agency algorithm usually removes all instances of <a>
2704       // from the list of active formatting elements, but in case it doesn't,
2705       // we're supposed to do this.  (The conditions where it might not are
2706       // listed in the spec.)
2707       if (find_last_anchor_index(parser, &last_a)) {
2708         void* last_element = gumbo_vector_remove_at(
2709             parser, last_a, &state->_active_formatting_elements);
2710         gumbo_vector_remove(parser, last_element, &state->_open_elements);
2711       }
2712       success = false;
2713     }
2714     reconstruct_active_formatting_elements(parser);
2715     add_formatting_element(parser, insert_element_from_token(parser, token));
2716     return success;
2717   } else if (tag_in(token, kStartTag,
2718                  (gumbo_tagset){TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT),
2719                      TAG(I), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG),
2720                      TAG(TT), TAG(U)})) {
2721     reconstruct_active_formatting_elements(parser);
2722     add_formatting_element(parser, insert_element_from_token(parser, token));
2723     return true;
2724   } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
2725     bool result = true;
2726     reconstruct_active_formatting_elements(parser);
2727     if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
2728       result = false;
2729       parser_add_parse_error(parser, token);
2730       adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
2731       reconstruct_active_formatting_elements(parser);
2732     }
2733     insert_element_from_token(parser, token);
2734     add_formatting_element(parser, get_current_node(parser));
2735     return result;
2736   } else if (tag_in(token, kEndTag,
2737                  (gumbo_tagset){TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM),
2738                      TAG(FONT), TAG(I), TAG(NOBR), TAG(S), TAG(SMALL),
2739                      TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)})) {
2740     return adoption_agency_algorithm(parser, token, token->v.end_tag);
2741   } else if (tag_in(token, kStartTag,
2742                  (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) {
2743     reconstruct_active_formatting_elements(parser);
2744     insert_element_from_token(parser, token);
2745     add_formatting_element(parser, &kActiveFormattingScopeMarker);
2746     set_frameset_not_ok(parser);
2747     return true;
2748   } else if (tag_in(token, kEndTag,
2749                  (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) {
2750     GumboTag token_tag = token->v.end_tag;
2751     if (!has_an_element_in_table_scope(parser, token_tag)) {
2752       parser_add_parse_error(parser, token);
2753       ignore_token(parser);
2754       return false;
2755     }
2756     implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2757     clear_active_formatting_elements(parser);
2758     return true;
2759   } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
2760     if (get_document_node(parser)->v.document.doc_type_quirks_mode !=
2761         GUMBO_DOCTYPE_QUIRKS) {
2762       maybe_implicitly_close_p_tag(parser, token);
2763     }
2764     insert_element_from_token(parser, token);
2765     set_frameset_not_ok(parser);
2766     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
2767     return true;
2768   } else if (tag_in(token, kStartTag,
2769                  (gumbo_tagset){TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG),
2770                      TAG(IMAGE), TAG(KEYGEN), TAG(WBR)})) {
2771     bool success = true;
2772     if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2773       success = false;
2774       parser_add_parse_error(parser, token);
2775       token->v.start_tag.tag = GUMBO_TAG_IMG;
2776     }
2777     reconstruct_active_formatting_elements(parser);
2778     GumboNode* node = insert_element_from_token(parser, token);
2779     if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2780       success = false;
2781       parser_add_parse_error(parser, token);
2782       node->v.element.tag = GUMBO_TAG_IMG;
2783       node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
2784     }
2785     pop_current_node(parser);
2786     acknowledge_self_closing_tag(parser);
2787     set_frameset_not_ok(parser);
2788     return success;
2789   } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
2790     if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
2791       // Must be before the element is inserted, as that takes ownership of the
2792       // token's attribute vector.
2793       set_frameset_not_ok(parser);
2794     }
2795     reconstruct_active_formatting_elements(parser);
2796     insert_element_from_token(parser, token);
2797     pop_current_node(parser);
2798     acknowledge_self_closing_tag(parser);
2799     return true;
2800   } else if (tag_in(token, kStartTag,
2801                  (gumbo_tagset){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})) {
2802     insert_element_from_token(parser, token);
2803     pop_current_node(parser);
2804     acknowledge_self_closing_tag(parser);
2805     return true;
2806   } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
2807     bool result = maybe_implicitly_close_p_tag(parser, token);
2808     insert_element_from_token(parser, token);
2809     pop_current_node(parser);
2810     acknowledge_self_closing_tag(parser);
2811     set_frameset_not_ok(parser);
2812     return result;
2813   } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
2814     parser_add_parse_error(parser, token);
2815     if (parser->_parser_state->_form_element != NULL &&
2816         !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2817       ignore_token(parser);
2818       return false;
2819     }
2820     acknowledge_self_closing_tag(parser);
2821     maybe_implicitly_close_p_tag(parser, token);
2822     set_frameset_not_ok(parser);
2823 
2824     GumboVector* token_attrs = &token->v.start_tag.attributes;
2825     GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt");
2826     GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action");
2827     GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "name");
2828 
2829     GumboNode* form = insert_element_of_tag_type(
2830         parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
2831     if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2832       parser->_parser_state->_form_element = form;
2833     }
2834     if (action_attr) {
2835       gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
2836     }
2837     insert_element_of_tag_type(
2838         parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2839     pop_current_node(parser);  // <hr>
2840 
2841     insert_element_of_tag_type(
2842         parser, GUMBO_TAG_LABEL, GUMBO_INSERTION_FROM_ISINDEX);
2843     TextNodeBufferState* text_state = &parser->_parser_state->_text_node;
2844     text_state->_start_original_text = token->original_text.data;
2845     text_state->_start_position = token->position;
2846     text_state->_type = GUMBO_NODE_TEXT;
2847     if (prompt_attr) {
2848       int prompt_attr_length = strlen(prompt_attr->value);
2849       gumbo_string_buffer_destroy(parser, &text_state->_buffer);
2850       text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value);
2851       text_state->_buffer.length = prompt_attr_length;
2852       text_state->_buffer.capacity = prompt_attr_length + 1;
2853       gumbo_destroy_attribute(parser, prompt_attr);
2854     } else {
2855       GumboStringPiece prompt_text =
2856           GUMBO_STRING("This is a searchable index. Enter search keywords: ");
2857       gumbo_string_buffer_append_string(
2858           parser, &prompt_text, &text_state->_buffer);
2859     }
2860 
2861     GumboNode* input = insert_element_of_tag_type(
2862         parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX);
2863     for (unsigned int i = 0; i < token_attrs->length; ++i) {
2864       GumboAttribute* attr = token_attrs->data[i];
2865       if (attr != prompt_attr && attr != action_attr && attr != name_attr) {
2866         gumbo_vector_add(parser, attr, &input->v.element.attributes);
2867       }
2868       token_attrs->data[i] = NULL;
2869     }
2870 
2871     // All attributes have been successfully transferred and nulled out at this
2872     // point, so the call to ignore_token will free the memory for it without
2873     // touching the attributes.
2874     ignore_token(parser);
2875 
2876     // The name attribute, if present, should be destroyed since it's ignored
2877     // when copying over.  The action attribute should be kept since it's moved
2878     // to the form.
2879     if (name_attr) {
2880       gumbo_destroy_attribute(parser, name_attr);
2881     }
2882 
2883     GumboAttribute* name =
2884         gumbo_parser_allocate(parser, sizeof(GumboAttribute));
2885     GumboStringPiece name_str = GUMBO_STRING("name");
2886     GumboStringPiece isindex_str = GUMBO_STRING("isindex");
2887     name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
2888     name->name = gumbo_copy_stringz(parser, "name");
2889     name->value = gumbo_copy_stringz(parser, "isindex");
2890     name->original_name = name_str;
2891     name->original_value = isindex_str;
2892     name->name_start = kGumboEmptySourcePosition;
2893     name->name_end = kGumboEmptySourcePosition;
2894     name->value_start = kGumboEmptySourcePosition;
2895     name->value_end = kGumboEmptySourcePosition;
2896     gumbo_vector_add(parser, name, &input->v.element.attributes);
2897 
2898     pop_current_node(parser);  // <input>
2899     pop_current_node(parser);  // <label>
2900     insert_element_of_tag_type(
2901         parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2902     pop_current_node(parser);  // <hr>
2903     pop_current_node(parser);  // <form>
2904     if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2905       parser->_parser_state->_form_element = NULL;
2906     }
2907     return false;
2908   } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
2909     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2910     parser->_parser_state->_ignore_next_linefeed = true;
2911     set_frameset_not_ok(parser);
2912     return true;
2913   } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
2914     bool result = maybe_implicitly_close_p_tag(parser, token);
2915     reconstruct_active_formatting_elements(parser);
2916     set_frameset_not_ok(parser);
2917     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2918     return result;
2919   } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
2920     set_frameset_not_ok(parser);
2921     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2922     return true;
2923   } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
2924     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2925     return true;
2926   } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
2927     reconstruct_active_formatting_elements(parser);
2928     insert_element_from_token(parser, token);
2929     set_frameset_not_ok(parser);
2930     GumboInsertionMode state = parser->_parser_state->_insertion_mode;
2931     if (state == GUMBO_INSERTION_MODE_IN_TABLE ||
2932         state == GUMBO_INSERTION_MODE_IN_CAPTION ||
2933         state == GUMBO_INSERTION_MODE_IN_TABLE_BODY ||
2934         state == GUMBO_INSERTION_MODE_IN_ROW ||
2935         state == GUMBO_INSERTION_MODE_IN_CELL) {
2936       set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
2937     } else {
2938       set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
2939     }
2940     return true;
2941   } else if (tag_in(token, kStartTag,
2942                  (gumbo_tagset){TAG(OPTION), TAG(OPTGROUP)})) {
2943     if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2944       pop_current_node(parser);
2945     }
2946     reconstruct_active_formatting_elements(parser);
2947     insert_element_from_token(parser, token);
2948     return true;
2949   } else if (tag_in(token, kStartTag,
2950                  (gumbo_tagset){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})) {
2951     bool success = true;
2952     GumboTag exception =
2953         tag_in(token, kStartTag, (gumbo_tagset){TAG(RT), TAG(RP)})
2954             ? GUMBO_TAG_RTC
2955             : GUMBO_TAG_LAST;
2956     if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
2957       generate_implied_end_tags(parser, exception);
2958     }
2959     if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) &&
2960         !(exception == GUMBO_TAG_LAST ||
2961             node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC))) {
2962       parser_add_parse_error(parser, token);
2963       success = false;
2964     }
2965     insert_element_from_token(parser, token);
2966     return success;
2967   } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
2968     parser_add_parse_error(parser, token);
2969     reconstruct_active_formatting_elements(parser);
2970     insert_element_of_tag_type(
2971         parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2972     pop_current_node(parser);
2973     return false;
2974   } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
2975     reconstruct_active_formatting_elements(parser);
2976     adjust_mathml_attributes(parser, token);
2977     adjust_foreign_attributes(parser, token);
2978     insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
2979     if (token->v.start_tag.is_self_closing) {
2980       pop_current_node(parser);
2981       acknowledge_self_closing_tag(parser);
2982     }
2983     return true;
2984   } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
2985     reconstruct_active_formatting_elements(parser);
2986     adjust_svg_attributes(parser, token);
2987     adjust_foreign_attributes(parser, token);
2988     insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
2989     if (token->v.start_tag.is_self_closing) {
2990       pop_current_node(parser);
2991       acknowledge_self_closing_tag(parser);
2992     }
2993     return true;
2994   } else if (tag_in(token, kStartTag,
2995                  (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
2996                      TAG(FRAME), TAG(HEAD), TAG(TBODY), TAG(TD), TAG(TFOOT),
2997                      TAG(TH), TAG(THEAD), TAG(TR)})) {
2998     parser_add_parse_error(parser, token);
2999     ignore_token(parser);
3000     return false;
3001   } else if (token->type == GUMBO_TOKEN_START_TAG) {
3002     reconstruct_active_formatting_elements(parser);
3003     insert_element_from_token(parser, token);
3004     return true;
3005   } else {
3006     assert(token->type == GUMBO_TOKEN_END_TAG);
3007     GumboTag end_tag = token->v.end_tag;
3008     assert(state->_open_elements.length > 0);
3009     assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
3010     // Walk up the stack of open elements until we find one that either:
3011     // a) Matches the tag name we saw
3012     // b) Is in the "special" category.
3013     // If we see a), implicitly close everything up to and including it.  If we
3014     // see b), then record a parse error, don't close anything (except the
3015     // implied end tags) and ignore the end tag token.
3016     for (int i = state->_open_elements.length; --i >= 0;) {
3017       const GumboNode* node = state->_open_elements.data[i];
3018       if (node_html_tag_is(node, end_tag)) {
3019         generate_implied_end_tags(parser, end_tag);
3020         // TODO(jdtang): Do I need to add a parse error here?  The condition in
3021         // the spec seems like it's the inverse of the loop condition above, and
3022         // so would never fire.
3023         while (node != pop_current_node(parser))
3024           ;  // Pop everything.
3025         return true;
3026       } else if (is_special_node(node)) {
3027         parser_add_parse_error(parser, token);
3028         ignore_token(parser);
3029         return false;
3030       }
3031     }
3032     // <html> is in the special category, so we should never get here.
3033     assert(0);
3034     return false;
3035   }
3036 }
3037 
3038 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata
handle_text(GumboParser * parser,GumboToken * token)3039 static bool handle_text(GumboParser* parser, GumboToken* token) {
3040   if (token->type == GUMBO_TOKEN_CHARACTER ||
3041       token->type == GUMBO_TOKEN_WHITESPACE) {
3042     insert_text_token(parser, token);
3043   } else {
3044     // We provide only bare-bones script handling that doesn't involve any of
3045     // the parser-pause/already-started/script-nesting flags or re-entrant
3046     // invocations of the tokenizer.  Because the intended usage of this library
3047     // is mostly for templating, refactoring, and static-analysis libraries, we
3048     // provide the script body as a text-node child of the <script> element.
3049     // This behavior doesn't support document.write of partial HTML elements,
3050     // but should be adequate for almost all other scripting support.
3051     if (token->type == GUMBO_TOKEN_EOF) {
3052       parser_add_parse_error(parser, token);
3053       parser->_parser_state->_reprocess_current_token = true;
3054     }
3055     pop_current_node(parser);
3056     set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3057   }
3058   return true;
3059 }
3060 
3061 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable
handle_in_table(GumboParser * parser,GumboToken * token)3062 static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3063   GumboParserState* state = parser->_parser_state;
3064   if (token->type == GUMBO_TOKEN_CHARACTER ||
3065       token->type == GUMBO_TOKEN_WHITESPACE) {
3066     // The "pending table character tokens" list described in the spec is
3067     // nothing more than the TextNodeBufferState.  We accumulate text tokens as
3068     // normal, except that when we go to flush them in the handle_in_table_text,
3069     // we set _foster_parent_insertions if there're non-whitespace characters in
3070     // the buffer.
3071     assert(state->_text_node._buffer.length == 0);
3072     state->_original_insertion_mode = state->_insertion_mode;
3073     state->_reprocess_current_token = true;
3074     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
3075     return true;
3076   } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3077     parser_add_parse_error(parser, token);
3078     ignore_token(parser);
3079     return false;
3080   } else if (token->type == GUMBO_TOKEN_COMMENT) {
3081     append_comment_node(parser, get_current_node(parser), token);
3082     return true;
3083   } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3084     clear_stack_to_table_context(parser);
3085     add_formatting_element(parser, &kActiveFormattingScopeMarker);
3086     insert_element_from_token(parser, token);
3087     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
3088     return true;
3089   } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3090     clear_stack_to_table_context(parser);
3091     insert_element_from_token(parser, token);
3092     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3093     return true;
3094   } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3095     clear_stack_to_table_context(parser);
3096     insert_element_of_tag_type(
3097         parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED);
3098     parser->_parser_state->_reprocess_current_token = true;
3099     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3100     return true;
3101   } else if (tag_in(token, kStartTag,
3102                  (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD),
3103                      TAG(TH), TAG(TR)})) {
3104     clear_stack_to_table_context(parser);
3105     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3106     if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH), TAG(TR)})) {
3107       insert_element_of_tag_type(
3108           parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
3109       state->_reprocess_current_token = true;
3110     } else {
3111       insert_element_from_token(parser, token);
3112     }
3113     return true;
3114   } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3115     parser_add_parse_error(parser, token);
3116     if (close_table(parser)) {
3117       parser->_parser_state->_reprocess_current_token = true;
3118     } else {
3119       ignore_token(parser);
3120     }
3121     return false;
3122   } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3123     if (!close_table(parser)) {
3124       parser_add_parse_error(parser, token);
3125       return false;
3126     }
3127     return true;
3128   } else if (tag_in(token, kEndTag,
3129                  (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL),
3130                      TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD), TAG(TFOOT),
3131                      TAG(TH), TAG(THEAD), TAG(TR)})) {
3132     parser_add_parse_error(parser, token);
3133     ignore_token(parser);
3134     return false;
3135   } else if (tag_in(token, kStartTag,
3136                  (gumbo_tagset){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)}) ||
3137              (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))) {
3138     return handle_in_head(parser, token);
3139   } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
3140              attribute_matches(
3141                  &token->v.start_tag.attributes, "type", "hidden")) {
3142     parser_add_parse_error(parser, token);
3143     insert_element_from_token(parser, token);
3144     pop_current_node(parser);
3145     return false;
3146   } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3147     parser_add_parse_error(parser, token);
3148     if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3149       ignore_token(parser);
3150       return false;
3151     }
3152     state->_form_element = insert_element_from_token(parser, token);
3153     pop_current_node(parser);
3154     return false;
3155   } else if (token->type == GUMBO_TOKEN_EOF) {
3156     return handle_in_body(parser, token);
3157   } else {
3158     parser_add_parse_error(parser, token);
3159     state->_foster_parent_insertions = true;
3160     bool result = handle_in_body(parser, token);
3161     state->_foster_parent_insertions = false;
3162     return result;
3163   }
3164 }
3165 
3166 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext
handle_in_table_text(GumboParser * parser,GumboToken * token)3167 static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3168   if (token->type == GUMBO_TOKEN_NULL) {
3169     parser_add_parse_error(parser, token);
3170     ignore_token(parser);
3171     return false;
3172   } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3173              token->type == GUMBO_TOKEN_WHITESPACE) {
3174     insert_text_token(parser, token);
3175     return true;
3176   } else {
3177     GumboParserState* state = parser->_parser_state;
3178     GumboStringBuffer* buffer = &state->_text_node._buffer;
3179     // Can't use strspn for this because GumboStringBuffers are not
3180     // null-terminated.
3181     // Note that TextNodeBuffer may contain UTF-8 characters, but the presence
3182     // of any one byte that is not whitespace means we flip the flag, so this
3183     // loop is still valid.
3184     for (unsigned int i = 0; i < buffer->length; ++i) {
3185       if (!isspace((unsigned char) buffer->data[i]) ||
3186           buffer->data[i] == '\v') {
3187         state->_foster_parent_insertions = true;
3188         reconstruct_active_formatting_elements(parser);
3189         break;
3190       }
3191     }
3192     maybe_flush_text_node_buffer(parser);
3193     state->_foster_parent_insertions = false;
3194     state->_reprocess_current_token = true;
3195     state->_insertion_mode = state->_original_insertion_mode;
3196     return true;
3197   }
3198 }
3199 
3200 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
handle_in_caption(GumboParser * parser,GumboToken * token)3201 static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3202   if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3203     if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3204       parser_add_parse_error(parser, token);
3205       ignore_token(parser);
3206       return false;
3207     } else {
3208       generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3209       bool result = true;
3210       if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3211         parser_add_parse_error(parser, token);
3212       }
3213       while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3214         ;
3215       clear_active_formatting_elements(parser);
3216       set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3217       return result;
3218     }
3219   } else if (tag_in(token, kStartTag,
3220                  (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3221                      TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3222                      TAG(TR)}) ||
3223              (tag_is(token, kEndTag, GUMBO_TAG_TABLE))) {
3224     if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3225       parser_add_parse_error(parser, token);
3226       ignore_token(parser);
3227       return false;
3228     }
3229     while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3230       ;
3231     clear_active_formatting_elements(parser);
3232     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3233     parser->_parser_state->_reprocess_current_token = true;
3234     return true;
3235   } else if (tag_in(token, kEndTag,
3236                  (gumbo_tagset){TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML),
3237                      TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3238                      TAG(TR)})) {
3239     parser_add_parse_error(parser, token);
3240     ignore_token(parser);
3241     return false;
3242   } else {
3243     return handle_in_body(parser, token);
3244   }
3245 }
3246 
3247 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup
handle_in_column_group(GumboParser * parser,GumboToken * token)3248 static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3249   if (token->type == GUMBO_TOKEN_WHITESPACE) {
3250     insert_text_token(parser, token);
3251     return true;
3252   } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3253     parser_add_parse_error(parser, token);
3254     ignore_token(parser);
3255     return false;
3256   } else if (token->type == GUMBO_TOKEN_COMMENT) {
3257     append_comment_node(parser, get_current_node(parser), token);
3258     return true;
3259   } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3260     return handle_in_body(parser, token);
3261   } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3262     insert_element_from_token(parser, token);
3263     pop_current_node(parser);
3264     acknowledge_self_closing_tag(parser);
3265     return true;
3266   } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3267     if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3268       parser_add_parse_error(parser, token);
3269       ignore_token(parser);
3270       return false;
3271     }
3272     pop_current_node(parser);
3273     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3274     return false;
3275   } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3276     parser_add_parse_error(parser, token);
3277     ignore_token(parser);
3278     return false;
3279   } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) ||
3280              tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3281     return handle_in_head(parser, token);
3282   } else if (token->type == GUMBO_TOKEN_EOF) {
3283     return handle_in_body(parser, token);
3284   } else {
3285     if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3286       parser_add_parse_error(parser, token);
3287       ignore_token(parser);
3288       return false;
3289     }
3290     pop_current_node(parser);
3291     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3292     parser->_parser_state->_reprocess_current_token = true;
3293     return true;
3294   }
3295 }
3296 
3297 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody
handle_in_table_body(GumboParser * parser,GumboToken * token)3298 static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3299   if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3300     clear_stack_to_table_body_context(parser);
3301     insert_element_from_token(parser, token);
3302     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3303     return true;
3304   } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3305     parser_add_parse_error(parser, token);
3306     clear_stack_to_table_body_context(parser);
3307     insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3308     parser->_parser_state->_reprocess_current_token = true;
3309     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3310     return false;
3311   } else if (tag_in(token, kEndTag,
3312                  (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3313     if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3314       parser_add_parse_error(parser, token);
3315       ignore_token(parser);
3316       return false;
3317     }
3318     clear_stack_to_table_body_context(parser);
3319     pop_current_node(parser);
3320     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3321     return true;
3322   } else if (tag_in(token, kStartTag,
3323                  (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3324                      TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) ||
3325              tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3326     if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
3327             has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
3328             has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) {
3329       parser_add_parse_error(parser, token);
3330       ignore_token(parser);
3331       return false;
3332     }
3333     clear_stack_to_table_body_context(parser);
3334     pop_current_node(parser);
3335     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3336     parser->_parser_state->_reprocess_current_token = true;
3337     return true;
3338   } else if (tag_in(token, kEndTag,
3339                  (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR),
3340                      TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) {
3341     parser_add_parse_error(parser, token);
3342     ignore_token(parser);
3343     return false;
3344   } else {
3345     return handle_in_table(parser, token);
3346   }
3347 }
3348 
3349 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
handle_in_row(GumboParser * parser,GumboToken * token)3350 static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3351   if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TH), TAG(TD)})) {
3352     clear_stack_to_table_row_context(parser);
3353     insert_element_from_token(parser, token);
3354     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3355     add_formatting_element(parser, &kActiveFormattingScopeMarker);
3356     return true;
3357   } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3358     if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3359       parser_add_parse_error(parser, token);
3360       ignore_token(parser);
3361       return false;
3362     } else {
3363       clear_stack_to_table_row_context(parser);
3364       pop_current_node(parser);
3365       set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3366       return true;
3367     }
3368   } else if (tag_in(token, kStartTag,
3369                  (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3370                      TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)}) ||
3371              tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3372     if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3373       parser_add_parse_error(parser, token);
3374       ignore_token(parser);
3375       return false;
3376     } else {
3377       clear_stack_to_table_row_context(parser);
3378       pop_current_node(parser);
3379       set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3380       parser->_parser_state->_reprocess_current_token = true;
3381       return true;
3382     }
3383   } else if (tag_in(token, kEndTag,
3384                  (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3385     if (!has_an_element_in_table_scope(parser, token->v.end_tag) ||
3386         (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR))) {
3387       parser_add_parse_error(parser, token);
3388       ignore_token(parser);
3389       return false;
3390     } else {
3391       clear_stack_to_table_row_context(parser);
3392       pop_current_node(parser);
3393       set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3394       parser->_parser_state->_reprocess_current_token = true;
3395       return true;
3396     }
3397   } else if (tag_in(token, kEndTag,
3398                  (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL),
3399                      TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) {
3400     parser_add_parse_error(parser, token);
3401     ignore_token(parser);
3402     return false;
3403   } else {
3404     return handle_in_table(parser, token);
3405   }
3406 }
3407 
3408 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
handle_in_cell(GumboParser * parser,GumboToken * token)3409 static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3410   if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3411     GumboTag token_tag = token->v.end_tag;
3412     if (!has_an_element_in_table_scope(parser, token_tag)) {
3413       parser_add_parse_error(parser, token);
3414       ignore_token(parser);
3415       return false;
3416     }
3417     return close_table_cell(parser, token, token_tag);
3418   } else if (tag_in(token, kStartTag,
3419                  (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3420                      TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3421                      TAG(TR)})) {
3422     gumbo_debug("Handling <td> in cell.\n");
3423     if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
3424         !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
3425       gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
3426       parser_add_parse_error(parser, token);
3427       ignore_token(parser);
3428       return false;
3429     }
3430     parser->_parser_state->_reprocess_current_token = true;
3431     return close_current_cell(parser, token);
3432   } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(CAPTION),
3433                                         TAG(COL), TAG(COLGROUP), TAG(HTML)})) {
3434     parser_add_parse_error(parser, token);
3435     ignore_token(parser);
3436     return false;
3437   } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TABLE), TAG(TBODY),
3438                                         TAG(TFOOT), TAG(THEAD), TAG(TR)})) {
3439     if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3440       parser_add_parse_error(parser, token);
3441       ignore_token(parser);
3442       return false;
3443     }
3444     parser->_parser_state->_reprocess_current_token = true;
3445     return close_current_cell(parser, token);
3446   } else {
3447     return handle_in_body(parser, token);
3448   }
3449 }
3450 
3451 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect
handle_in_select(GumboParser * parser,GumboToken * token)3452 static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3453   if (token->type == GUMBO_TOKEN_NULL) {
3454     parser_add_parse_error(parser, token);
3455     ignore_token(parser);
3456     return false;
3457   } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3458              token->type == GUMBO_TOKEN_WHITESPACE) {
3459     insert_text_token(parser, token);
3460     return true;
3461   } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3462     parser_add_parse_error(parser, token);
3463     ignore_token(parser);
3464     return false;
3465   } else if (token->type == GUMBO_TOKEN_COMMENT) {
3466     append_comment_node(parser, get_current_node(parser), token);
3467     return true;
3468   } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3469     return handle_in_body(parser, token);
3470   } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3471     if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3472       pop_current_node(parser);
3473     }
3474     insert_element_from_token(parser, token);
3475     return true;
3476   } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3477     if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3478       pop_current_node(parser);
3479     }
3480     if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3481       pop_current_node(parser);
3482     }
3483     insert_element_from_token(parser, token);
3484     return true;
3485   } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3486     GumboVector* open_elements = &parser->_parser_state->_open_elements;
3487     if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
3488         node_html_tag_is(open_elements->data[open_elements->length - 2],
3489             GUMBO_TAG_OPTGROUP)) {
3490       pop_current_node(parser);
3491     }
3492     if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3493       pop_current_node(parser);
3494       return true;
3495     } else {
3496       parser_add_parse_error(parser, token);
3497       ignore_token(parser);
3498       return false;
3499     }
3500   } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3501     if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3502       pop_current_node(parser);
3503       return true;
3504     } else {
3505       parser_add_parse_error(parser, token);
3506       ignore_token(parser);
3507       return false;
3508     }
3509   } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3510     if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3511       parser_add_parse_error(parser, token);
3512       ignore_token(parser);
3513       return false;
3514     }
3515     close_current_select(parser);
3516     return true;
3517   } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3518     parser_add_parse_error(parser, token);
3519     ignore_token(parser);
3520     if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3521       close_current_select(parser);
3522     }
3523     return false;
3524   } else if (tag_in(token, kStartTag,
3525                  (gumbo_tagset){TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})) {
3526     parser_add_parse_error(parser, token);
3527     if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3528       ignore_token(parser);
3529     } else {
3530       close_current_select(parser);
3531       parser->_parser_state->_reprocess_current_token = true;
3532     }
3533     return false;
3534   } else if (tag_in(token, kStartTag,
3535                  (gumbo_tagset){TAG(SCRIPT), TAG(TEMPLATE)}) ||
3536              tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3537     return handle_in_head(parser, token);
3538   } else if (token->type == GUMBO_TOKEN_EOF) {
3539     return handle_in_body(parser, token);
3540   } else {
3541     parser_add_parse_error(parser, token);
3542     ignore_token(parser);
3543     return false;
3544   }
3545 }
3546 
3547 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
handle_in_select_in_table(GumboParser * parser,GumboToken * token)3548 static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3549   if (tag_in(token, kStartTag,
3550           (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT),
3551               TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
3552     parser_add_parse_error(parser, token);
3553     close_current_select(parser);
3554     parser->_parser_state->_reprocess_current_token = true;
3555     return false;
3556   } else if (tag_in(token, kEndTag,
3557                  (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY),
3558                      TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
3559     parser_add_parse_error(parser, token);
3560     if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3561       ignore_token(parser);
3562       return false;
3563     } else {
3564       close_current_select(parser);
3565       // close_current_select already does the
3566       // reset_insertion_mode_appropriately
3567       // reset_insertion_mode_appropriately(parser);
3568       parser->_parser_state->_reprocess_current_token = true;
3569       return false;
3570     }
3571   } else {
3572     return handle_in_select(parser, token);
3573   }
3574 }
3575 
3576 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
handle_in_template(GumboParser * parser,GumboToken * token)3577 static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3578   GumboParserState* state = parser->_parser_state;
3579   if (token->type == GUMBO_TOKEN_WHITESPACE ||
3580       token->type == GUMBO_TOKEN_CHARACTER ||
3581       token->type == GUMBO_TOKEN_COMMENT || token->type == GUMBO_TOKEN_NULL ||
3582       token->type == GUMBO_TOKEN_DOCTYPE) {
3583     return handle_in_body(parser, token);
3584   } else if (tag_in(token, kStartTag,
3585                  (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
3586                      TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
3587                      TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
3588              tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3589     return handle_in_head(parser, token);
3590   } else if (tag_in(
3591                  token, kStartTag, (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP),
3592                                        TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3593     pop_template_insertion_mode(parser);
3594     push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3595     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3596     state->_reprocess_current_token = true;
3597     return true;
3598   } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3599     pop_template_insertion_mode(parser);
3600     push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3601     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3602     state->_reprocess_current_token = true;
3603     return true;
3604   } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3605     pop_template_insertion_mode(parser);
3606     push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3607     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3608     state->_reprocess_current_token = true;
3609     return true;
3610   } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3611     pop_template_insertion_mode(parser);
3612     push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3613     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3614     state->_reprocess_current_token = true;
3615     return true;
3616   } else if (token->type == GUMBO_TOKEN_START_TAG) {
3617     pop_template_insertion_mode(parser);
3618     push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3619     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3620     state->_reprocess_current_token = true;
3621     return true;
3622   } else if (token->type == GUMBO_TOKEN_END_TAG) {
3623     parser_add_parse_error(parser, token);
3624     ignore_token(parser);
3625     return false;
3626   } else if (token->type == GUMBO_TOKEN_EOF) {
3627     if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3628       // Stop parsing.
3629       return true;
3630     }
3631     parser_add_parse_error(parser, token);
3632     while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
3633       ;
3634     clear_active_formatting_elements(parser);
3635     pop_template_insertion_mode(parser);
3636     reset_insertion_mode_appropriately(parser);
3637     state->_reprocess_current_token = true;
3638     return false;
3639   } else {
3640     assert(0);
3641     return false;
3642   }
3643 }
3644 
3645 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
handle_after_body(GumboParser * parser,GumboToken * token)3646 static bool handle_after_body(GumboParser* parser, GumboToken* token) {
3647   if (token->type == GUMBO_TOKEN_WHITESPACE ||
3648       tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3649     return handle_in_body(parser, token);
3650   } else if (token->type == GUMBO_TOKEN_COMMENT) {
3651     GumboNode* html_node = parser->_output->root;
3652     assert(html_node != NULL);
3653     append_comment_node(parser, html_node, token);
3654     return true;
3655   } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3656     parser_add_parse_error(parser, token);
3657     ignore_token(parser);
3658     return false;
3659   } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3660     /* fragment case: ignore the closing HTML token */
3661     if (is_fragment_parser(parser)) {
3662       parser_add_parse_error(parser, token);
3663       ignore_token(parser);
3664       return false;
3665     }
3666     set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
3667     GumboNode* html = parser->_parser_state->_open_elements.data[0];
3668     assert(node_html_tag_is(html, GUMBO_TAG_HTML));
3669     record_end_of_element(
3670         parser->_parser_state->_current_token, &html->v.element);
3671     return true;
3672   } else if (token->type == GUMBO_TOKEN_EOF) {
3673     return true;
3674   } else {
3675     parser_add_parse_error(parser, token);
3676     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3677     parser->_parser_state->_reprocess_current_token = true;
3678     return false;
3679   }
3680 }
3681 
3682 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset
handle_in_frameset(GumboParser * parser,GumboToken * token)3683 static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
3684   if (token->type == GUMBO_TOKEN_WHITESPACE) {
3685     insert_text_token(parser, token);
3686     return true;
3687   } else if (token->type == GUMBO_TOKEN_COMMENT) {
3688     append_comment_node(parser, get_current_node(parser), token);
3689     return true;
3690   } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3691     parser_add_parse_error(parser, token);
3692     ignore_token(parser);
3693     return false;
3694   } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3695     return handle_in_body(parser, token);
3696   } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
3697     insert_element_from_token(parser, token);
3698     return true;
3699   } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
3700     if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3701       parser_add_parse_error(parser, token);
3702       ignore_token(parser);
3703       return false;
3704     }
3705     pop_current_node(parser);
3706     if (!is_fragment_parser(parser) &&
3707         !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
3708       set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
3709     }
3710     return true;
3711   } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
3712     insert_element_from_token(parser, token);
3713     pop_current_node(parser);
3714     acknowledge_self_closing_tag(parser);
3715     return true;
3716   } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3717     return handle_in_head(parser, token);
3718   } else if (token->type == GUMBO_TOKEN_EOF) {
3719     if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3720       parser_add_parse_error(parser, token);
3721       return false;
3722     }
3723     return true;
3724   } else {
3725     parser_add_parse_error(parser, token);
3726     ignore_token(parser);
3727     return false;
3728   }
3729 }
3730 
3731 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset
handle_after_frameset(GumboParser * parser,GumboToken * token)3732 static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
3733   if (token->type == GUMBO_TOKEN_WHITESPACE) {
3734     insert_text_token(parser, token);
3735     return true;
3736   } else if (token->type == GUMBO_TOKEN_COMMENT) {
3737     append_comment_node(parser, get_current_node(parser), token);
3738     return true;
3739   } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3740     parser_add_parse_error(parser, token);
3741     ignore_token(parser);
3742     return false;
3743   } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3744     return handle_in_body(parser, token);
3745   } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3746     GumboNode* html = parser->_parser_state->_open_elements.data[0];
3747     assert(node_html_tag_is(html, GUMBO_TAG_HTML));
3748     record_end_of_element(
3749         parser->_parser_state->_current_token, &html->v.element);
3750     set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
3751     return true;
3752   } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3753     return handle_in_head(parser, token);
3754   } else if (token->type == GUMBO_TOKEN_EOF) {
3755     return true;
3756   } else {
3757     parser_add_parse_error(parser, token);
3758     ignore_token(parser);
3759     return false;
3760   }
3761 }
3762 
3763 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode
handle_after_after_body(GumboParser * parser,GumboToken * token)3764 static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
3765   if (token->type == GUMBO_TOKEN_COMMENT) {
3766     append_comment_node(parser, get_document_node(parser), token);
3767     return true;
3768   } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3769              token->type == GUMBO_TOKEN_WHITESPACE ||
3770              tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3771     return handle_in_body(parser, token);
3772   } else if (token->type == GUMBO_TOKEN_EOF) {
3773     return true;
3774   } else {
3775     parser_add_parse_error(parser, token);
3776     set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3777     parser->_parser_state->_reprocess_current_token = true;
3778     return false;
3779   }
3780 }
3781 
3782 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode
handle_after_after_frameset(GumboParser * parser,GumboToken * token)3783 static bool handle_after_after_frameset(
3784     GumboParser* parser, GumboToken* token) {
3785   if (token->type == GUMBO_TOKEN_COMMENT) {
3786     append_comment_node(parser, get_document_node(parser), token);
3787     return true;
3788   } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3789              token->type == GUMBO_TOKEN_WHITESPACE ||
3790              tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3791     return handle_in_body(parser, token);
3792   } else if (token->type == GUMBO_TOKEN_EOF) {
3793     return true;
3794   } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3795     return handle_in_head(parser, token);
3796   } else {
3797     parser_add_parse_error(parser, token);
3798     ignore_token(parser);
3799     return false;
3800   }
3801 }
3802 
3803 // Function pointers for each insertion mode.  Keep in sync with
3804 // insertion_mode.h.
3805 typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
3806 static const TokenHandler kTokenHandlers[] = {handle_initial,
3807     handle_before_html, handle_before_head, handle_in_head,
3808     handle_in_head_noscript, handle_after_head, handle_in_body, handle_text,
3809     handle_in_table, handle_in_table_text, handle_in_caption,
3810     handle_in_column_group, handle_in_table_body, handle_in_row, handle_in_cell,
3811     handle_in_select, handle_in_select_in_table, handle_in_template,
3812     handle_after_body, handle_in_frameset, handle_after_frameset,
3813     handle_after_after_body, handle_after_after_frameset};
3814 
handle_html_content(GumboParser * parser,GumboToken * token)3815 static bool handle_html_content(GumboParser* parser, GumboToken* token) {
3816   return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode](
3817       parser, token);
3818 }
3819 
3820 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign
handle_in_foreign_content(GumboParser * parser,GumboToken * token)3821 static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
3822   gumbo_debug("Handling foreign content");
3823   switch (token->type) {
3824     case GUMBO_TOKEN_NULL:
3825       parser_add_parse_error(parser, token);
3826       token->v.character = kUtf8ReplacementChar;
3827       insert_text_token(parser, token);
3828       return false;
3829     case GUMBO_TOKEN_WHITESPACE:
3830       insert_text_token(parser, token);
3831       return true;
3832     case GUMBO_TOKEN_CDATA:
3833     case GUMBO_TOKEN_CHARACTER:
3834       insert_text_token(parser, token);
3835       set_frameset_not_ok(parser);
3836       return true;
3837     case GUMBO_TOKEN_COMMENT:
3838       append_comment_node(parser, get_current_node(parser), token);
3839       return true;
3840     case GUMBO_TOKEN_DOCTYPE:
3841       parser_add_parse_error(parser, token);
3842       ignore_token(parser);
3843       return false;
3844     default:
3845       // Fall through to the if-statements below.
3846       break;
3847   }
3848   // Order matters for these clauses.
3849   if (tag_in(token, kStartTag,
3850           (gumbo_tagset){TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR),
3851               TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT),
3852               TAG(EM), TAG(EMBED), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5),
3853               TAG(H6), TAG(HEAD), TAG(HR), TAG(I), TAG(IMG), TAG(LI),
3854               TAG(LISTING), TAG(MENU), TAG(META), TAG(NOBR), TAG(OL), TAG(P),
3855               TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), TAG(SPAN), TAG(STRONG),
3856               TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), TAG(TT), TAG(U),
3857               TAG(UL), TAG(VAR)}) ||
3858       (tag_is(token, kStartTag, GUMBO_TAG_FONT) &&
3859           (token_has_attribute(token, "color") ||
3860               token_has_attribute(token, "face") ||
3861               token_has_attribute(token, "size")))) {
3862     /* Parse error */
3863     parser_add_parse_error(parser, token);
3864 
3865     /*
3866      * Fragment case: If the parser was originally created for the HTML
3867      * fragment parsing algorithm, then act as described in the "any other
3868      * start tag" entry below.
3869      */
3870     if (!is_fragment_parser(parser)) {
3871       do {
3872         pop_current_node(parser);
3873       } while (!(is_mathml_integration_point(get_current_node(parser)) ||
3874                    is_html_integration_point(get_current_node(parser)) ||
3875                    get_current_node(parser)->v.element.tag_namespace ==
3876                        GUMBO_NAMESPACE_HTML));
3877       parser->_parser_state->_reprocess_current_token = true;
3878       return false;
3879     }
3880 
3881     assert(token->type == GUMBO_TOKEN_START_TAG);
3882   }
3883 
3884   if (token->type == GUMBO_TOKEN_START_TAG) {
3885     const GumboNamespaceEnum current_namespace =
3886         get_adjusted_current_node(parser)->v.element.tag_namespace;
3887     if (current_namespace == GUMBO_NAMESPACE_MATHML) {
3888       adjust_mathml_attributes(parser, token);
3889     }
3890     if (current_namespace == GUMBO_NAMESPACE_SVG) {
3891       // Tag adjustment is left to the gumbo_normalize_svg_tagname helper
3892       // function.
3893       adjust_svg_attributes(parser, token);
3894     }
3895     adjust_foreign_attributes(parser, token);
3896     insert_foreign_element(parser, token, current_namespace);
3897     if (token->v.start_tag.is_self_closing) {
3898       pop_current_node(parser);
3899       acknowledge_self_closing_tag(parser);
3900     }
3901     return true;
3902     // </script> tags are handled like any other end tag, putting the script's
3903     // text into a text node child and closing the current node.
3904   } else {
3905     assert(token->type == GUMBO_TOKEN_END_TAG);
3906     GumboNode* node = get_current_node(parser);
3907     assert(node != NULL);
3908     GumboStringPiece token_tagname = token->original_text;
3909     GumboStringPiece node_tagname = node->v.element.original_tag;
3910     gumbo_tag_from_original_text(&token_tagname);
3911     gumbo_tag_from_original_text(&node_tagname);
3912 
3913     bool is_success = true;
3914     if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3915       parser_add_parse_error(parser, token);
3916       is_success = false;
3917     }
3918     int i = parser->_parser_state->_open_elements.length;
3919     for (--i; i > 0;) {
3920       // Here we move up the stack until we find an HTML element (in which
3921       // case we do nothing) or we find the element that we're about to
3922       // close (in which case we pop everything we've seen until that
3923       // point.)
3924       gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length,
3925           node_tagname.data, i);
3926       if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3927         gumbo_debug("Matches.\n");
3928         while (pop_current_node(parser) != node) {
3929           // Pop all the nodes below the current one.  Node is guaranteed to
3930           // be an element on the stack of open elements (set below), so
3931           // this loop is guaranteed to terminate.
3932         }
3933         return is_success;
3934       }
3935       --i;
3936       node = parser->_parser_state->_open_elements.data[i];
3937       if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
3938         // Must break before gumbo_tag_from_original_text to avoid passing
3939         // parser-inserted nodes through.
3940         break;
3941       }
3942       node_tagname = node->v.element.original_tag;
3943       gumbo_tag_from_original_text(&node_tagname);
3944     }
3945     assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
3946     // We can't call handle_token directly because the current node is still in
3947     // the SVG namespace, so it would re-enter this and result in infinite
3948     // recursion.
3949     return handle_html_content(parser, token) && is_success;
3950   }
3951 }
3952 
3953 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction
handle_token(GumboParser * parser,GumboToken * token)3954 static bool handle_token(GumboParser* parser, GumboToken* token) {
3955   if (parser->_parser_state->_ignore_next_linefeed &&
3956       token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') {
3957     parser->_parser_state->_ignore_next_linefeed = false;
3958     ignore_token(parser);
3959     return true;
3960   }
3961   // This needs to be reset both here and in the conditional above to catch both
3962   // the case where the next token is not whitespace (so we don't ignore
3963   // whitespace in the middle of <pre> tags) and where there are multiple
3964   // whitespace tokens (so we don't ignore the second one).
3965   parser->_parser_state->_ignore_next_linefeed = false;
3966 
3967   if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
3968     parser->_parser_state->_closed_body_tag = true;
3969   }
3970   if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3971     parser->_parser_state->_closed_html_tag = true;
3972   }
3973 
3974   const GumboNode* current_node = get_adjusted_current_node(parser);
3975   assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT ||
3976          current_node->type == GUMBO_NODE_TEMPLATE);
3977   if (current_node) {
3978     gumbo_debug("Current node: <%s>.\n",
3979         gumbo_normalized_tagname(current_node->v.element.tag));
3980   }
3981   if (!current_node ||
3982       current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
3983       (is_mathml_integration_point(current_node) &&
3984           (token->type == GUMBO_TOKEN_CHARACTER ||
3985               token->type == GUMBO_TOKEN_WHITESPACE ||
3986               token->type == GUMBO_TOKEN_NULL ||
3987               (token->type == GUMBO_TOKEN_START_TAG &&
3988                   !tag_in(token, kStartTag,
3989                       (gumbo_tagset){TAG(MGLYPH), TAG(MALIGNMARK)})))) ||
3990       (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
3991           node_qualified_tag_is(
3992               current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
3993           tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
3994       (is_html_integration_point(current_node) &&
3995           (token->type == GUMBO_TOKEN_START_TAG ||
3996               token->type == GUMBO_TOKEN_CHARACTER ||
3997               token->type == GUMBO_TOKEN_NULL ||
3998               token->type == GUMBO_TOKEN_WHITESPACE)) ||
3999       token->type == GUMBO_TOKEN_EOF) {
4000     return handle_html_content(parser, token);
4001   } else {
4002     return handle_in_foreign_content(parser, token);
4003   }
4004 }
4005 
fragment_parser_init(GumboParser * parser,GumboTag fragment_ctx,GumboNamespaceEnum fragment_namespace)4006 static void fragment_parser_init(GumboParser* parser, GumboTag fragment_ctx,
4007     GumboNamespaceEnum fragment_namespace) {
4008   GumboNode* root;
4009   assert(fragment_ctx != GUMBO_TAG_LAST);
4010 
4011   // 3
4012   parser->_parser_state->_fragment_ctx = create_element(parser, fragment_ctx);
4013   parser->_parser_state->_fragment_ctx->v.element.tag_namespace =
4014       fragment_namespace;
4015 
4016   // 4
4017   if (fragment_namespace == GUMBO_NAMESPACE_HTML) {
4018     // Non-HTML namespaces always start in the DATA state.
4019     switch (fragment_ctx) {
4020       case GUMBO_TAG_TITLE:
4021       case GUMBO_TAG_TEXTAREA:
4022         gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
4023         break;
4024 
4025       case GUMBO_TAG_STYLE:
4026       case GUMBO_TAG_XMP:
4027       case GUMBO_TAG_IFRAME:
4028       case GUMBO_TAG_NOEMBED:
4029       case GUMBO_TAG_NOFRAMES:
4030         gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4031         break;
4032 
4033       case GUMBO_TAG_SCRIPT:
4034         gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
4035         break;
4036 
4037       case GUMBO_TAG_NOSCRIPT:
4038         /* scripting is disabled in Gumbo, so leave the tokenizer
4039          * in the default data state */
4040         break;
4041 
4042       case GUMBO_TAG_PLAINTEXT:
4043         gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
4044         break;
4045 
4046       default:
4047         /* default data state */
4048         break;
4049     }
4050   }
4051 
4052   // 5. 6. 7.
4053   root = insert_element_of_tag_type(
4054       parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
4055   parser->_output->root = root;
4056 
4057   // 8.
4058   if (fragment_ctx == GUMBO_TAG_TEMPLATE) {
4059     push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
4060   }
4061 
4062   // 10.
4063   reset_insertion_mode_appropriately(parser);
4064 }
4065 
gumbo_parse(const char * buffer)4066 GumboOutput* gumbo_parse(const char* buffer) {
4067   return gumbo_parse_with_options(
4068       &kGumboDefaultOptions, buffer, strlen(buffer));
4069 }
4070 
gumbo_parse_with_options(const GumboOptions * options,const char * buffer,size_t length)4071 GumboOutput* gumbo_parse_with_options(
4072     const GumboOptions* options, const char* buffer, size_t length) {
4073   GumboParser parser;
4074   parser._options = options;
4075   output_init(&parser);
4076   gumbo_tokenizer_state_init(&parser, buffer, length);
4077   parser_state_init(&parser);
4078 
4079   if (options->fragment_context != GUMBO_TAG_LAST) {
4080     fragment_parser_init(
4081         &parser, options->fragment_context, options->fragment_namespace);
4082   }
4083 
4084   GumboParserState* state = parser._parser_state;
4085   gumbo_debug("Parsing %.*s.\n", length, buffer);
4086 
4087   // Sanity check so that infinite loops die with an assertion failure instead
4088   // of hanging the process before we ever get an error.
4089   int loop_count = 0;
4090 
4091   GumboToken token;
4092   bool has_error = false;
4093 
4094   do {
4095     if (state->_reprocess_current_token) {
4096       state->_reprocess_current_token = false;
4097     } else {
4098       GumboNode* current_node = get_current_node(&parser);
4099       gumbo_tokenizer_set_is_current_node_foreign(&parser,
4100           current_node &&
4101               current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML);
4102       has_error = !gumbo_lex(&parser, &token) || has_error;
4103     }
4104     const char* token_type = "text";
4105     switch (token.type) {
4106       case GUMBO_TOKEN_DOCTYPE:
4107         token_type = "doctype";
4108         break;
4109       case GUMBO_TOKEN_START_TAG:
4110         token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
4111         break;
4112       case GUMBO_TOKEN_END_TAG:
4113         token_type = gumbo_normalized_tagname(token.v.end_tag);
4114         break;
4115       case GUMBO_TOKEN_COMMENT:
4116         token_type = "comment";
4117         break;
4118       default:
4119         break;
4120     }
4121     gumbo_debug("Handling %s token @%d:%d in state %d.\n", (char*) token_type,
4122         token.position.line, token.position.column, state->_insertion_mode);
4123 
4124     state->_current_token = &token;
4125     state->_self_closing_flag_acknowledged =
4126         !(token.type == GUMBO_TOKEN_START_TAG &&
4127             token.v.start_tag.is_self_closing);
4128 
4129     has_error = !handle_token(&parser, &token) || has_error;
4130 
4131     // Check for memory leaks when ownership is transferred from start tag
4132     // tokens to nodes.
4133     assert(state->_reprocess_current_token ||
4134            token.type != GUMBO_TOKEN_START_TAG ||
4135            token.v.start_tag.attributes.data == NULL);
4136 
4137     if (!state->_self_closing_flag_acknowledged) {
4138       GumboError* error = parser_add_parse_error(&parser, &token);
4139       if (error) {
4140         error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
4141       }
4142     }
4143 
4144     ++loop_count;
4145     assert(loop_count < 1000000000);
4146 
4147   } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) &&
4148            !(options->stop_on_first_error && has_error));
4149 
4150   finish_parsing(&parser);
4151   // For API uniformity reasons, if the doctype still has nulls, convert them to
4152   // empty strings.
4153   GumboDocument* doc_type = &parser._output->document->v.document;
4154   if (doc_type->name == NULL) {
4155     doc_type->name = gumbo_copy_stringz(&parser, "");
4156   }
4157   if (doc_type->public_identifier == NULL) {
4158     doc_type->public_identifier = gumbo_copy_stringz(&parser, "");
4159   }
4160   if (doc_type->system_identifier == NULL) {
4161     doc_type->system_identifier = gumbo_copy_stringz(&parser, "");
4162   }
4163 
4164   parser_state_destroy(&parser);
4165   gumbo_tokenizer_state_destroy(&parser);
4166   return parser._output;
4167 }
4168 
gumbo_destroy_node(GumboOptions * options,GumboNode * node)4169 void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
4170   // Need a dummy GumboParser because the allocator comes along with the
4171   // options object.
4172   GumboParser parser;
4173   parser._options = options;
4174   destroy_node(&parser, node);
4175 }
4176 
gumbo_destroy_output(const GumboOptions * options,GumboOutput * output)4177 void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
4178   // Need a dummy GumboParser because the allocator comes along with the
4179   // options object.
4180   GumboParser parser;
4181   parser._options = options;
4182   destroy_node(&parser, output->document);
4183   for (unsigned int i = 0; i < output->errors.length; ++i) {
4184     gumbo_error_destroy(&parser, output->errors.data[i]);
4185   }
4186   gumbo_vector_destroy(&parser, &output->errors);
4187   gumbo_parser_deallocate(&parser, output);
4188 }
4189