1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16
17 #include <assert.h>
18 #include <ctype.h>
19 #include <stdarg.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <strings.h>
23
24 #include "attribute.h"
25 #include "error.h"
26 #include "gumbo.h"
27 #include "insertion_mode.h"
28 #include "parser.h"
29 #include "tokenizer.h"
30 #include "tokenizer_states.h"
31 #include "utf8.h"
32 #include "util.h"
33 #include "vector.h"
34
35 #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
36
37 #define GUMBO_STRING(literal) \
38 { literal, sizeof(literal) - 1 }
39 #define TERMINATOR \
40 { "", 0 }
41
42 typedef char gumbo_tagset[GUMBO_TAG_LAST];
43 #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML)
44 #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG)
45 #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML)
46
47 #define TAGSET_INCLUDES(tagset, namespace, tag) \
48 (tag < GUMBO_TAG_LAST && tagset[(int) tag] == (1 << (int) namespace))
49
50 // selected forward declarations as it is getting hard to find
51 // an appropriate order
52 static bool node_html_tag_is(const GumboNode*, GumboTag);
53 static GumboInsertionMode get_current_template_insertion_mode(
54 const GumboParser*);
55 static bool handle_in_template(GumboParser*, GumboToken*);
56 static void destroy_node(GumboParser*, GumboNode*);
57
malloc_wrapper(void * unused,size_t size)58 static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); }
59
free_wrapper(void * unused,void * ptr)60 static void free_wrapper(void* unused, void* ptr) { free(ptr); }
61
62 const GumboOptions kGumboDefaultOptions = {&malloc_wrapper, &free_wrapper, NULL,
63 8, false, -1, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML};
64
65 static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html");
66 static const GumboStringPiece kPublicIdHtml4_0 =
67 GUMBO_STRING("-//W3C//DTD HTML 4.0//EN");
68 static const GumboStringPiece kPublicIdHtml4_01 =
69 GUMBO_STRING("-//W3C//DTD HTML 4.01//EN");
70 static const GumboStringPiece kPublicIdXhtml1_0 =
71 GUMBO_STRING("-//W3C//DTD XHTML 1.0 Strict//EN");
72 static const GumboStringPiece kPublicIdXhtml1_1 =
73 GUMBO_STRING("-//W3C//DTD XHTML 1.1//EN");
74 static const GumboStringPiece kSystemIdRecHtml4_0 =
75 GUMBO_STRING("http://www.w3.org/TR/REC-html40/strict.dtd");
76 static const GumboStringPiece kSystemIdHtml4 =
77 GUMBO_STRING("http://www.w3.org/TR/html4/strict.dtd");
78 static const GumboStringPiece kSystemIdXhtmlStrict1_1 =
79 GUMBO_STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
80 static const GumboStringPiece kSystemIdXhtml1_1 =
81 GUMBO_STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
82 static const GumboStringPiece kSystemIdLegacyCompat =
83 GUMBO_STRING("about:legacy-compat");
84
85 // The doctype arrays have an explicit terminator because we want to pass them
86 // to a helper function, and passing them as a pointer discards sizeof
87 // information. The SVG arrays are used only by one-off functions, and so loops
88 // over them use sizeof directly instead of a terminator.
89
90 static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
91 GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
92 GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
93 GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
94 GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"),
95 GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"),
96 GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
97 GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
98 GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"),
99 GUMBO_STRING("-//IETF//DTD HTML 2.0//"),
100 GUMBO_STRING("-//IETF//DTD HTML 2.1E//"),
101 GUMBO_STRING("-//IETF//DTD HTML 3.0//"),
102 GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"),
103 GUMBO_STRING("-//IETF//DTD HTML 3.2//"),
104 GUMBO_STRING("-//IETF//DTD HTML 3//"),
105 GUMBO_STRING("-//IETF//DTD HTML Level 0//"),
106 GUMBO_STRING("-//IETF//DTD HTML Level 1//"),
107 GUMBO_STRING("-//IETF//DTD HTML Level 2//"),
108 GUMBO_STRING("-//IETF//DTD HTML Level 3//"),
109 GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"),
110 GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"),
111 GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"),
112 GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"),
113 GUMBO_STRING("-//IETF//DTD HTML Strict//"),
114 GUMBO_STRING("-//IETF//DTD HTML//"),
115 GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"),
116 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
117 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
118 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
119 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
120 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
121 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
122 GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"),
123 GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
124 GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
125 GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
126 GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
127 GUMBO_STRING(
128 "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
129 "extensions to HTML 4.0//"),
130 GUMBO_STRING(
131 "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
132 "extensions to HTML 4.0//"),
133 GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
134 GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
135 GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
136 GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
137 GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"),
138 GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"),
139 GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"),
140 GUMBO_STRING("-//W3C//DTD HTML 3.2//"),
141 GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"),
142 GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"),
143 GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"),
144 GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"),
145 GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"),
146 GUMBO_STRING("-//W3C//DTD W3 HTML//"),
147 GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"),
148 GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
149 GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"), TERMINATOR};
150
151 static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
152 GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
153 GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), GUMBO_STRING("HTML"),
154 TERMINATOR};
155
156 static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
157 GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
158 TERMINATOR};
159
160 static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
161 GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
162 GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"), TERMINATOR};
163
164 static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] =
165 {GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"),
166 GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"), TERMINATOR};
167
168 // Indexed by GumboNamespaceEnum; keep in sync with that.
169 static const char* kLegalXmlns[] = {"http://www.w3.org/1999/xhtml",
170 "http://www.w3.org/2000/svg", "http://www.w3.org/1998/Math/MathML"};
171
172 typedef struct _ReplacementEntry {
173 const GumboStringPiece from;
174 const GumboStringPiece to;
175 } ReplacementEntry;
176
177 #define REPLACEMENT_ENTRY(from, to) \
178 { GUMBO_STRING(from), GUMBO_STRING(to) }
179
180 // Static data for SVG attribute replacements.
181 // https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes
182 static const ReplacementEntry kSvgAttributeReplacements[] = {
183 REPLACEMENT_ENTRY("attributename", "attributeName"),
184 REPLACEMENT_ENTRY("attributetype", "attributeType"),
185 REPLACEMENT_ENTRY("basefrequency", "baseFrequency"),
186 REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
187 REPLACEMENT_ENTRY("calcmode", "calcMode"),
188 REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
189 // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
190 // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
191 REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
192 REPLACEMENT_ENTRY("edgemode", "edgeMode"),
193 // REPLACEMENT_ENTRY("externalresourcesrequired",
194 // "externalResourcesRequired"),
195 // REPLACEMENT_ENTRY("filterres", "filterRes"),
196 REPLACEMENT_ENTRY("filterunits", "filterUnits"),
197 REPLACEMENT_ENTRY("glyphref", "glyphRef"),
198 REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
199 REPLACEMENT_ENTRY("gradientunits", "gradientUnits"),
200 REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"),
201 REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"),
202 REPLACEMENT_ENTRY("keypoints", "keyPoints"),
203 REPLACEMENT_ENTRY("keysplines", "keySplines"),
204 REPLACEMENT_ENTRY("keytimes", "keyTimes"),
205 REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"),
206 REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"),
207 REPLACEMENT_ENTRY("markerheight", "markerHeight"),
208 REPLACEMENT_ENTRY("markerunits", "markerUnits"),
209 REPLACEMENT_ENTRY("markerwidth", "markerWidth"),
210 REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"),
211 REPLACEMENT_ENTRY("maskunits", "maskUnits"),
212 REPLACEMENT_ENTRY("numoctaves", "numOctaves"),
213 REPLACEMENT_ENTRY("pathlength", "pathLength"),
214 REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"),
215 REPLACEMENT_ENTRY("patterntransform", "patternTransform"),
216 REPLACEMENT_ENTRY("patternunits", "patternUnits"),
217 REPLACEMENT_ENTRY("pointsatx", "pointsAtX"),
218 REPLACEMENT_ENTRY("pointsaty", "pointsAtY"),
219 REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"),
220 REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"),
221 REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"),
222 REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"),
223 REPLACEMENT_ENTRY("refx", "refX"), REPLACEMENT_ENTRY("refy", "refY"),
224 REPLACEMENT_ENTRY("repeatcount", "repeatCount"),
225 REPLACEMENT_ENTRY("repeatdur", "repeatDur"),
226 REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"),
227 REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"),
228 REPLACEMENT_ENTRY("specularconstant", "specularConstant"),
229 REPLACEMENT_ENTRY("specularexponent", "specularExponent"),
230 REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"),
231 REPLACEMENT_ENTRY("startoffset", "startOffset"),
232 REPLACEMENT_ENTRY("stddeviation", "stdDeviation"),
233 REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"),
234 REPLACEMENT_ENTRY("surfacescale", "surfaceScale"),
235 REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"),
236 REPLACEMENT_ENTRY("tablevalues", "tableValues"),
237 REPLACEMENT_ENTRY("targetx", "targetX"),
238 REPLACEMENT_ENTRY("targety", "targetY"),
239 REPLACEMENT_ENTRY("textlength", "textLength"),
240 REPLACEMENT_ENTRY("viewbox", "viewBox"),
241 REPLACEMENT_ENTRY("viewtarget", "viewTarget"),
242 REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"),
243 REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"),
244 REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"),
245 };
246
247 static const ReplacementEntry kSvgTagReplacements[] = {
248 REPLACEMENT_ENTRY("altglyph", "altGlyph"),
249 REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"),
250 REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"),
251 REPLACEMENT_ENTRY("animatecolor", "animateColor"),
252 REPLACEMENT_ENTRY("animatemotion", "animateMotion"),
253 REPLACEMENT_ENTRY("animatetransform", "animateTransform"),
254 REPLACEMENT_ENTRY("clippath", "clipPath"),
255 REPLACEMENT_ENTRY("feblend", "feBlend"),
256 REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"),
257 REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"),
258 REPLACEMENT_ENTRY("fecomposite", "feComposite"),
259 REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"),
260 REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"),
261 REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"),
262 REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"),
263 REPLACEMENT_ENTRY("feflood", "feFlood"),
264 REPLACEMENT_ENTRY("fefunca", "feFuncA"),
265 REPLACEMENT_ENTRY("fefuncb", "feFuncB"),
266 REPLACEMENT_ENTRY("fefuncg", "feFuncG"),
267 REPLACEMENT_ENTRY("fefuncr", "feFuncR"),
268 REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"),
269 REPLACEMENT_ENTRY("feimage", "feImage"),
270 REPLACEMENT_ENTRY("femerge", "feMerge"),
271 REPLACEMENT_ENTRY("femergenode", "feMergeNode"),
272 REPLACEMENT_ENTRY("femorphology", "feMorphology"),
273 REPLACEMENT_ENTRY("feoffset", "feOffset"),
274 REPLACEMENT_ENTRY("fepointlight", "fePointLight"),
275 REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"),
276 REPLACEMENT_ENTRY("fespotlight", "feSpotLight"),
277 REPLACEMENT_ENTRY("fetile", "feTile"),
278 REPLACEMENT_ENTRY("feturbulence", "feTurbulence"),
279 REPLACEMENT_ENTRY("foreignobject", "foreignObject"),
280 REPLACEMENT_ENTRY("glyphref", "glyphRef"),
281 REPLACEMENT_ENTRY("lineargradient", "linearGradient"),
282 REPLACEMENT_ENTRY("radialgradient", "radialGradient"),
283 REPLACEMENT_ENTRY("textpath", "textPath"),
284 };
285
286 typedef struct _NamespacedAttributeReplacement {
287 const char* from;
288 const char* local_name;
289 const GumboAttributeNamespaceEnum attr_namespace;
290 } NamespacedAttributeReplacement;
291
292 static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = {
293 {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
294 {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
295 {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
296 {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
297 {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
298 {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
299 {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
300 {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML},
301 {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
302 {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
303 {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
304 {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
305 };
306
307 // The "scope marker" for the list of active formatting elements. We use a
308 // pointer to this as a generic marker element, since the particular element
309 // scope doesn't matter.
310 static const GumboNode kActiveFormattingScopeMarker;
311
312 // The tag_is and tag_in function use true & false to denote start & end tags,
313 // but for readability, we define constants for them here.
314 static const bool kStartTag = true;
315 static const bool kEndTag = false;
316
317 // Because GumboStringPieces are immutable, we can't insert a character directly
318 // into a text node. Instead, we accumulate all pending characters here and
319 // flush them out to a text node whenever a new element is inserted.
320 //
321 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character
322 typedef struct _TextNodeBufferState {
323 // The accumulated text to be inserted into the current text node.
324 GumboStringBuffer _buffer;
325
326 // A pointer to the original text represented by this text node. Note that
327 // because of foster parenting and other strange DOM manipulations, this may
328 // include other non-text HTML tags in it; it is defined as the span of
329 // original text from the first character in this text node to the last
330 // character in this text node.
331 const char* _start_original_text;
332
333 // The source position of the start of this text node.
334 GumboSourcePosition _start_position;
335
336 // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
337 GumboNodeType _type;
338 } TextNodeBufferState;
339
340 typedef struct GumboInternalParserState {
341 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
342 GumboInsertionMode _insertion_mode;
343
344 // Used for run_generic_parsing_algorithm, which needs to switch back to the
345 // original insertion mode at its conclusion.
346 GumboInsertionMode _original_insertion_mode;
347
348 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements
349 GumboVector /*GumboNode*/ _open_elements;
350
351 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
352 GumboVector /*GumboNode*/ _active_formatting_elements;
353
354 // The stack of template insertion modes.
355 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode
356 GumboVector /*InsertionMode*/ _template_insertion_modes;
357
358 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
359 GumboNode* _head_element;
360 GumboNode* _form_element;
361
362 // The element used as fragment context when parsing in fragment mode
363 GumboNode* _fragment_ctx;
364
365 // The flag for when the spec says "Reprocess the current token in..."
366 bool _reprocess_current_token;
367
368 // The flag for "acknowledge the token's self-closing flag".
369 bool _self_closing_flag_acknowledged;
370
371 // The "frameset-ok" flag from the spec.
372 bool _frameset_ok;
373
374 // The flag for "If the next token is a LINE FEED, ignore that token...".
375 bool _ignore_next_linefeed;
376
377 // The flag for "whenever a node would be inserted into the current node, it
378 // must instead be foster parented". This is used for misnested table
379 // content, which needs to be handled according to "in body" rules yet foster
380 // parented outside of the table.
381 // It would perhaps be more explicit to have this as a parameter to
382 // handle_in_body and insert_element, but given how special-purpose this is
383 // and the number of call-sites that would need to take the extra parameter,
384 // it's easier just to have a state flag.
385 bool _foster_parent_insertions;
386
387 // The accumulated text node buffer state.
388 TextNodeBufferState _text_node;
389
390 // The current token.
391 GumboToken* _current_token;
392
393 // The way that the spec is written, the </body> and </html> tags are *always*
394 // implicit, because encountering one of those tokens merely switches the
395 // insertion mode out of "in body". So we have individual state flags for
396 // those end tags that are then inspected by pop_current_node when the <body>
397 // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
398 // flag appropriately.
399 bool _closed_body_tag;
400 bool _closed_html_tag;
401 } GumboParserState;
402
token_has_attribute(const GumboToken * token,const char * name)403 static bool token_has_attribute(const GumboToken* token, const char* name) {
404 assert(token->type == GUMBO_TOKEN_START_TAG);
405 return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
406 }
407
408 // Checks if the value of the specified attribute is a case-insensitive match
409 // for the specified string.
attribute_matches(const GumboVector * attributes,const char * name,const char * value)410 static bool attribute_matches(
411 const GumboVector* attributes, const char* name, const char* value) {
412 const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
413 return attr ? strcasecmp(value, attr->value) == 0 : false;
414 }
415
416 // Checks if the value of the specified attribute is a case-sensitive match
417 // for the specified string.
attribute_matches_case_sensitive(const GumboVector * attributes,const char * name,const char * value)418 static bool attribute_matches_case_sensitive(
419 const GumboVector* attributes, const char* name, const char* value) {
420 const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
421 return attr ? strcmp(value, attr->value) == 0 : false;
422 }
423
424 // Checks if the specified attribute vectors are identical.
all_attributes_match(const GumboVector * attr1,const GumboVector * attr2)425 static bool all_attributes_match(
426 const GumboVector* attr1, const GumboVector* attr2) {
427 unsigned int num_unmatched_attr2_elements = attr2->length;
428 for (unsigned int i = 0; i < attr1->length; ++i) {
429 const GumboAttribute* attr = attr1->data[i];
430 if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
431 --num_unmatched_attr2_elements;
432 } else {
433 return false;
434 }
435 }
436 return num_unmatched_attr2_elements == 0;
437 }
438
set_frameset_not_ok(GumboParser * parser)439 static void set_frameset_not_ok(GumboParser* parser) {
440 gumbo_debug("Setting frameset_ok to false.\n");
441 parser->_parser_state->_frameset_ok = false;
442 }
443
create_node(GumboParser * parser,GumboNodeType type)444 static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
445 GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
446 node->parent = NULL;
447 node->index_within_parent = -1;
448 node->type = type;
449 node->parse_flags = GUMBO_INSERTION_NORMAL;
450 return node;
451 }
452
new_document_node(GumboParser * parser)453 static GumboNode* new_document_node(GumboParser* parser) {
454 GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT);
455 document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
456 gumbo_vector_init(parser, 1, &document_node->v.document.children);
457
458 // Must be initialized explicitly, as there's no guarantee that we'll see a
459 // doc type token.
460 GumboDocument* document = &document_node->v.document;
461 document->has_doctype = false;
462 document->name = NULL;
463 document->public_identifier = NULL;
464 document->system_identifier = NULL;
465 return document_node;
466 }
467
output_init(GumboParser * parser)468 static void output_init(GumboParser* parser) {
469 GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput));
470 output->root = NULL;
471 output->document = new_document_node(parser);
472 parser->_output = output;
473 gumbo_init_errors(parser);
474 }
475
parser_state_init(GumboParser * parser)476 static void parser_state_init(GumboParser* parser) {
477 GumboParserState* parser_state =
478 gumbo_parser_allocate(parser, sizeof(GumboParserState));
479 parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
480 parser_state->_reprocess_current_token = false;
481 parser_state->_frameset_ok = true;
482 parser_state->_ignore_next_linefeed = false;
483 parser_state->_foster_parent_insertions = false;
484 parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
485 gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
486 gumbo_vector_init(parser, 10, &parser_state->_open_elements);
487 gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
488 gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
489 parser_state->_head_element = NULL;
490 parser_state->_form_element = NULL;
491 parser_state->_fragment_ctx = NULL;
492 parser_state->_current_token = NULL;
493 parser_state->_closed_body_tag = false;
494 parser_state->_closed_html_tag = false;
495 parser->_parser_state = parser_state;
496 }
497
parser_state_destroy(GumboParser * parser)498 static void parser_state_destroy(GumboParser* parser) {
499 GumboParserState* state = parser->_parser_state;
500 if (state->_fragment_ctx) {
501 destroy_node(parser, state->_fragment_ctx);
502 }
503 gumbo_vector_destroy(parser, &state->_active_formatting_elements);
504 gumbo_vector_destroy(parser, &state->_open_elements);
505 gumbo_vector_destroy(parser, &state->_template_insertion_modes);
506 gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
507 gumbo_parser_deallocate(parser, state);
508 }
509
get_document_node(GumboParser * parser)510 static GumboNode* get_document_node(GumboParser* parser) {
511 return parser->_output->document;
512 }
513
is_fragment_parser(const GumboParser * parser)514 static bool is_fragment_parser(const GumboParser* parser) {
515 return !!parser->_parser_state->_fragment_ctx;
516 }
517
518 // Returns the node at the bottom of the stack of open elements, or NULL if no
519 // elements have been added yet.
get_current_node(GumboParser * parser)520 static GumboNode* get_current_node(GumboParser* parser) {
521 GumboVector* open_elements = &parser->_parser_state->_open_elements;
522 if (open_elements->length == 0) {
523 assert(!parser->_output->root);
524 return NULL;
525 }
526 assert(open_elements->length > 0);
527 assert(open_elements->data != NULL);
528 return open_elements->data[open_elements->length - 1];
529 }
530
get_adjusted_current_node(GumboParser * parser)531 static GumboNode* get_adjusted_current_node(GumboParser* parser) {
532 GumboParserState* state = parser->_parser_state;
533 if (state->_open_elements.length == 1 && state->_fragment_ctx) {
534 return state->_fragment_ctx;
535 }
536 return get_current_node(parser);
537 }
538
539 // Returns true if the given needle is in the given array of literal
540 // GumboStringPieces. If exact_match is true, this requires that they match
541 // exactly; otherwise, this performs a prefix match to check if any of the
542 // elements in haystack start with needle. This always performs a
543 // case-insensitive match.
is_in_static_list(const char * needle,const GumboStringPiece * haystack,bool exact_match)544 static bool is_in_static_list(
545 const char* needle, const GumboStringPiece* haystack, bool exact_match) {
546 for (unsigned int i = 0; haystack[i].length > 0; ++i) {
547 if ((exact_match && !strcmp(needle, haystack[i].data)) ||
548 (!exact_match && !strcasecmp(needle, haystack[i].data))) {
549 return true;
550 }
551 }
552 return false;
553 }
554
set_insertion_mode(GumboParser * parser,GumboInsertionMode mode)555 static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
556 parser->_parser_state->_insertion_mode = mode;
557 }
558
559 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
560 // This is a helper function that returns the appropriate insertion mode instead
561 // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
562 // indicate that there is no appropriate insertion mode, and the loop should
563 // continue.
get_appropriate_insertion_mode(const GumboParser * parser,int index)564 static GumboInsertionMode get_appropriate_insertion_mode(
565 const GumboParser* parser, int index) {
566 const GumboVector* open_elements = &parser->_parser_state->_open_elements;
567 const GumboNode* node = open_elements->data[index];
568 const bool is_last = index == 0;
569
570 if (is_last && is_fragment_parser(parser)) {
571 node = parser->_parser_state->_fragment_ctx;
572 }
573
574 assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
575 switch (node->v.element.tag) {
576 case GUMBO_TAG_SELECT: {
577 if (is_last) {
578 return GUMBO_INSERTION_MODE_IN_SELECT;
579 }
580 for (int i = index; i > 0; --i) {
581 const GumboNode* ancestor = open_elements->data[i];
582 if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
583 return GUMBO_INSERTION_MODE_IN_SELECT;
584 }
585 if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
586 return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
587 }
588 }
589 return GUMBO_INSERTION_MODE_IN_SELECT;
590 }
591 case GUMBO_TAG_TD:
592 case GUMBO_TAG_TH:
593 if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
594 break;
595 case GUMBO_TAG_TR:
596 return GUMBO_INSERTION_MODE_IN_ROW;
597 case GUMBO_TAG_TBODY:
598 case GUMBO_TAG_THEAD:
599 case GUMBO_TAG_TFOOT:
600 return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
601 case GUMBO_TAG_CAPTION:
602 return GUMBO_INSERTION_MODE_IN_CAPTION;
603 case GUMBO_TAG_COLGROUP:
604 return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
605 case GUMBO_TAG_TABLE:
606 return GUMBO_INSERTION_MODE_IN_TABLE;
607 case GUMBO_TAG_TEMPLATE:
608 return get_current_template_insertion_mode(parser);
609 case GUMBO_TAG_HEAD:
610 if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD;
611 break;
612 case GUMBO_TAG_BODY:
613 return GUMBO_INSERTION_MODE_IN_BODY;
614 case GUMBO_TAG_FRAMESET:
615 return GUMBO_INSERTION_MODE_IN_FRAMESET;
616 case GUMBO_TAG_HTML:
617 return parser->_parser_state->_head_element
618 ? GUMBO_INSERTION_MODE_AFTER_HEAD
619 : GUMBO_INSERTION_MODE_BEFORE_HEAD;
620 default:
621 break;
622 }
623 return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
624 }
625
626 // This performs the actual "reset the insertion mode" loop.
reset_insertion_mode_appropriately(GumboParser * parser)627 static void reset_insertion_mode_appropriately(GumboParser* parser) {
628 const GumboVector* open_elements = &parser->_parser_state->_open_elements;
629 for (int i = open_elements->length; --i >= 0;) {
630 GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i);
631 if (mode != GUMBO_INSERTION_MODE_INITIAL) {
632 set_insertion_mode(parser, mode);
633 return;
634 }
635 }
636 // Should never get here, because is_last will be set on the last iteration
637 // and will force GUMBO_INSERTION_MODE_IN_BODY.
638 assert(0);
639 }
640
parser_add_parse_error(GumboParser * parser,const GumboToken * token)641 static GumboError* parser_add_parse_error(
642 GumboParser* parser, const GumboToken* token) {
643 gumbo_debug("Adding parse error.\n");
644 GumboError* error = gumbo_add_error(parser);
645 if (!error) {
646 return NULL;
647 }
648 error->type = GUMBO_ERR_PARSER;
649 error->position = token->position;
650 error->original_text = token->original_text.data;
651 GumboParserError* extra_data = &error->v.parser;
652 extra_data->input_type = token->type;
653 extra_data->input_tag = GUMBO_TAG_UNKNOWN;
654 if (token->type == GUMBO_TOKEN_START_TAG) {
655 extra_data->input_tag = token->v.start_tag.tag;
656 } else if (token->type == GUMBO_TOKEN_END_TAG) {
657 extra_data->input_tag = token->v.end_tag;
658 }
659 GumboParserState* state = parser->_parser_state;
660 extra_data->parser_state = state->_insertion_mode;
661 gumbo_vector_init(
662 parser, state->_open_elements.length, &extra_data->tag_stack);
663 for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
664 const GumboNode* node = state->_open_elements.data[i];
665 assert(
666 node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
667 gumbo_vector_add(
668 parser, (void*) node->v.element.tag, &extra_data->tag_stack);
669 }
670 return error;
671 }
672
673 // Returns true if the specified token is either a start or end tag (specified
674 // by is_start) with one of the tag types in the varargs list. Terminate the
675 // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
676 // the spec references tags that are not in the spec.
tag_in(const GumboToken * token,bool is_start,const gumbo_tagset tags)677 static bool tag_in(
678 const GumboToken* token, bool is_start, const gumbo_tagset tags) {
679 GumboTag token_tag;
680 if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
681 token_tag = token->v.start_tag.tag;
682 } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
683 token_tag = token->v.end_tag;
684 } else {
685 return false;
686 }
687 return (token_tag < GUMBO_TAG_LAST && tags[(int) token_tag] != 0);
688 }
689
690 // Like tag_in, but for the single-tag case.
tag_is(const GumboToken * token,bool is_start,GumboTag tag)691 static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
692 if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
693 return token->v.start_tag.tag == tag;
694 } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
695 return token->v.end_tag == tag;
696 } else {
697 return false;
698 }
699 }
700
701 // Like tag_in, but checks for the tag of a node, rather than a token.
node_tag_in_set(const GumboNode * node,const gumbo_tagset tags)702 static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) {
703 assert(node != NULL);
704 if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
705 return false;
706 }
707 return TAGSET_INCLUDES(
708 tags, node->v.element.tag_namespace, node->v.element.tag);
709 }
710
711 // Like node_tag_in, but for the single-tag case.
node_qualified_tag_is(const GumboNode * node,GumboNamespaceEnum ns,GumboTag tag)712 static bool node_qualified_tag_is(
713 const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) {
714 assert(node);
715 return (node->type == GUMBO_NODE_ELEMENT ||
716 node->type == GUMBO_NODE_TEMPLATE) &&
717 node->v.element.tag == tag && node->v.element.tag_namespace == ns;
718 }
719
720 // Like node_tag_in, but for the single-tag case in the HTML namespace
node_html_tag_is(const GumboNode * node,GumboTag tag)721 static bool node_html_tag_is(const GumboNode* node, GumboTag tag) {
722 return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
723 }
724
push_template_insertion_mode(GumboParser * parser,GumboInsertionMode mode)725 static void push_template_insertion_mode(
726 GumboParser* parser, GumboInsertionMode mode) {
727 gumbo_vector_add(
728 parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
729 }
730
pop_template_insertion_mode(GumboParser * parser)731 static void pop_template_insertion_mode(GumboParser* parser) {
732 gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
733 }
734
735 // Returns the current template insertion mode. If the stack of template
736 // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL.
get_current_template_insertion_mode(const GumboParser * parser)737 static GumboInsertionMode get_current_template_insertion_mode(
738 const GumboParser* parser) {
739 GumboVector* template_insertion_modes =
740 &parser->_parser_state->_template_insertion_modes;
741 if (template_insertion_modes->length == 0) {
742 return GUMBO_INSERTION_MODE_INITIAL;
743 }
744 return (GumboInsertionMode)
745 template_insertion_modes->data[(template_insertion_modes->length - 1)];
746 }
747
748 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
is_mathml_integration_point(const GumboNode * node)749 static bool is_mathml_integration_point(const GumboNode* node) {
750 return node_tag_in_set(
751 node, (gumbo_tagset){TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
752 TAG_MATHML(MS), TAG_MATHML(MTEXT)});
753 }
754
755 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
is_html_integration_point(const GumboNode * node)756 static bool is_html_integration_point(const GumboNode* node) {
757 return node_tag_in_set(node, (gumbo_tagset){TAG_SVG(FOREIGNOBJECT),
758 TAG_SVG(DESC), TAG_SVG(TITLE)}) ||
759 (node_qualified_tag_is(
760 node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
761 (attribute_matches(
762 &node->v.element.attributes, "encoding", "text/html") ||
763 attribute_matches(&node->v.element.attributes, "encoding",
764 "application/xhtml+xml")));
765 }
766
767 // This represents a place to insert a node, consisting of a target parent and a
768 // child index within that parent. If the node should be inserted at the end of
769 // the parent's child, index will be -1.
770 typedef struct {
771 GumboNode* target;
772 int index;
773 } InsertionLocation;
774
get_appropriate_insertion_location(GumboParser * parser,GumboNode * override_target)775 InsertionLocation get_appropriate_insertion_location(
776 GumboParser* parser, GumboNode* override_target) {
777 InsertionLocation retval = {override_target, -1};
778 if (retval.target == NULL) {
779 // No override target; default to the current node, but special-case the
780 // root node since get_current_node() assumes the stack of open elements is
781 // non-empty.
782 retval.target = parser->_output->root != NULL ? get_current_node(parser)
783 : get_document_node(parser);
784 }
785 if (!parser->_parser_state->_foster_parent_insertions ||
786 !node_tag_in_set(retval.target, (gumbo_tagset){TAG(TABLE), TAG(TBODY),
787 TAG(TFOOT), TAG(THEAD), TAG(TR)})) {
788 return retval;
789 }
790
791 // Foster-parenting case.
792 int last_template_index = -1;
793 int last_table_index = -1;
794 GumboVector* open_elements = &parser->_parser_state->_open_elements;
795 for (unsigned int i = 0; i < open_elements->length; ++i) {
796 if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) {
797 last_template_index = i;
798 }
799 if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) {
800 last_table_index = i;
801 }
802 }
803 if (last_template_index != -1 &&
804 (last_table_index == -1 || last_template_index > last_table_index)) {
805 retval.target = open_elements->data[last_template_index];
806 return retval;
807 }
808 if (last_table_index == -1) {
809 retval.target = open_elements->data[0];
810 return retval;
811 }
812 GumboNode* last_table = open_elements->data[last_table_index];
813 if (last_table->parent != NULL) {
814 retval.target = last_table->parent;
815 retval.index = last_table->index_within_parent;
816 return retval;
817 }
818
819 retval.target = open_elements->data[last_table_index - 1];
820 return retval;
821 }
822
823 // Appends a node to the end of its parent, setting the "parent" and
824 // "index_within_parent" fields appropriately.
append_node(GumboParser * parser,GumboNode * parent,GumboNode * node)825 static void append_node(
826 GumboParser* parser, GumboNode* parent, GumboNode* node) {
827 assert(node->parent == NULL);
828 assert(node->index_within_parent == -1);
829 GumboVector* children;
830 if (parent->type == GUMBO_NODE_ELEMENT ||
831 parent->type == GUMBO_NODE_TEMPLATE) {
832 children = &parent->v.element.children;
833 } else {
834 assert(parent->type == GUMBO_NODE_DOCUMENT);
835 children = &parent->v.document.children;
836 }
837 node->parent = parent;
838 node->index_within_parent = children->length;
839 gumbo_vector_add(parser, (void*) node, children);
840 assert(node->index_within_parent < children->length);
841 }
842
843 // Inserts a node at the specified InsertionLocation, updating the
844 // "parent" and "index_within_parent" fields of it and all its siblings.
845 // If the index of the location is -1, this calls append_node.
insert_node(GumboParser * parser,GumboNode * node,InsertionLocation location)846 static void insert_node(
847 GumboParser* parser, GumboNode* node, InsertionLocation location) {
848 assert(node->parent == NULL);
849 assert(node->index_within_parent == -1);
850 GumboNode* parent = location.target;
851 int index = location.index;
852 if (index != -1) {
853 GumboVector* children = NULL;
854 if (parent->type == GUMBO_NODE_ELEMENT ||
855 parent->type == GUMBO_NODE_TEMPLATE) {
856 children = &parent->v.element.children;
857 } else if (parent->type == GUMBO_NODE_DOCUMENT) {
858 children = &parent->v.document.children;
859 assert(children->length == 0);
860 } else {
861 assert(0);
862 }
863
864 assert(index >= 0);
865 assert((unsigned int) index < children->length);
866 node->parent = parent;
867 node->index_within_parent = index;
868 gumbo_vector_insert_at(parser, (void*) node, index, children);
869 assert(node->index_within_parent < children->length);
870 for (unsigned int i = index + 1; i < children->length; ++i) {
871 GumboNode* sibling = children->data[i];
872 sibling->index_within_parent = i;
873 assert(sibling->index_within_parent < children->length);
874 }
875 } else {
876 append_node(parser, parent, node);
877 }
878 }
879
maybe_flush_text_node_buffer(GumboParser * parser)880 static void maybe_flush_text_node_buffer(GumboParser* parser) {
881 GumboParserState* state = parser->_parser_state;
882 TextNodeBufferState* buffer_state = &state->_text_node;
883 if (buffer_state->_buffer.length == 0) {
884 return;
885 }
886
887 assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
888 buffer_state->_type == GUMBO_NODE_TEXT ||
889 buffer_state->_type == GUMBO_NODE_CDATA);
890 GumboNode* text_node = create_node(parser, buffer_state->_type);
891 GumboText* text_node_data = &text_node->v.text;
892 text_node_data->text =
893 gumbo_string_buffer_to_string(parser, &buffer_state->_buffer);
894 text_node_data->original_text.data = buffer_state->_start_original_text;
895 text_node_data->original_text.length =
896 state->_current_token->original_text.data -
897 buffer_state->_start_original_text;
898 text_node_data->start_pos = buffer_state->_start_position;
899
900 gumbo_debug("Flushing text node buffer of %.*s.\n",
901 (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
902
903 InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
904 if (location.target->type == GUMBO_NODE_DOCUMENT) {
905 // The DOM does not allow Document nodes to have Text children, so per the
906 // spec, they are dropped on the floor.
907 destroy_node(parser, text_node);
908 } else {
909 insert_node(parser, text_node, location);
910 }
911
912 gumbo_string_buffer_clear(parser, &buffer_state->_buffer);
913 buffer_state->_type = GUMBO_NODE_WHITESPACE;
914 assert(buffer_state->_buffer.length == 0);
915 }
916
record_end_of_element(GumboToken * current_token,GumboElement * element)917 static void record_end_of_element(
918 GumboToken* current_token, GumboElement* element) {
919 element->end_pos = current_token->position;
920 element->original_end_tag = current_token->type == GUMBO_TOKEN_END_TAG
921 ? current_token->original_text
922 : kGumboEmptyString;
923 }
924
pop_current_node(GumboParser * parser)925 static GumboNode* pop_current_node(GumboParser* parser) {
926 GumboParserState* state = parser->_parser_state;
927 maybe_flush_text_node_buffer(parser);
928 if (state->_open_elements.length > 0) {
929 assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
930 gumbo_debug("Popping %s node.\n",
931 gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
932 }
933 GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements);
934 if (!current_node) {
935 assert(state->_open_elements.length == 0);
936 return NULL;
937 }
938 assert(current_node->type == GUMBO_NODE_ELEMENT ||
939 current_node->type == GUMBO_NODE_TEMPLATE);
940 bool is_closed_body_or_html_tag =
941 (node_html_tag_is(current_node, GUMBO_TAG_BODY) &&
942 state->_closed_body_tag) ||
943 (node_html_tag_is(current_node, GUMBO_TAG_HTML) &&
944 state->_closed_html_tag);
945 if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
946 !node_html_tag_is(current_node, state->_current_token->v.end_tag)) &&
947 !is_closed_body_or_html_tag) {
948 current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
949 }
950 if (!is_closed_body_or_html_tag) {
951 record_end_of_element(state->_current_token, ¤t_node->v.element);
952 }
953 return current_node;
954 }
955
append_comment_node(GumboParser * parser,GumboNode * node,const GumboToken * token)956 static void append_comment_node(
957 GumboParser* parser, GumboNode* node, const GumboToken* token) {
958 maybe_flush_text_node_buffer(parser);
959 GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT);
960 comment->type = GUMBO_NODE_COMMENT;
961 comment->parse_flags = GUMBO_INSERTION_NORMAL;
962 comment->v.text.text = token->v.text;
963 comment->v.text.original_text = token->original_text;
964 comment->v.text.start_pos = token->position;
965 append_node(parser, node, comment);
966 }
967
968 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
clear_stack_to_table_row_context(GumboParser * parser)969 static void clear_stack_to_table_row_context(GumboParser* parser) {
970 while (!node_tag_in_set(get_current_node(parser),
971 (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) {
972 pop_current_node(parser);
973 }
974 }
975
976 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
clear_stack_to_table_context(GumboParser * parser)977 static void clear_stack_to_table_context(GumboParser* parser) {
978 while (!node_tag_in_set(get_current_node(parser),
979 (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) {
980 pop_current_node(parser);
981 }
982 }
983
984 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
clear_stack_to_table_body_context(GumboParser * parser)985 void clear_stack_to_table_body_context(GumboParser* parser) {
986 while (!node_tag_in_set(get_current_node(parser),
987 (gumbo_tagset){TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD),
988 TAG(TEMPLATE)})) {
989 pop_current_node(parser);
990 }
991 }
992
993 // Creates a parser-inserted element in the HTML namespace and returns it.
create_element(GumboParser * parser,GumboTag tag)994 static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
995 GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
996 GumboElement* element = &node->v.element;
997 gumbo_vector_init(parser, 1, &element->children);
998 gumbo_vector_init(parser, 0, &element->attributes);
999 element->tag = tag;
1000 element->tag_namespace = GUMBO_NAMESPACE_HTML;
1001 element->original_tag = kGumboEmptyString;
1002 element->original_end_tag = kGumboEmptyString;
1003 element->start_pos = (parser->_parser_state->_current_token)
1004 ? parser->_parser_state->_current_token->position
1005 : kGumboEmptySourcePosition;
1006 element->end_pos = kGumboEmptySourcePosition;
1007 return node;
1008 }
1009
1010 // Constructs an element from the given start tag token.
create_element_from_token(GumboParser * parser,GumboToken * token,GumboNamespaceEnum tag_namespace)1011 static GumboNode* create_element_from_token(
1012 GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1013 assert(token->type == GUMBO_TOKEN_START_TAG);
1014 GumboTokenStartTag* start_tag = &token->v.start_tag;
1015
1016 GumboNodeType type = (tag_namespace == GUMBO_NAMESPACE_HTML &&
1017 start_tag->tag == GUMBO_TAG_TEMPLATE)
1018 ? GUMBO_NODE_TEMPLATE
1019 : GUMBO_NODE_ELEMENT;
1020
1021 GumboNode* node = create_node(parser, type);
1022 GumboElement* element = &node->v.element;
1023 gumbo_vector_init(parser, 1, &element->children);
1024 element->attributes = start_tag->attributes;
1025 element->tag = start_tag->tag;
1026 element->tag_namespace = tag_namespace;
1027
1028 assert(token->original_text.length >= 2);
1029 assert(token->original_text.data[0] == '<');
1030 assert(token->original_text.data[token->original_text.length - 1] == '>');
1031 element->original_tag = token->original_text;
1032 element->start_pos = token->position;
1033 element->original_end_tag = kGumboEmptyString;
1034 element->end_pos = kGumboEmptySourcePosition;
1035
1036 // The element takes ownership of the attributes from the token, so any
1037 // allocated-memory fields should be nulled out.
1038 start_tag->attributes = kGumboEmptyVector;
1039 return node;
1040 }
1041
1042 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element
insert_element(GumboParser * parser,GumboNode * node,bool is_reconstructing_formatting_elements)1043 static void insert_element(GumboParser* parser, GumboNode* node,
1044 bool is_reconstructing_formatting_elements) {
1045 GumboParserState* state = parser->_parser_state;
1046 // NOTE(jdtang): The text node buffer must always be flushed before inserting
1047 // a node, otherwise we're handling nodes in a different order than the spec
1048 // mandated. However, one clause of the spec (character tokens in the body)
1049 // requires that we reconstruct the active formatting elements *before* adding
1050 // the character, and reconstructing the active formatting elements may itself
1051 // result in the insertion of new elements (which should be pushed onto the
1052 // stack of open elements before the buffer is flushed). We solve this (for
1053 // the time being, the spec has been rewritten for <template> and the new
1054 // version may be simpler here) with a boolean flag to this method.
1055 if (!is_reconstructing_formatting_elements) {
1056 maybe_flush_text_node_buffer(parser);
1057 }
1058 InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
1059 insert_node(parser, node, location);
1060 gumbo_vector_add(parser, (void*) node, &state->_open_elements);
1061 }
1062
1063 // Convenience method that combines create_element_from_token and
1064 // insert_element, inserting the generated element directly into the current
1065 // node. Returns the node inserted.
insert_element_from_token(GumboParser * parser,GumboToken * token)1066 static GumboNode* insert_element_from_token(
1067 GumboParser* parser, GumboToken* token) {
1068 GumboNode* element =
1069 create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML);
1070 insert_element(parser, element, false);
1071 gumbo_debug("Inserting <%s> element (@%x) from token.\n",
1072 gumbo_normalized_tagname(element->v.element.tag), element);
1073 return element;
1074 }
1075
1076 // Convenience method that combines create_element and insert_element, inserting
1077 // a parser-generated element of a specific tag type. Returns the node
1078 // inserted.
insert_element_of_tag_type(GumboParser * parser,GumboTag tag,GumboParseFlags reason)1079 static GumboNode* insert_element_of_tag_type(
1080 GumboParser* parser, GumboTag tag, GumboParseFlags reason) {
1081 GumboNode* element = create_element(parser, tag);
1082 element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1083 insert_element(parser, element, false);
1084 gumbo_debug("Inserting %s element (@%x) from tag type.\n",
1085 gumbo_normalized_tagname(tag), element);
1086 return element;
1087 }
1088
1089 // Convenience method for creating foreign namespaced element. Returns the node
1090 // inserted.
insert_foreign_element(GumboParser * parser,GumboToken * token,GumboNamespaceEnum tag_namespace)1091 static GumboNode* insert_foreign_element(
1092 GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1093 assert(token->type == GUMBO_TOKEN_START_TAG);
1094 GumboNode* element = create_element_from_token(parser, token, tag_namespace);
1095 insert_element(parser, element, false);
1096 if (token_has_attribute(token, "xmlns") &&
1097 !attribute_matches_case_sensitive(&token->v.start_tag.attributes, "xmlns",
1098 kLegalXmlns[tag_namespace])) {
1099 // TODO(jdtang): Since there're multiple possible error codes here, we
1100 // eventually need reason codes to differentiate them.
1101 parser_add_parse_error(parser, token);
1102 }
1103 if (token_has_attribute(token, "xmlns:xlink") &&
1104 !attribute_matches_case_sensitive(&token->v.start_tag.attributes,
1105 "xmlns:xlink", "http://www.w3.org/1999/xlink")) {
1106 parser_add_parse_error(parser, token);
1107 }
1108 return element;
1109 }
1110
insert_text_token(GumboParser * parser,GumboToken * token)1111 static void insert_text_token(GumboParser* parser, GumboToken* token) {
1112 assert(token->type == GUMBO_TOKEN_WHITESPACE ||
1113 token->type == GUMBO_TOKEN_CHARACTER ||
1114 token->type == GUMBO_TOKEN_NULL || token->type == GUMBO_TOKEN_CDATA);
1115 TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1116 if (buffer_state->_buffer.length == 0) {
1117 // Initialize position fields.
1118 buffer_state->_start_original_text = token->original_text.data;
1119 buffer_state->_start_position = token->position;
1120 }
1121 gumbo_string_buffer_append_codepoint(
1122 parser, token->v.character, &buffer_state->_buffer);
1123 if (token->type == GUMBO_TOKEN_CHARACTER) {
1124 buffer_state->_type = GUMBO_NODE_TEXT;
1125 } else if (token->type == GUMBO_TOKEN_CDATA) {
1126 buffer_state->_type = GUMBO_NODE_CDATA;
1127 }
1128 gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1129 }
1130
1131 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm
run_generic_parsing_algorithm(GumboParser * parser,GumboToken * token,GumboTokenizerEnum lexer_state)1132 static void run_generic_parsing_algorithm(
1133 GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) {
1134 insert_element_from_token(parser, token);
1135 gumbo_tokenizer_set_state(parser, lexer_state);
1136 parser->_parser_state->_original_insertion_mode =
1137 parser->_parser_state->_insertion_mode;
1138 parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
1139 }
1140
acknowledge_self_closing_tag(GumboParser * parser)1141 static void acknowledge_self_closing_tag(GumboParser* parser) {
1142 parser->_parser_state->_self_closing_flag_acknowledged = true;
1143 }
1144
1145 // Returns true if there's an anchor tag in the list of active formatting
1146 // elements, and fills in its index if so.
find_last_anchor_index(GumboParser * parser,int * anchor_index)1147 static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1148 GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1149 for (int i = elements->length; --i >= 0;) {
1150 GumboNode* node = elements->data[i];
1151 if (node == &kActiveFormattingScopeMarker) {
1152 return false;
1153 }
1154 if (node_html_tag_is(node, GUMBO_TAG_A)) {
1155 *anchor_index = i;
1156 return true;
1157 }
1158 }
1159 return false;
1160 }
1161
1162 // Counts the number of open formatting elements in the list of active
1163 // formatting elements (after the last active scope marker) that have a specific
1164 // tag. If this is > 0, then earliest_matching_index will be filled in with the
1165 // index of the first such element.
count_formatting_elements_of_tag(GumboParser * parser,const GumboNode * desired_node,int * earliest_matching_index)1166 static int count_formatting_elements_of_tag(GumboParser* parser,
1167 const GumboNode* desired_node, int* earliest_matching_index) {
1168 const GumboElement* desired_element = &desired_node->v.element;
1169 GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1170 int num_identical_elements = 0;
1171 for (int i = elements->length; --i >= 0;) {
1172 GumboNode* node = elements->data[i];
1173 if (node == &kActiveFormattingScopeMarker) {
1174 break;
1175 }
1176 assert(node->type == GUMBO_NODE_ELEMENT);
1177 if (node_qualified_tag_is(
1178 node, desired_element->tag_namespace, desired_element->tag) &&
1179 all_attributes_match(
1180 &node->v.element.attributes, &desired_element->attributes)) {
1181 num_identical_elements++;
1182 *earliest_matching_index = i;
1183 }
1184 }
1185 return num_identical_elements;
1186 }
1187
1188 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements
add_formatting_element(GumboParser * parser,const GumboNode * node)1189 static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
1190 assert(node == &kActiveFormattingScopeMarker ||
1191 node->type == GUMBO_NODE_ELEMENT);
1192 GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1193 if (node == &kActiveFormattingScopeMarker) {
1194 gumbo_debug("Adding a scope marker.\n");
1195 } else {
1196 gumbo_debug("Adding a formatting element.\n");
1197 }
1198
1199 // Hunt for identical elements.
1200 int earliest_identical_element = elements->length;
1201 int num_identical_elements = count_formatting_elements_of_tag(
1202 parser, node, &earliest_identical_element);
1203
1204 // Noah's Ark clause: if there're at least 3, remove the earliest.
1205 if (num_identical_elements >= 3) {
1206 gumbo_debug("Noah's ark clause: removing element at %d.\n",
1207 earliest_identical_element);
1208 gumbo_vector_remove_at(parser, earliest_identical_element, elements);
1209 }
1210
1211 gumbo_vector_add(parser, (void*) node, elements);
1212 }
1213
is_open_element(GumboParser * parser,const GumboNode * node)1214 static bool is_open_element(GumboParser* parser, const GumboNode* node) {
1215 GumboVector* open_elements = &parser->_parser_state->_open_elements;
1216 for (unsigned int i = 0; i < open_elements->length; ++i) {
1217 if (open_elements->data[i] == node) {
1218 return true;
1219 }
1220 }
1221 return false;
1222 }
1223
1224 // Clones attributes, tags, etc. of a node, but does not copy the content. The
1225 // clone shares no structure with the original node: all owned strings and
1226 // values are fresh copies.
clone_node(GumboParser * parser,GumboNode * node,GumboParseFlags reason)1227 GumboNode* clone_node(
1228 GumboParser* parser, GumboNode* node, GumboParseFlags reason) {
1229 assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1230 GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
1231 *new_node = *node;
1232 new_node->parent = NULL;
1233 new_node->index_within_parent = -1;
1234 // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
1235 // have a separate end tag.
1236 new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
1237 new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
1238 GumboElement* element = &new_node->v.element;
1239 gumbo_vector_init(parser, 1, &element->children);
1240
1241 const GumboVector* old_attributes = &node->v.element.attributes;
1242 gumbo_vector_init(parser, old_attributes->length, &element->attributes);
1243 for (unsigned int i = 0; i < old_attributes->length; ++i) {
1244 const GumboAttribute* old_attr = old_attributes->data[i];
1245 GumboAttribute* attr =
1246 gumbo_parser_allocate(parser, sizeof(GumboAttribute));
1247 *attr = *old_attr;
1248 attr->name = gumbo_copy_stringz(parser, old_attr->name);
1249 attr->value = gumbo_copy_stringz(parser, old_attr->value);
1250 gumbo_vector_add(parser, attr, &element->attributes);
1251 }
1252 return new_node;
1253 }
1254
1255 // "Reconstruct active formatting elements" part of the spec.
1256 // This implementation is based on the html5lib translation from the mess of
1257 // GOTOs in the spec to reasonably structured programming.
1258 // http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py
reconstruct_active_formatting_elements(GumboParser * parser)1259 static void reconstruct_active_formatting_elements(GumboParser* parser) {
1260 GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1261 // Step 1
1262 if (elements->length == 0) {
1263 return;
1264 }
1265
1266 // Step 2 & 3
1267 unsigned int i = elements->length - 1;
1268 GumboNode* element = elements->data[i];
1269 if (element == &kActiveFormattingScopeMarker ||
1270 is_open_element(parser, element)) {
1271 return;
1272 }
1273
1274 // Step 6
1275 do {
1276 if (i == 0) {
1277 // Step 4
1278 i = -1; // Incremented to 0 below.
1279 break;
1280 }
1281 // Step 5
1282 element = elements->data[--i];
1283 } while (element != &kActiveFormattingScopeMarker &&
1284 !is_open_element(parser, element));
1285
1286 ++i;
1287 gumbo_debug("Reconstructing elements from %d on %s parent.\n", i,
1288 gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
1289 for (; i < elements->length; ++i) {
1290 // Step 7 & 8.
1291 assert(elements->length > 0);
1292 assert(i < elements->length);
1293 element = elements->data[i];
1294 assert(element != &kActiveFormattingScopeMarker);
1295 GumboNode* clone = clone_node(
1296 parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
1297 // Step 9.
1298 InsertionLocation location =
1299 get_appropriate_insertion_location(parser, NULL);
1300 insert_node(parser, clone, location);
1301 gumbo_vector_add(
1302 parser, (void*) clone, &parser->_parser_state->_open_elements);
1303
1304 // Step 10.
1305 elements->data[i] = clone;
1306 gumbo_debug("Reconstructed %s element at %d.\n",
1307 gumbo_normalized_tagname(clone->v.element.tag), i);
1308 }
1309 }
1310
clear_active_formatting_elements(GumboParser * parser)1311 static void clear_active_formatting_elements(GumboParser* parser) {
1312 GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1313 int num_elements_cleared = 0;
1314 const GumboNode* node;
1315 do {
1316 node = gumbo_vector_pop(parser, elements);
1317 ++num_elements_cleared;
1318 } while (node && node != &kActiveFormattingScopeMarker);
1319 gumbo_debug("Cleared %d elements from active formatting list.\n",
1320 num_elements_cleared);
1321 }
1322
1323 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode
compute_quirks_mode(const GumboTokenDocType * doctype)1324 static GumboQuirksModeEnum compute_quirks_mode(
1325 const GumboTokenDocType* doctype) {
1326 if (doctype->force_quirks || strcmp(doctype->name, kDoctypeHtml.data) ||
1327 is_in_static_list(
1328 doctype->public_identifier, kQuirksModePublicIdPrefixes, false) ||
1329 is_in_static_list(
1330 doctype->public_identifier, kQuirksModePublicIdExactMatches, true) ||
1331 is_in_static_list(
1332 doctype->system_identifier, kQuirksModeSystemIdExactMatches, true) ||
1333 (is_in_static_list(doctype->public_identifier,
1334 kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) &&
1335 !doctype->has_system_identifier)) {
1336 return GUMBO_DOCTYPE_QUIRKS;
1337 } else if (is_in_static_list(doctype->public_identifier,
1338 kLimitedQuirksPublicIdPrefixes, false) ||
1339 (is_in_static_list(doctype->public_identifier,
1340 kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) &&
1341 doctype->has_system_identifier)) {
1342 return GUMBO_DOCTYPE_LIMITED_QUIRKS;
1343 }
1344 return GUMBO_DOCTYPE_NO_QUIRKS;
1345 }
1346
1347 // The following functions are all defined by the "has an element in __ scope"
1348 // sections of the HTML5 spec:
1349 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
1350 // The basic idea behind them is that they check for an element of the given
1351 // qualified name, contained within a scope formed by a set of other qualified
1352 // names. For example, "has an element in list scope" looks for an element of
1353 // the given qualified name within the nearest enclosing <ol> or <ul>, along
1354 // with a bunch of generic element types that serve to "firewall" their content
1355 // from the rest of the document. Note that because of the way the spec is
1356 // written,
1357 // all elements are expected to be in the HTML namespace
has_an_element_in_specific_scope(GumboParser * parser,int expected_size,const GumboTag * expected,bool negate,const gumbo_tagset tags)1358 static bool has_an_element_in_specific_scope(GumboParser* parser,
1359 int expected_size, const GumboTag* expected, bool negate,
1360 const gumbo_tagset tags) {
1361 GumboVector* open_elements = &parser->_parser_state->_open_elements;
1362 for (int i = open_elements->length; --i >= 0;) {
1363 const GumboNode* node = open_elements->data[i];
1364 if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE)
1365 continue;
1366
1367 GumboTag node_tag = node->v.element.tag;
1368 GumboNamespaceEnum node_ns = node->v.element.tag_namespace;
1369 for (int j = 0; j < expected_size; ++j) {
1370 if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML)
1371 return true;
1372 }
1373
1374 bool found = TAGSET_INCLUDES(tags, node_ns, node_tag);
1375 if (negate != found) return false;
1376 }
1377 return false;
1378 }
1379
1380 // Checks for the presence of an open element of the specified tag type.
has_open_element(GumboParser * parser,GumboTag tag)1381 static bool has_open_element(GumboParser* parser, GumboTag tag) {
1382 return has_an_element_in_specific_scope(
1383 parser, 1, &tag, false, (gumbo_tagset){TAG(HTML)});
1384 }
1385
1386 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
has_an_element_in_scope(GumboParser * parser,GumboTag tag)1387 static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
1388 return has_an_element_in_specific_scope(parser, 1, &tag, false,
1389 (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1390 TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1391 TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1392 TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1393 TAG_SVG(TITLE)});
1394 }
1395
1396 // Like "has an element in scope", but for the specific case of looking for a
1397 // unique target node, not for any node with a given tag name. This duplicates
1398 // much of the algorithm from has_an_element_in_specific_scope because the
1399 // predicate is different when checking for an exact node, and it's easier &
1400 // faster just to duplicate the code for this one case than to try and
1401 // parameterize it.
has_node_in_scope(GumboParser * parser,const GumboNode * node)1402 static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1403 GumboVector* open_elements = &parser->_parser_state->_open_elements;
1404 for (int i = open_elements->length; --i >= 0;) {
1405 const GumboNode* current = open_elements->data[i];
1406 if (current == node) {
1407 return true;
1408 }
1409 if (current->type != GUMBO_NODE_ELEMENT &&
1410 current->type != GUMBO_NODE_TEMPLATE) {
1411 continue;
1412 }
1413 if (node_tag_in_set(current,
1414 (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE),
1415 TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE),
1416 TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1417 TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1418 TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE)})) {
1419 return false;
1420 }
1421 }
1422 assert(false);
1423 return false;
1424 }
1425
1426 // Like has_an_element_in_scope, but restricts the expected qualified name to a
1427 // range of possible qualified names instead of just a single one.
has_an_element_in_scope_with_tagname(GumboParser * parser,int expected_len,const GumboTag expected[])1428 static bool has_an_element_in_scope_with_tagname(
1429 GumboParser* parser, int expected_len, const GumboTag expected[]) {
1430 return has_an_element_in_specific_scope(parser, expected_len, expected, false,
1431 (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1432 TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1433 TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1434 TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1435 TAG_SVG(TITLE)});
1436 }
1437
1438 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
has_an_element_in_list_scope(GumboParser * parser,GumboTag tag)1439 static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
1440 return has_an_element_in_specific_scope(parser, 1, &tag, false,
1441 (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1442 TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1443 TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1444 TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1445 TAG_SVG(TITLE), TAG(OL), TAG(UL)});
1446 }
1447
1448 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
has_an_element_in_button_scope(GumboParser * parser,GumboTag tag)1449 static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
1450 return has_an_element_in_specific_scope(parser, 1, &tag, false,
1451 (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1452 TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1453 TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1454 TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1455 TAG_SVG(TITLE), TAG(BUTTON)});
1456 }
1457
1458 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
has_an_element_in_table_scope(GumboParser * parser,GumboTag tag)1459 static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
1460 return has_an_element_in_specific_scope(parser, 1, &tag, false,
1461 (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)});
1462 }
1463
1464 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
has_an_element_in_select_scope(GumboParser * parser,GumboTag tag)1465 static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
1466 return has_an_element_in_specific_scope(
1467 parser, 1, &tag, true, (gumbo_tagset){TAG(OPTGROUP), TAG(OPTION)});
1468 }
1469
1470 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
1471 // "exception" is the "element to exclude from the process" listed in the spec.
1472 // Pass GUMBO_TAG_LAST to not exclude any of them.
generate_implied_end_tags(GumboParser * parser,GumboTag exception)1473 static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1474 for (; node_tag_in_set(get_current_node(parser),
1475 (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTION),
1476 TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)}) &&
1477 !node_html_tag_is(get_current_node(parser), exception);
1478 pop_current_node(parser))
1479 ;
1480 }
1481
1482 // This is the "generate all implied end tags thoroughly" clause of the spec.
1483 // https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags
generate_all_implied_end_tags_thoroughly(GumboParser * parser)1484 static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1485 for (
1486 ; node_tag_in_set(get_current_node(parser),
1487 (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI),
1488 TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC),
1489 TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)});
1490 pop_current_node(parser))
1491 ;
1492 }
1493
1494 // This factors out the clauses relating to "act as if an end tag token with tag
1495 // name "table" had been seen. Returns true if there's a table element in table
1496 // scope which was successfully closed, false if not and the token should be
1497 // ignored. Does not add parse errors; callers should handle that.
close_table(GumboParser * parser)1498 static bool close_table(GumboParser* parser) {
1499 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
1500 return false;
1501 }
1502
1503 GumboNode* node = pop_current_node(parser);
1504 while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) {
1505 node = pop_current_node(parser);
1506 }
1507 reset_insertion_mode_appropriately(parser);
1508 return true;
1509 }
1510
1511 // This factors out the clauses relating to "act as if an end tag token with tag
1512 // name `cell_tag` had been seen".
close_table_cell(GumboParser * parser,const GumboToken * token,GumboTag cell_tag)1513 static bool close_table_cell(
1514 GumboParser* parser, const GumboToken* token, GumboTag cell_tag) {
1515 bool result = true;
1516 generate_implied_end_tags(parser, GUMBO_TAG_LAST);
1517 const GumboNode* node = get_current_node(parser);
1518 if (!node_html_tag_is(node, cell_tag)) {
1519 parser_add_parse_error(parser, token);
1520 result = false;
1521 }
1522 do {
1523 node = pop_current_node(parser);
1524 } while (!node_html_tag_is(node, cell_tag));
1525
1526 clear_active_formatting_elements(parser);
1527 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
1528 return result;
1529 }
1530
1531 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell
1532 // This holds the logic to determine whether we should close a <td> or a <th>.
close_current_cell(GumboParser * parser,const GumboToken * token)1533 static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1534 if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1535 assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1536 return close_table_cell(parser, token, GUMBO_TAG_TD);
1537 } else {
1538 assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1539 return close_table_cell(parser, token, GUMBO_TAG_TH);
1540 }
1541 }
1542
1543 // This factors out the "act as if an end tag of tag name 'select' had been
1544 // seen" clause of the spec, since it's referenced in several places. It pops
1545 // all nodes from the stack until the current <select> has been closed, then
1546 // resets the insertion mode appropriately.
close_current_select(GumboParser * parser)1547 static void close_current_select(GumboParser* parser) {
1548 GumboNode* node = pop_current_node(parser);
1549 while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
1550 node = pop_current_node(parser);
1551 }
1552 reset_insertion_mode_appropriately(parser);
1553 }
1554
1555 // The list of nodes in the "special" category:
1556 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
is_special_node(const GumboNode * node)1557 static bool is_special_node(const GumboNode* node) {
1558 assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1559 return node_tag_in_set(node,
1560 (gumbo_tagset){TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE),
1561 TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
1562 TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
1563 TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR),
1564 TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET),
1565 TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME),
1566 TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6),
1567 TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME),
1568 TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK), TAG(LISTING),
1569 TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
1570 TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P),
1571 TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION),
1572 TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY),
1573 TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH),
1574 TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP),
1575
1576 TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1577 TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1578
1579 TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC)});
1580 }
1581
1582 // Implicitly closes currently open elements until it reaches an element with
1583 // the
1584 // specified qualified name. If the elements closed are in the set handled by
1585 // generate_implied_end_tags, this is normal operation and this function returns
1586 // true. Otherwise, a parse error is recorded and this function returns false.
implicitly_close_tags(GumboParser * parser,GumboToken * token,GumboNamespaceEnum target_ns,GumboTag target)1587 static bool implicitly_close_tags(GumboParser* parser, GumboToken* token,
1588 GumboNamespaceEnum target_ns, GumboTag target) {
1589 bool result = true;
1590 generate_implied_end_tags(parser, target);
1591 if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1592 parser_add_parse_error(parser, token);
1593 while (
1594 !node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1595 pop_current_node(parser);
1596 }
1597 result = false;
1598 }
1599 assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
1600 pop_current_node(parser);
1601 return result;
1602 }
1603
1604 // If the stack of open elements has a <p> tag in button scope, this acts as if
1605 // a </p> tag was encountered, implicitly closing tags. Returns false if a
1606 // parse error occurs. This is a convenience function because this particular
1607 // clause appears several times in the spec.
maybe_implicitly_close_p_tag(GumboParser * parser,GumboToken * token)1608 static bool maybe_implicitly_close_p_tag(
1609 GumboParser* parser, GumboToken* token) {
1610 if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1611 return implicitly_close_tags(
1612 parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
1613 }
1614 return true;
1615 }
1616
1617 // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1618 // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
maybe_implicitly_close_list_tag(GumboParser * parser,GumboToken * token,bool is_li)1619 static void maybe_implicitly_close_list_tag(
1620 GumboParser* parser, GumboToken* token, bool is_li) {
1621 GumboParserState* state = parser->_parser_state;
1622 state->_frameset_ok = false;
1623 for (int i = state->_open_elements.length; --i >= 0;) {
1624 const GumboNode* node = state->_open_elements.data[i];
1625 bool is_list_tag =
1626 is_li ? node_html_tag_is(node, GUMBO_TAG_LI)
1627 : node_tag_in_set(node, (gumbo_tagset){TAG(DD), TAG(DT)});
1628 if (is_list_tag) {
1629 implicitly_close_tags(
1630 parser, token, node->v.element.tag_namespace, node->v.element.tag);
1631 return;
1632 }
1633 if (is_special_node(node) &&
1634 !node_tag_in_set(
1635 node, (gumbo_tagset){TAG(ADDRESS), TAG(DIV), TAG(P)})) {
1636 return;
1637 }
1638 }
1639 }
1640
merge_attributes(GumboParser * parser,GumboToken * token,GumboNode * node)1641 static void merge_attributes(
1642 GumboParser* parser, GumboToken* token, GumboNode* node) {
1643 assert(token->type == GUMBO_TOKEN_START_TAG);
1644 assert(node->type == GUMBO_NODE_ELEMENT);
1645 const GumboVector* token_attr = &token->v.start_tag.attributes;
1646 GumboVector* node_attr = &node->v.element.attributes;
1647
1648 for (unsigned int i = 0; i < token_attr->length; ++i) {
1649 GumboAttribute* attr = token_attr->data[i];
1650 if (!gumbo_get_attribute(node_attr, attr->name)) {
1651 // Ownership of the attribute is transferred by this gumbo_vector_add,
1652 // so it has to be nulled out of the original token so it doesn't get
1653 // double-deleted.
1654 gumbo_vector_add(parser, attr, node_attr);
1655 token_attr->data[i] = NULL;
1656 }
1657 }
1658 // When attributes are merged, it means the token has been ignored and merged
1659 // with another token, so we need to free its memory. The attributes that are
1660 // transferred need to be nulled-out in the vector above so that they aren't
1661 // double-deleted.
1662 gumbo_token_destroy(parser, token);
1663
1664 #ifndef NDEBUG
1665 // Mark this sentinel so the assertion in the main loop knows it's been
1666 // destroyed.
1667 token->v.start_tag.attributes = kGumboEmptyVector;
1668 #endif
1669 }
1670
gumbo_normalize_svg_tagname(const GumboStringPiece * tag)1671 const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
1672 for (size_t i = 0; i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry);
1673 ++i) {
1674 const ReplacementEntry* entry = &kSvgTagReplacements[i];
1675 if (gumbo_string_equals_ignore_case(tag, &entry->from)) {
1676 return entry->to.data;
1677 }
1678 }
1679 return NULL;
1680 }
1681
1682 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
1683 // This destructively modifies any matching attributes on the token and sets the
1684 // namespace appropriately.
adjust_foreign_attributes(GumboParser * parser,GumboToken * token)1685 static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) {
1686 assert(token->type == GUMBO_TOKEN_START_TAG);
1687 const GumboVector* attributes = &token->v.start_tag.attributes;
1688 for (size_t i = 0; i < sizeof(kForeignAttributeReplacements) /
1689 sizeof(NamespacedAttributeReplacement);
1690 ++i) {
1691 const NamespacedAttributeReplacement* entry =
1692 &kForeignAttributeReplacements[i];
1693 GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from);
1694 if (!attr) {
1695 continue;
1696 }
1697 gumbo_parser_deallocate(parser, (void*) attr->name);
1698 attr->attr_namespace = entry->attr_namespace;
1699 attr->name = gumbo_copy_stringz(parser, entry->local_name);
1700 }
1701 }
1702
1703 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes
1704 // This destructively modifies any matching attributes on the token.
adjust_svg_attributes(GumboParser * parser,GumboToken * token)1705 static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) {
1706 assert(token->type == GUMBO_TOKEN_START_TAG);
1707 const GumboVector* attributes = &token->v.start_tag.attributes;
1708 for (size_t i = 0;
1709 i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) {
1710 const ReplacementEntry* entry = &kSvgAttributeReplacements[i];
1711 GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data);
1712 if (!attr) {
1713 continue;
1714 }
1715 gumbo_parser_deallocate(parser, (void*) attr->name);
1716 attr->name = gumbo_copy_stringz(parser, entry->to.data);
1717 }
1718 }
1719
1720 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes
1721 // Note that this may destructively modify the token with the new attribute
1722 // value.
adjust_mathml_attributes(GumboParser * parser,GumboToken * token)1723 static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) {
1724 assert(token->type == GUMBO_TOKEN_START_TAG);
1725 GumboAttribute* attr =
1726 gumbo_get_attribute(&token->v.start_tag.attributes, "definitionurl");
1727 if (!attr) {
1728 return;
1729 }
1730 gumbo_parser_deallocate(parser, (void*) attr->name);
1731 attr->name = gumbo_copy_stringz(parser, "definitionURL");
1732 }
1733
doctype_matches(const GumboTokenDocType * doctype,const GumboStringPiece * public_id,const GumboStringPiece * system_id,bool allow_missing_system_id)1734 static bool doctype_matches(const GumboTokenDocType* doctype,
1735 const GumboStringPiece* public_id, const GumboStringPiece* system_id,
1736 bool allow_missing_system_id) {
1737 return !strcmp(doctype->public_identifier, public_id->data) &&
1738 (allow_missing_system_id || doctype->has_system_identifier) &&
1739 !strcmp(doctype->system_identifier, system_id->data);
1740 }
1741
maybe_add_doctype_error(GumboParser * parser,const GumboToken * token)1742 static bool maybe_add_doctype_error(
1743 GumboParser* parser, const GumboToken* token) {
1744 const GumboTokenDocType* doctype = &token->v.doc_type;
1745 bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data);
1746 if ((!html_doctype || doctype->has_public_identifier ||
1747 (doctype->has_system_identifier &&
1748 !strcmp(
1749 doctype->system_identifier, kSystemIdLegacyCompat.data))) &&
1750 !(html_doctype && (doctype_matches(doctype, &kPublicIdHtml4_0,
1751 &kSystemIdRecHtml4_0, true) ||
1752 doctype_matches(doctype, &kPublicIdHtml4_01,
1753 &kSystemIdHtml4, true) ||
1754 doctype_matches(doctype, &kPublicIdXhtml1_0,
1755 &kSystemIdXhtmlStrict1_1, false) ||
1756 doctype_matches(doctype, &kPublicIdXhtml1_1,
1757 &kSystemIdXhtml1_1, false)))) {
1758 parser_add_parse_error(parser, token);
1759 return false;
1760 }
1761 return true;
1762 }
1763
remove_from_parent(GumboParser * parser,GumboNode * node)1764 static void remove_from_parent(GumboParser* parser, GumboNode* node) {
1765 if (!node->parent) {
1766 // The node may not have a parent if, for example, it is a newly-cloned copy
1767 // of an active formatting element. DOM manipulations continue with the
1768 // orphaned fragment of the DOM tree until it's appended/foster-parented to
1769 // the common ancestor at the end of the adoption agency algorithm.
1770 return;
1771 }
1772 assert(node->parent->type == GUMBO_NODE_ELEMENT);
1773 GumboVector* children = &node->parent->v.element.children;
1774 int index = gumbo_vector_index_of(children, node);
1775 assert(index != -1);
1776
1777 gumbo_vector_remove_at(parser, index, children);
1778 node->parent = NULL;
1779 node->index_within_parent = -1;
1780 for (unsigned int i = index; i < children->length; ++i) {
1781 GumboNode* child = children->data[i];
1782 child->index_within_parent = i;
1783 }
1784 }
1785
1786 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
1787 // Also described in the "in body" handling for end formatting tags.
adoption_agency_algorithm(GumboParser * parser,GumboToken * token,GumboTag subject)1788 static bool adoption_agency_algorithm(
1789 GumboParser* parser, GumboToken* token, GumboTag subject) {
1790 GumboParserState* state = parser->_parser_state;
1791 gumbo_debug("Entering adoption agency algorithm.\n");
1792 // Step 1.
1793 GumboNode* current_node = get_current_node(parser);
1794 if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
1795 current_node->v.element.tag == subject &&
1796 gumbo_vector_index_of(
1797 &state->_active_formatting_elements, current_node) == -1) {
1798 pop_current_node(parser);
1799 return false;
1800 }
1801 // Steps 2-4 & 20:
1802 for (unsigned int i = 0; i < 8; ++i) {
1803 // Step 5.
1804 GumboNode* formatting_node = NULL;
1805 int formatting_node_in_open_elements = -1;
1806 for (int j = state->_active_formatting_elements.length; --j >= 0;) {
1807 GumboNode* current_node = state->_active_formatting_elements.data[j];
1808 if (current_node == &kActiveFormattingScopeMarker) {
1809 gumbo_debug("Broke on scope marker; aborting.\n");
1810 // Last scope marker; abort the algorithm.
1811 return false;
1812 }
1813 if (node_html_tag_is(current_node, subject)) {
1814 // Found it.
1815 formatting_node = current_node;
1816 formatting_node_in_open_elements =
1817 gumbo_vector_index_of(&state->_open_elements, formatting_node);
1818 gumbo_debug("Formatting element of tag %s at %d.\n",
1819 gumbo_normalized_tagname(subject),
1820 formatting_node_in_open_elements);
1821 break;
1822 }
1823 }
1824 if (!formatting_node) {
1825 // No matching tag; not a parse error outright, but fall through to the
1826 // "any other end tag" clause (which may potentially add a parse error,
1827 // but not always).
1828 gumbo_debug("No active formatting elements; aborting.\n");
1829 return false;
1830 }
1831
1832 // Step 6
1833 if (formatting_node_in_open_elements == -1) {
1834 gumbo_debug("Formatting node not on stack of open elements.\n");
1835 parser_add_parse_error(parser, token);
1836 gumbo_vector_remove(
1837 parser, formatting_node, &state->_active_formatting_elements);
1838 return false;
1839 }
1840
1841 // Step 7
1842 if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
1843 parser_add_parse_error(parser, token);
1844 gumbo_debug("Element not in scope.\n");
1845 return false;
1846 }
1847
1848 // Step 8
1849 if (formatting_node != get_current_node(parser)) {
1850 parser_add_parse_error(parser, token); // But continue onwards.
1851 }
1852 assert(formatting_node);
1853 assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
1854 assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
1855
1856 // Step 9 & 10
1857 GumboNode* furthest_block = NULL;
1858 for (unsigned int j = formatting_node_in_open_elements;
1859 j < state->_open_elements.length; ++j) {
1860 assert(j > 0);
1861 GumboNode* current = state->_open_elements.data[j];
1862 if (is_special_node(current)) {
1863 // Step 9.
1864 furthest_block = current;
1865 break;
1866 }
1867 }
1868 if (!furthest_block) {
1869 // Step 10.
1870 while (get_current_node(parser) != formatting_node) {
1871 pop_current_node(parser);
1872 }
1873 // And the formatting element itself.
1874 pop_current_node(parser);
1875 gumbo_vector_remove(
1876 parser, formatting_node, &state->_active_formatting_elements);
1877 return false;
1878 }
1879 assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
1880 assert(furthest_block);
1881
1882 // Step 11.
1883 // Elements may be moved and reparented by this algorithm, so
1884 // common_ancestor is not necessarily the same as formatting_node->parent.
1885 GumboNode* common_ancestor =
1886 state->_open_elements.data[gumbo_vector_index_of(&state->_open_elements,
1887 formatting_node) -
1888 1];
1889 gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
1890 gumbo_normalized_tagname(common_ancestor->v.element.tag),
1891 gumbo_normalized_tagname(furthest_block->v.element.tag));
1892
1893 // Step 12.
1894 int bookmark = gumbo_vector_index_of(
1895 &state->_active_formatting_elements, formatting_node) +
1896 1;
1897 gumbo_debug("Bookmark at %d.\n", bookmark);
1898 // Step 13.
1899 GumboNode* node = furthest_block;
1900 GumboNode* last_node = furthest_block;
1901 // Must be stored explicitly, in case node is removed from the stack of open
1902 // elements, to handle step 9.4.
1903 int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
1904 assert(saved_node_index > 0);
1905 // Step 13.1.
1906 for (int j = 0;;) {
1907 // Step 13.2.
1908 ++j;
1909 // Step 13.3.
1910 int node_index = gumbo_vector_index_of(&state->_open_elements, node);
1911 gumbo_debug(
1912 "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1913 if (node_index == -1) {
1914 node_index = saved_node_index;
1915 }
1916 saved_node_index = --node_index;
1917 assert(node_index > 0);
1918 assert((unsigned int) node_index < state->_open_elements.capacity);
1919 node = state->_open_elements.data[node_index];
1920 assert(node->parent);
1921 if (node == formatting_node) {
1922 // Step 13.4.
1923 break;
1924 }
1925 int formatting_index =
1926 gumbo_vector_index_of(&state->_active_formatting_elements, node);
1927 if (j > 3 && formatting_index != -1) {
1928 // Step 13.5.
1929 gumbo_debug("Removing formatting element at %d.\n", formatting_index);
1930 gumbo_vector_remove_at(
1931 parser, formatting_index, &state->_active_formatting_elements);
1932 // Removing the element shifts all indices over by one, so we may need
1933 // to move the bookmark.
1934 if (formatting_index < bookmark) {
1935 --bookmark;
1936 gumbo_debug("Moving bookmark to %d.\n", bookmark);
1937 }
1938 continue;
1939 }
1940 if (formatting_index == -1) {
1941 // Step 13.6.
1942 gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
1943 continue;
1944 }
1945 // Step 13.7.
1946 // "common ancestor as the intended parent" doesn't actually mean insert
1947 // it into the common ancestor; that happens below.
1948 node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1949 assert(formatting_index >= 0);
1950 state->_active_formatting_elements.data[formatting_index] = node;
1951 assert(node_index >= 0);
1952 state->_open_elements.data[node_index] = node;
1953 // Step 13.8.
1954 if (last_node == furthest_block) {
1955 bookmark = formatting_index + 1;
1956 gumbo_debug("Bookmark moved to %d.\n", bookmark);
1957 assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
1958 }
1959 // Step 13.9.
1960 last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1961 remove_from_parent(parser, last_node);
1962 append_node(parser, node, last_node);
1963 // Step 13.10.
1964 last_node = node;
1965 } // Step 13.11.
1966
1967 // Step 14.
1968 gumbo_debug("Removing %s node from parent ",
1969 gumbo_normalized_tagname(last_node->v.element.tag));
1970 remove_from_parent(parser, last_node);
1971 last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1972 InsertionLocation location =
1973 get_appropriate_insertion_location(parser, common_ancestor);
1974 gumbo_debug("and inserting it into %s.\n",
1975 gumbo_normalized_tagname(location.target->v.element.tag));
1976 insert_node(parser, last_node, location);
1977
1978 // Step 15.
1979 GumboNode* new_formatting_node = clone_node(
1980 parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1981 formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1982
1983 // Step 16. Instead of appending nodes one-by-one, we swap the children
1984 // vector of furthest_block with the empty children of new_formatting_node,
1985 // reducing memory traffic and allocations. We still have to reset their
1986 // parent pointers, though.
1987 GumboVector temp = new_formatting_node->v.element.children;
1988 new_formatting_node->v.element.children =
1989 furthest_block->v.element.children;
1990 furthest_block->v.element.children = temp;
1991
1992 temp = new_formatting_node->v.element.children;
1993 for (unsigned int i = 0; i < temp.length; ++i) {
1994 GumboNode* child = temp.data[i];
1995 child->parent = new_formatting_node;
1996 }
1997
1998 // Step 17.
1999 append_node(parser, furthest_block, new_formatting_node);
2000
2001 // Step 18.
2002 // If the formatting node was before the bookmark, it may shift over all
2003 // indices after it, so we need to explicitly find the index and possibly
2004 // adjust the bookmark.
2005 int formatting_node_index = gumbo_vector_index_of(
2006 &state->_active_formatting_elements, formatting_node);
2007 assert(formatting_node_index != -1);
2008 if (formatting_node_index < bookmark) {
2009 gumbo_debug(
2010 "Formatting node at %d is before bookmark at %d; decrementing.\n",
2011 formatting_node_index, bookmark);
2012 --bookmark;
2013 }
2014 gumbo_vector_remove_at(
2015 parser, formatting_node_index, &state->_active_formatting_elements);
2016 assert(bookmark >= 0);
2017 assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2018 gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
2019 &state->_active_formatting_elements);
2020
2021 // Step 19.
2022 gumbo_vector_remove(parser, formatting_node, &state->_open_elements);
2023 int insert_at =
2024 gumbo_vector_index_of(&state->_open_elements, furthest_block) + 1;
2025 assert(insert_at >= 0);
2026 assert((unsigned int) insert_at <= state->_open_elements.length);
2027 gumbo_vector_insert_at(
2028 parser, new_formatting_node, insert_at, &state->_open_elements);
2029 } // Step 20.
2030 return true;
2031 }
2032
2033 // This is here to clean up memory when the spec says "Ignore current token."
ignore_token(GumboParser * parser)2034 static void ignore_token(GumboParser* parser) {
2035 GumboToken* token = parser->_parser_state->_current_token;
2036 // Ownership of the token's internal buffers are normally transferred to the
2037 // element, but if no element is emitted (as happens in non-verbatim-mode
2038 // when a token is ignored), we need to free it here to prevent a memory
2039 // leak.
2040 gumbo_token_destroy(parser, token);
2041 #ifndef NDEBUG
2042 if (token->type == GUMBO_TOKEN_START_TAG) {
2043 // Mark this sentinel so the assertion in the main loop knows it's been
2044 // destroyed.
2045 token->v.start_tag.attributes = kGumboEmptyVector;
2046 }
2047 #endif
2048 }
2049
2050 // http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html
finish_parsing(GumboParser * parser)2051 static void finish_parsing(GumboParser* parser) {
2052 gumbo_debug("Finishing parsing");
2053 maybe_flush_text_node_buffer(parser);
2054 GumboParserState* state = parser->_parser_state;
2055 for (GumboNode* node = pop_current_node(parser); node;
2056 node = pop_current_node(parser)) {
2057 if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
2058 (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
2059 continue;
2060 }
2061 node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2062 }
2063 while (pop_current_node(parser))
2064 ; // Pop them all.
2065 }
2066
handle_initial(GumboParser * parser,GumboToken * token)2067 static bool handle_initial(GumboParser* parser, GumboToken* token) {
2068 GumboDocument* document = &get_document_node(parser)->v.document;
2069 if (token->type == GUMBO_TOKEN_WHITESPACE) {
2070 ignore_token(parser);
2071 return true;
2072 } else if (token->type == GUMBO_TOKEN_COMMENT) {
2073 append_comment_node(parser, get_document_node(parser), token);
2074 return true;
2075 } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2076 document->has_doctype = true;
2077 document->name = token->v.doc_type.name;
2078 document->public_identifier = token->v.doc_type.public_identifier;
2079 document->system_identifier = token->v.doc_type.system_identifier;
2080 document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
2081 set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2082 return maybe_add_doctype_error(parser, token);
2083 }
2084 parser_add_parse_error(parser, token);
2085 document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
2086 set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2087 parser->_parser_state->_reprocess_current_token = true;
2088 return true;
2089 }
2090
2091 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode
handle_before_html(GumboParser * parser,GumboToken * token)2092 static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2093 if (token->type == GUMBO_TOKEN_DOCTYPE) {
2094 parser_add_parse_error(parser, token);
2095 ignore_token(parser);
2096 return false;
2097 } else if (token->type == GUMBO_TOKEN_COMMENT) {
2098 append_comment_node(parser, get_document_node(parser), token);
2099 return true;
2100 } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2101 ignore_token(parser);
2102 return true;
2103 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2104 GumboNode* html_node = insert_element_from_token(parser, token);
2105 parser->_output->root = html_node;
2106 set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2107 return true;
2108 } else if (token->type == GUMBO_TOKEN_END_TAG &&
2109 !tag_in(token, false,
2110 (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) {
2111 parser_add_parse_error(parser, token);
2112 ignore_token(parser);
2113 return false;
2114 } else {
2115 GumboNode* html_node = insert_element_of_tag_type(
2116 parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
2117 assert(html_node);
2118 parser->_output->root = html_node;
2119 set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2120 parser->_parser_state->_reprocess_current_token = true;
2121 return true;
2122 }
2123 }
2124
2125 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode
handle_before_head(GumboParser * parser,GumboToken * token)2126 static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2127 if (token->type == GUMBO_TOKEN_DOCTYPE) {
2128 parser_add_parse_error(parser, token);
2129 ignore_token(parser);
2130 return false;
2131 } else if (token->type == GUMBO_TOKEN_COMMENT) {
2132 append_comment_node(parser, get_current_node(parser), token);
2133 return true;
2134 } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2135 ignore_token(parser);
2136 return true;
2137 } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2138 GumboNode* node = insert_element_from_token(parser, token);
2139 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2140 parser->_parser_state->_head_element = node;
2141 return true;
2142 } else if (token->type == GUMBO_TOKEN_END_TAG &&
2143 !tag_in(token, false,
2144 (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) {
2145 parser_add_parse_error(parser, token);
2146 ignore_token(parser);
2147 return false;
2148 } else {
2149 GumboNode* node = insert_element_of_tag_type(
2150 parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED);
2151 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2152 parser->_parser_state->_head_element = node;
2153 parser->_parser_state->_reprocess_current_token = true;
2154 return true;
2155 }
2156 }
2157
2158 // Forward declarations because of mutual dependencies.
2159 static bool handle_token(GumboParser* parser, GumboToken* token);
2160 static bool handle_in_body(GumboParser* parser, GumboToken* token);
2161
2162 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead
handle_in_head(GumboParser * parser,GumboToken * token)2163 static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2164 if (token->type == GUMBO_TOKEN_WHITESPACE) {
2165 insert_text_token(parser, token);
2166 return true;
2167 } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2168 parser_add_parse_error(parser, token);
2169 ignore_token(parser);
2170 return false;
2171 } else if (token->type == GUMBO_TOKEN_COMMENT) {
2172 append_comment_node(parser, get_current_node(parser), token);
2173 return true;
2174 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2175 return handle_in_body(parser, token);
2176 } else if (tag_in(token, kStartTag,
2177 (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2178 TAG(MENUITEM), TAG(LINK)})) {
2179 insert_element_from_token(parser, token);
2180 pop_current_node(parser);
2181 acknowledge_self_closing_tag(parser);
2182 return true;
2183 } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2184 insert_element_from_token(parser, token);
2185 pop_current_node(parser);
2186 acknowledge_self_closing_tag(parser);
2187 // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
2188 // spec doesn't apply. If clients want to handle meta-tag re-encoding, they
2189 // should specifically look for that string in the document and re-encode it
2190 // before passing to Gumbo.
2191 return true;
2192 } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2193 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2194 return true;
2195 } else if (tag_in(
2196 token, kStartTag, (gumbo_tagset){TAG(NOFRAMES), TAG(STYLE)})) {
2197 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2198 return true;
2199 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2200 insert_element_from_token(parser, token);
2201 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2202 return true;
2203 } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2204 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
2205 return true;
2206 } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2207 GumboNode* head = pop_current_node(parser);
2208 AVOID_UNUSED_VARIABLE_WARNING(head);
2209 assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2210 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2211 return true;
2212 } else if (tag_in(token, kEndTag,
2213 (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)})) {
2214 pop_current_node(parser);
2215 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2216 parser->_parser_state->_reprocess_current_token = true;
2217 return true;
2218 } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2219 insert_element_from_token(parser, token);
2220 add_formatting_element(parser, &kActiveFormattingScopeMarker);
2221 parser->_parser_state->_frameset_ok = false;
2222 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2223 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2224 return true;
2225 } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2226 if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2227 parser_add_parse_error(parser, token);
2228 ignore_token(parser);
2229 return false;
2230 }
2231 generate_all_implied_end_tags_thoroughly(parser);
2232 bool success = true;
2233 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) {
2234 parser_add_parse_error(parser, token);
2235 success = false;
2236 }
2237 while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
2238 ;
2239 clear_active_formatting_elements(parser);
2240 pop_template_insertion_mode(parser);
2241 reset_insertion_mode_appropriately(parser);
2242 return success;
2243 } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2244 (token->type == GUMBO_TOKEN_END_TAG)) {
2245 parser_add_parse_error(parser, token);
2246 ignore_token(parser);
2247 return false;
2248 } else {
2249 pop_current_node(parser);
2250 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2251 parser->_parser_state->_reprocess_current_token = true;
2252 return true;
2253 }
2254 return true;
2255 }
2256
2257 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript
handle_in_head_noscript(GumboParser * parser,GumboToken * token)2258 static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2259 if (token->type == GUMBO_TOKEN_DOCTYPE) {
2260 parser_add_parse_error(parser, token);
2261 return false;
2262 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2263 return handle_in_body(parser, token);
2264 } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2265 const GumboNode* node = pop_current_node(parser);
2266 assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2267 AVOID_UNUSED_VARIABLE_WARNING(node);
2268 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2269 return true;
2270 } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
2271 token->type == GUMBO_TOKEN_COMMENT ||
2272 tag_in(token, kStartTag,
2273 (gumbo_tagset){TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2274 TAG(META), TAG(NOFRAMES), TAG(STYLE)})) {
2275 return handle_in_head(parser, token);
2276 } else if (tag_in(
2277 token, kStartTag, (gumbo_tagset){TAG(HEAD), TAG(NOSCRIPT)}) ||
2278 (token->type == GUMBO_TOKEN_END_TAG &&
2279 !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
2280 parser_add_parse_error(parser, token);
2281 ignore_token(parser);
2282 return false;
2283 } else {
2284 parser_add_parse_error(parser, token);
2285 const GumboNode* node = pop_current_node(parser);
2286 assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2287 AVOID_UNUSED_VARIABLE_WARNING(node);
2288 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2289 parser->_parser_state->_reprocess_current_token = true;
2290 return false;
2291 }
2292 }
2293
2294 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode
handle_after_head(GumboParser * parser,GumboToken * token)2295 static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2296 GumboParserState* state = parser->_parser_state;
2297 if (token->type == GUMBO_TOKEN_WHITESPACE) {
2298 insert_text_token(parser, token);
2299 return true;
2300 } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2301 parser_add_parse_error(parser, token);
2302 ignore_token(parser);
2303 return false;
2304 } else if (token->type == GUMBO_TOKEN_COMMENT) {
2305 append_comment_node(parser, get_current_node(parser), token);
2306 return true;
2307 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2308 return handle_in_body(parser, token);
2309 } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2310 insert_element_from_token(parser, token);
2311 state->_frameset_ok = false;
2312 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2313 return true;
2314 } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2315 insert_element_from_token(parser, token);
2316 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2317 return true;
2318 } else if (tag_in(token, kStartTag,
2319 (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2320 TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
2321 TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)})) {
2322 parser_add_parse_error(parser, token);
2323 assert(state->_head_element != NULL);
2324 // This must be flushed before we push the head element on, as there may be
2325 // pending character tokens that should be attached to the root.
2326 maybe_flush_text_node_buffer(parser);
2327 gumbo_vector_add(parser, state->_head_element, &state->_open_elements);
2328 bool result = handle_in_head(parser, token);
2329 gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
2330 return result;
2331 } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2332 return handle_in_head(parser, token);
2333 } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2334 (token->type == GUMBO_TOKEN_END_TAG &&
2335 !tag_in(token, kEndTag,
2336 (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)}))) {
2337 parser_add_parse_error(parser, token);
2338 ignore_token(parser);
2339 return false;
2340 } else {
2341 insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2342 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2343 state->_reprocess_current_token = true;
2344 return true;
2345 }
2346 }
2347
destroy_node(GumboParser * parser,GumboNode * node)2348 static void destroy_node(GumboParser* parser, GumboNode* node) {
2349 switch (node->type) {
2350 case GUMBO_NODE_DOCUMENT: {
2351 GumboDocument* doc = &node->v.document;
2352 for (unsigned int i = 0; i < doc->children.length; ++i) {
2353 destroy_node(parser, doc->children.data[i]);
2354 }
2355 gumbo_parser_deallocate(parser, (void*) doc->children.data);
2356 gumbo_parser_deallocate(parser, (void*) doc->name);
2357 gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
2358 gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
2359 } break;
2360 case GUMBO_NODE_TEMPLATE:
2361 case GUMBO_NODE_ELEMENT:
2362 for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) {
2363 gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
2364 }
2365 gumbo_parser_deallocate(parser, node->v.element.attributes.data);
2366 for (unsigned int i = 0; i < node->v.element.children.length; ++i) {
2367 destroy_node(parser, node->v.element.children.data[i]);
2368 }
2369 gumbo_parser_deallocate(parser, node->v.element.children.data);
2370 break;
2371 case GUMBO_NODE_TEXT:
2372 case GUMBO_NODE_CDATA:
2373 case GUMBO_NODE_COMMENT:
2374 case GUMBO_NODE_WHITESPACE:
2375 gumbo_parser_deallocate(parser, (void*) node->v.text.text);
2376 break;
2377 }
2378 gumbo_parser_deallocate(parser, node);
2379 }
2380
2381 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
handle_in_body(GumboParser * parser,GumboToken * token)2382 static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2383 GumboParserState* state = parser->_parser_state;
2384 assert(state->_open_elements.length > 0);
2385 if (token->type == GUMBO_TOKEN_NULL) {
2386 parser_add_parse_error(parser, token);
2387 ignore_token(parser);
2388 return false;
2389 } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2390 reconstruct_active_formatting_elements(parser);
2391 insert_text_token(parser, token);
2392 return true;
2393 } else if (token->type == GUMBO_TOKEN_CHARACTER ||
2394 token->type == GUMBO_TOKEN_CDATA) {
2395 reconstruct_active_formatting_elements(parser);
2396 insert_text_token(parser, token);
2397 set_frameset_not_ok(parser);
2398 return true;
2399 } else if (token->type == GUMBO_TOKEN_COMMENT) {
2400 append_comment_node(parser, get_current_node(parser), token);
2401 return true;
2402 } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2403 parser_add_parse_error(parser, token);
2404 ignore_token(parser);
2405 return false;
2406 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2407 parser_add_parse_error(parser, token);
2408 if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2409 ignore_token(parser);
2410 return false;
2411 }
2412 assert(parser->_output->root != NULL);
2413 assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2414 merge_attributes(parser, token, parser->_output->root);
2415 return false;
2416 } else if (tag_in(token, kStartTag,
2417 (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2418 TAG(MENUITEM), TAG(LINK), TAG(META), TAG(NOFRAMES),
2419 TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
2420 tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2421 return handle_in_head(parser, token);
2422 } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2423 parser_add_parse_error(parser, token);
2424 if (state->_open_elements.length < 2 ||
2425 !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2426 has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2427 ignore_token(parser);
2428 return false;
2429 }
2430 state->_frameset_ok = false;
2431 merge_attributes(parser, token, state->_open_elements.data[1]);
2432 return false;
2433 } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2434 parser_add_parse_error(parser, token);
2435 if (state->_open_elements.length < 2 ||
2436 !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2437 !state->_frameset_ok) {
2438 ignore_token(parser);
2439 return false;
2440 }
2441 // Save the body node for later removal.
2442 GumboNode* body_node = state->_open_elements.data[1];
2443
2444 // Pop all nodes except root HTML element.
2445 GumboNode* node;
2446 do {
2447 node = pop_current_node(parser);
2448 } while (node != state->_open_elements.data[1]);
2449
2450 // Removing & destroying the body node is going to kill any nodes that have
2451 // been added to the list of active formatting elements, and so we should
2452 // clear it to prevent a use-after-free if the list of active formatting
2453 // elements is reconstructed afterwards. This may happen if whitespace
2454 // follows the </frameset>.
2455 clear_active_formatting_elements(parser);
2456
2457 // Remove the body node. We may want to factor this out into a generic
2458 // helper, but right now this is the only code that needs to do this.
2459 GumboVector* children = &parser->_output->root->v.element.children;
2460 for (unsigned int i = 0; i < children->length; ++i) {
2461 if (children->data[i] == body_node) {
2462 gumbo_vector_remove_at(parser, i, children);
2463 break;
2464 }
2465 }
2466 destroy_node(parser, body_node);
2467
2468 // Insert the <frameset>, and switch the insertion mode.
2469 insert_element_from_token(parser, token);
2470 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2471 return true;
2472 } else if (token->type == GUMBO_TOKEN_EOF) {
2473 for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2474 if (!node_tag_in_set(state->_open_elements.data[i],
2475 (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY),
2476 TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY),
2477 TAG(HTML)})) {
2478 parser_add_parse_error(parser, token);
2479 }
2480 }
2481 if (get_current_template_insertion_mode(parser) !=
2482 GUMBO_INSERTION_MODE_INITIAL) {
2483 return handle_in_template(parser, token);
2484 }
2485 return true;
2486 } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(HTML)})) {
2487 if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2488 parser_add_parse_error(parser, token);
2489 ignore_token(parser);
2490 return false;
2491 }
2492 bool success = true;
2493 for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2494 if (!node_tag_in_set(state->_open_elements.data[i],
2495 (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
2496 TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC),
2497 TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR),
2498 TAG(BODY), TAG(HTML)})) {
2499 parser_add_parse_error(parser, token);
2500 success = false;
2501 break;
2502 }
2503 }
2504 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2505 if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2506 parser->_parser_state->_reprocess_current_token = true;
2507 } else {
2508 GumboNode* body = state->_open_elements.data[1];
2509 assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2510 record_end_of_element(state->_current_token, &body->v.element);
2511 }
2512 return success;
2513 } else if (tag_in(token, kStartTag,
2514 (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
2515 TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS), TAG(DIR),
2516 TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
2517 TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
2518 TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P),
2519 TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
2520 bool result = maybe_implicitly_close_p_tag(parser, token);
2521 insert_element_from_token(parser, token);
2522 return result;
2523 } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2524 TAG(H4), TAG(H5), TAG(H6)})) {
2525 bool result = maybe_implicitly_close_p_tag(parser, token);
2526 if (node_tag_in_set(
2527 get_current_node(parser), (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2528 TAG(H4), TAG(H5), TAG(H6)})) {
2529 parser_add_parse_error(parser, token);
2530 pop_current_node(parser);
2531 result = false;
2532 }
2533 insert_element_from_token(parser, token);
2534 return result;
2535 } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(PRE), TAG(LISTING)})) {
2536 bool result = maybe_implicitly_close_p_tag(parser, token);
2537 insert_element_from_token(parser, token);
2538 state->_ignore_next_linefeed = true;
2539 state->_frameset_ok = false;
2540 return result;
2541 } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2542 if (state->_form_element != NULL &&
2543 !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2544 gumbo_debug("Ignoring nested form.\n");
2545 parser_add_parse_error(parser, token);
2546 ignore_token(parser);
2547 return false;
2548 }
2549 bool result = maybe_implicitly_close_p_tag(parser, token);
2550 GumboNode* form_element = insert_element_from_token(parser, token);
2551 if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2552 state->_form_element = form_element;
2553 }
2554 return result;
2555 } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2556 maybe_implicitly_close_list_tag(parser, token, true);
2557 bool result = maybe_implicitly_close_p_tag(parser, token);
2558 insert_element_from_token(parser, token);
2559 return result;
2560 } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(DD), TAG(DT)})) {
2561 maybe_implicitly_close_list_tag(parser, token, false);
2562 bool result = maybe_implicitly_close_p_tag(parser, token);
2563 insert_element_from_token(parser, token);
2564 return result;
2565 } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2566 bool result = maybe_implicitly_close_p_tag(parser, token);
2567 insert_element_from_token(parser, token);
2568 gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2569 return result;
2570 } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2571 if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2572 parser_add_parse_error(parser, token);
2573 implicitly_close_tags(
2574 parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON);
2575 state->_reprocess_current_token = true;
2576 return false;
2577 }
2578 reconstruct_active_formatting_elements(parser);
2579 insert_element_from_token(parser, token);
2580 state->_frameset_ok = false;
2581 return true;
2582 } else if (tag_in(token, kEndTag,
2583 (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
2584 TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS),
2585 TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
2586 TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER),
2587 TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV),
2588 TAG(OL), TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
2589 GumboTag tag = token->v.end_tag;
2590 if (!has_an_element_in_scope(parser, tag)) {
2591 parser_add_parse_error(parser, token);
2592 ignore_token(parser);
2593 return false;
2594 }
2595 implicitly_close_tags(
2596 parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag);
2597 return true;
2598 } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2599 if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2600 if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
2601 parser_add_parse_error(parser, token);
2602 ignore_token(parser);
2603 return false;
2604 }
2605 bool success = true;
2606 generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2607 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
2608 parser_add_parse_error(parser, token);
2609 return false;
2610 }
2611 while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
2612 ;
2613 return success;
2614 } else {
2615 bool result = true;
2616 const GumboNode* node = state->_form_element;
2617 assert(!node || node->type == GUMBO_NODE_ELEMENT);
2618 state->_form_element = NULL;
2619 if (!node || !has_node_in_scope(parser, node)) {
2620 gumbo_debug("Closing an unopened form.\n");
2621 parser_add_parse_error(parser, token);
2622 ignore_token(parser);
2623 return false;
2624 }
2625 // This differs from implicitly_close_tags because we remove *only* the
2626 // <form> element; other nodes are left in scope.
2627 generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2628 if (get_current_node(parser) != node) {
2629 parser_add_parse_error(parser, token);
2630 result = false;
2631 }
2632
2633 GumboVector* open_elements = &state->_open_elements;
2634 int index = gumbo_vector_index_of(open_elements, node);
2635 assert(index >= 0);
2636 gumbo_vector_remove_at(parser, index, open_elements);
2637 return result;
2638 }
2639 } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
2640 if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2641 parser_add_parse_error(parser, token);
2642 // reconstruct_active_formatting_elements(parser);
2643 insert_element_of_tag_type(
2644 parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2645 state->_reprocess_current_token = true;
2646 return false;
2647 }
2648 return implicitly_close_tags(
2649 parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
2650 } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
2651 if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
2652 parser_add_parse_error(parser, token);
2653 ignore_token(parser);
2654 return false;
2655 }
2656 return implicitly_close_tags(
2657 parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI);
2658 } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(DD), TAG(DT)})) {
2659 assert(token->type == GUMBO_TOKEN_END_TAG);
2660 GumboTag token_tag = token->v.end_tag;
2661 if (!has_an_element_in_scope(parser, token_tag)) {
2662 parser_add_parse_error(parser, token);
2663 ignore_token(parser);
2664 return false;
2665 }
2666 return implicitly_close_tags(
2667 parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2668 } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2669 TAG(H4), TAG(H5), TAG(H6)})) {
2670 if (!has_an_element_in_scope_with_tagname(
2671 parser, 6, (GumboTag[]){GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2672 GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) {
2673 // No heading open; ignore the token entirely.
2674 parser_add_parse_error(parser, token);
2675 ignore_token(parser);
2676 return false;
2677 } else {
2678 generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2679 const GumboNode* current_node = get_current_node(parser);
2680 bool success = node_html_tag_is(current_node, token->v.end_tag);
2681 if (!success) {
2682 // There're children of the heading currently open; close them below and
2683 // record a parse error.
2684 // TODO(jdtang): Add a way to distinguish this error case from the one
2685 // above.
2686 parser_add_parse_error(parser, token);
2687 }
2688 do {
2689 current_node = pop_current_node(parser);
2690 } while (!node_tag_in_set(
2691 current_node, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2692 TAG(H4), TAG(H5), TAG(H6)}));
2693 return success;
2694 }
2695 } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
2696 bool success = true;
2697 int last_a;
2698 int has_matching_a = find_last_anchor_index(parser, &last_a);
2699 if (has_matching_a) {
2700 assert(has_matching_a == 1);
2701 parser_add_parse_error(parser, token);
2702 adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
2703 // The adoption agency algorithm usually removes all instances of <a>
2704 // from the list of active formatting elements, but in case it doesn't,
2705 // we're supposed to do this. (The conditions where it might not are
2706 // listed in the spec.)
2707 if (find_last_anchor_index(parser, &last_a)) {
2708 void* last_element = gumbo_vector_remove_at(
2709 parser, last_a, &state->_active_formatting_elements);
2710 gumbo_vector_remove(parser, last_element, &state->_open_elements);
2711 }
2712 success = false;
2713 }
2714 reconstruct_active_formatting_elements(parser);
2715 add_formatting_element(parser, insert_element_from_token(parser, token));
2716 return success;
2717 } else if (tag_in(token, kStartTag,
2718 (gumbo_tagset){TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT),
2719 TAG(I), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG),
2720 TAG(TT), TAG(U)})) {
2721 reconstruct_active_formatting_elements(parser);
2722 add_formatting_element(parser, insert_element_from_token(parser, token));
2723 return true;
2724 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
2725 bool result = true;
2726 reconstruct_active_formatting_elements(parser);
2727 if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
2728 result = false;
2729 parser_add_parse_error(parser, token);
2730 adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
2731 reconstruct_active_formatting_elements(parser);
2732 }
2733 insert_element_from_token(parser, token);
2734 add_formatting_element(parser, get_current_node(parser));
2735 return result;
2736 } else if (tag_in(token, kEndTag,
2737 (gumbo_tagset){TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM),
2738 TAG(FONT), TAG(I), TAG(NOBR), TAG(S), TAG(SMALL),
2739 TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)})) {
2740 return adoption_agency_algorithm(parser, token, token->v.end_tag);
2741 } else if (tag_in(token, kStartTag,
2742 (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) {
2743 reconstruct_active_formatting_elements(parser);
2744 insert_element_from_token(parser, token);
2745 add_formatting_element(parser, &kActiveFormattingScopeMarker);
2746 set_frameset_not_ok(parser);
2747 return true;
2748 } else if (tag_in(token, kEndTag,
2749 (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) {
2750 GumboTag token_tag = token->v.end_tag;
2751 if (!has_an_element_in_table_scope(parser, token_tag)) {
2752 parser_add_parse_error(parser, token);
2753 ignore_token(parser);
2754 return false;
2755 }
2756 implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2757 clear_active_formatting_elements(parser);
2758 return true;
2759 } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
2760 if (get_document_node(parser)->v.document.doc_type_quirks_mode !=
2761 GUMBO_DOCTYPE_QUIRKS) {
2762 maybe_implicitly_close_p_tag(parser, token);
2763 }
2764 insert_element_from_token(parser, token);
2765 set_frameset_not_ok(parser);
2766 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
2767 return true;
2768 } else if (tag_in(token, kStartTag,
2769 (gumbo_tagset){TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG),
2770 TAG(IMAGE), TAG(KEYGEN), TAG(WBR)})) {
2771 bool success = true;
2772 if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2773 success = false;
2774 parser_add_parse_error(parser, token);
2775 token->v.start_tag.tag = GUMBO_TAG_IMG;
2776 }
2777 reconstruct_active_formatting_elements(parser);
2778 GumboNode* node = insert_element_from_token(parser, token);
2779 if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2780 success = false;
2781 parser_add_parse_error(parser, token);
2782 node->v.element.tag = GUMBO_TAG_IMG;
2783 node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
2784 }
2785 pop_current_node(parser);
2786 acknowledge_self_closing_tag(parser);
2787 set_frameset_not_ok(parser);
2788 return success;
2789 } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
2790 if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
2791 // Must be before the element is inserted, as that takes ownership of the
2792 // token's attribute vector.
2793 set_frameset_not_ok(parser);
2794 }
2795 reconstruct_active_formatting_elements(parser);
2796 insert_element_from_token(parser, token);
2797 pop_current_node(parser);
2798 acknowledge_self_closing_tag(parser);
2799 return true;
2800 } else if (tag_in(token, kStartTag,
2801 (gumbo_tagset){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})) {
2802 insert_element_from_token(parser, token);
2803 pop_current_node(parser);
2804 acknowledge_self_closing_tag(parser);
2805 return true;
2806 } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
2807 bool result = maybe_implicitly_close_p_tag(parser, token);
2808 insert_element_from_token(parser, token);
2809 pop_current_node(parser);
2810 acknowledge_self_closing_tag(parser);
2811 set_frameset_not_ok(parser);
2812 return result;
2813 } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
2814 parser_add_parse_error(parser, token);
2815 if (parser->_parser_state->_form_element != NULL &&
2816 !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2817 ignore_token(parser);
2818 return false;
2819 }
2820 acknowledge_self_closing_tag(parser);
2821 maybe_implicitly_close_p_tag(parser, token);
2822 set_frameset_not_ok(parser);
2823
2824 GumboVector* token_attrs = &token->v.start_tag.attributes;
2825 GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt");
2826 GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action");
2827 GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "name");
2828
2829 GumboNode* form = insert_element_of_tag_type(
2830 parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
2831 if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2832 parser->_parser_state->_form_element = form;
2833 }
2834 if (action_attr) {
2835 gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
2836 }
2837 insert_element_of_tag_type(
2838 parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2839 pop_current_node(parser); // <hr>
2840
2841 insert_element_of_tag_type(
2842 parser, GUMBO_TAG_LABEL, GUMBO_INSERTION_FROM_ISINDEX);
2843 TextNodeBufferState* text_state = &parser->_parser_state->_text_node;
2844 text_state->_start_original_text = token->original_text.data;
2845 text_state->_start_position = token->position;
2846 text_state->_type = GUMBO_NODE_TEXT;
2847 if (prompt_attr) {
2848 int prompt_attr_length = strlen(prompt_attr->value);
2849 gumbo_string_buffer_destroy(parser, &text_state->_buffer);
2850 text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value);
2851 text_state->_buffer.length = prompt_attr_length;
2852 text_state->_buffer.capacity = prompt_attr_length + 1;
2853 gumbo_destroy_attribute(parser, prompt_attr);
2854 } else {
2855 GumboStringPiece prompt_text =
2856 GUMBO_STRING("This is a searchable index. Enter search keywords: ");
2857 gumbo_string_buffer_append_string(
2858 parser, &prompt_text, &text_state->_buffer);
2859 }
2860
2861 GumboNode* input = insert_element_of_tag_type(
2862 parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX);
2863 for (unsigned int i = 0; i < token_attrs->length; ++i) {
2864 GumboAttribute* attr = token_attrs->data[i];
2865 if (attr != prompt_attr && attr != action_attr && attr != name_attr) {
2866 gumbo_vector_add(parser, attr, &input->v.element.attributes);
2867 }
2868 token_attrs->data[i] = NULL;
2869 }
2870
2871 // All attributes have been successfully transferred and nulled out at this
2872 // point, so the call to ignore_token will free the memory for it without
2873 // touching the attributes.
2874 ignore_token(parser);
2875
2876 // The name attribute, if present, should be destroyed since it's ignored
2877 // when copying over. The action attribute should be kept since it's moved
2878 // to the form.
2879 if (name_attr) {
2880 gumbo_destroy_attribute(parser, name_attr);
2881 }
2882
2883 GumboAttribute* name =
2884 gumbo_parser_allocate(parser, sizeof(GumboAttribute));
2885 GumboStringPiece name_str = GUMBO_STRING("name");
2886 GumboStringPiece isindex_str = GUMBO_STRING("isindex");
2887 name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
2888 name->name = gumbo_copy_stringz(parser, "name");
2889 name->value = gumbo_copy_stringz(parser, "isindex");
2890 name->original_name = name_str;
2891 name->original_value = isindex_str;
2892 name->name_start = kGumboEmptySourcePosition;
2893 name->name_end = kGumboEmptySourcePosition;
2894 name->value_start = kGumboEmptySourcePosition;
2895 name->value_end = kGumboEmptySourcePosition;
2896 gumbo_vector_add(parser, name, &input->v.element.attributes);
2897
2898 pop_current_node(parser); // <input>
2899 pop_current_node(parser); // <label>
2900 insert_element_of_tag_type(
2901 parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2902 pop_current_node(parser); // <hr>
2903 pop_current_node(parser); // <form>
2904 if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2905 parser->_parser_state->_form_element = NULL;
2906 }
2907 return false;
2908 } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
2909 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2910 parser->_parser_state->_ignore_next_linefeed = true;
2911 set_frameset_not_ok(parser);
2912 return true;
2913 } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
2914 bool result = maybe_implicitly_close_p_tag(parser, token);
2915 reconstruct_active_formatting_elements(parser);
2916 set_frameset_not_ok(parser);
2917 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2918 return result;
2919 } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
2920 set_frameset_not_ok(parser);
2921 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2922 return true;
2923 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
2924 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2925 return true;
2926 } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
2927 reconstruct_active_formatting_elements(parser);
2928 insert_element_from_token(parser, token);
2929 set_frameset_not_ok(parser);
2930 GumboInsertionMode state = parser->_parser_state->_insertion_mode;
2931 if (state == GUMBO_INSERTION_MODE_IN_TABLE ||
2932 state == GUMBO_INSERTION_MODE_IN_CAPTION ||
2933 state == GUMBO_INSERTION_MODE_IN_TABLE_BODY ||
2934 state == GUMBO_INSERTION_MODE_IN_ROW ||
2935 state == GUMBO_INSERTION_MODE_IN_CELL) {
2936 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
2937 } else {
2938 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
2939 }
2940 return true;
2941 } else if (tag_in(token, kStartTag,
2942 (gumbo_tagset){TAG(OPTION), TAG(OPTGROUP)})) {
2943 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2944 pop_current_node(parser);
2945 }
2946 reconstruct_active_formatting_elements(parser);
2947 insert_element_from_token(parser, token);
2948 return true;
2949 } else if (tag_in(token, kStartTag,
2950 (gumbo_tagset){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})) {
2951 bool success = true;
2952 GumboTag exception =
2953 tag_in(token, kStartTag, (gumbo_tagset){TAG(RT), TAG(RP)})
2954 ? GUMBO_TAG_RTC
2955 : GUMBO_TAG_LAST;
2956 if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
2957 generate_implied_end_tags(parser, exception);
2958 }
2959 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) &&
2960 !(exception == GUMBO_TAG_LAST ||
2961 node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC))) {
2962 parser_add_parse_error(parser, token);
2963 success = false;
2964 }
2965 insert_element_from_token(parser, token);
2966 return success;
2967 } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
2968 parser_add_parse_error(parser, token);
2969 reconstruct_active_formatting_elements(parser);
2970 insert_element_of_tag_type(
2971 parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2972 pop_current_node(parser);
2973 return false;
2974 } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
2975 reconstruct_active_formatting_elements(parser);
2976 adjust_mathml_attributes(parser, token);
2977 adjust_foreign_attributes(parser, token);
2978 insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
2979 if (token->v.start_tag.is_self_closing) {
2980 pop_current_node(parser);
2981 acknowledge_self_closing_tag(parser);
2982 }
2983 return true;
2984 } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
2985 reconstruct_active_formatting_elements(parser);
2986 adjust_svg_attributes(parser, token);
2987 adjust_foreign_attributes(parser, token);
2988 insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
2989 if (token->v.start_tag.is_self_closing) {
2990 pop_current_node(parser);
2991 acknowledge_self_closing_tag(parser);
2992 }
2993 return true;
2994 } else if (tag_in(token, kStartTag,
2995 (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
2996 TAG(FRAME), TAG(HEAD), TAG(TBODY), TAG(TD), TAG(TFOOT),
2997 TAG(TH), TAG(THEAD), TAG(TR)})) {
2998 parser_add_parse_error(parser, token);
2999 ignore_token(parser);
3000 return false;
3001 } else if (token->type == GUMBO_TOKEN_START_TAG) {
3002 reconstruct_active_formatting_elements(parser);
3003 insert_element_from_token(parser, token);
3004 return true;
3005 } else {
3006 assert(token->type == GUMBO_TOKEN_END_TAG);
3007 GumboTag end_tag = token->v.end_tag;
3008 assert(state->_open_elements.length > 0);
3009 assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
3010 // Walk up the stack of open elements until we find one that either:
3011 // a) Matches the tag name we saw
3012 // b) Is in the "special" category.
3013 // If we see a), implicitly close everything up to and including it. If we
3014 // see b), then record a parse error, don't close anything (except the
3015 // implied end tags) and ignore the end tag token.
3016 for (int i = state->_open_elements.length; --i >= 0;) {
3017 const GumboNode* node = state->_open_elements.data[i];
3018 if (node_html_tag_is(node, end_tag)) {
3019 generate_implied_end_tags(parser, end_tag);
3020 // TODO(jdtang): Do I need to add a parse error here? The condition in
3021 // the spec seems like it's the inverse of the loop condition above, and
3022 // so would never fire.
3023 while (node != pop_current_node(parser))
3024 ; // Pop everything.
3025 return true;
3026 } else if (is_special_node(node)) {
3027 parser_add_parse_error(parser, token);
3028 ignore_token(parser);
3029 return false;
3030 }
3031 }
3032 // <html> is in the special category, so we should never get here.
3033 assert(0);
3034 return false;
3035 }
3036 }
3037
3038 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata
handle_text(GumboParser * parser,GumboToken * token)3039 static bool handle_text(GumboParser* parser, GumboToken* token) {
3040 if (token->type == GUMBO_TOKEN_CHARACTER ||
3041 token->type == GUMBO_TOKEN_WHITESPACE) {
3042 insert_text_token(parser, token);
3043 } else {
3044 // We provide only bare-bones script handling that doesn't involve any of
3045 // the parser-pause/already-started/script-nesting flags or re-entrant
3046 // invocations of the tokenizer. Because the intended usage of this library
3047 // is mostly for templating, refactoring, and static-analysis libraries, we
3048 // provide the script body as a text-node child of the <script> element.
3049 // This behavior doesn't support document.write of partial HTML elements,
3050 // but should be adequate for almost all other scripting support.
3051 if (token->type == GUMBO_TOKEN_EOF) {
3052 parser_add_parse_error(parser, token);
3053 parser->_parser_state->_reprocess_current_token = true;
3054 }
3055 pop_current_node(parser);
3056 set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3057 }
3058 return true;
3059 }
3060
3061 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable
handle_in_table(GumboParser * parser,GumboToken * token)3062 static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3063 GumboParserState* state = parser->_parser_state;
3064 if (token->type == GUMBO_TOKEN_CHARACTER ||
3065 token->type == GUMBO_TOKEN_WHITESPACE) {
3066 // The "pending table character tokens" list described in the spec is
3067 // nothing more than the TextNodeBufferState. We accumulate text tokens as
3068 // normal, except that when we go to flush them in the handle_in_table_text,
3069 // we set _foster_parent_insertions if there're non-whitespace characters in
3070 // the buffer.
3071 assert(state->_text_node._buffer.length == 0);
3072 state->_original_insertion_mode = state->_insertion_mode;
3073 state->_reprocess_current_token = true;
3074 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
3075 return true;
3076 } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3077 parser_add_parse_error(parser, token);
3078 ignore_token(parser);
3079 return false;
3080 } else if (token->type == GUMBO_TOKEN_COMMENT) {
3081 append_comment_node(parser, get_current_node(parser), token);
3082 return true;
3083 } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3084 clear_stack_to_table_context(parser);
3085 add_formatting_element(parser, &kActiveFormattingScopeMarker);
3086 insert_element_from_token(parser, token);
3087 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
3088 return true;
3089 } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3090 clear_stack_to_table_context(parser);
3091 insert_element_from_token(parser, token);
3092 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3093 return true;
3094 } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3095 clear_stack_to_table_context(parser);
3096 insert_element_of_tag_type(
3097 parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED);
3098 parser->_parser_state->_reprocess_current_token = true;
3099 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3100 return true;
3101 } else if (tag_in(token, kStartTag,
3102 (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD),
3103 TAG(TH), TAG(TR)})) {
3104 clear_stack_to_table_context(parser);
3105 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3106 if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH), TAG(TR)})) {
3107 insert_element_of_tag_type(
3108 parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
3109 state->_reprocess_current_token = true;
3110 } else {
3111 insert_element_from_token(parser, token);
3112 }
3113 return true;
3114 } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3115 parser_add_parse_error(parser, token);
3116 if (close_table(parser)) {
3117 parser->_parser_state->_reprocess_current_token = true;
3118 } else {
3119 ignore_token(parser);
3120 }
3121 return false;
3122 } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3123 if (!close_table(parser)) {
3124 parser_add_parse_error(parser, token);
3125 return false;
3126 }
3127 return true;
3128 } else if (tag_in(token, kEndTag,
3129 (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL),
3130 TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD), TAG(TFOOT),
3131 TAG(TH), TAG(THEAD), TAG(TR)})) {
3132 parser_add_parse_error(parser, token);
3133 ignore_token(parser);
3134 return false;
3135 } else if (tag_in(token, kStartTag,
3136 (gumbo_tagset){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)}) ||
3137 (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))) {
3138 return handle_in_head(parser, token);
3139 } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
3140 attribute_matches(
3141 &token->v.start_tag.attributes, "type", "hidden")) {
3142 parser_add_parse_error(parser, token);
3143 insert_element_from_token(parser, token);
3144 pop_current_node(parser);
3145 return false;
3146 } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3147 parser_add_parse_error(parser, token);
3148 if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3149 ignore_token(parser);
3150 return false;
3151 }
3152 state->_form_element = insert_element_from_token(parser, token);
3153 pop_current_node(parser);
3154 return false;
3155 } else if (token->type == GUMBO_TOKEN_EOF) {
3156 return handle_in_body(parser, token);
3157 } else {
3158 parser_add_parse_error(parser, token);
3159 state->_foster_parent_insertions = true;
3160 bool result = handle_in_body(parser, token);
3161 state->_foster_parent_insertions = false;
3162 return result;
3163 }
3164 }
3165
3166 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext
handle_in_table_text(GumboParser * parser,GumboToken * token)3167 static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3168 if (token->type == GUMBO_TOKEN_NULL) {
3169 parser_add_parse_error(parser, token);
3170 ignore_token(parser);
3171 return false;
3172 } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3173 token->type == GUMBO_TOKEN_WHITESPACE) {
3174 insert_text_token(parser, token);
3175 return true;
3176 } else {
3177 GumboParserState* state = parser->_parser_state;
3178 GumboStringBuffer* buffer = &state->_text_node._buffer;
3179 // Can't use strspn for this because GumboStringBuffers are not
3180 // null-terminated.
3181 // Note that TextNodeBuffer may contain UTF-8 characters, but the presence
3182 // of any one byte that is not whitespace means we flip the flag, so this
3183 // loop is still valid.
3184 for (unsigned int i = 0; i < buffer->length; ++i) {
3185 if (!isspace((unsigned char) buffer->data[i]) ||
3186 buffer->data[i] == '\v') {
3187 state->_foster_parent_insertions = true;
3188 reconstruct_active_formatting_elements(parser);
3189 break;
3190 }
3191 }
3192 maybe_flush_text_node_buffer(parser);
3193 state->_foster_parent_insertions = false;
3194 state->_reprocess_current_token = true;
3195 state->_insertion_mode = state->_original_insertion_mode;
3196 return true;
3197 }
3198 }
3199
3200 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
handle_in_caption(GumboParser * parser,GumboToken * token)3201 static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3202 if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3203 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3204 parser_add_parse_error(parser, token);
3205 ignore_token(parser);
3206 return false;
3207 } else {
3208 generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3209 bool result = true;
3210 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3211 parser_add_parse_error(parser, token);
3212 }
3213 while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3214 ;
3215 clear_active_formatting_elements(parser);
3216 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3217 return result;
3218 }
3219 } else if (tag_in(token, kStartTag,
3220 (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3221 TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3222 TAG(TR)}) ||
3223 (tag_is(token, kEndTag, GUMBO_TAG_TABLE))) {
3224 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3225 parser_add_parse_error(parser, token);
3226 ignore_token(parser);
3227 return false;
3228 }
3229 while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3230 ;
3231 clear_active_formatting_elements(parser);
3232 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3233 parser->_parser_state->_reprocess_current_token = true;
3234 return true;
3235 } else if (tag_in(token, kEndTag,
3236 (gumbo_tagset){TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML),
3237 TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3238 TAG(TR)})) {
3239 parser_add_parse_error(parser, token);
3240 ignore_token(parser);
3241 return false;
3242 } else {
3243 return handle_in_body(parser, token);
3244 }
3245 }
3246
3247 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup
handle_in_column_group(GumboParser * parser,GumboToken * token)3248 static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3249 if (token->type == GUMBO_TOKEN_WHITESPACE) {
3250 insert_text_token(parser, token);
3251 return true;
3252 } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3253 parser_add_parse_error(parser, token);
3254 ignore_token(parser);
3255 return false;
3256 } else if (token->type == GUMBO_TOKEN_COMMENT) {
3257 append_comment_node(parser, get_current_node(parser), token);
3258 return true;
3259 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3260 return handle_in_body(parser, token);
3261 } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3262 insert_element_from_token(parser, token);
3263 pop_current_node(parser);
3264 acknowledge_self_closing_tag(parser);
3265 return true;
3266 } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3267 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3268 parser_add_parse_error(parser, token);
3269 ignore_token(parser);
3270 return false;
3271 }
3272 pop_current_node(parser);
3273 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3274 return false;
3275 } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3276 parser_add_parse_error(parser, token);
3277 ignore_token(parser);
3278 return false;
3279 } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) ||
3280 tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3281 return handle_in_head(parser, token);
3282 } else if (token->type == GUMBO_TOKEN_EOF) {
3283 return handle_in_body(parser, token);
3284 } else {
3285 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3286 parser_add_parse_error(parser, token);
3287 ignore_token(parser);
3288 return false;
3289 }
3290 pop_current_node(parser);
3291 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3292 parser->_parser_state->_reprocess_current_token = true;
3293 return true;
3294 }
3295 }
3296
3297 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody
handle_in_table_body(GumboParser * parser,GumboToken * token)3298 static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3299 if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3300 clear_stack_to_table_body_context(parser);
3301 insert_element_from_token(parser, token);
3302 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3303 return true;
3304 } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3305 parser_add_parse_error(parser, token);
3306 clear_stack_to_table_body_context(parser);
3307 insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3308 parser->_parser_state->_reprocess_current_token = true;
3309 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3310 return false;
3311 } else if (tag_in(token, kEndTag,
3312 (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3313 if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3314 parser_add_parse_error(parser, token);
3315 ignore_token(parser);
3316 return false;
3317 }
3318 clear_stack_to_table_body_context(parser);
3319 pop_current_node(parser);
3320 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3321 return true;
3322 } else if (tag_in(token, kStartTag,
3323 (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3324 TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) ||
3325 tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3326 if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
3327 has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
3328 has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) {
3329 parser_add_parse_error(parser, token);
3330 ignore_token(parser);
3331 return false;
3332 }
3333 clear_stack_to_table_body_context(parser);
3334 pop_current_node(parser);
3335 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3336 parser->_parser_state->_reprocess_current_token = true;
3337 return true;
3338 } else if (tag_in(token, kEndTag,
3339 (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR),
3340 TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) {
3341 parser_add_parse_error(parser, token);
3342 ignore_token(parser);
3343 return false;
3344 } else {
3345 return handle_in_table(parser, token);
3346 }
3347 }
3348
3349 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
handle_in_row(GumboParser * parser,GumboToken * token)3350 static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3351 if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TH), TAG(TD)})) {
3352 clear_stack_to_table_row_context(parser);
3353 insert_element_from_token(parser, token);
3354 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3355 add_formatting_element(parser, &kActiveFormattingScopeMarker);
3356 return true;
3357 } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3358 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3359 parser_add_parse_error(parser, token);
3360 ignore_token(parser);
3361 return false;
3362 } else {
3363 clear_stack_to_table_row_context(parser);
3364 pop_current_node(parser);
3365 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3366 return true;
3367 }
3368 } else if (tag_in(token, kStartTag,
3369 (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3370 TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)}) ||
3371 tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3372 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3373 parser_add_parse_error(parser, token);
3374 ignore_token(parser);
3375 return false;
3376 } else {
3377 clear_stack_to_table_row_context(parser);
3378 pop_current_node(parser);
3379 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3380 parser->_parser_state->_reprocess_current_token = true;
3381 return true;
3382 }
3383 } else if (tag_in(token, kEndTag,
3384 (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3385 if (!has_an_element_in_table_scope(parser, token->v.end_tag) ||
3386 (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR))) {
3387 parser_add_parse_error(parser, token);
3388 ignore_token(parser);
3389 return false;
3390 } else {
3391 clear_stack_to_table_row_context(parser);
3392 pop_current_node(parser);
3393 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3394 parser->_parser_state->_reprocess_current_token = true;
3395 return true;
3396 }
3397 } else if (tag_in(token, kEndTag,
3398 (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL),
3399 TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) {
3400 parser_add_parse_error(parser, token);
3401 ignore_token(parser);
3402 return false;
3403 } else {
3404 return handle_in_table(parser, token);
3405 }
3406 }
3407
3408 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
handle_in_cell(GumboParser * parser,GumboToken * token)3409 static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3410 if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3411 GumboTag token_tag = token->v.end_tag;
3412 if (!has_an_element_in_table_scope(parser, token_tag)) {
3413 parser_add_parse_error(parser, token);
3414 ignore_token(parser);
3415 return false;
3416 }
3417 return close_table_cell(parser, token, token_tag);
3418 } else if (tag_in(token, kStartTag,
3419 (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3420 TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3421 TAG(TR)})) {
3422 gumbo_debug("Handling <td> in cell.\n");
3423 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
3424 !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
3425 gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
3426 parser_add_parse_error(parser, token);
3427 ignore_token(parser);
3428 return false;
3429 }
3430 parser->_parser_state->_reprocess_current_token = true;
3431 return close_current_cell(parser, token);
3432 } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(CAPTION),
3433 TAG(COL), TAG(COLGROUP), TAG(HTML)})) {
3434 parser_add_parse_error(parser, token);
3435 ignore_token(parser);
3436 return false;
3437 } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TABLE), TAG(TBODY),
3438 TAG(TFOOT), TAG(THEAD), TAG(TR)})) {
3439 if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3440 parser_add_parse_error(parser, token);
3441 ignore_token(parser);
3442 return false;
3443 }
3444 parser->_parser_state->_reprocess_current_token = true;
3445 return close_current_cell(parser, token);
3446 } else {
3447 return handle_in_body(parser, token);
3448 }
3449 }
3450
3451 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect
handle_in_select(GumboParser * parser,GumboToken * token)3452 static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3453 if (token->type == GUMBO_TOKEN_NULL) {
3454 parser_add_parse_error(parser, token);
3455 ignore_token(parser);
3456 return false;
3457 } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3458 token->type == GUMBO_TOKEN_WHITESPACE) {
3459 insert_text_token(parser, token);
3460 return true;
3461 } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3462 parser_add_parse_error(parser, token);
3463 ignore_token(parser);
3464 return false;
3465 } else if (token->type == GUMBO_TOKEN_COMMENT) {
3466 append_comment_node(parser, get_current_node(parser), token);
3467 return true;
3468 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3469 return handle_in_body(parser, token);
3470 } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3471 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3472 pop_current_node(parser);
3473 }
3474 insert_element_from_token(parser, token);
3475 return true;
3476 } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3477 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3478 pop_current_node(parser);
3479 }
3480 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3481 pop_current_node(parser);
3482 }
3483 insert_element_from_token(parser, token);
3484 return true;
3485 } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3486 GumboVector* open_elements = &parser->_parser_state->_open_elements;
3487 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
3488 node_html_tag_is(open_elements->data[open_elements->length - 2],
3489 GUMBO_TAG_OPTGROUP)) {
3490 pop_current_node(parser);
3491 }
3492 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3493 pop_current_node(parser);
3494 return true;
3495 } else {
3496 parser_add_parse_error(parser, token);
3497 ignore_token(parser);
3498 return false;
3499 }
3500 } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3501 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3502 pop_current_node(parser);
3503 return true;
3504 } else {
3505 parser_add_parse_error(parser, token);
3506 ignore_token(parser);
3507 return false;
3508 }
3509 } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3510 if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3511 parser_add_parse_error(parser, token);
3512 ignore_token(parser);
3513 return false;
3514 }
3515 close_current_select(parser);
3516 return true;
3517 } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3518 parser_add_parse_error(parser, token);
3519 ignore_token(parser);
3520 if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3521 close_current_select(parser);
3522 }
3523 return false;
3524 } else if (tag_in(token, kStartTag,
3525 (gumbo_tagset){TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})) {
3526 parser_add_parse_error(parser, token);
3527 if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3528 ignore_token(parser);
3529 } else {
3530 close_current_select(parser);
3531 parser->_parser_state->_reprocess_current_token = true;
3532 }
3533 return false;
3534 } else if (tag_in(token, kStartTag,
3535 (gumbo_tagset){TAG(SCRIPT), TAG(TEMPLATE)}) ||
3536 tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3537 return handle_in_head(parser, token);
3538 } else if (token->type == GUMBO_TOKEN_EOF) {
3539 return handle_in_body(parser, token);
3540 } else {
3541 parser_add_parse_error(parser, token);
3542 ignore_token(parser);
3543 return false;
3544 }
3545 }
3546
3547 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
handle_in_select_in_table(GumboParser * parser,GumboToken * token)3548 static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3549 if (tag_in(token, kStartTag,
3550 (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT),
3551 TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
3552 parser_add_parse_error(parser, token);
3553 close_current_select(parser);
3554 parser->_parser_state->_reprocess_current_token = true;
3555 return false;
3556 } else if (tag_in(token, kEndTag,
3557 (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY),
3558 TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
3559 parser_add_parse_error(parser, token);
3560 if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3561 ignore_token(parser);
3562 return false;
3563 } else {
3564 close_current_select(parser);
3565 // close_current_select already does the
3566 // reset_insertion_mode_appropriately
3567 // reset_insertion_mode_appropriately(parser);
3568 parser->_parser_state->_reprocess_current_token = true;
3569 return false;
3570 }
3571 } else {
3572 return handle_in_select(parser, token);
3573 }
3574 }
3575
3576 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
handle_in_template(GumboParser * parser,GumboToken * token)3577 static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3578 GumboParserState* state = parser->_parser_state;
3579 if (token->type == GUMBO_TOKEN_WHITESPACE ||
3580 token->type == GUMBO_TOKEN_CHARACTER ||
3581 token->type == GUMBO_TOKEN_COMMENT || token->type == GUMBO_TOKEN_NULL ||
3582 token->type == GUMBO_TOKEN_DOCTYPE) {
3583 return handle_in_body(parser, token);
3584 } else if (tag_in(token, kStartTag,
3585 (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
3586 TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
3587 TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
3588 tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3589 return handle_in_head(parser, token);
3590 } else if (tag_in(
3591 token, kStartTag, (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP),
3592 TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3593 pop_template_insertion_mode(parser);
3594 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3595 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3596 state->_reprocess_current_token = true;
3597 return true;
3598 } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3599 pop_template_insertion_mode(parser);
3600 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3601 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3602 state->_reprocess_current_token = true;
3603 return true;
3604 } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3605 pop_template_insertion_mode(parser);
3606 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3607 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3608 state->_reprocess_current_token = true;
3609 return true;
3610 } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3611 pop_template_insertion_mode(parser);
3612 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3613 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3614 state->_reprocess_current_token = true;
3615 return true;
3616 } else if (token->type == GUMBO_TOKEN_START_TAG) {
3617 pop_template_insertion_mode(parser);
3618 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3619 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3620 state->_reprocess_current_token = true;
3621 return true;
3622 } else if (token->type == GUMBO_TOKEN_END_TAG) {
3623 parser_add_parse_error(parser, token);
3624 ignore_token(parser);
3625 return false;
3626 } else if (token->type == GUMBO_TOKEN_EOF) {
3627 if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3628 // Stop parsing.
3629 return true;
3630 }
3631 parser_add_parse_error(parser, token);
3632 while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
3633 ;
3634 clear_active_formatting_elements(parser);
3635 pop_template_insertion_mode(parser);
3636 reset_insertion_mode_appropriately(parser);
3637 state->_reprocess_current_token = true;
3638 return false;
3639 } else {
3640 assert(0);
3641 return false;
3642 }
3643 }
3644
3645 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
handle_after_body(GumboParser * parser,GumboToken * token)3646 static bool handle_after_body(GumboParser* parser, GumboToken* token) {
3647 if (token->type == GUMBO_TOKEN_WHITESPACE ||
3648 tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3649 return handle_in_body(parser, token);
3650 } else if (token->type == GUMBO_TOKEN_COMMENT) {
3651 GumboNode* html_node = parser->_output->root;
3652 assert(html_node != NULL);
3653 append_comment_node(parser, html_node, token);
3654 return true;
3655 } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3656 parser_add_parse_error(parser, token);
3657 ignore_token(parser);
3658 return false;
3659 } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3660 /* fragment case: ignore the closing HTML token */
3661 if (is_fragment_parser(parser)) {
3662 parser_add_parse_error(parser, token);
3663 ignore_token(parser);
3664 return false;
3665 }
3666 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
3667 GumboNode* html = parser->_parser_state->_open_elements.data[0];
3668 assert(node_html_tag_is(html, GUMBO_TAG_HTML));
3669 record_end_of_element(
3670 parser->_parser_state->_current_token, &html->v.element);
3671 return true;
3672 } else if (token->type == GUMBO_TOKEN_EOF) {
3673 return true;
3674 } else {
3675 parser_add_parse_error(parser, token);
3676 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3677 parser->_parser_state->_reprocess_current_token = true;
3678 return false;
3679 }
3680 }
3681
3682 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset
handle_in_frameset(GumboParser * parser,GumboToken * token)3683 static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
3684 if (token->type == GUMBO_TOKEN_WHITESPACE) {
3685 insert_text_token(parser, token);
3686 return true;
3687 } else if (token->type == GUMBO_TOKEN_COMMENT) {
3688 append_comment_node(parser, get_current_node(parser), token);
3689 return true;
3690 } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3691 parser_add_parse_error(parser, token);
3692 ignore_token(parser);
3693 return false;
3694 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3695 return handle_in_body(parser, token);
3696 } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
3697 insert_element_from_token(parser, token);
3698 return true;
3699 } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
3700 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3701 parser_add_parse_error(parser, token);
3702 ignore_token(parser);
3703 return false;
3704 }
3705 pop_current_node(parser);
3706 if (!is_fragment_parser(parser) &&
3707 !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
3708 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
3709 }
3710 return true;
3711 } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
3712 insert_element_from_token(parser, token);
3713 pop_current_node(parser);
3714 acknowledge_self_closing_tag(parser);
3715 return true;
3716 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3717 return handle_in_head(parser, token);
3718 } else if (token->type == GUMBO_TOKEN_EOF) {
3719 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3720 parser_add_parse_error(parser, token);
3721 return false;
3722 }
3723 return true;
3724 } else {
3725 parser_add_parse_error(parser, token);
3726 ignore_token(parser);
3727 return false;
3728 }
3729 }
3730
3731 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset
handle_after_frameset(GumboParser * parser,GumboToken * token)3732 static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
3733 if (token->type == GUMBO_TOKEN_WHITESPACE) {
3734 insert_text_token(parser, token);
3735 return true;
3736 } else if (token->type == GUMBO_TOKEN_COMMENT) {
3737 append_comment_node(parser, get_current_node(parser), token);
3738 return true;
3739 } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3740 parser_add_parse_error(parser, token);
3741 ignore_token(parser);
3742 return false;
3743 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3744 return handle_in_body(parser, token);
3745 } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3746 GumboNode* html = parser->_parser_state->_open_elements.data[0];
3747 assert(node_html_tag_is(html, GUMBO_TAG_HTML));
3748 record_end_of_element(
3749 parser->_parser_state->_current_token, &html->v.element);
3750 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
3751 return true;
3752 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3753 return handle_in_head(parser, token);
3754 } else if (token->type == GUMBO_TOKEN_EOF) {
3755 return true;
3756 } else {
3757 parser_add_parse_error(parser, token);
3758 ignore_token(parser);
3759 return false;
3760 }
3761 }
3762
3763 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode
handle_after_after_body(GumboParser * parser,GumboToken * token)3764 static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
3765 if (token->type == GUMBO_TOKEN_COMMENT) {
3766 append_comment_node(parser, get_document_node(parser), token);
3767 return true;
3768 } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3769 token->type == GUMBO_TOKEN_WHITESPACE ||
3770 tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3771 return handle_in_body(parser, token);
3772 } else if (token->type == GUMBO_TOKEN_EOF) {
3773 return true;
3774 } else {
3775 parser_add_parse_error(parser, token);
3776 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3777 parser->_parser_state->_reprocess_current_token = true;
3778 return false;
3779 }
3780 }
3781
3782 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode
handle_after_after_frameset(GumboParser * parser,GumboToken * token)3783 static bool handle_after_after_frameset(
3784 GumboParser* parser, GumboToken* token) {
3785 if (token->type == GUMBO_TOKEN_COMMENT) {
3786 append_comment_node(parser, get_document_node(parser), token);
3787 return true;
3788 } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3789 token->type == GUMBO_TOKEN_WHITESPACE ||
3790 tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3791 return handle_in_body(parser, token);
3792 } else if (token->type == GUMBO_TOKEN_EOF) {
3793 return true;
3794 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3795 return handle_in_head(parser, token);
3796 } else {
3797 parser_add_parse_error(parser, token);
3798 ignore_token(parser);
3799 return false;
3800 }
3801 }
3802
3803 // Function pointers for each insertion mode. Keep in sync with
3804 // insertion_mode.h.
3805 typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
3806 static const TokenHandler kTokenHandlers[] = {handle_initial,
3807 handle_before_html, handle_before_head, handle_in_head,
3808 handle_in_head_noscript, handle_after_head, handle_in_body, handle_text,
3809 handle_in_table, handle_in_table_text, handle_in_caption,
3810 handle_in_column_group, handle_in_table_body, handle_in_row, handle_in_cell,
3811 handle_in_select, handle_in_select_in_table, handle_in_template,
3812 handle_after_body, handle_in_frameset, handle_after_frameset,
3813 handle_after_after_body, handle_after_after_frameset};
3814
handle_html_content(GumboParser * parser,GumboToken * token)3815 static bool handle_html_content(GumboParser* parser, GumboToken* token) {
3816 return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode](
3817 parser, token);
3818 }
3819
3820 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign
handle_in_foreign_content(GumboParser * parser,GumboToken * token)3821 static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
3822 gumbo_debug("Handling foreign content");
3823 switch (token->type) {
3824 case GUMBO_TOKEN_NULL:
3825 parser_add_parse_error(parser, token);
3826 token->v.character = kUtf8ReplacementChar;
3827 insert_text_token(parser, token);
3828 return false;
3829 case GUMBO_TOKEN_WHITESPACE:
3830 insert_text_token(parser, token);
3831 return true;
3832 case GUMBO_TOKEN_CDATA:
3833 case GUMBO_TOKEN_CHARACTER:
3834 insert_text_token(parser, token);
3835 set_frameset_not_ok(parser);
3836 return true;
3837 case GUMBO_TOKEN_COMMENT:
3838 append_comment_node(parser, get_current_node(parser), token);
3839 return true;
3840 case GUMBO_TOKEN_DOCTYPE:
3841 parser_add_parse_error(parser, token);
3842 ignore_token(parser);
3843 return false;
3844 default:
3845 // Fall through to the if-statements below.
3846 break;
3847 }
3848 // Order matters for these clauses.
3849 if (tag_in(token, kStartTag,
3850 (gumbo_tagset){TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR),
3851 TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT),
3852 TAG(EM), TAG(EMBED), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5),
3853 TAG(H6), TAG(HEAD), TAG(HR), TAG(I), TAG(IMG), TAG(LI),
3854 TAG(LISTING), TAG(MENU), TAG(META), TAG(NOBR), TAG(OL), TAG(P),
3855 TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), TAG(SPAN), TAG(STRONG),
3856 TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), TAG(TT), TAG(U),
3857 TAG(UL), TAG(VAR)}) ||
3858 (tag_is(token, kStartTag, GUMBO_TAG_FONT) &&
3859 (token_has_attribute(token, "color") ||
3860 token_has_attribute(token, "face") ||
3861 token_has_attribute(token, "size")))) {
3862 /* Parse error */
3863 parser_add_parse_error(parser, token);
3864
3865 /*
3866 * Fragment case: If the parser was originally created for the HTML
3867 * fragment parsing algorithm, then act as described in the "any other
3868 * start tag" entry below.
3869 */
3870 if (!is_fragment_parser(parser)) {
3871 do {
3872 pop_current_node(parser);
3873 } while (!(is_mathml_integration_point(get_current_node(parser)) ||
3874 is_html_integration_point(get_current_node(parser)) ||
3875 get_current_node(parser)->v.element.tag_namespace ==
3876 GUMBO_NAMESPACE_HTML));
3877 parser->_parser_state->_reprocess_current_token = true;
3878 return false;
3879 }
3880
3881 assert(token->type == GUMBO_TOKEN_START_TAG);
3882 }
3883
3884 if (token->type == GUMBO_TOKEN_START_TAG) {
3885 const GumboNamespaceEnum current_namespace =
3886 get_adjusted_current_node(parser)->v.element.tag_namespace;
3887 if (current_namespace == GUMBO_NAMESPACE_MATHML) {
3888 adjust_mathml_attributes(parser, token);
3889 }
3890 if (current_namespace == GUMBO_NAMESPACE_SVG) {
3891 // Tag adjustment is left to the gumbo_normalize_svg_tagname helper
3892 // function.
3893 adjust_svg_attributes(parser, token);
3894 }
3895 adjust_foreign_attributes(parser, token);
3896 insert_foreign_element(parser, token, current_namespace);
3897 if (token->v.start_tag.is_self_closing) {
3898 pop_current_node(parser);
3899 acknowledge_self_closing_tag(parser);
3900 }
3901 return true;
3902 // </script> tags are handled like any other end tag, putting the script's
3903 // text into a text node child and closing the current node.
3904 } else {
3905 assert(token->type == GUMBO_TOKEN_END_TAG);
3906 GumboNode* node = get_current_node(parser);
3907 assert(node != NULL);
3908 GumboStringPiece token_tagname = token->original_text;
3909 GumboStringPiece node_tagname = node->v.element.original_tag;
3910 gumbo_tag_from_original_text(&token_tagname);
3911 gumbo_tag_from_original_text(&node_tagname);
3912
3913 bool is_success = true;
3914 if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3915 parser_add_parse_error(parser, token);
3916 is_success = false;
3917 }
3918 int i = parser->_parser_state->_open_elements.length;
3919 for (--i; i > 0;) {
3920 // Here we move up the stack until we find an HTML element (in which
3921 // case we do nothing) or we find the element that we're about to
3922 // close (in which case we pop everything we've seen until that
3923 // point.)
3924 gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length,
3925 node_tagname.data, i);
3926 if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3927 gumbo_debug("Matches.\n");
3928 while (pop_current_node(parser) != node) {
3929 // Pop all the nodes below the current one. Node is guaranteed to
3930 // be an element on the stack of open elements (set below), so
3931 // this loop is guaranteed to terminate.
3932 }
3933 return is_success;
3934 }
3935 --i;
3936 node = parser->_parser_state->_open_elements.data[i];
3937 if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
3938 // Must break before gumbo_tag_from_original_text to avoid passing
3939 // parser-inserted nodes through.
3940 break;
3941 }
3942 node_tagname = node->v.element.original_tag;
3943 gumbo_tag_from_original_text(&node_tagname);
3944 }
3945 assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
3946 // We can't call handle_token directly because the current node is still in
3947 // the SVG namespace, so it would re-enter this and result in infinite
3948 // recursion.
3949 return handle_html_content(parser, token) && is_success;
3950 }
3951 }
3952
3953 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction
handle_token(GumboParser * parser,GumboToken * token)3954 static bool handle_token(GumboParser* parser, GumboToken* token) {
3955 if (parser->_parser_state->_ignore_next_linefeed &&
3956 token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') {
3957 parser->_parser_state->_ignore_next_linefeed = false;
3958 ignore_token(parser);
3959 return true;
3960 }
3961 // This needs to be reset both here and in the conditional above to catch both
3962 // the case where the next token is not whitespace (so we don't ignore
3963 // whitespace in the middle of <pre> tags) and where there are multiple
3964 // whitespace tokens (so we don't ignore the second one).
3965 parser->_parser_state->_ignore_next_linefeed = false;
3966
3967 if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
3968 parser->_parser_state->_closed_body_tag = true;
3969 }
3970 if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3971 parser->_parser_state->_closed_html_tag = true;
3972 }
3973
3974 const GumboNode* current_node = get_adjusted_current_node(parser);
3975 assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT ||
3976 current_node->type == GUMBO_NODE_TEMPLATE);
3977 if (current_node) {
3978 gumbo_debug("Current node: <%s>.\n",
3979 gumbo_normalized_tagname(current_node->v.element.tag));
3980 }
3981 if (!current_node ||
3982 current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
3983 (is_mathml_integration_point(current_node) &&
3984 (token->type == GUMBO_TOKEN_CHARACTER ||
3985 token->type == GUMBO_TOKEN_WHITESPACE ||
3986 token->type == GUMBO_TOKEN_NULL ||
3987 (token->type == GUMBO_TOKEN_START_TAG &&
3988 !tag_in(token, kStartTag,
3989 (gumbo_tagset){TAG(MGLYPH), TAG(MALIGNMARK)})))) ||
3990 (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
3991 node_qualified_tag_is(
3992 current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
3993 tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
3994 (is_html_integration_point(current_node) &&
3995 (token->type == GUMBO_TOKEN_START_TAG ||
3996 token->type == GUMBO_TOKEN_CHARACTER ||
3997 token->type == GUMBO_TOKEN_NULL ||
3998 token->type == GUMBO_TOKEN_WHITESPACE)) ||
3999 token->type == GUMBO_TOKEN_EOF) {
4000 return handle_html_content(parser, token);
4001 } else {
4002 return handle_in_foreign_content(parser, token);
4003 }
4004 }
4005
fragment_parser_init(GumboParser * parser,GumboTag fragment_ctx,GumboNamespaceEnum fragment_namespace)4006 static void fragment_parser_init(GumboParser* parser, GumboTag fragment_ctx,
4007 GumboNamespaceEnum fragment_namespace) {
4008 GumboNode* root;
4009 assert(fragment_ctx != GUMBO_TAG_LAST);
4010
4011 // 3
4012 parser->_parser_state->_fragment_ctx = create_element(parser, fragment_ctx);
4013 parser->_parser_state->_fragment_ctx->v.element.tag_namespace =
4014 fragment_namespace;
4015
4016 // 4
4017 if (fragment_namespace == GUMBO_NAMESPACE_HTML) {
4018 // Non-HTML namespaces always start in the DATA state.
4019 switch (fragment_ctx) {
4020 case GUMBO_TAG_TITLE:
4021 case GUMBO_TAG_TEXTAREA:
4022 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
4023 break;
4024
4025 case GUMBO_TAG_STYLE:
4026 case GUMBO_TAG_XMP:
4027 case GUMBO_TAG_IFRAME:
4028 case GUMBO_TAG_NOEMBED:
4029 case GUMBO_TAG_NOFRAMES:
4030 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4031 break;
4032
4033 case GUMBO_TAG_SCRIPT:
4034 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
4035 break;
4036
4037 case GUMBO_TAG_NOSCRIPT:
4038 /* scripting is disabled in Gumbo, so leave the tokenizer
4039 * in the default data state */
4040 break;
4041
4042 case GUMBO_TAG_PLAINTEXT:
4043 gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
4044 break;
4045
4046 default:
4047 /* default data state */
4048 break;
4049 }
4050 }
4051
4052 // 5. 6. 7.
4053 root = insert_element_of_tag_type(
4054 parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
4055 parser->_output->root = root;
4056
4057 // 8.
4058 if (fragment_ctx == GUMBO_TAG_TEMPLATE) {
4059 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
4060 }
4061
4062 // 10.
4063 reset_insertion_mode_appropriately(parser);
4064 }
4065
gumbo_parse(const char * buffer)4066 GumboOutput* gumbo_parse(const char* buffer) {
4067 return gumbo_parse_with_options(
4068 &kGumboDefaultOptions, buffer, strlen(buffer));
4069 }
4070
gumbo_parse_with_options(const GumboOptions * options,const char * buffer,size_t length)4071 GumboOutput* gumbo_parse_with_options(
4072 const GumboOptions* options, const char* buffer, size_t length) {
4073 GumboParser parser;
4074 parser._options = options;
4075 output_init(&parser);
4076 gumbo_tokenizer_state_init(&parser, buffer, length);
4077 parser_state_init(&parser);
4078
4079 if (options->fragment_context != GUMBO_TAG_LAST) {
4080 fragment_parser_init(
4081 &parser, options->fragment_context, options->fragment_namespace);
4082 }
4083
4084 GumboParserState* state = parser._parser_state;
4085 gumbo_debug("Parsing %.*s.\n", length, buffer);
4086
4087 // Sanity check so that infinite loops die with an assertion failure instead
4088 // of hanging the process before we ever get an error.
4089 int loop_count = 0;
4090
4091 GumboToken token;
4092 bool has_error = false;
4093
4094 do {
4095 if (state->_reprocess_current_token) {
4096 state->_reprocess_current_token = false;
4097 } else {
4098 GumboNode* current_node = get_current_node(&parser);
4099 gumbo_tokenizer_set_is_current_node_foreign(&parser,
4100 current_node &&
4101 current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML);
4102 has_error = !gumbo_lex(&parser, &token) || has_error;
4103 }
4104 const char* token_type = "text";
4105 switch (token.type) {
4106 case GUMBO_TOKEN_DOCTYPE:
4107 token_type = "doctype";
4108 break;
4109 case GUMBO_TOKEN_START_TAG:
4110 token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
4111 break;
4112 case GUMBO_TOKEN_END_TAG:
4113 token_type = gumbo_normalized_tagname(token.v.end_tag);
4114 break;
4115 case GUMBO_TOKEN_COMMENT:
4116 token_type = "comment";
4117 break;
4118 default:
4119 break;
4120 }
4121 gumbo_debug("Handling %s token @%d:%d in state %d.\n", (char*) token_type,
4122 token.position.line, token.position.column, state->_insertion_mode);
4123
4124 state->_current_token = &token;
4125 state->_self_closing_flag_acknowledged =
4126 !(token.type == GUMBO_TOKEN_START_TAG &&
4127 token.v.start_tag.is_self_closing);
4128
4129 has_error = !handle_token(&parser, &token) || has_error;
4130
4131 // Check for memory leaks when ownership is transferred from start tag
4132 // tokens to nodes.
4133 assert(state->_reprocess_current_token ||
4134 token.type != GUMBO_TOKEN_START_TAG ||
4135 token.v.start_tag.attributes.data == NULL);
4136
4137 if (!state->_self_closing_flag_acknowledged) {
4138 GumboError* error = parser_add_parse_error(&parser, &token);
4139 if (error) {
4140 error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
4141 }
4142 }
4143
4144 ++loop_count;
4145 assert(loop_count < 1000000000);
4146
4147 } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) &&
4148 !(options->stop_on_first_error && has_error));
4149
4150 finish_parsing(&parser);
4151 // For API uniformity reasons, if the doctype still has nulls, convert them to
4152 // empty strings.
4153 GumboDocument* doc_type = &parser._output->document->v.document;
4154 if (doc_type->name == NULL) {
4155 doc_type->name = gumbo_copy_stringz(&parser, "");
4156 }
4157 if (doc_type->public_identifier == NULL) {
4158 doc_type->public_identifier = gumbo_copy_stringz(&parser, "");
4159 }
4160 if (doc_type->system_identifier == NULL) {
4161 doc_type->system_identifier = gumbo_copy_stringz(&parser, "");
4162 }
4163
4164 parser_state_destroy(&parser);
4165 gumbo_tokenizer_state_destroy(&parser);
4166 return parser._output;
4167 }
4168
gumbo_destroy_node(GumboOptions * options,GumboNode * node)4169 void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
4170 // Need a dummy GumboParser because the allocator comes along with the
4171 // options object.
4172 GumboParser parser;
4173 parser._options = options;
4174 destroy_node(&parser, node);
4175 }
4176
gumbo_destroy_output(const GumboOptions * options,GumboOutput * output)4177 void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
4178 // Need a dummy GumboParser because the allocator comes along with the
4179 // options object.
4180 GumboParser parser;
4181 parser._options = options;
4182 destroy_node(&parser, output->document);
4183 for (unsigned int i = 0; i < output->errors.length; ++i) {
4184 gumbo_error_destroy(&parser, output->errors.data[i]);
4185 }
4186 gumbo_vector_destroy(&parser, &output->errors);
4187 gumbo_parser_deallocate(&parser, output);
4188 }
4189