1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16
17 #include "tokenizer.h"
18
19 #include <stdio.h>
20
21 #include "gtest/gtest.h"
22 #include "test_utils.h"
23
24 extern const char* kGumboTagNames[];
25
26 namespace {
27
28 // Tests for tokenizer.c
29 class GumboTokenizerTest : public GumboTest {
30 protected:
GumboTokenizerTest()31 GumboTokenizerTest() { gumbo_tokenizer_state_init(&parser_, "", 0); }
32
~GumboTokenizerTest()33 virtual ~GumboTokenizerTest() {
34 gumbo_tokenizer_state_destroy(&parser_);
35 gumbo_token_destroy(&parser_, &token_);
36 }
37
SetInput(const char * input)38 void SetInput(const char* input) {
39 text_ = input;
40 gumbo_tokenizer_state_destroy(&parser_);
41 gumbo_tokenizer_state_init(&parser_, input, strlen(input));
42 }
43
Advance(int num_tokens)44 void Advance(int num_tokens) {
45 for (int i = 0; i < num_tokens; ++i) {
46 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
47 gumbo_token_destroy(&parser_, &token_);
48 }
49 }
50
51 GumboToken token_;
52 };
53
TEST(GumboTagEnumTest,TagEnumIncludesAllTags)54 TEST(GumboTagEnumTest, TagEnumIncludesAllTags) {
55 EXPECT_EQ(150, GUMBO_TAG_UNKNOWN);
56 EXPECT_STREQ("", kGumboTagNames[GUMBO_TAG_UNKNOWN]);
57 }
58
TEST_F(GumboTokenizerTest,PartialTag)59 TEST_F(GumboTokenizerTest, PartialTag) {
60 SetInput("<a");
61 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
62 EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
63 }
64
TEST_F(GumboTokenizerTest,PartialTagWithAttributes)65 TEST_F(GumboTokenizerTest, PartialTagWithAttributes) {
66 SetInput("<a href=foo /");
67 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
68 EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
69 }
70
TEST_F(GumboTokenizerTest,LexCharToken)71 TEST_F(GumboTokenizerTest, LexCharToken) {
72 SetInput("a");
73 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
74 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
75 EXPECT_EQ(1, token_.position.column);
76 EXPECT_EQ(1, token_.position.line);
77 EXPECT_EQ(0, token_.position.offset);
78 EXPECT_EQ('a', *token_.original_text.data);
79 EXPECT_EQ(1, token_.original_text.length);
80 EXPECT_EQ('a', token_.v.character);
81
82 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
83 EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
84 EXPECT_EQ(1, token_.position.offset);
85 }
86
TEST_F(GumboTokenizerTest,LexCharRef)87 TEST_F(GumboTokenizerTest, LexCharRef) {
88 SetInput(" Text");
89 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
90 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
91 EXPECT_EQ(1, token_.position.column);
92 EXPECT_EQ(1, token_.position.line);
93 EXPECT_EQ(0, token_.position.offset);
94 EXPECT_EQ('&', *token_.original_text.data);
95 EXPECT_EQ(6, token_.original_text.length);
96 EXPECT_EQ(0xA0, token_.v.character);
97
98 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
99 EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
100 EXPECT_EQ(' ', *token_.original_text.data);
101 EXPECT_EQ(' ', token_.v.character);
102 }
103
TEST_F(GumboTokenizerTest,LexCharRef_NotCharRef)104 TEST_F(GumboTokenizerTest, LexCharRef_NotCharRef) {
105 SetInput("&xyz");
106 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
107 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
108 EXPECT_EQ(0, token_.position.offset);
109 EXPECT_EQ('&', token_.v.character);
110
111 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
112 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
113 EXPECT_EQ(1, token_.position.offset);
114 EXPECT_EQ('x', token_.v.character);
115 }
116
TEST_F(GumboTokenizerTest,LeadingWhitespace)117 TEST_F(GumboTokenizerTest, LeadingWhitespace) {
118 SetInput(
119 "<div>\n"
120 " <span class=foo>");
121 Advance(4);
122 EXPECT_TRUE(gumbo_lex(&parser_, &token_)); // <span>
123
124 GumboTokenStartTag* start_tag = &token_.v.start_tag;
125 EXPECT_EQ(GUMBO_TAG_SPAN, start_tag->tag);
126 EXPECT_EQ(2, token_.position.line);
127 EXPECT_EQ(3, token_.position.column);
128 ASSERT_EQ(1, start_tag->attributes.length);
129
130 GumboAttribute* clas =
131 static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
132 EXPECT_STREQ("class", clas->name);
133 EXPECT_EQ("class", ToString(clas->original_name));
134 EXPECT_EQ(2, clas->name_start.line);
135 EXPECT_EQ(9, clas->name_start.column);
136 EXPECT_EQ(14, clas->name_end.column);
137 EXPECT_STREQ("foo", clas->value);
138 EXPECT_EQ("foo", ToString(clas->original_value));
139 EXPECT_EQ(15, clas->value_start.column);
140 EXPECT_EQ(18, clas->value_end.column);
141 }
142
TEST_F(GumboTokenizerTest,Doctype)143 TEST_F(GumboTokenizerTest, Doctype) {
144 SetInput("<!doctype html>");
145 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
146 ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
147 EXPECT_EQ(0, token_.position.offset);
148
149 GumboTokenDocType* doc_type = &token_.v.doc_type;
150 EXPECT_FALSE(doc_type->force_quirks);
151 EXPECT_FALSE(doc_type->has_public_identifier);
152 EXPECT_FALSE(doc_type->has_system_identifier);
153 EXPECT_STREQ("html", doc_type->name);
154 EXPECT_STREQ("", doc_type->public_identifier);
155 EXPECT_STREQ("", doc_type->system_identifier);
156 }
157
TEST_F(GumboTokenizerTest,DoctypePublic)158 TEST_F(GumboTokenizerTest, DoctypePublic) {
159 SetInput(
160 "<!DOCTYPE html PUBLIC "
161 "\"-//W3C//DTD XHTML 1.0 Transitional//EN\" "
162 "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>");
163 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
164 ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
165 EXPECT_EQ(0, token_.position.offset);
166
167 GumboTokenDocType* doc_type = &token_.v.doc_type;
168 EXPECT_FALSE(doc_type->force_quirks);
169 EXPECT_TRUE(doc_type->has_public_identifier);
170 EXPECT_TRUE(doc_type->has_system_identifier);
171 EXPECT_STREQ("html", doc_type->name);
172 EXPECT_STREQ(
173 "-//W3C//DTD XHTML 1.0 Transitional//EN", doc_type->public_identifier);
174 EXPECT_STREQ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
175 doc_type->system_identifier);
176 }
177
TEST_F(GumboTokenizerTest,DoctypeSystem)178 TEST_F(GumboTokenizerTest, DoctypeSystem) {
179 SetInput("<!DOCtype root_element SYSTEM \"DTD_location\">");
180 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
181 ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
182 EXPECT_EQ(0, token_.position.offset);
183
184 GumboTokenDocType* doc_type = &token_.v.doc_type;
185 EXPECT_FALSE(doc_type->force_quirks);
186 EXPECT_FALSE(doc_type->has_public_identifier);
187 EXPECT_TRUE(doc_type->has_system_identifier);
188 EXPECT_STREQ("root_element", doc_type->name);
189 EXPECT_STREQ("DTD_location", doc_type->system_identifier);
190 }
191
TEST_F(GumboTokenizerTest,DoctypeUnterminated)192 TEST_F(GumboTokenizerTest, DoctypeUnterminated) {
193 SetInput("<!DOCTYPE a PUBLIC''");
194 EXPECT_FALSE(gumbo_lex(&parser_, &token_));
195 ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
196 EXPECT_EQ(0, token_.position.offset);
197
198 GumboTokenDocType* doc_type = &token_.v.doc_type;
199 EXPECT_TRUE(doc_type->force_quirks);
200 EXPECT_TRUE(doc_type->has_public_identifier);
201 EXPECT_FALSE(doc_type->has_system_identifier);
202 EXPECT_STREQ("a", doc_type->name);
203 EXPECT_STREQ("", doc_type->system_identifier);
204 }
205
TEST_F(GumboTokenizerTest,RawtextEnd)206 TEST_F(GumboTokenizerTest, RawtextEnd) {
207 SetInput("<title>x ignores <tag></title>");
208 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
209 EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
210 EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
211
212 gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RAWTEXT);
213 gumbo_token_destroy(&parser_, &token_);
214 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
215 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
216 EXPECT_EQ('x', token_.v.character);
217 gumbo_token_destroy(&parser_, &token_);
218
219 Advance(9);
220 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
221 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
222 EXPECT_EQ('<', token_.v.character);
223 gumbo_token_destroy(&parser_, &token_);
224
225 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
226 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
227 EXPECT_EQ('t', token_.v.character);
228 gumbo_token_destroy(&parser_, &token_);
229
230 Advance(3);
231 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
232 EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
233 EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
234 }
235
TEST_F(GumboTokenizerTest,RCDataEnd)236 TEST_F(GumboTokenizerTest, RCDataEnd) {
237 SetInput("<title>x</title>");
238 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
239 EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
240 EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
241
242 gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RCDATA);
243 gumbo_token_destroy(&parser_, &token_);
244 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
245 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
246 EXPECT_EQ('x', token_.v.character);
247
248 gumbo_token_destroy(&parser_, &token_);
249 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
250 EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
251 EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
252 }
253
TEST_F(GumboTokenizerTest,ScriptEnd)254 TEST_F(GumboTokenizerTest, ScriptEnd) {
255 SetInput("<script>x = '\"></';</script>");
256 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
257 EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
258 EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.start_tag.tag);
259
260 gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
261 gumbo_token_destroy(&parser_, &token_);
262 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
263 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
264 EXPECT_EQ('x', token_.v.character);
265
266 gumbo_token_destroy(&parser_, &token_);
267 Advance(6);
268 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
269 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
270 EXPECT_EQ('<', token_.v.character);
271
272 gumbo_token_destroy(&parser_, &token_);
273 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
274 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
275 EXPECT_EQ('/', token_.v.character);
276
277 gumbo_token_destroy(&parser_, &token_);
278 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
279 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
280 EXPECT_EQ('\'', token_.v.character);
281
282 gumbo_token_destroy(&parser_, &token_);
283 Advance(1);
284 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
285 EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
286 EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.end_tag);
287 }
288
TEST_F(GumboTokenizerTest,ScriptEscapedEnd)289 TEST_F(GumboTokenizerTest, ScriptEscapedEnd) {
290 SetInput("<title>x</title>");
291 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
292 EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
293 EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
294
295 gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT_ESCAPED);
296 gumbo_token_destroy(&parser_, &token_);
297 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
298 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
299 EXPECT_EQ('x', token_.v.character);
300
301 gumbo_token_destroy(&parser_, &token_);
302 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
303 EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
304 EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
305 }
306
TEST_F(GumboTokenizerTest,ScriptCommentEscaped)307 TEST_F(GumboTokenizerTest, ScriptCommentEscaped) {
308 SetInput(
309 "<script><!-- var foo = x < 7 + '</div>-- <A href=\"foo\"></a>';\n"
310 "-->\n"
311 "</script>");
312 Advance(1);
313 gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
314 Advance(15);
315
316 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
317 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
318 EXPECT_EQ('x', token_.v.character);
319
320 gumbo_token_destroy(&parser_, &token_);
321 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
322 EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
323 EXPECT_EQ(' ', token_.v.character);
324
325 gumbo_token_destroy(&parser_, &token_);
326 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
327 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
328 EXPECT_EQ('<', token_.v.character);
329
330 gumbo_token_destroy(&parser_, &token_);
331 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
332 EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
333 EXPECT_EQ(' ', token_.v.character);
334
335 gumbo_token_destroy(&parser_, &token_);
336 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
337 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
338 EXPECT_EQ('7', token_.v.character);
339
340 gumbo_token_destroy(&parser_, &token_);
341 Advance(4);
342 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
343 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
344 EXPECT_EQ('<', token_.v.character);
345
346 gumbo_token_destroy(&parser_, &token_);
347 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
348 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
349 EXPECT_EQ('/', token_.v.character);
350
351 gumbo_token_destroy(&parser_, &token_);
352 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
353 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
354 EXPECT_EQ('d', token_.v.character);
355 gumbo_token_destroy(&parser_, &token_);
356 Advance(25);
357 }
358
TEST_F(GumboTokenizerTest,ScriptEscapedEmbeddedLessThan)359 TEST_F(GumboTokenizerTest, ScriptEscapedEmbeddedLessThan) {
360 SetInput("<script>/*<![CDATA[*/ x<7 /*]]>*/</script>");
361 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
362 EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
363 EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.start_tag.tag);
364
365 gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
366 gumbo_token_destroy(&parser_, &token_);
367 Advance(14);
368 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
369 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
370 EXPECT_EQ('x', token_.v.character);
371
372 gumbo_token_destroy(&parser_, &token_);
373 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
374 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
375 EXPECT_EQ('<', token_.v.character);
376
377 gumbo_token_destroy(&parser_, &token_);
378 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
379 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
380 EXPECT_EQ('7', token_.v.character);
381
382 gumbo_token_destroy(&parser_, &token_);
383 Advance(8);
384 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
385 EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
386 EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.end_tag);
387 }
388
TEST_F(GumboTokenizerTest,ScriptHasTagEmbedded)389 TEST_F(GumboTokenizerTest, ScriptHasTagEmbedded) {
390 SetInput("<script>var foo = '</div>';</script>");
391 Advance(1);
392 gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
393 Advance(11);
394
395 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
396 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
397 EXPECT_EQ('<', token_.v.character);
398
399 gumbo_token_destroy(&parser_, &token_);
400 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
401 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
402 EXPECT_EQ('/', token_.v.character);
403
404 gumbo_token_destroy(&parser_, &token_);
405 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
406 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
407 EXPECT_EQ('d', token_.v.character);
408
409 gumbo_token_destroy(&parser_, &token_);
410 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
411 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
412 EXPECT_EQ('i', token_.v.character);
413 }
414
TEST_F(GumboTokenizerTest,ScriptDoubleEscaped)415 TEST_F(GumboTokenizerTest, ScriptDoubleEscaped) {
416 SetInput(
417 "<script><!--var foo = '<a href=\"foo\"></a>\n"
418 "<sCrIpt>i--<f</script>'-->;</script>");
419 Advance(1);
420 gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
421 Advance(34);
422
423 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
424 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
425 EXPECT_EQ('<', token_.v.character);
426
427 gumbo_token_destroy(&parser_, &token_);
428 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
429 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
430 EXPECT_EQ('s', token_.v.character);
431
432 gumbo_token_destroy(&parser_, &token_);
433 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
434 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
435 EXPECT_EQ('C', token_.v.character);
436
437 gumbo_token_destroy(&parser_, &token_);
438 Advance(20);
439 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
440 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
441 EXPECT_EQ('-', token_.v.character);
442
443 gumbo_token_destroy(&parser_, &token_);
444 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
445 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
446 EXPECT_EQ('-', token_.v.character);
447
448 gumbo_token_destroy(&parser_, &token_);
449 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
450 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
451 EXPECT_EQ('>', token_.v.character);
452 }
453
TEST_F(GumboTokenizerTest,CData)454 TEST_F(GumboTokenizerTest, CData) {
455 // SetInput uses strlen and so can't handle nulls.
456 text_ = "<![CDATA[\0filler\0text\0]]>";
457 gumbo_tokenizer_state_destroy(&parser_);
458 gumbo_tokenizer_state_init(
459 &parser_, text_, sizeof("<![CDATA[\0filler\0text\0]]>") - 1);
460 gumbo_tokenizer_set_is_current_node_foreign(&parser_, true);
461
462 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
463 EXPECT_EQ(GUMBO_TOKEN_NULL, token_.type);
464 EXPECT_EQ(0, token_.v.character);
465
466 gumbo_token_destroy(&parser_, &token_);
467 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
468 EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type);
469 EXPECT_EQ('f', token_.v.character);
470 }
471
TEST_F(GumboTokenizerTest,StyleHasTagEmbedded)472 TEST_F(GumboTokenizerTest, StyleHasTagEmbedded) {
473 SetInput("<style>/* For <head> */</style>");
474 Advance(1);
475 gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RCDATA);
476 Advance(7);
477
478 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
479 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
480 EXPECT_EQ('<', token_.v.character);
481
482 gumbo_token_destroy(&parser_, &token_);
483 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
484 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
485 EXPECT_EQ('h', token_.v.character);
486
487 gumbo_token_destroy(&parser_, &token_);
488 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
489 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
490 EXPECT_EQ('e', token_.v.character);
491 }
492
TEST_F(GumboTokenizerTest,PreWithNewlines)493 TEST_F(GumboTokenizerTest, PreWithNewlines) {
494 SetInput("<!DOCTYPE html><pre>\r\na</pre>");
495 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
496 ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
497 EXPECT_EQ(0, token_.position.offset);
498
499 gumbo_token_destroy(&parser_, &token_);
500 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
501 ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
502 EXPECT_EQ("<pre>", ToString(token_.original_text));
503 EXPECT_EQ(15, token_.position.offset);
504 }
505
TEST_F(GumboTokenizerTest,SelfClosingStartTag)506 TEST_F(GumboTokenizerTest, SelfClosingStartTag) {
507 SetInput("<br />");
508 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
509 ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
510 EXPECT_EQ(0, token_.position.offset);
511 EXPECT_EQ("<br />", ToString(token_.original_text));
512
513 GumboTokenStartTag* start_tag = &token_.v.start_tag;
514 EXPECT_EQ(GUMBO_TAG_BR, start_tag->tag);
515 EXPECT_EQ(0, start_tag->attributes.length);
516 EXPECT_TRUE(start_tag->is_self_closing);
517 }
518
TEST_F(GumboTokenizerTest,OpenTagWithAttributes)519 TEST_F(GumboTokenizerTest, OpenTagWithAttributes) {
520 SetInput("<a href ='/search?q=foo&hl=en' id=link>");
521 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
522 ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
523
524 GumboTokenStartTag* start_tag = &token_.v.start_tag;
525 EXPECT_EQ(GUMBO_TAG_A, start_tag->tag);
526 EXPECT_FALSE(start_tag->is_self_closing);
527 ASSERT_EQ(2, start_tag->attributes.length);
528
529 GumboAttribute* href =
530 static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
531 EXPECT_STREQ("href", href->name);
532 EXPECT_EQ("href", ToString(href->original_name));
533 EXPECT_STREQ("/search?q=foo&hl=en", href->value);
534 EXPECT_EQ("'/search?q=foo&hl=en'", ToString(href->original_value));
535
536 GumboAttribute* id =
537 static_cast<GumboAttribute*>(start_tag->attributes.data[1]);
538 EXPECT_STREQ("id", id->name);
539 EXPECT_EQ("id", ToString(id->original_name));
540 EXPECT_STREQ("link", id->value);
541 EXPECT_EQ("link", ToString(id->original_value));
542 }
543
TEST_F(GumboTokenizerTest,BogusComment1)544 TEST_F(GumboTokenizerTest, BogusComment1) {
545 SetInput("<?xml is bogus-comment>Text");
546 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
547 ASSERT_EQ(GUMBO_TOKEN_COMMENT, token_.type);
548 EXPECT_STREQ("?xml is bogus-comment", token_.v.text);
549
550 gumbo_token_destroy(&parser_, &token_);
551 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
552 EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
553 EXPECT_EQ('T', token_.v.character);
554
555 errors_are_expected_ = true;
556 }
557
TEST_F(GumboTokenizerTest,BogusComment2)558 TEST_F(GumboTokenizerTest, BogusComment2) {
559 SetInput("</#bogus-comment");
560 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
561 ASSERT_EQ(GUMBO_TOKEN_COMMENT, token_.type);
562 EXPECT_STREQ("#bogus-comment", token_.v.text);
563
564 gumbo_token_destroy(&parser_, &token_);
565 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
566 EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
567 errors_are_expected_ = true;
568 }
569
TEST_F(GumboTokenizerTest,MultilineAttribute)570 TEST_F(GumboTokenizerTest, MultilineAttribute) {
571 SetInput(
572 "<foo long_attr=\"SomeCode;\n"
573 " calls_a_big_long_function();\n"
574 " return true;\" />");
575 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
576 ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
577
578 GumboTokenStartTag* start_tag = &token_.v.start_tag;
579 EXPECT_EQ(GUMBO_TAG_UNKNOWN, start_tag->tag);
580 EXPECT_TRUE(start_tag->is_self_closing);
581 ASSERT_EQ(1, start_tag->attributes.length);
582
583 GumboAttribute* long_attr =
584 static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
585 EXPECT_STREQ("long_attr", long_attr->name);
586 EXPECT_EQ("long_attr", ToString(long_attr->original_name));
587 EXPECT_STREQ(
588 "SomeCode;\n"
589 " calls_a_big_long_function();\n"
590 " return true;",
591 long_attr->value);
592 }
593
TEST_F(GumboTokenizerTest,DoubleAmpersand)594 TEST_F(GumboTokenizerTest, DoubleAmpersand) {
595 SetInput("<span jsif=\"foo && bar\">");
596 EXPECT_TRUE(gumbo_lex(&parser_, &token_));
597 ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
598
599 GumboTokenStartTag* start_tag = &token_.v.start_tag;
600 EXPECT_EQ(GUMBO_TAG_SPAN, start_tag->tag);
601 EXPECT_FALSE(start_tag->is_self_closing);
602 ASSERT_EQ(1, start_tag->attributes.length);
603
604 GumboAttribute* jsif =
605 static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
606 EXPECT_STREQ("jsif", jsif->name);
607 EXPECT_EQ("jsif", ToString(jsif->original_name));
608 EXPECT_STREQ("foo && bar", jsif->value);
609 EXPECT_EQ("\"foo && bar\"", ToString(jsif->original_value));
610 }
611
TEST_F(GumboTokenizerTest,MatchedTagPair)612 TEST_F(GumboTokenizerTest, MatchedTagPair) {
613 SetInput("<div id=dash<-Dash data-test=\"bar\">a</div>");
614 ASSERT_TRUE(gumbo_lex(&parser_, &token_));
615 ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
616 EXPECT_EQ(0, token_.position.offset);
617
618 GumboTokenStartTag* start_tag = &token_.v.start_tag;
619 EXPECT_EQ(GUMBO_TAG_DIV, start_tag->tag);
620 EXPECT_FALSE(start_tag->is_self_closing);
621 ASSERT_EQ(2, start_tag->attributes.length);
622
623 GumboAttribute* id =
624 static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
625 EXPECT_STREQ("id", id->name);
626 EXPECT_EQ("id", ToString(id->original_name));
627 EXPECT_EQ(1, id->name_start.line);
628 EXPECT_EQ(5, id->name_start.offset);
629 EXPECT_EQ(6, id->name_start.column);
630 EXPECT_EQ(8, id->name_end.column);
631 EXPECT_STREQ("dash<-Dash", id->value);
632 EXPECT_EQ("dash<-Dash", ToString(id->original_value));
633 EXPECT_EQ(9, id->value_start.column);
634 EXPECT_EQ(19, id->value_end.column);
635
636 GumboAttribute* data_attr =
637 static_cast<GumboAttribute*>(start_tag->attributes.data[1]);
638 EXPECT_STREQ("data-test", data_attr->name);
639 EXPECT_EQ("data-test", ToString(data_attr->original_name));
640 EXPECT_EQ(20, data_attr->name_start.column);
641 EXPECT_EQ(29, data_attr->name_end.column);
642 EXPECT_STREQ("bar", data_attr->value);
643 EXPECT_EQ("\"bar\"", ToString(data_attr->original_value));
644 EXPECT_EQ(30, data_attr->value_start.column);
645 EXPECT_EQ(35, data_attr->value_end.column);
646
647 gumbo_token_destroy(&parser_, &token_);
648 ASSERT_TRUE(gumbo_lex(&parser_, &token_));
649 ASSERT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
650 EXPECT_EQ(35, token_.position.offset);
651 EXPECT_EQ('a', token_.v.character);
652
653 gumbo_token_destroy(&parser_, &token_);
654 ASSERT_TRUE(gumbo_lex(&parser_, &token_));
655 ASSERT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
656 EXPECT_EQ(GUMBO_TAG_DIV, token_.v.end_tag);
657 errors_are_expected_ = true;
658 }
659
TEST_F(GumboTokenizerTest,BogusEndTag)660 TEST_F(GumboTokenizerTest, BogusEndTag) {
661 // According to the spec, the correct parse of this is an end tag token for
662 // "<div<>" (notice the ending bracket) with the attribute "th=th" (ignored
663 // because end tags don't take attributes), with the tokenizer passing through
664 // the self-closing tag state in the process.
665 SetInput("</div</th>");
666 ASSERT_TRUE(gumbo_lex(&parser_, &token_));
667 ASSERT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
668 EXPECT_EQ(0, token_.position.offset);
669 EXPECT_EQ(GUMBO_TAG_UNKNOWN, token_.v.end_tag);
670 EXPECT_EQ("</div</th>", ToString(token_.original_text));
671 errors_are_expected_ = true;
672 }
673 } // namespace
674