1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 
17 #include "tokenizer.h"
18 
19 #include <stdio.h>
20 
21 #include "gtest/gtest.h"
22 #include "test_utils.h"
23 
24 extern const char* kGumboTagNames[];
25 
26 namespace {
27 
28 // Tests for tokenizer.c
29 class GumboTokenizerTest : public GumboTest {
30  protected:
GumboTokenizerTest()31   GumboTokenizerTest() { gumbo_tokenizer_state_init(&parser_, "", 0); }
32 
~GumboTokenizerTest()33   virtual ~GumboTokenizerTest() {
34     gumbo_tokenizer_state_destroy(&parser_);
35     gumbo_token_destroy(&parser_, &token_);
36   }
37 
SetInput(const char * input)38   void SetInput(const char* input) {
39     text_ = input;
40     gumbo_tokenizer_state_destroy(&parser_);
41     gumbo_tokenizer_state_init(&parser_, input, strlen(input));
42   }
43 
Advance(int num_tokens)44   void Advance(int num_tokens) {
45     for (int i = 0; i < num_tokens; ++i) {
46       EXPECT_TRUE(gumbo_lex(&parser_, &token_));
47       gumbo_token_destroy(&parser_, &token_);
48     }
49   }
50 
51   GumboToken token_;
52 };
53 
TEST(GumboTagEnumTest,TagEnumIncludesAllTags)54 TEST(GumboTagEnumTest, TagEnumIncludesAllTags) {
55   EXPECT_EQ(150, GUMBO_TAG_UNKNOWN);
56   EXPECT_STREQ("", kGumboTagNames[GUMBO_TAG_UNKNOWN]);
57 }
58 
TEST_F(GumboTokenizerTest,PartialTag)59 TEST_F(GumboTokenizerTest, PartialTag) {
60   SetInput("<a");
61   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
62   EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
63 }
64 
TEST_F(GumboTokenizerTest,PartialTagWithAttributes)65 TEST_F(GumboTokenizerTest, PartialTagWithAttributes) {
66   SetInput("<a href=foo /");
67   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
68   EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
69 }
70 
TEST_F(GumboTokenizerTest,LexCharToken)71 TEST_F(GumboTokenizerTest, LexCharToken) {
72   SetInput("a");
73   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
74   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
75   EXPECT_EQ(1, token_.position.column);
76   EXPECT_EQ(1, token_.position.line);
77   EXPECT_EQ(0, token_.position.offset);
78   EXPECT_EQ('a', *token_.original_text.data);
79   EXPECT_EQ(1, token_.original_text.length);
80   EXPECT_EQ('a', token_.v.character);
81 
82   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
83   EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
84   EXPECT_EQ(1, token_.position.offset);
85 }
86 
TEST_F(GumboTokenizerTest,LexCharRef)87 TEST_F(GumboTokenizerTest, LexCharRef) {
88   SetInput("&nbsp; Text");
89   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
90   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
91   EXPECT_EQ(1, token_.position.column);
92   EXPECT_EQ(1, token_.position.line);
93   EXPECT_EQ(0, token_.position.offset);
94   EXPECT_EQ('&', *token_.original_text.data);
95   EXPECT_EQ(6, token_.original_text.length);
96   EXPECT_EQ(0xA0, token_.v.character);
97 
98   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
99   EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
100   EXPECT_EQ(' ', *token_.original_text.data);
101   EXPECT_EQ(' ', token_.v.character);
102 }
103 
TEST_F(GumboTokenizerTest,LexCharRef_NotCharRef)104 TEST_F(GumboTokenizerTest, LexCharRef_NotCharRef) {
105   SetInput("&xyz");
106   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
107   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
108   EXPECT_EQ(0, token_.position.offset);
109   EXPECT_EQ('&', token_.v.character);
110 
111   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
112   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
113   EXPECT_EQ(1, token_.position.offset);
114   EXPECT_EQ('x', token_.v.character);
115 }
116 
TEST_F(GumboTokenizerTest,LeadingWhitespace)117 TEST_F(GumboTokenizerTest, LeadingWhitespace) {
118   SetInput(
119       "<div>\n"
120       "  <span class=foo>");
121   Advance(4);
122   EXPECT_TRUE(gumbo_lex(&parser_, &token_));  // <span>
123 
124   GumboTokenStartTag* start_tag = &token_.v.start_tag;
125   EXPECT_EQ(GUMBO_TAG_SPAN, start_tag->tag);
126   EXPECT_EQ(2, token_.position.line);
127   EXPECT_EQ(3, token_.position.column);
128   ASSERT_EQ(1, start_tag->attributes.length);
129 
130   GumboAttribute* clas =
131       static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
132   EXPECT_STREQ("class", clas->name);
133   EXPECT_EQ("class", ToString(clas->original_name));
134   EXPECT_EQ(2, clas->name_start.line);
135   EXPECT_EQ(9, clas->name_start.column);
136   EXPECT_EQ(14, clas->name_end.column);
137   EXPECT_STREQ("foo", clas->value);
138   EXPECT_EQ("foo", ToString(clas->original_value));
139   EXPECT_EQ(15, clas->value_start.column);
140   EXPECT_EQ(18, clas->value_end.column);
141 }
142 
TEST_F(GumboTokenizerTest,Doctype)143 TEST_F(GumboTokenizerTest, Doctype) {
144   SetInput("<!doctype html>");
145   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
146   ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
147   EXPECT_EQ(0, token_.position.offset);
148 
149   GumboTokenDocType* doc_type = &token_.v.doc_type;
150   EXPECT_FALSE(doc_type->force_quirks);
151   EXPECT_FALSE(doc_type->has_public_identifier);
152   EXPECT_FALSE(doc_type->has_system_identifier);
153   EXPECT_STREQ("html", doc_type->name);
154   EXPECT_STREQ("", doc_type->public_identifier);
155   EXPECT_STREQ("", doc_type->system_identifier);
156 }
157 
TEST_F(GumboTokenizerTest,DoctypePublic)158 TEST_F(GumboTokenizerTest, DoctypePublic) {
159   SetInput(
160       "<!DOCTYPE html PUBLIC "
161       "\"-//W3C//DTD XHTML 1.0 Transitional//EN\" "
162       "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>");
163   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
164   ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
165   EXPECT_EQ(0, token_.position.offset);
166 
167   GumboTokenDocType* doc_type = &token_.v.doc_type;
168   EXPECT_FALSE(doc_type->force_quirks);
169   EXPECT_TRUE(doc_type->has_public_identifier);
170   EXPECT_TRUE(doc_type->has_system_identifier);
171   EXPECT_STREQ("html", doc_type->name);
172   EXPECT_STREQ(
173       "-//W3C//DTD XHTML 1.0 Transitional//EN", doc_type->public_identifier);
174   EXPECT_STREQ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
175       doc_type->system_identifier);
176 }
177 
TEST_F(GumboTokenizerTest,DoctypeSystem)178 TEST_F(GumboTokenizerTest, DoctypeSystem) {
179   SetInput("<!DOCtype root_element SYSTEM \"DTD_location\">");
180   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
181   ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
182   EXPECT_EQ(0, token_.position.offset);
183 
184   GumboTokenDocType* doc_type = &token_.v.doc_type;
185   EXPECT_FALSE(doc_type->force_quirks);
186   EXPECT_FALSE(doc_type->has_public_identifier);
187   EXPECT_TRUE(doc_type->has_system_identifier);
188   EXPECT_STREQ("root_element", doc_type->name);
189   EXPECT_STREQ("DTD_location", doc_type->system_identifier);
190 }
191 
TEST_F(GumboTokenizerTest,DoctypeUnterminated)192 TEST_F(GumboTokenizerTest, DoctypeUnterminated) {
193   SetInput("<!DOCTYPE a PUBLIC''");
194   EXPECT_FALSE(gumbo_lex(&parser_, &token_));
195   ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
196   EXPECT_EQ(0, token_.position.offset);
197 
198   GumboTokenDocType* doc_type = &token_.v.doc_type;
199   EXPECT_TRUE(doc_type->force_quirks);
200   EXPECT_TRUE(doc_type->has_public_identifier);
201   EXPECT_FALSE(doc_type->has_system_identifier);
202   EXPECT_STREQ("a", doc_type->name);
203   EXPECT_STREQ("", doc_type->system_identifier);
204 }
205 
TEST_F(GumboTokenizerTest,RawtextEnd)206 TEST_F(GumboTokenizerTest, RawtextEnd) {
207   SetInput("<title>x ignores <tag></title>");
208   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
209   EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
210   EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
211 
212   gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RAWTEXT);
213   gumbo_token_destroy(&parser_, &token_);
214   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
215   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
216   EXPECT_EQ('x', token_.v.character);
217   gumbo_token_destroy(&parser_, &token_);
218 
219   Advance(9);
220   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
221   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
222   EXPECT_EQ('<', token_.v.character);
223   gumbo_token_destroy(&parser_, &token_);
224 
225   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
226   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
227   EXPECT_EQ('t', token_.v.character);
228   gumbo_token_destroy(&parser_, &token_);
229 
230   Advance(3);
231   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
232   EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
233   EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
234 }
235 
TEST_F(GumboTokenizerTest,RCDataEnd)236 TEST_F(GumboTokenizerTest, RCDataEnd) {
237   SetInput("<title>x</title>");
238   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
239   EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
240   EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
241 
242   gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RCDATA);
243   gumbo_token_destroy(&parser_, &token_);
244   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
245   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
246   EXPECT_EQ('x', token_.v.character);
247 
248   gumbo_token_destroy(&parser_, &token_);
249   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
250   EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
251   EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
252 }
253 
TEST_F(GumboTokenizerTest,ScriptEnd)254 TEST_F(GumboTokenizerTest, ScriptEnd) {
255   SetInput("<script>x = '\"></';</script>");
256   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
257   EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
258   EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.start_tag.tag);
259 
260   gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
261   gumbo_token_destroy(&parser_, &token_);
262   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
263   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
264   EXPECT_EQ('x', token_.v.character);
265 
266   gumbo_token_destroy(&parser_, &token_);
267   Advance(6);
268   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
269   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
270   EXPECT_EQ('<', token_.v.character);
271 
272   gumbo_token_destroy(&parser_, &token_);
273   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
274   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
275   EXPECT_EQ('/', token_.v.character);
276 
277   gumbo_token_destroy(&parser_, &token_);
278   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
279   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
280   EXPECT_EQ('\'', token_.v.character);
281 
282   gumbo_token_destroy(&parser_, &token_);
283   Advance(1);
284   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
285   EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
286   EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.end_tag);
287 }
288 
TEST_F(GumboTokenizerTest,ScriptEscapedEnd)289 TEST_F(GumboTokenizerTest, ScriptEscapedEnd) {
290   SetInput("<title>x</title>");
291   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
292   EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
293   EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
294 
295   gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT_ESCAPED);
296   gumbo_token_destroy(&parser_, &token_);
297   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
298   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
299   EXPECT_EQ('x', token_.v.character);
300 
301   gumbo_token_destroy(&parser_, &token_);
302   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
303   EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
304   EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
305 }
306 
TEST_F(GumboTokenizerTest,ScriptCommentEscaped)307 TEST_F(GumboTokenizerTest, ScriptCommentEscaped) {
308   SetInput(
309       "<script><!-- var foo = x < 7 + '</div>-- <A href=\"foo\"></a>';\n"
310       "-->\n"
311       "</script>");
312   Advance(1);
313   gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
314   Advance(15);
315 
316   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
317   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
318   EXPECT_EQ('x', token_.v.character);
319 
320   gumbo_token_destroy(&parser_, &token_);
321   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
322   EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
323   EXPECT_EQ(' ', token_.v.character);
324 
325   gumbo_token_destroy(&parser_, &token_);
326   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
327   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
328   EXPECT_EQ('<', token_.v.character);
329 
330   gumbo_token_destroy(&parser_, &token_);
331   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
332   EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
333   EXPECT_EQ(' ', token_.v.character);
334 
335   gumbo_token_destroy(&parser_, &token_);
336   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
337   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
338   EXPECT_EQ('7', token_.v.character);
339 
340   gumbo_token_destroy(&parser_, &token_);
341   Advance(4);
342   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
343   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
344   EXPECT_EQ('<', token_.v.character);
345 
346   gumbo_token_destroy(&parser_, &token_);
347   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
348   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
349   EXPECT_EQ('/', token_.v.character);
350 
351   gumbo_token_destroy(&parser_, &token_);
352   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
353   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
354   EXPECT_EQ('d', token_.v.character);
355   gumbo_token_destroy(&parser_, &token_);
356   Advance(25);
357 }
358 
TEST_F(GumboTokenizerTest,ScriptEscapedEmbeddedLessThan)359 TEST_F(GumboTokenizerTest, ScriptEscapedEmbeddedLessThan) {
360   SetInput("<script>/*<![CDATA[*/ x<7 /*]]>*/</script>");
361   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
362   EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
363   EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.start_tag.tag);
364 
365   gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
366   gumbo_token_destroy(&parser_, &token_);
367   Advance(14);
368   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
369   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
370   EXPECT_EQ('x', token_.v.character);
371 
372   gumbo_token_destroy(&parser_, &token_);
373   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
374   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
375   EXPECT_EQ('<', token_.v.character);
376 
377   gumbo_token_destroy(&parser_, &token_);
378   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
379   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
380   EXPECT_EQ('7', token_.v.character);
381 
382   gumbo_token_destroy(&parser_, &token_);
383   Advance(8);
384   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
385   EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
386   EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.end_tag);
387 }
388 
TEST_F(GumboTokenizerTest,ScriptHasTagEmbedded)389 TEST_F(GumboTokenizerTest, ScriptHasTagEmbedded) {
390   SetInput("<script>var foo = '</div>';</script>");
391   Advance(1);
392   gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
393   Advance(11);
394 
395   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
396   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
397   EXPECT_EQ('<', token_.v.character);
398 
399   gumbo_token_destroy(&parser_, &token_);
400   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
401   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
402   EXPECT_EQ('/', token_.v.character);
403 
404   gumbo_token_destroy(&parser_, &token_);
405   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
406   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
407   EXPECT_EQ('d', token_.v.character);
408 
409   gumbo_token_destroy(&parser_, &token_);
410   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
411   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
412   EXPECT_EQ('i', token_.v.character);
413 }
414 
TEST_F(GumboTokenizerTest,ScriptDoubleEscaped)415 TEST_F(GumboTokenizerTest, ScriptDoubleEscaped) {
416   SetInput(
417       "<script><!--var foo = '<a href=\"foo\"></a>\n"
418       "<sCrIpt>i--<f</script>'-->;</script>");
419   Advance(1);
420   gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
421   Advance(34);
422 
423   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
424   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
425   EXPECT_EQ('<', token_.v.character);
426 
427   gumbo_token_destroy(&parser_, &token_);
428   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
429   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
430   EXPECT_EQ('s', token_.v.character);
431 
432   gumbo_token_destroy(&parser_, &token_);
433   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
434   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
435   EXPECT_EQ('C', token_.v.character);
436 
437   gumbo_token_destroy(&parser_, &token_);
438   Advance(20);
439   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
440   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
441   EXPECT_EQ('-', token_.v.character);
442 
443   gumbo_token_destroy(&parser_, &token_);
444   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
445   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
446   EXPECT_EQ('-', token_.v.character);
447 
448   gumbo_token_destroy(&parser_, &token_);
449   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
450   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
451   EXPECT_EQ('>', token_.v.character);
452 }
453 
TEST_F(GumboTokenizerTest,CData)454 TEST_F(GumboTokenizerTest, CData) {
455   // SetInput uses strlen and so can't handle nulls.
456   text_ = "<![CDATA[\0filler\0text\0]]>";
457   gumbo_tokenizer_state_destroy(&parser_);
458   gumbo_tokenizer_state_init(
459       &parser_, text_, sizeof("<![CDATA[\0filler\0text\0]]>") - 1);
460   gumbo_tokenizer_set_is_current_node_foreign(&parser_, true);
461 
462   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
463   EXPECT_EQ(GUMBO_TOKEN_NULL, token_.type);
464   EXPECT_EQ(0, token_.v.character);
465 
466   gumbo_token_destroy(&parser_, &token_);
467   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
468   EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type);
469   EXPECT_EQ('f', token_.v.character);
470 }
471 
TEST_F(GumboTokenizerTest,StyleHasTagEmbedded)472 TEST_F(GumboTokenizerTest, StyleHasTagEmbedded) {
473   SetInput("<style>/* For <head> */</style>");
474   Advance(1);
475   gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RCDATA);
476   Advance(7);
477 
478   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
479   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
480   EXPECT_EQ('<', token_.v.character);
481 
482   gumbo_token_destroy(&parser_, &token_);
483   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
484   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
485   EXPECT_EQ('h', token_.v.character);
486 
487   gumbo_token_destroy(&parser_, &token_);
488   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
489   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
490   EXPECT_EQ('e', token_.v.character);
491 }
492 
TEST_F(GumboTokenizerTest,PreWithNewlines)493 TEST_F(GumboTokenizerTest, PreWithNewlines) {
494   SetInput("<!DOCTYPE html><pre>\r\na</pre>");
495   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
496   ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
497   EXPECT_EQ(0, token_.position.offset);
498 
499   gumbo_token_destroy(&parser_, &token_);
500   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
501   ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
502   EXPECT_EQ("<pre>", ToString(token_.original_text));
503   EXPECT_EQ(15, token_.position.offset);
504 }
505 
TEST_F(GumboTokenizerTest,SelfClosingStartTag)506 TEST_F(GumboTokenizerTest, SelfClosingStartTag) {
507   SetInput("<br />");
508   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
509   ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
510   EXPECT_EQ(0, token_.position.offset);
511   EXPECT_EQ("<br />", ToString(token_.original_text));
512 
513   GumboTokenStartTag* start_tag = &token_.v.start_tag;
514   EXPECT_EQ(GUMBO_TAG_BR, start_tag->tag);
515   EXPECT_EQ(0, start_tag->attributes.length);
516   EXPECT_TRUE(start_tag->is_self_closing);
517 }
518 
TEST_F(GumboTokenizerTest,OpenTagWithAttributes)519 TEST_F(GumboTokenizerTest, OpenTagWithAttributes) {
520   SetInput("<a href ='/search?q=foo&amp;hl=en'  id=link>");
521   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
522   ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
523 
524   GumboTokenStartTag* start_tag = &token_.v.start_tag;
525   EXPECT_EQ(GUMBO_TAG_A, start_tag->tag);
526   EXPECT_FALSE(start_tag->is_self_closing);
527   ASSERT_EQ(2, start_tag->attributes.length);
528 
529   GumboAttribute* href =
530       static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
531   EXPECT_STREQ("href", href->name);
532   EXPECT_EQ("href", ToString(href->original_name));
533   EXPECT_STREQ("/search?q=foo&hl=en", href->value);
534   EXPECT_EQ("'/search?q=foo&amp;hl=en'", ToString(href->original_value));
535 
536   GumboAttribute* id =
537       static_cast<GumboAttribute*>(start_tag->attributes.data[1]);
538   EXPECT_STREQ("id", id->name);
539   EXPECT_EQ("id", ToString(id->original_name));
540   EXPECT_STREQ("link", id->value);
541   EXPECT_EQ("link", ToString(id->original_value));
542 }
543 
TEST_F(GumboTokenizerTest,BogusComment1)544 TEST_F(GumboTokenizerTest, BogusComment1) {
545   SetInput("<?xml is bogus-comment>Text");
546   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
547   ASSERT_EQ(GUMBO_TOKEN_COMMENT, token_.type);
548   EXPECT_STREQ("?xml is bogus-comment", token_.v.text);
549 
550   gumbo_token_destroy(&parser_, &token_);
551   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
552   EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
553   EXPECT_EQ('T', token_.v.character);
554 
555   errors_are_expected_ = true;
556 }
557 
TEST_F(GumboTokenizerTest,BogusComment2)558 TEST_F(GumboTokenizerTest, BogusComment2) {
559   SetInput("</#bogus-comment");
560   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
561   ASSERT_EQ(GUMBO_TOKEN_COMMENT, token_.type);
562   EXPECT_STREQ("#bogus-comment", token_.v.text);
563 
564   gumbo_token_destroy(&parser_, &token_);
565   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
566   EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
567   errors_are_expected_ = true;
568 }
569 
TEST_F(GumboTokenizerTest,MultilineAttribute)570 TEST_F(GumboTokenizerTest, MultilineAttribute) {
571   SetInput(
572       "<foo long_attr=\"SomeCode;\n"
573       "  calls_a_big_long_function();\n"
574       "  return true;\" />");
575   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
576   ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
577 
578   GumboTokenStartTag* start_tag = &token_.v.start_tag;
579   EXPECT_EQ(GUMBO_TAG_UNKNOWN, start_tag->tag);
580   EXPECT_TRUE(start_tag->is_self_closing);
581   ASSERT_EQ(1, start_tag->attributes.length);
582 
583   GumboAttribute* long_attr =
584       static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
585   EXPECT_STREQ("long_attr", long_attr->name);
586   EXPECT_EQ("long_attr", ToString(long_attr->original_name));
587   EXPECT_STREQ(
588       "SomeCode;\n"
589       "  calls_a_big_long_function();\n"
590       "  return true;",
591       long_attr->value);
592 }
593 
TEST_F(GumboTokenizerTest,DoubleAmpersand)594 TEST_F(GumboTokenizerTest, DoubleAmpersand) {
595   SetInput("<span jsif=\"foo && bar\">");
596   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
597   ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
598 
599   GumboTokenStartTag* start_tag = &token_.v.start_tag;
600   EXPECT_EQ(GUMBO_TAG_SPAN, start_tag->tag);
601   EXPECT_FALSE(start_tag->is_self_closing);
602   ASSERT_EQ(1, start_tag->attributes.length);
603 
604   GumboAttribute* jsif =
605       static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
606   EXPECT_STREQ("jsif", jsif->name);
607   EXPECT_EQ("jsif", ToString(jsif->original_name));
608   EXPECT_STREQ("foo && bar", jsif->value);
609   EXPECT_EQ("\"foo && bar\"", ToString(jsif->original_value));
610 }
611 
TEST_F(GumboTokenizerTest,MatchedTagPair)612 TEST_F(GumboTokenizerTest, MatchedTagPair) {
613   SetInput("<div id=dash<-Dash data-test=\"bar\">a</div>");
614   ASSERT_TRUE(gumbo_lex(&parser_, &token_));
615   ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
616   EXPECT_EQ(0, token_.position.offset);
617 
618   GumboTokenStartTag* start_tag = &token_.v.start_tag;
619   EXPECT_EQ(GUMBO_TAG_DIV, start_tag->tag);
620   EXPECT_FALSE(start_tag->is_self_closing);
621   ASSERT_EQ(2, start_tag->attributes.length);
622 
623   GumboAttribute* id =
624       static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
625   EXPECT_STREQ("id", id->name);
626   EXPECT_EQ("id", ToString(id->original_name));
627   EXPECT_EQ(1, id->name_start.line);
628   EXPECT_EQ(5, id->name_start.offset);
629   EXPECT_EQ(6, id->name_start.column);
630   EXPECT_EQ(8, id->name_end.column);
631   EXPECT_STREQ("dash<-Dash", id->value);
632   EXPECT_EQ("dash<-Dash", ToString(id->original_value));
633   EXPECT_EQ(9, id->value_start.column);
634   EXPECT_EQ(19, id->value_end.column);
635 
636   GumboAttribute* data_attr =
637       static_cast<GumboAttribute*>(start_tag->attributes.data[1]);
638   EXPECT_STREQ("data-test", data_attr->name);
639   EXPECT_EQ("data-test", ToString(data_attr->original_name));
640   EXPECT_EQ(20, data_attr->name_start.column);
641   EXPECT_EQ(29, data_attr->name_end.column);
642   EXPECT_STREQ("bar", data_attr->value);
643   EXPECT_EQ("\"bar\"", ToString(data_attr->original_value));
644   EXPECT_EQ(30, data_attr->value_start.column);
645   EXPECT_EQ(35, data_attr->value_end.column);
646 
647   gumbo_token_destroy(&parser_, &token_);
648   ASSERT_TRUE(gumbo_lex(&parser_, &token_));
649   ASSERT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
650   EXPECT_EQ(35, token_.position.offset);
651   EXPECT_EQ('a', token_.v.character);
652 
653   gumbo_token_destroy(&parser_, &token_);
654   ASSERT_TRUE(gumbo_lex(&parser_, &token_));
655   ASSERT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
656   EXPECT_EQ(GUMBO_TAG_DIV, token_.v.end_tag);
657   errors_are_expected_ = true;
658 }
659 
TEST_F(GumboTokenizerTest,BogusEndTag)660 TEST_F(GumboTokenizerTest, BogusEndTag) {
661   // According to the spec, the correct parse of this is an end tag token for
662   // "<div<>" (notice the ending bracket) with the attribute "th=th" (ignored
663   // because end tags don't take attributes), with the tokenizer passing through
664   // the self-closing tag state in the process.
665   SetInput("</div</th>");
666   ASSERT_TRUE(gumbo_lex(&parser_, &token_));
667   ASSERT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
668   EXPECT_EQ(0, token_.position.offset);
669   EXPECT_EQ(GUMBO_TAG_UNKNOWN, token_.v.end_tag);
670   EXPECT_EQ("</div</th>", ToString(token_.original_text));
671   errors_are_expected_ = true;
672 }
673 }  // namespace
674