1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package html
6
7import (
8	"bytes"
9	"io"
10	"io/ioutil"
11	"reflect"
12	"runtime"
13	"strings"
14	"testing"
15)
16
17type tokenTest struct {
18	// A short description of the test case.
19	desc string
20	// The HTML to parse.
21	html string
22	// The string representations of the expected tokens, joined by '$'.
23	golden string
24}
25
26var tokenTests = []tokenTest{
27	{
28		"empty",
29		"",
30		"",
31	},
32	// A single text node. The tokenizer should not break text nodes on whitespace,
33	// nor should it normalize whitespace within a text node.
34	{
35		"text",
36		"foo  bar",
37		"foo  bar",
38	},
39	// An entity.
40	{
41		"entity",
42		"one < two",
43		"one < two",
44	},
45	// A start, self-closing and end tag. The tokenizer does not care if the start
46	// and end tokens don't match; that is the job of the parser.
47	{
48		"tags",
49		"<a>b<c/>d</e>",
50		"<a>$b$<c/>$d$</e>",
51	},
52	// Angle brackets that aren't a tag.
53	{
54		"not a tag #0",
55		"<",
56		"&lt;",
57	},
58	{
59		"not a tag #1",
60		"</",
61		"&lt;/",
62	},
63	{
64		"not a tag #2",
65		"</>",
66		"<!---->",
67	},
68	{
69		"not a tag #3",
70		"a</>b",
71		"a$<!---->$b",
72	},
73	{
74		"not a tag #4",
75		"</ >",
76		"<!-- -->",
77	},
78	{
79		"not a tag #5",
80		"</.",
81		"<!--.-->",
82	},
83	{
84		"not a tag #6",
85		"</.>",
86		"<!--.-->",
87	},
88	{
89		"not a tag #7",
90		"a < b",
91		"a &lt; b",
92	},
93	{
94		"not a tag #8",
95		"<.>",
96		"&lt;.&gt;",
97	},
98	{
99		"not a tag #9",
100		"a<<<b>>>c",
101		"a&lt;&lt;$<b>$&gt;&gt;c",
102	},
103	{
104		"not a tag #10",
105		"if x<0 and y < 0 then x*y>0",
106		"if x&lt;0 and y &lt; 0 then x*y&gt;0",
107	},
108	{
109		"not a tag #11",
110		"<<p>",
111		"&lt;$<p>",
112	},
113	// EOF in a tag name.
114	{
115		"tag name eof #0",
116		"<a",
117		"",
118	},
119	{
120		"tag name eof #1",
121		"<a ",
122		"",
123	},
124	{
125		"tag name eof #2",
126		"a<b",
127		"a",
128	},
129	{
130		"tag name eof #3",
131		"<a><b",
132		"<a>",
133	},
134	{
135		"tag name eof #4",
136		`<a x`,
137		``,
138	},
139	// Some malformed tags that are missing a '>'.
140	{
141		"malformed tag #0",
142		`<p</p>`,
143		`<p< p="">`,
144	},
145	{
146		"malformed tag #1",
147		`<p </p>`,
148		`<p <="" p="">`,
149	},
150	{
151		"malformed tag #2",
152		`<p id`,
153		``,
154	},
155	{
156		"malformed tag #3",
157		`<p id=`,
158		``,
159	},
160	{
161		"malformed tag #4",
162		`<p id=>`,
163		`<p id="">`,
164	},
165	{
166		"malformed tag #5",
167		`<p id=0`,
168		``,
169	},
170	{
171		"malformed tag #6",
172		`<p id=0</p>`,
173		`<p id="0&lt;/p">`,
174	},
175	{
176		"malformed tag #7",
177		`<p id="0</p>`,
178		``,
179	},
180	{
181		"malformed tag #8",
182		`<p id="0"</p>`,
183		`<p id="0" <="" p="">`,
184	},
185	{
186		"malformed tag #9",
187		`<p></p id`,
188		`<p>`,
189	},
190	// Raw text and RCDATA.
191	{
192		"basic raw text",
193		"<script><a></b></script>",
194		"<script>$&lt;a&gt;&lt;/b&gt;$</script>",
195	},
196	{
197		"unfinished script end tag",
198		"<SCRIPT>a</SCR",
199		"<script>$a&lt;/SCR",
200	},
201	{
202		"broken script end tag",
203		"<SCRIPT>a</SCR ipt>",
204		"<script>$a&lt;/SCR ipt&gt;",
205	},
206	{
207		"EOF in script end tag",
208		"<SCRIPT>a</SCRipt",
209		"<script>$a&lt;/SCRipt",
210	},
211	{
212		"scriptx end tag",
213		"<SCRIPT>a</SCRiptx",
214		"<script>$a&lt;/SCRiptx",
215	},
216	{
217		"' ' completes script end tag",
218		"<SCRIPT>a</SCRipt ",
219		"<script>$a",
220	},
221	{
222		"'>' completes script end tag",
223		"<SCRIPT>a</SCRipt>",
224		"<script>$a$</script>",
225	},
226	{
227		"self-closing script end tag",
228		"<SCRIPT>a</SCRipt/>",
229		"<script>$a$</script>",
230	},
231	{
232		"nested script tag",
233		"<SCRIPT>a</SCRipt<script>",
234		"<script>$a&lt;/SCRipt&lt;script&gt;",
235	},
236	{
237		"script end tag after unfinished",
238		"<SCRIPT>a</SCRipt</script>",
239		"<script>$a&lt;/SCRipt$</script>",
240	},
241	{
242		"script/style mismatched tags",
243		"<script>a</style>",
244		"<script>$a&lt;/style&gt;",
245	},
246	{
247		"style element with entity",
248		"<style>&apos;",
249		"<style>$&amp;apos;",
250	},
251	{
252		"textarea with tag",
253		"<textarea><div></textarea>",
254		"<textarea>$&lt;div&gt;$</textarea>",
255	},
256	{
257		"title with tag and entity",
258		"<title><b>K&amp;R C</b></title>",
259		"<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
260	},
261	{
262		"title with trailing '&lt;' entity",
263		"<title>foobar<</title>",
264		"<title>$foobar&lt;$</title>",
265	},
266	// DOCTYPE tests.
267	{
268		"Proper DOCTYPE",
269		"<!DOCTYPE html>",
270		"<!DOCTYPE html>",
271	},
272	{
273		"DOCTYPE with no space",
274		"<!doctypehtml>",
275		"<!DOCTYPE html>",
276	},
277	{
278		"DOCTYPE with two spaces",
279		"<!doctype  html>",
280		"<!DOCTYPE html>",
281	},
282	{
283		"looks like DOCTYPE but isn't",
284		"<!DOCUMENT html>",
285		"<!--DOCUMENT html-->",
286	},
287	{
288		"DOCTYPE at EOF",
289		"<!DOCtype",
290		"<!DOCTYPE >",
291	},
292	// XML processing instructions.
293	{
294		"XML processing instruction",
295		"<?xml?>",
296		"<!--?xml?-->",
297	},
298	// Comments.
299	{
300		"comment0",
301		"abc<b><!-- skipme --></b>def",
302		"abc$<b>$<!-- skipme -->$</b>$def",
303	},
304	{
305		"comment1",
306		"a<!-->z",
307		"a$<!---->$z",
308	},
309	{
310		"comment2",
311		"a<!--->z",
312		"a$<!---->$z",
313	},
314	{
315		"comment3",
316		"a<!--x>-->z",
317		"a$<!--x>-->$z",
318	},
319	{
320		"comment4",
321		"a<!--x->-->z",
322		"a$<!--x->-->$z",
323	},
324	{
325		"comment5",
326		"a<!>z",
327		"a$<!---->$z",
328	},
329	{
330		"comment6",
331		"a<!->z",
332		"a$<!----->$z",
333	},
334	{
335		"comment7",
336		"a<!---<>z",
337		"a$<!---<>z-->",
338	},
339	{
340		"comment8",
341		"a<!--z",
342		"a$<!--z-->",
343	},
344	{
345		"comment9",
346		"a<!--z-",
347		"a$<!--z-->",
348	},
349	{
350		"comment10",
351		"a<!--z--",
352		"a$<!--z-->",
353	},
354	{
355		"comment11",
356		"a<!--z---",
357		"a$<!--z--->",
358	},
359	{
360		"comment12",
361		"a<!--z----",
362		"a$<!--z---->",
363	},
364	{
365		"comment13",
366		"a<!--x--!>z",
367		"a$<!--x-->$z",
368	},
369	// An attribute with a backslash.
370	{
371		"backslash",
372		`<p id="a\"b">`,
373		`<p id="a\" b"="">`,
374	},
375	// Entities, tag name and attribute key lower-casing, and whitespace
376	// normalization within a tag.
377	{
378		"tricky",
379		"<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
380		`<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
381	},
382	// A nonexistent entity. Tokenizing and converting back to a string should
383	// escape the "&" to become "&amp;".
384	{
385		"noSuchEntity",
386		`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
387		`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
388	},
389	{
390		"entity without semicolon",
391		`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
392it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
393	},
394	{
395		"entity with digits",
396		"&frac12;",
397		"½",
398	},
399	// Attribute tests:
400	// http://dev.w3.org/html5/pf-summary/Overview.html#attributes
401	{
402		"Empty attribute",
403		`<input disabled FOO>`,
404		`<input disabled="" foo="">`,
405	},
406	{
407		"Empty attribute, whitespace",
408		`<input disabled FOO >`,
409		`<input disabled="" foo="">`,
410	},
411	{
412		"Unquoted attribute value",
413		`<input value=yes FOO=BAR>`,
414		`<input value="yes" foo="BAR">`,
415	},
416	{
417		"Unquoted attribute value, spaces",
418		`<input value = yes FOO = BAR>`,
419		`<input value="yes" foo="BAR">`,
420	},
421	{
422		"Unquoted attribute value, trailing space",
423		`<input value=yes FOO=BAR >`,
424		`<input value="yes" foo="BAR">`,
425	},
426	{
427		"Single-quoted attribute value",
428		`<input value='yes' FOO='BAR'>`,
429		`<input value="yes" foo="BAR">`,
430	},
431	{
432		"Single-quoted attribute value, trailing space",
433		`<input value='yes' FOO='BAR' >`,
434		`<input value="yes" foo="BAR">`,
435	},
436	{
437		"Double-quoted attribute value",
438		`<input value="I'm an attribute" FOO="BAR">`,
439		`<input value="I&#39;m an attribute" foo="BAR">`,
440	},
441	{
442		"Attribute name characters",
443		`<meta http-equiv="content-type">`,
444		`<meta http-equiv="content-type">`,
445	},
446	{
447		"Mixed attributes",
448		`a<P V="0 1" w='2' X=3 y>z`,
449		`a$<p v="0 1" w="2" x="3" y="">$z`,
450	},
451	{
452		"Attributes with a solitary single quote",
453		`<p id=can't><p id=won't>`,
454		`<p id="can&#39;t">$<p id="won&#39;t">`,
455	},
456}
457
458func TestTokenizer(t *testing.T) {
459loop:
460	for _, tt := range tokenTests {
461		z := NewTokenizer(strings.NewReader(tt.html))
462		if tt.golden != "" {
463			for i, s := range strings.Split(tt.golden, "$") {
464				if z.Next() == ErrorToken {
465					t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
466					continue loop
467				}
468				actual := z.Token().String()
469				if s != actual {
470					t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
471					continue loop
472				}
473			}
474		}
475		z.Next()
476		if z.Err() != io.EOF {
477			t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
478		}
479	}
480}
481
482func TestMaxBuffer(t *testing.T) {
483	// Exceeding the maximum buffer size generates ErrBufferExceeded.
484	z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
485	z.SetMaxBuf(5)
486	tt := z.Next()
487	if got, want := tt, ErrorToken; got != want {
488		t.Fatalf("token type: got: %v want: %v", got, want)
489	}
490	if got, want := z.Err(), ErrBufferExceeded; got != want {
491		t.Errorf("error type: got: %v want: %v", got, want)
492	}
493	if got, want := string(z.Raw()), "<tttt"; got != want {
494		t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
495	}
496}
497
498func TestMaxBufferReconstruction(t *testing.T) {
499	// Exceeding the maximum buffer size at any point while tokenizing permits
500	// reconstructing the original input.
501tests:
502	for _, test := range tokenTests {
503		for maxBuf := 1; ; maxBuf++ {
504			r := strings.NewReader(test.html)
505			z := NewTokenizer(r)
506			z.SetMaxBuf(maxBuf)
507			var tokenized bytes.Buffer
508			for {
509				tt := z.Next()
510				tokenized.Write(z.Raw())
511				if tt == ErrorToken {
512					if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
513						t.Errorf("%s: unexpected error: %v", test.desc, err)
514					}
515					break
516				}
517			}
518			// Anything tokenized along with untokenized input or data left in the reader.
519			assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
520			if err != nil {
521				t.Errorf("%s: ReadAll: %v", test.desc, err)
522				continue tests
523			}
524			if got, want := string(assembled), test.html; got != want {
525				t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
526				continue tests
527			}
528			// EOF indicates that we completed tokenization and hence found the max
529			// maxBuf that generates ErrBufferExceeded, so continue to the next test.
530			if z.Err() == io.EOF {
531				break
532			}
533		} // buffer sizes
534	} // tests
535}
536
537func TestPassthrough(t *testing.T) {
538	// Accumulating the raw output for each parse event should reconstruct the
539	// original input.
540	for _, test := range tokenTests {
541		z := NewTokenizer(strings.NewReader(test.html))
542		var parsed bytes.Buffer
543		for {
544			tt := z.Next()
545			parsed.Write(z.Raw())
546			if tt == ErrorToken {
547				break
548			}
549		}
550		if got, want := parsed.String(), test.html; got != want {
551			t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
552		}
553	}
554}
555
556func TestBufAPI(t *testing.T) {
557	s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
558	z := NewTokenizer(bytes.NewBufferString(s))
559	var result bytes.Buffer
560	depth := 0
561loop:
562	for {
563		tt := z.Next()
564		switch tt {
565		case ErrorToken:
566			if z.Err() != io.EOF {
567				t.Error(z.Err())
568			}
569			break loop
570		case TextToken:
571			if depth > 0 {
572				result.Write(z.Text())
573			}
574		case StartTagToken, EndTagToken:
575			tn, _ := z.TagName()
576			if len(tn) == 1 && tn[0] == 'a' {
577				if tt == StartTagToken {
578					depth++
579				} else {
580					depth--
581				}
582			}
583		}
584	}
585	u := "14567"
586	v := string(result.Bytes())
587	if u != v {
588		t.Errorf("TestBufAPI: want %q got %q", u, v)
589	}
590}
591
592func TestConvertNewlines(t *testing.T) {
593	testCases := map[string]string{
594		"Mac\rDOS\r\nUnix\n":    "Mac\nDOS\nUnix\n",
595		"Unix\nMac\rDOS\r\n":    "Unix\nMac\nDOS\n",
596		"DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
597		"":                      "",
598		"\n":                    "\n",
599		"\n\r":                  "\n\n",
600		"\r":                    "\n",
601		"\r\n":                  "\n",
602		"\r\n\n":                "\n\n",
603		"\r\n\r":                "\n\n",
604		"\r\n\r\n":              "\n\n",
605		"\r\r":                  "\n\n",
606		"\r\r\n":                "\n\n",
607		"\r\r\n\n":              "\n\n\n",
608		"\r\r\r\n":              "\n\n\n",
609		"\r \n":                 "\n \n",
610		"xyz":                   "xyz",
611	}
612	for in, want := range testCases {
613		if got := string(convertNewlines([]byte(in))); got != want {
614			t.Errorf("input %q: got %q, want %q", in, got, want)
615		}
616	}
617}
618
619func TestReaderEdgeCases(t *testing.T) {
620	const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
621	testCases := []io.Reader{
622		&zeroOneByteReader{s: s},
623		&eofStringsReader{s: s},
624		&stuckReader{},
625	}
626	for i, tc := range testCases {
627		got := []TokenType{}
628		z := NewTokenizer(tc)
629		for {
630			tt := z.Next()
631			if tt == ErrorToken {
632				break
633			}
634			got = append(got, tt)
635		}
636		if err := z.Err(); err != nil && err != io.EOF {
637			if err != io.ErrNoProgress {
638				t.Errorf("i=%d: %v", i, err)
639			}
640			continue
641		}
642		want := []TokenType{
643			StartTagToken,
644			TextToken,
645			EndTagToken,
646		}
647		if !reflect.DeepEqual(got, want) {
648			t.Errorf("i=%d: got %v, want %v", i, got, want)
649			continue
650		}
651	}
652}
653
654// zeroOneByteReader is like a strings.Reader that alternates between
655// returning 0 bytes and 1 byte at a time.
656type zeroOneByteReader struct {
657	s string
658	n int
659}
660
661func (r *zeroOneByteReader) Read(p []byte) (int, error) {
662	if len(p) == 0 {
663		return 0, nil
664	}
665	if len(r.s) == 0 {
666		return 0, io.EOF
667	}
668	r.n++
669	if r.n%2 != 0 {
670		return 0, nil
671	}
672	p[0], r.s = r.s[0], r.s[1:]
673	return 1, nil
674}
675
676// eofStringsReader is like a strings.Reader but can return an (n, err) where
677// n > 0 && err != nil.
678type eofStringsReader struct {
679	s string
680}
681
682func (r *eofStringsReader) Read(p []byte) (int, error) {
683	n := copy(p, r.s)
684	r.s = r.s[n:]
685	if r.s != "" {
686		return n, nil
687	}
688	return n, io.EOF
689}
690
691// stuckReader is an io.Reader that always returns no data and no error.
692type stuckReader struct{}
693
694func (*stuckReader) Read(p []byte) (int, error) {
695	return 0, nil
696}
697
698const (
699	rawLevel = iota
700	lowLevel
701	highLevel
702)
703
704func benchmarkTokenizer(b *testing.B, level int) {
705	buf, err := ioutil.ReadFile("testdata/go1.html")
706	if err != nil {
707		b.Fatalf("could not read testdata/go1.html: %v", err)
708	}
709	b.SetBytes(int64(len(buf)))
710	runtime.GC()
711	b.ReportAllocs()
712	b.ResetTimer()
713	for i := 0; i < b.N; i++ {
714		z := NewTokenizer(bytes.NewBuffer(buf))
715		for {
716			tt := z.Next()
717			if tt == ErrorToken {
718				if err := z.Err(); err != nil && err != io.EOF {
719					b.Fatalf("tokenizer error: %v", err)
720				}
721				break
722			}
723			switch level {
724			case rawLevel:
725				// Calling z.Raw just returns the raw bytes of the token. It does
726				// not unescape &lt; to <, or lower-case tag names and attribute keys.
727				z.Raw()
728			case lowLevel:
729				// Caling z.Text, z.TagName and z.TagAttr returns []byte values
730				// whose contents may change on the next call to z.Next.
731				switch tt {
732				case TextToken, CommentToken, DoctypeToken:
733					z.Text()
734				case StartTagToken, SelfClosingTagToken:
735					_, more := z.TagName()
736					for more {
737						_, _, more = z.TagAttr()
738					}
739				case EndTagToken:
740					z.TagName()
741				}
742			case highLevel:
743				// Calling z.Token converts []byte values to strings whose validity
744				// extend beyond the next call to z.Next.
745				z.Token()
746			}
747		}
748	}
749}
750
751func BenchmarkRawLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, rawLevel) }
752func BenchmarkLowLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, lowLevel) }
753func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }
754