1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package html
6
7import (
8	"bufio"
9	"bytes"
10	"errors"
11	"fmt"
12	"io"
13	"io/ioutil"
14	"os"
15	"path/filepath"
16	"runtime"
17	"sort"
18	"strings"
19	"testing"
20
21	"golang.org/x/net/html/atom"
22)
23
24type testAttrs struct {
25	text, want, context string
26	scripting           bool
27}
28
29// readParseTest reads a single test case from r.
30func readParseTest(r *bufio.Reader) (*testAttrs, error) {
31	ta := &testAttrs{scripting: true}
32	line, err := r.ReadSlice('\n')
33	if err != nil {
34		return nil, err
35	}
36	var b []byte
37
38	// Read the HTML.
39	if string(line) != "#data\n" {
40		return nil, fmt.Errorf(`got %q want "#data\n"`, line)
41	}
42	for {
43		line, err = r.ReadSlice('\n')
44		if err != nil {
45			return nil, err
46		}
47		if line[0] == '#' {
48			break
49		}
50		b = append(b, line...)
51	}
52	ta.text = strings.TrimSuffix(string(b), "\n")
53	b = b[:0]
54
55	// Skip the error list.
56	if string(line) != "#errors\n" {
57		return nil, fmt.Errorf(`got %q want "#errors\n"`, line)
58	}
59	for {
60		line, err = r.ReadSlice('\n')
61		if err != nil {
62			return nil, err
63		}
64		if line[0] == '#' {
65			break
66		}
67	}
68
69	// Skip the new-errors list.
70	if string(line) == "#new-errors\n" {
71		for {
72			line, err = r.ReadSlice('\n')
73			if err != nil {
74				return nil, err
75			}
76			if line[0] == '#' {
77				break
78			}
79		}
80	}
81
82	if ls := string(line); strings.HasPrefix(ls, "#script-") {
83		switch {
84		case strings.HasSuffix(ls, "-on\n"):
85			ta.scripting = true
86		case strings.HasSuffix(ls, "-off\n"):
87			ta.scripting = false
88		default:
89			return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line)
90		}
91		for {
92			line, err = r.ReadSlice('\n')
93			if err != nil {
94				return nil, err
95			}
96			if line[0] == '#' {
97				break
98			}
99		}
100	}
101
102	if string(line) == "#document-fragment\n" {
103		line, err = r.ReadSlice('\n')
104		if err != nil {
105			return nil, err
106		}
107		ta.context = strings.TrimSpace(string(line))
108		line, err = r.ReadSlice('\n')
109		if err != nil {
110			return nil, err
111		}
112	}
113
114	// Read the dump of what the parse tree should be.
115	if string(line) != "#document\n" {
116		return nil, fmt.Errorf(`got %q want "#document\n"`, line)
117	}
118	inQuote := false
119	for {
120		line, err = r.ReadSlice('\n')
121		if err != nil && err != io.EOF {
122			return nil, err
123		}
124		trimmed := bytes.Trim(line, "| \n")
125		if len(trimmed) > 0 {
126			if line[0] == '|' && trimmed[0] == '"' {
127				inQuote = true
128			}
129			if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
130				inQuote = false
131			}
132		}
133		if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
134			break
135		}
136		b = append(b, line...)
137	}
138	ta.want = string(b)
139	return ta, nil
140}
141
142func dumpIndent(w io.Writer, level int) {
143	io.WriteString(w, "| ")
144	for i := 0; i < level; i++ {
145		io.WriteString(w, "  ")
146	}
147}
148
149type sortedAttributes []Attribute
150
151func (a sortedAttributes) Len() int {
152	return len(a)
153}
154
155func (a sortedAttributes) Less(i, j int) bool {
156	if a[i].Namespace != a[j].Namespace {
157		return a[i].Namespace < a[j].Namespace
158	}
159	return a[i].Key < a[j].Key
160}
161
162func (a sortedAttributes) Swap(i, j int) {
163	a[i], a[j] = a[j], a[i]
164}
165
166func dumpLevel(w io.Writer, n *Node, level int) error {
167	dumpIndent(w, level)
168	level++
169	switch n.Type {
170	case ErrorNode:
171		return errors.New("unexpected ErrorNode")
172	case DocumentNode:
173		return errors.New("unexpected DocumentNode")
174	case ElementNode:
175		if n.Namespace != "" {
176			fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
177		} else {
178			fmt.Fprintf(w, "<%s>", n.Data)
179		}
180		attr := sortedAttributes(n.Attr)
181		sort.Sort(attr)
182		for _, a := range attr {
183			io.WriteString(w, "\n")
184			dumpIndent(w, level)
185			if a.Namespace != "" {
186				fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
187			} else {
188				fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
189			}
190		}
191		if n.Namespace == "" && n.DataAtom == atom.Template {
192			io.WriteString(w, "\n")
193			dumpIndent(w, level)
194			level++
195			io.WriteString(w, "content")
196		}
197	case TextNode:
198		fmt.Fprintf(w, `"%s"`, n.Data)
199	case CommentNode:
200		fmt.Fprintf(w, "<!-- %s -->", n.Data)
201	case DoctypeNode:
202		fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
203		if n.Attr != nil {
204			var p, s string
205			for _, a := range n.Attr {
206				switch a.Key {
207				case "public":
208					p = a.Val
209				case "system":
210					s = a.Val
211				}
212			}
213			if p != "" || s != "" {
214				fmt.Fprintf(w, ` "%s"`, p)
215				fmt.Fprintf(w, ` "%s"`, s)
216			}
217		}
218		io.WriteString(w, ">")
219	case scopeMarkerNode:
220		return errors.New("unexpected scopeMarkerNode")
221	default:
222		return errors.New("unknown node type")
223	}
224	io.WriteString(w, "\n")
225	for c := n.FirstChild; c != nil; c = c.NextSibling {
226		if err := dumpLevel(w, c, level); err != nil {
227			return err
228		}
229	}
230	return nil
231}
232
233func dump(n *Node) (string, error) {
234	if n == nil || n.FirstChild == nil {
235		return "", nil
236	}
237	var b bytes.Buffer
238	for c := n.FirstChild; c != nil; c = c.NextSibling {
239		if err := dumpLevel(&b, c, 0); err != nil {
240			return "", err
241		}
242	}
243	return b.String(), nil
244}
245
246var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
247
248func TestParser(t *testing.T) {
249	for _, testDataDir := range testDataDirs {
250		testFiles, err := filepath.Glob(testDataDir + "*.dat")
251		if err != nil {
252			t.Fatal(err)
253		}
254		for _, tf := range testFiles {
255			f, err := os.Open(tf)
256			if err != nil {
257				t.Fatal(err)
258			}
259			defer f.Close()
260			r := bufio.NewReader(f)
261
262			for i := 0; ; i++ {
263				ta, err := readParseTest(r)
264				if err == io.EOF {
265					break
266				}
267				if err != nil {
268					t.Fatal(err)
269				}
270				if parseTestBlacklist[ta.text] {
271					continue
272				}
273
274				err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting))
275
276				if err != nil {
277					t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err)
278				}
279			}
280		}
281	}
282}
283
284// Issue 16318
285func TestParserWithoutScripting(t *testing.T) {
286	text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>`
287	want := `| <html>
288|   <head>
289|     <noscript>
290|   <body>
291|     <img>
292|       src="https://golang.org/doc/gopher/frontpage.png"
293|     <p>
294|       <img>
295|         src="https://golang.org/doc/gopher/doc.png"
296`
297
298	if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil {
299		t.Errorf("test with scripting is disabled, %q, %s", text, err)
300	}
301}
302
303// testParseCase tests one test case from the test files. If the test does not
304// pass, it returns an error that explains the failure.
305// text is the HTML to be parsed, want is a dump of the correct parse tree,
306// and context is the name of the context node, if any.
307func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
308	defer func() {
309		if x := recover(); x != nil {
310			switch e := x.(type) {
311			case error:
312				err = e
313			default:
314				err = fmt.Errorf("%v", e)
315			}
316		}
317	}()
318
319	var doc *Node
320	if context == "" {
321		doc, err = ParseWithOptions(strings.NewReader(text), opts...)
322		if err != nil {
323			return err
324		}
325	} else {
326		namespace := ""
327		if i := strings.IndexByte(context, ' '); i >= 0 {
328			namespace, context = context[:i], context[i+1:]
329		}
330		contextNode := &Node{
331			Data:      context,
332			DataAtom:  atom.Lookup([]byte(context)),
333			Namespace: namespace,
334			Type:      ElementNode,
335		}
336		nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...)
337		if err != nil {
338			return err
339		}
340		doc = &Node{
341			Type: DocumentNode,
342		}
343		for _, n := range nodes {
344			doc.AppendChild(n)
345		}
346	}
347
348	if err := checkTreeConsistency(doc); err != nil {
349		return err
350	}
351
352	got, err := dump(doc)
353	if err != nil {
354		return err
355	}
356	// Compare the parsed tree to the #document section.
357	if got != want {
358		return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
359	}
360
361	if renderTestBlacklist[text] || context != "" {
362		return nil
363	}
364
365	// Check that rendering and re-parsing results in an identical tree.
366	pr, pw := io.Pipe()
367	go func() {
368		pw.CloseWithError(Render(pw, doc))
369	}()
370	doc1, err := ParseWithOptions(pr, opts...)
371	if err != nil {
372		return err
373	}
374	got1, err := dump(doc1)
375	if err != nil {
376		return err
377	}
378	if got != got1 {
379		return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
380	}
381
382	return nil
383}
384
385// Some test inputs are simply skipped - we would otherwise fail the test. We
386// blacklist such inputs from the parse test.
387var parseTestBlacklist = map[string]bool{
388	// See the a.Template TODO in inHeadIM.
389	`<math><template><mo><template>`:                                     true,
390	`<template><svg><foo><template><foreignObject><div></template><div>`: true,
391}
392
393// Some test input result in parse trees are not 'well-formed' despite
394// following the HTML5 recovery algorithms. Rendering and re-parsing such a
395// tree will not result in an exact clone of that tree. We blacklist such
396// inputs from the render test.
397var renderTestBlacklist = map[string]bool{
398	// The second <a> will be reparented to the first <table>'s parent. This
399	// results in an <a> whose parent is an <a>, which is not 'well-formed'.
400	`<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
401	// The same thing with a <p>:
402	`<p><table></p>`: true,
403	// More cases of <a> being reparented:
404	`<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
405	`<a><table><a></table><p><a><div><a>`:                                     true,
406	`<a><table><td><a><table></table><a></tr><a></table><a>`:                  true,
407	`<template><a><table><a>`:                                                 true,
408	// A similar reparenting situation involving <nobr>:
409	`<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
410	// A <plaintext> element is reparented, putting it before a table.
411	// A <plaintext> element can't have anything after it in HTML.
412	`<table><plaintext><td>`:                                   true,
413	`<!doctype html><table><plaintext></plaintext>`:            true,
414	`<!doctype html><table><tbody><plaintext></plaintext>`:     true,
415	`<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
416	// A form inside a table inside a form doesn't work either.
417	`<!doctype html><form><table></form><form></table></form>`: true,
418	// A script that ends at EOF may escape its own closing tag when rendered.
419	`<!doctype html><script><!--<script `:          true,
420	`<!doctype html><script><!--<script <`:         true,
421	`<!doctype html><script><!--<script <a`:        true,
422	`<!doctype html><script><!--<script </`:        true,
423	`<!doctype html><script><!--<script </s`:       true,
424	`<!doctype html><script><!--<script </script`:  true,
425	`<!doctype html><script><!--<script </scripta`: true,
426	`<!doctype html><script><!--<script -`:         true,
427	`<!doctype html><script><!--<script -a`:        true,
428	`<!doctype html><script><!--<script -<`:        true,
429	`<!doctype html><script><!--<script --`:        true,
430	`<!doctype html><script><!--<script --a`:       true,
431	`<!doctype html><script><!--<script --<`:       true,
432	`<script><!--<script `:                         true,
433	`<script><!--<script <a`:                       true,
434	`<script><!--<script </script`:                 true,
435	`<script><!--<script </scripta`:                true,
436	`<script><!--<script -`:                        true,
437	`<script><!--<script -a`:                       true,
438	`<script><!--<script --`:                       true,
439	`<script><!--<script --a`:                      true,
440	`<script><!--<script <`:                        true,
441	`<script><!--<script </`:                       true,
442	`<script><!--<script </s`:                      true,
443	// Reconstructing the active formatting elements results in a <plaintext>
444	// element that contains an <a> element.
445	`<!doctype html><p><a><plaintext>b`:                       true,
446	`<table><math><select><mi><select></table>`:               true,
447	`<!doctype html><table><colgroup><plaintext></plaintext>`: true,
448	`<!doctype html><svg><plaintext>a</plaintext>b`:           true,
449}
450
451func TestNodeConsistency(t *testing.T) {
452	// inconsistentNode is a Node whose DataAtom and Data do not agree.
453	inconsistentNode := &Node{
454		Type:     ElementNode,
455		DataAtom: atom.Frameset,
456		Data:     "table",
457	}
458	if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil {
459		t.Errorf("got nil error, want non-nil")
460	}
461}
462
463func TestParseFragmentWithNilContext(t *testing.T) {
464	// This shouldn't panic.
465	ParseFragment(strings.NewReader("<p>hello</p>"), nil)
466}
467
468func TestParseFragmentForeignContentTemplates(t *testing.T) {
469	srcs := []string{
470		"<math><html><template><mn><template></template></template>",
471		"<math><math><head><mi><template>",
472	}
473	for _, src := range srcs {
474		// The next line shouldn't infinite-loop.
475		ParseFragment(strings.NewReader(src), nil)
476	}
477}
478
479func BenchmarkParser(b *testing.B) {
480	buf, err := ioutil.ReadFile("testdata/go1.html")
481	if err != nil {
482		b.Fatalf("could not read testdata/go1.html: %v", err)
483	}
484	b.SetBytes(int64(len(buf)))
485	runtime.GC()
486	b.ReportAllocs()
487	b.ResetTimer()
488	for i := 0; i < b.N; i++ {
489		Parse(bytes.NewBuffer(buf))
490	}
491}
492