1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package html
6
7import (
8	"bufio"
9	"bytes"
10	"errors"
11	"fmt"
12	"io"
13	"io/ioutil"
14	"os"
15	"path/filepath"
16	"runtime"
17	"sort"
18	"strings"
19	"testing"
20
21	"golang.org/x/net/html/atom"
22)
23
24type testAttrs struct {
25	text, want, context string
26	scripting           bool
27}
28
29// readParseTest reads a single test case from r.
30func readParseTest(r *bufio.Reader) (*testAttrs, error) {
31	ta := &testAttrs{scripting: true}
32	line, err := r.ReadSlice('\n')
33	if err != nil {
34		return nil, err
35	}
36	var b []byte
37
38	// Read the HTML.
39	if string(line) != "#data\n" {
40		return nil, fmt.Errorf(`got %q want "#data\n"`, line)
41	}
42	for {
43		line, err = r.ReadSlice('\n')
44		if err != nil {
45			return nil, err
46		}
47		if line[0] == '#' {
48			break
49		}
50		b = append(b, line...)
51	}
52	ta.text = strings.TrimSuffix(string(b), "\n")
53	b = b[:0]
54
55	// Skip the error list.
56	if string(line) != "#errors\n" {
57		return nil, fmt.Errorf(`got %q want "#errors\n"`, line)
58	}
59	for {
60		line, err = r.ReadSlice('\n')
61		if err != nil {
62			return nil, err
63		}
64		if line[0] == '#' {
65			break
66		}
67	}
68
69	if ls := string(line); strings.HasPrefix(ls, "#script-") {
70		switch {
71		case strings.HasSuffix(ls, "-on\n"):
72			ta.scripting = true
73		case strings.HasSuffix(ls, "-off\n"):
74			ta.scripting = false
75		default:
76			return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line)
77		}
78		for {
79			line, err = r.ReadSlice('\n')
80			if err != nil {
81				return nil, err
82			}
83			if line[0] == '#' {
84				break
85			}
86		}
87	}
88
89	if string(line) == "#document-fragment\n" {
90		line, err = r.ReadSlice('\n')
91		if err != nil {
92			return nil, err
93		}
94		ta.context = strings.TrimSpace(string(line))
95		line, err = r.ReadSlice('\n')
96		if err != nil {
97			return nil, err
98		}
99	}
100
101	// Read the dump of what the parse tree should be.
102	if string(line) != "#document\n" {
103		return nil, fmt.Errorf(`got %q want "#document\n"`, line)
104	}
105	inQuote := false
106	for {
107		line, err = r.ReadSlice('\n')
108		if err != nil && err != io.EOF {
109			return nil, err
110		}
111		trimmed := bytes.Trim(line, "| \n")
112		if len(trimmed) > 0 {
113			if line[0] == '|' && trimmed[0] == '"' {
114				inQuote = true
115			}
116			if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
117				inQuote = false
118			}
119		}
120		if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
121			break
122		}
123		b = append(b, line...)
124	}
125	ta.want = string(b)
126	return ta, nil
127}
128
129func dumpIndent(w io.Writer, level int) {
130	io.WriteString(w, "| ")
131	for i := 0; i < level; i++ {
132		io.WriteString(w, "  ")
133	}
134}
135
136type sortedAttributes []Attribute
137
138func (a sortedAttributes) Len() int {
139	return len(a)
140}
141
142func (a sortedAttributes) Less(i, j int) bool {
143	if a[i].Namespace != a[j].Namespace {
144		return a[i].Namespace < a[j].Namespace
145	}
146	return a[i].Key < a[j].Key
147}
148
149func (a sortedAttributes) Swap(i, j int) {
150	a[i], a[j] = a[j], a[i]
151}
152
153func dumpLevel(w io.Writer, n *Node, level int) error {
154	dumpIndent(w, level)
155	level++
156	switch n.Type {
157	case ErrorNode:
158		return errors.New("unexpected ErrorNode")
159	case DocumentNode:
160		return errors.New("unexpected DocumentNode")
161	case ElementNode:
162		if n.Namespace != "" {
163			fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
164		} else {
165			fmt.Fprintf(w, "<%s>", n.Data)
166		}
167		attr := sortedAttributes(n.Attr)
168		sort.Sort(attr)
169		for _, a := range attr {
170			io.WriteString(w, "\n")
171			dumpIndent(w, level)
172			if a.Namespace != "" {
173				fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
174			} else {
175				fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
176			}
177		}
178		if n.Namespace == "" && n.DataAtom == atom.Template {
179			io.WriteString(w, "\n")
180			dumpIndent(w, level)
181			level++
182			io.WriteString(w, "content")
183		}
184	case TextNode:
185		fmt.Fprintf(w, `"%s"`, n.Data)
186	case CommentNode:
187		fmt.Fprintf(w, "<!-- %s -->", n.Data)
188	case DoctypeNode:
189		fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
190		if n.Attr != nil {
191			var p, s string
192			for _, a := range n.Attr {
193				switch a.Key {
194				case "public":
195					p = a.Val
196				case "system":
197					s = a.Val
198				}
199			}
200			if p != "" || s != "" {
201				fmt.Fprintf(w, ` "%s"`, p)
202				fmt.Fprintf(w, ` "%s"`, s)
203			}
204		}
205		io.WriteString(w, ">")
206	case scopeMarkerNode:
207		return errors.New("unexpected scopeMarkerNode")
208	default:
209		return errors.New("unknown node type")
210	}
211	io.WriteString(w, "\n")
212	for c := n.FirstChild; c != nil; c = c.NextSibling {
213		if err := dumpLevel(w, c, level); err != nil {
214			return err
215		}
216	}
217	return nil
218}
219
220func dump(n *Node) (string, error) {
221	if n == nil || n.FirstChild == nil {
222		return "", nil
223	}
224	var b bytes.Buffer
225	for c := n.FirstChild; c != nil; c = c.NextSibling {
226		if err := dumpLevel(&b, c, 0); err != nil {
227			return "", err
228		}
229	}
230	return b.String(), nil
231}
232
233var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
234
235func TestParser(t *testing.T) {
236	for _, testDataDir := range testDataDirs {
237		testFiles, err := filepath.Glob(testDataDir + "*.dat")
238		if err != nil {
239			t.Fatal(err)
240		}
241		for _, tf := range testFiles {
242			f, err := os.Open(tf)
243			if err != nil {
244				t.Fatal(err)
245			}
246			defer f.Close()
247			r := bufio.NewReader(f)
248
249			for i := 0; ; i++ {
250				ta, err := readParseTest(r)
251				if err == io.EOF {
252					break
253				}
254				if err != nil {
255					t.Fatal(err)
256				}
257
258				err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting))
259
260				if err != nil {
261					t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err)
262				}
263			}
264		}
265	}
266}
267
268// Issue 16318
269func TestParserWithoutScripting(t *testing.T) {
270	text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>`
271	want := `| <html>
272|   <head>
273|     <noscript>
274|   <body>
275|     <img>
276|       src="https://golang.org/doc/gopher/frontpage.png"
277|     <p>
278|       <img>
279|         src="https://golang.org/doc/gopher/doc.png"
280`
281
282	if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil {
283		t.Errorf("test with scripting is disabled, %q, %s", text, err)
284	}
285}
286
287// testParseCase tests one test case from the test files. If the test does not
288// pass, it returns an error that explains the failure.
289// text is the HTML to be parsed, want is a dump of the correct parse tree,
290// and context is the name of the context node, if any.
291func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
292	defer func() {
293		if x := recover(); x != nil {
294			switch e := x.(type) {
295			case error:
296				err = e
297			default:
298				err = fmt.Errorf("%v", e)
299			}
300		}
301	}()
302
303	var doc *Node
304	if context == "" {
305		doc, err = ParseWithOptions(strings.NewReader(text), opts...)
306		if err != nil {
307			return err
308		}
309	} else {
310		namespace := ""
311		if i := strings.IndexByte(context, ' '); i >= 0 {
312			namespace, context = context[:i], context[i+1:]
313		}
314		contextNode := &Node{
315			Data:      context,
316			DataAtom:  atom.Lookup([]byte(context)),
317			Namespace: namespace,
318			Type:      ElementNode,
319		}
320		nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...)
321		if err != nil {
322			return err
323		}
324		doc = &Node{
325			Type: DocumentNode,
326		}
327		for _, n := range nodes {
328			doc.AppendChild(n)
329		}
330	}
331
332	if err := checkTreeConsistency(doc); err != nil {
333		return err
334	}
335
336	got, err := dump(doc)
337	if err != nil {
338		return err
339	}
340	// Compare the parsed tree to the #document section.
341	if got != want {
342		return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
343	}
344
345	if renderTestBlacklist[text] || context != "" {
346		return nil
347	}
348
349	// Check that rendering and re-parsing results in an identical tree.
350	pr, pw := io.Pipe()
351	go func() {
352		pw.CloseWithError(Render(pw, doc))
353	}()
354	doc1, err := ParseWithOptions(pr, opts...)
355	if err != nil {
356		return err
357	}
358	got1, err := dump(doc1)
359	if err != nil {
360		return err
361	}
362	if got != got1 {
363		return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
364	}
365
366	return nil
367}
368
369// Some test input result in parse trees are not 'well-formed' despite
370// following the HTML5 recovery algorithms. Rendering and re-parsing such a
371// tree will not result in an exact clone of that tree. We blacklist such
372// inputs from the render test.
373var renderTestBlacklist = map[string]bool{
374	// The second <a> will be reparented to the first <table>'s parent. This
375	// results in an <a> whose parent is an <a>, which is not 'well-formed'.
376	`<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
377	// The same thing with a <p>:
378	`<p><table></p>`: true,
379	// More cases of <a> being reparented:
380	`<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
381	`<a><table><a></table><p><a><div><a>`:                                     true,
382	`<a><table><td><a><table></table><a></tr><a></table><a>`:                  true,
383	`<template><a><table><a>`:                                                 true,
384	// A similar reparenting situation involving <nobr>:
385	`<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
386	// A <plaintext> element is reparented, putting it before a table.
387	// A <plaintext> element can't have anything after it in HTML.
388	`<table><plaintext><td>`:                                   true,
389	`<!doctype html><table><plaintext></plaintext>`:            true,
390	`<!doctype html><table><tbody><plaintext></plaintext>`:     true,
391	`<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
392	// A form inside a table inside a form doesn't work either.
393	`<!doctype html><form><table></form><form></table></form>`: true,
394	// A script that ends at EOF may escape its own closing tag when rendered.
395	`<!doctype html><script><!--<script `:          true,
396	`<!doctype html><script><!--<script <`:         true,
397	`<!doctype html><script><!--<script <a`:        true,
398	`<!doctype html><script><!--<script </`:        true,
399	`<!doctype html><script><!--<script </s`:       true,
400	`<!doctype html><script><!--<script </script`:  true,
401	`<!doctype html><script><!--<script </scripta`: true,
402	`<!doctype html><script><!--<script -`:         true,
403	`<!doctype html><script><!--<script -a`:        true,
404	`<!doctype html><script><!--<script -<`:        true,
405	`<!doctype html><script><!--<script --`:        true,
406	`<!doctype html><script><!--<script --a`:       true,
407	`<!doctype html><script><!--<script --<`:       true,
408	`<script><!--<script `:                         true,
409	`<script><!--<script <a`:                       true,
410	`<script><!--<script </script`:                 true,
411	`<script><!--<script </scripta`:                true,
412	`<script><!--<script -`:                        true,
413	`<script><!--<script -a`:                       true,
414	`<script><!--<script --`:                       true,
415	`<script><!--<script --a`:                      true,
416	`<script><!--<script <`:                        true,
417	`<script><!--<script </`:                       true,
418	`<script><!--<script </s`:                      true,
419	// Reconstructing the active formatting elements results in a <plaintext>
420	// element that contains an <a> element.
421	`<!doctype html><p><a><plaintext>b`:         true,
422	`<table><math><select><mi><select></table>`: true,
423}
424
425func TestNodeConsistency(t *testing.T) {
426	// inconsistentNode is a Node whose DataAtom and Data do not agree.
427	inconsistentNode := &Node{
428		Type:     ElementNode,
429		DataAtom: atom.Frameset,
430		Data:     "table",
431	}
432	if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil {
433		t.Errorf("got nil error, want non-nil")
434	}
435}
436
437func TestParseFragmentWithNilContext(t *testing.T) {
438	// This shouldn't panic.
439	ParseFragment(strings.NewReader("<p>hello</p>"), nil)
440}
441
442func BenchmarkParser(b *testing.B) {
443	buf, err := ioutil.ReadFile("testdata/go1.html")
444	if err != nil {
445		b.Fatalf("could not read testdata/go1.html: %v", err)
446	}
447	b.SetBytes(int64(len(buf)))
448	runtime.GC()
449	b.ReportAllocs()
450	b.ResetTimer()
451	for i := 0; i < b.N; i++ {
452		Parse(bytes.NewBuffer(buf))
453	}
454}
455