1// Copyright 2010 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package html 6 7import ( 8 "bufio" 9 "bytes" 10 "errors" 11 "fmt" 12 "io" 13 "io/ioutil" 14 "os" 15 "path/filepath" 16 "runtime" 17 "sort" 18 "strings" 19 "testing" 20 21 "golang.org/x/net/html/atom" 22) 23 24type testAttrs struct { 25 text, want, context string 26 scripting bool 27} 28 29// readParseTest reads a single test case from r. 30func readParseTest(r *bufio.Reader) (*testAttrs, error) { 31 ta := &testAttrs{scripting: true} 32 line, err := r.ReadSlice('\n') 33 if err != nil { 34 return nil, err 35 } 36 var b []byte 37 38 // Read the HTML. 39 if string(line) != "#data\n" { 40 return nil, fmt.Errorf(`got %q want "#data\n"`, line) 41 } 42 for { 43 line, err = r.ReadSlice('\n') 44 if err != nil { 45 return nil, err 46 } 47 if line[0] == '#' { 48 break 49 } 50 b = append(b, line...) 51 } 52 ta.text = strings.TrimSuffix(string(b), "\n") 53 b = b[:0] 54 55 // Skip the error list. 56 if string(line) != "#errors\n" { 57 return nil, fmt.Errorf(`got %q want "#errors\n"`, line) 58 } 59 for { 60 line, err = r.ReadSlice('\n') 61 if err != nil { 62 return nil, err 63 } 64 if line[0] == '#' { 65 break 66 } 67 } 68 69 if ls := string(line); strings.HasPrefix(ls, "#script-") { 70 switch { 71 case strings.HasSuffix(ls, "-on\n"): 72 ta.scripting = true 73 case strings.HasSuffix(ls, "-off\n"): 74 ta.scripting = false 75 default: 76 return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line) 77 } 78 for { 79 line, err = r.ReadSlice('\n') 80 if err != nil { 81 return nil, err 82 } 83 if line[0] == '#' { 84 break 85 } 86 } 87 } 88 89 if string(line) == "#document-fragment\n" { 90 line, err = r.ReadSlice('\n') 91 if err != nil { 92 return nil, err 93 } 94 ta.context = strings.TrimSpace(string(line)) 95 line, err = r.ReadSlice('\n') 96 if err != nil { 97 return nil, err 98 } 99 } 100 101 // Read the dump of what the parse tree should be. 102 if string(line) != "#document\n" { 103 return nil, fmt.Errorf(`got %q want "#document\n"`, line) 104 } 105 inQuote := false 106 for { 107 line, err = r.ReadSlice('\n') 108 if err != nil && err != io.EOF { 109 return nil, err 110 } 111 trimmed := bytes.Trim(line, "| \n") 112 if len(trimmed) > 0 { 113 if line[0] == '|' && trimmed[0] == '"' { 114 inQuote = true 115 } 116 if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) { 117 inQuote = false 118 } 119 } 120 if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote { 121 break 122 } 123 b = append(b, line...) 124 } 125 ta.want = string(b) 126 return ta, nil 127} 128 129func dumpIndent(w io.Writer, level int) { 130 io.WriteString(w, "| ") 131 for i := 0; i < level; i++ { 132 io.WriteString(w, " ") 133 } 134} 135 136type sortedAttributes []Attribute 137 138func (a sortedAttributes) Len() int { 139 return len(a) 140} 141 142func (a sortedAttributes) Less(i, j int) bool { 143 if a[i].Namespace != a[j].Namespace { 144 return a[i].Namespace < a[j].Namespace 145 } 146 return a[i].Key < a[j].Key 147} 148 149func (a sortedAttributes) Swap(i, j int) { 150 a[i], a[j] = a[j], a[i] 151} 152 153func dumpLevel(w io.Writer, n *Node, level int) error { 154 dumpIndent(w, level) 155 level++ 156 switch n.Type { 157 case ErrorNode: 158 return errors.New("unexpected ErrorNode") 159 case DocumentNode: 160 return errors.New("unexpected DocumentNode") 161 case ElementNode: 162 if n.Namespace != "" { 163 fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data) 164 } else { 165 fmt.Fprintf(w, "<%s>", n.Data) 166 } 167 attr := sortedAttributes(n.Attr) 168 sort.Sort(attr) 169 for _, a := range attr { 170 io.WriteString(w, "\n") 171 dumpIndent(w, level) 172 if a.Namespace != "" { 173 fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val) 174 } else { 175 fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val) 176 } 177 } 178 if n.Namespace == "" && n.DataAtom == atom.Template { 179 io.WriteString(w, "\n") 180 dumpIndent(w, level) 181 level++ 182 io.WriteString(w, "content") 183 } 184 case TextNode: 185 fmt.Fprintf(w, `"%s"`, n.Data) 186 case CommentNode: 187 fmt.Fprintf(w, "<!-- %s -->", n.Data) 188 case DoctypeNode: 189 fmt.Fprintf(w, "<!DOCTYPE %s", n.Data) 190 if n.Attr != nil { 191 var p, s string 192 for _, a := range n.Attr { 193 switch a.Key { 194 case "public": 195 p = a.Val 196 case "system": 197 s = a.Val 198 } 199 } 200 if p != "" || s != "" { 201 fmt.Fprintf(w, ` "%s"`, p) 202 fmt.Fprintf(w, ` "%s"`, s) 203 } 204 } 205 io.WriteString(w, ">") 206 case scopeMarkerNode: 207 return errors.New("unexpected scopeMarkerNode") 208 default: 209 return errors.New("unknown node type") 210 } 211 io.WriteString(w, "\n") 212 for c := n.FirstChild; c != nil; c = c.NextSibling { 213 if err := dumpLevel(w, c, level); err != nil { 214 return err 215 } 216 } 217 return nil 218} 219 220func dump(n *Node) (string, error) { 221 if n == nil || n.FirstChild == nil { 222 return "", nil 223 } 224 var b bytes.Buffer 225 for c := n.FirstChild; c != nil; c = c.NextSibling { 226 if err := dumpLevel(&b, c, 0); err != nil { 227 return "", err 228 } 229 } 230 return b.String(), nil 231} 232 233var testDataDirs = []string{"testdata/webkit/", "testdata/go/"} 234 235func TestParser(t *testing.T) { 236 for _, testDataDir := range testDataDirs { 237 testFiles, err := filepath.Glob(testDataDir + "*.dat") 238 if err != nil { 239 t.Fatal(err) 240 } 241 for _, tf := range testFiles { 242 f, err := os.Open(tf) 243 if err != nil { 244 t.Fatal(err) 245 } 246 defer f.Close() 247 r := bufio.NewReader(f) 248 249 for i := 0; ; i++ { 250 ta, err := readParseTest(r) 251 if err == io.EOF { 252 break 253 } 254 if err != nil { 255 t.Fatal(err) 256 } 257 258 err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting)) 259 260 if err != nil { 261 t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err) 262 } 263 } 264 } 265 } 266} 267 268// Issue 16318 269func TestParserWithoutScripting(t *testing.T) { 270 text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>` 271 want := `| <html> 272| <head> 273| <noscript> 274| <body> 275| <img> 276| src="https://golang.org/doc/gopher/frontpage.png" 277| <p> 278| <img> 279| src="https://golang.org/doc/gopher/doc.png" 280` 281 282 if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil { 283 t.Errorf("test with scripting is disabled, %q, %s", text, err) 284 } 285} 286 287// testParseCase tests one test case from the test files. If the test does not 288// pass, it returns an error that explains the failure. 289// text is the HTML to be parsed, want is a dump of the correct parse tree, 290// and context is the name of the context node, if any. 291func testParseCase(text, want, context string, opts ...ParseOption) (err error) { 292 defer func() { 293 if x := recover(); x != nil { 294 switch e := x.(type) { 295 case error: 296 err = e 297 default: 298 err = fmt.Errorf("%v", e) 299 } 300 } 301 }() 302 303 var doc *Node 304 if context == "" { 305 doc, err = ParseWithOptions(strings.NewReader(text), opts...) 306 if err != nil { 307 return err 308 } 309 } else { 310 namespace := "" 311 if i := strings.IndexByte(context, ' '); i >= 0 { 312 namespace, context = context[:i], context[i+1:] 313 } 314 contextNode := &Node{ 315 Data: context, 316 DataAtom: atom.Lookup([]byte(context)), 317 Namespace: namespace, 318 Type: ElementNode, 319 } 320 nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...) 321 if err != nil { 322 return err 323 } 324 doc = &Node{ 325 Type: DocumentNode, 326 } 327 for _, n := range nodes { 328 doc.AppendChild(n) 329 } 330 } 331 332 if err := checkTreeConsistency(doc); err != nil { 333 return err 334 } 335 336 got, err := dump(doc) 337 if err != nil { 338 return err 339 } 340 // Compare the parsed tree to the #document section. 341 if got != want { 342 return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want) 343 } 344 345 if renderTestBlacklist[text] || context != "" { 346 return nil 347 } 348 349 // Check that rendering and re-parsing results in an identical tree. 350 pr, pw := io.Pipe() 351 go func() { 352 pw.CloseWithError(Render(pw, doc)) 353 }() 354 doc1, err := ParseWithOptions(pr, opts...) 355 if err != nil { 356 return err 357 } 358 got1, err := dump(doc1) 359 if err != nil { 360 return err 361 } 362 if got != got1 { 363 return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1) 364 } 365 366 return nil 367} 368 369// Some test input result in parse trees are not 'well-formed' despite 370// following the HTML5 recovery algorithms. Rendering and re-parsing such a 371// tree will not result in an exact clone of that tree. We blacklist such 372// inputs from the render test. 373var renderTestBlacklist = map[string]bool{ 374 // The second <a> will be reparented to the first <table>'s parent. This 375 // results in an <a> whose parent is an <a>, which is not 'well-formed'. 376 `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true, 377 // The same thing with a <p>: 378 `<p><table></p>`: true, 379 // More cases of <a> being reparented: 380 `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true, 381 `<a><table><a></table><p><a><div><a>`: true, 382 `<a><table><td><a><table></table><a></tr><a></table><a>`: true, 383 `<template><a><table><a>`: true, 384 // A similar reparenting situation involving <nobr>: 385 `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true, 386 // A <plaintext> element is reparented, putting it before a table. 387 // A <plaintext> element can't have anything after it in HTML. 388 `<table><plaintext><td>`: true, 389 `<!doctype html><table><plaintext></plaintext>`: true, 390 `<!doctype html><table><tbody><plaintext></plaintext>`: true, 391 `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true, 392 // A form inside a table inside a form doesn't work either. 393 `<!doctype html><form><table></form><form></table></form>`: true, 394 // A script that ends at EOF may escape its own closing tag when rendered. 395 `<!doctype html><script><!--<script `: true, 396 `<!doctype html><script><!--<script <`: true, 397 `<!doctype html><script><!--<script <a`: true, 398 `<!doctype html><script><!--<script </`: true, 399 `<!doctype html><script><!--<script </s`: true, 400 `<!doctype html><script><!--<script </script`: true, 401 `<!doctype html><script><!--<script </scripta`: true, 402 `<!doctype html><script><!--<script -`: true, 403 `<!doctype html><script><!--<script -a`: true, 404 `<!doctype html><script><!--<script -<`: true, 405 `<!doctype html><script><!--<script --`: true, 406 `<!doctype html><script><!--<script --a`: true, 407 `<!doctype html><script><!--<script --<`: true, 408 `<script><!--<script `: true, 409 `<script><!--<script <a`: true, 410 `<script><!--<script </script`: true, 411 `<script><!--<script </scripta`: true, 412 `<script><!--<script -`: true, 413 `<script><!--<script -a`: true, 414 `<script><!--<script --`: true, 415 `<script><!--<script --a`: true, 416 `<script><!--<script <`: true, 417 `<script><!--<script </`: true, 418 `<script><!--<script </s`: true, 419 // Reconstructing the active formatting elements results in a <plaintext> 420 // element that contains an <a> element. 421 `<!doctype html><p><a><plaintext>b`: true, 422 `<table><math><select><mi><select></table>`: true, 423} 424 425func TestNodeConsistency(t *testing.T) { 426 // inconsistentNode is a Node whose DataAtom and Data do not agree. 427 inconsistentNode := &Node{ 428 Type: ElementNode, 429 DataAtom: atom.Frameset, 430 Data: "table", 431 } 432 if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil { 433 t.Errorf("got nil error, want non-nil") 434 } 435} 436 437func TestParseFragmentWithNilContext(t *testing.T) { 438 // This shouldn't panic. 439 ParseFragment(strings.NewReader("<p>hello</p>"), nil) 440} 441 442func BenchmarkParser(b *testing.B) { 443 buf, err := ioutil.ReadFile("testdata/go1.html") 444 if err != nil { 445 b.Fatalf("could not read testdata/go1.html: %v", err) 446 } 447 b.SetBytes(int64(len(buf))) 448 runtime.GC() 449 b.ReportAllocs() 450 b.ResetTimer() 451 for i := 0; i < b.N; i++ { 452 Parse(bytes.NewBuffer(buf)) 453 } 454} 455