1// Copyright 2010 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package html 6 7import ( 8 "errors" 9 "fmt" 10 "io" 11 "strings" 12 13 a "golang.org/x/net/html/atom" 14) 15 16// A parser implements the HTML5 parsing algorithm: 17// https://html.spec.whatwg.org/multipage/syntax.html#tree-construction 18type parser struct { 19 // tokenizer provides the tokens for the parser. 20 tokenizer *Tokenizer 21 // tok is the most recently read token. 22 tok Token 23 // Self-closing tags like <hr/> are treated as start tags, except that 24 // hasSelfClosingToken is set while they are being processed. 25 hasSelfClosingToken bool 26 // doc is the document root element. 27 doc *Node 28 // The stack of open elements (section 12.2.4.2) and active formatting 29 // elements (section 12.2.4.3). 30 oe, afe nodeStack 31 // Element pointers (section 12.2.4.4). 32 head, form *Node 33 // Other parsing state flags (section 12.2.4.5). 34 scripting, framesetOK bool 35 // The stack of template insertion modes 36 templateStack insertionModeStack 37 // im is the current insertion mode. 38 im insertionMode 39 // originalIM is the insertion mode to go back to after completing a text 40 // or inTableText insertion mode. 41 originalIM insertionMode 42 // fosterParenting is whether new elements should be inserted according to 43 // the foster parenting rules (section 12.2.6.1). 44 fosterParenting bool 45 // quirks is whether the parser is operating in "quirks mode." 46 quirks bool 47 // fragment is whether the parser is parsing an HTML fragment. 48 fragment bool 49 // context is the context element when parsing an HTML fragment 50 // (section 12.4). 51 context *Node 52} 53 54func (p *parser) top() *Node { 55 if n := p.oe.top(); n != nil { 56 return n 57 } 58 return p.doc 59} 60 61// Stop tags for use in popUntil. These come from section 12.2.4.2. 62var ( 63 defaultScopeStopTags = map[string][]a.Atom{ 64 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, 65 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, 66 "svg": {a.Desc, a.ForeignObject, a.Title}, 67 } 68) 69 70type scope int 71 72const ( 73 defaultScope scope = iota 74 listItemScope 75 buttonScope 76 tableScope 77 tableRowScope 78 tableBodyScope 79 selectScope 80) 81 82// popUntil pops the stack of open elements at the highest element whose tag 83// is in matchTags, provided there is no higher element in the scope's stop 84// tags (as defined in section 12.2.4.2). It returns whether or not there was 85// such an element. If there was not, popUntil leaves the stack unchanged. 86// 87// For example, the set of stop tags for table scope is: "html", "table". If 88// the stack was: 89// ["html", "body", "font", "table", "b", "i", "u"] 90// then popUntil(tableScope, "font") would return false, but 91// popUntil(tableScope, "i") would return true and the stack would become: 92// ["html", "body", "font", "table", "b"] 93// 94// If an element's tag is in both the stop tags and matchTags, then the stack 95// will be popped and the function returns true (provided, of course, there was 96// no higher element in the stack that was also in the stop tags). For example, 97// popUntil(tableScope, "table") returns true and leaves: 98// ["html", "body", "font"] 99func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { 100 if i := p.indexOfElementInScope(s, matchTags...); i != -1 { 101 p.oe = p.oe[:i] 102 return true 103 } 104 return false 105} 106 107// indexOfElementInScope returns the index in p.oe of the highest element whose 108// tag is in matchTags that is in scope. If no matching element is in scope, it 109// returns -1. 110func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { 111 for i := len(p.oe) - 1; i >= 0; i-- { 112 tagAtom := p.oe[i].DataAtom 113 if p.oe[i].Namespace == "" { 114 for _, t := range matchTags { 115 if t == tagAtom { 116 return i 117 } 118 } 119 switch s { 120 case defaultScope: 121 // No-op. 122 case listItemScope: 123 if tagAtom == a.Ol || tagAtom == a.Ul { 124 return -1 125 } 126 case buttonScope: 127 if tagAtom == a.Button { 128 return -1 129 } 130 case tableScope: 131 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 132 return -1 133 } 134 case selectScope: 135 if tagAtom != a.Optgroup && tagAtom != a.Option { 136 return -1 137 } 138 default: 139 panic("unreachable") 140 } 141 } 142 switch s { 143 case defaultScope, listItemScope, buttonScope: 144 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { 145 if t == tagAtom { 146 return -1 147 } 148 } 149 } 150 } 151 return -1 152} 153 154// elementInScope is like popUntil, except that it doesn't modify the stack of 155// open elements. 156func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { 157 return p.indexOfElementInScope(s, matchTags...) != -1 158} 159 160// clearStackToContext pops elements off the stack of open elements until a 161// scope-defined element is found. 162func (p *parser) clearStackToContext(s scope) { 163 for i := len(p.oe) - 1; i >= 0; i-- { 164 tagAtom := p.oe[i].DataAtom 165 switch s { 166 case tableScope: 167 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 168 p.oe = p.oe[:i+1] 169 return 170 } 171 case tableRowScope: 172 if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template { 173 p.oe = p.oe[:i+1] 174 return 175 } 176 case tableBodyScope: 177 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template { 178 p.oe = p.oe[:i+1] 179 return 180 } 181 default: 182 panic("unreachable") 183 } 184 } 185} 186 187// parseGenericRawTextElements implements the generic raw text element parsing 188// algorithm defined in 12.2.6.2. 189// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text 190// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part 191// officially, need to make tokenizer consider both states. 192func (p *parser) parseGenericRawTextElement() { 193 p.addElement() 194 p.originalIM = p.im 195 p.im = textIM 196} 197 198// generateImpliedEndTags pops nodes off the stack of open elements as long as 199// the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc. 200// If exceptions are specified, nodes with that name will not be popped off. 201func (p *parser) generateImpliedEndTags(exceptions ...string) { 202 var i int 203loop: 204 for i = len(p.oe) - 1; i >= 0; i-- { 205 n := p.oe[i] 206 if n.Type != ElementNode { 207 break 208 } 209 switch n.DataAtom { 210 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc: 211 for _, except := range exceptions { 212 if n.Data == except { 213 break loop 214 } 215 } 216 continue 217 } 218 break 219 } 220 221 p.oe = p.oe[:i+1] 222} 223 224// addChild adds a child node n to the top element, and pushes n onto the stack 225// of open elements if it is an element node. 226func (p *parser) addChild(n *Node) { 227 if p.shouldFosterParent() { 228 p.fosterParent(n) 229 } else { 230 p.top().AppendChild(n) 231 } 232 233 if n.Type == ElementNode { 234 p.oe = append(p.oe, n) 235 } 236} 237 238// shouldFosterParent returns whether the next node to be added should be 239// foster parented. 240func (p *parser) shouldFosterParent() bool { 241 if p.fosterParenting { 242 switch p.top().DataAtom { 243 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 244 return true 245 } 246 } 247 return false 248} 249 250// fosterParent adds a child node according to the foster parenting rules. 251// Section 12.2.6.1, "foster parenting". 252func (p *parser) fosterParent(n *Node) { 253 var table, parent, prev, template *Node 254 var i int 255 for i = len(p.oe) - 1; i >= 0; i-- { 256 if p.oe[i].DataAtom == a.Table { 257 table = p.oe[i] 258 break 259 } 260 } 261 262 var j int 263 for j = len(p.oe) - 1; j >= 0; j-- { 264 if p.oe[j].DataAtom == a.Template { 265 template = p.oe[j] 266 break 267 } 268 } 269 270 if template != nil && (table == nil || j > i) { 271 template.AppendChild(n) 272 return 273 } 274 275 if table == nil { 276 // The foster parent is the html element. 277 parent = p.oe[0] 278 } else { 279 parent = table.Parent 280 } 281 if parent == nil { 282 parent = p.oe[i-1] 283 } 284 285 if table != nil { 286 prev = table.PrevSibling 287 } else { 288 prev = parent.LastChild 289 } 290 if prev != nil && prev.Type == TextNode && n.Type == TextNode { 291 prev.Data += n.Data 292 return 293 } 294 295 parent.InsertBefore(n, table) 296} 297 298// addText adds text to the preceding node if it is a text node, or else it 299// calls addChild with a new text node. 300func (p *parser) addText(text string) { 301 if text == "" { 302 return 303 } 304 305 if p.shouldFosterParent() { 306 p.fosterParent(&Node{ 307 Type: TextNode, 308 Data: text, 309 }) 310 return 311 } 312 313 t := p.top() 314 if n := t.LastChild; n != nil && n.Type == TextNode { 315 n.Data += text 316 return 317 } 318 p.addChild(&Node{ 319 Type: TextNode, 320 Data: text, 321 }) 322} 323 324// addElement adds a child element based on the current token. 325func (p *parser) addElement() { 326 p.addChild(&Node{ 327 Type: ElementNode, 328 DataAtom: p.tok.DataAtom, 329 Data: p.tok.Data, 330 Attr: p.tok.Attr, 331 }) 332} 333 334// Section 12.2.4.3. 335func (p *parser) addFormattingElement() { 336 tagAtom, attr := p.tok.DataAtom, p.tok.Attr 337 p.addElement() 338 339 // Implement the Noah's Ark clause, but with three per family instead of two. 340 identicalElements := 0 341findIdenticalElements: 342 for i := len(p.afe) - 1; i >= 0; i-- { 343 n := p.afe[i] 344 if n.Type == scopeMarkerNode { 345 break 346 } 347 if n.Type != ElementNode { 348 continue 349 } 350 if n.Namespace != "" { 351 continue 352 } 353 if n.DataAtom != tagAtom { 354 continue 355 } 356 if len(n.Attr) != len(attr) { 357 continue 358 } 359 compareAttributes: 360 for _, t0 := range n.Attr { 361 for _, t1 := range attr { 362 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { 363 // Found a match for this attribute, continue with the next attribute. 364 continue compareAttributes 365 } 366 } 367 // If we get here, there is no attribute that matches a. 368 // Therefore the element is not identical to the new one. 369 continue findIdenticalElements 370 } 371 372 identicalElements++ 373 if identicalElements >= 3 { 374 p.afe.remove(n) 375 } 376 } 377 378 p.afe = append(p.afe, p.top()) 379} 380 381// Section 12.2.4.3. 382func (p *parser) clearActiveFormattingElements() { 383 for { 384 if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode { 385 return 386 } 387 } 388} 389 390// Section 12.2.4.3. 391func (p *parser) reconstructActiveFormattingElements() { 392 n := p.afe.top() 393 if n == nil { 394 return 395 } 396 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { 397 return 398 } 399 i := len(p.afe) - 1 400 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { 401 if i == 0 { 402 i = -1 403 break 404 } 405 i-- 406 n = p.afe[i] 407 } 408 for { 409 i++ 410 clone := p.afe[i].clone() 411 p.addChild(clone) 412 p.afe[i] = clone 413 if i == len(p.afe)-1 { 414 break 415 } 416 } 417} 418 419// Section 12.2.5. 420func (p *parser) acknowledgeSelfClosingTag() { 421 p.hasSelfClosingToken = false 422} 423 424// An insertion mode (section 12.2.4.1) is the state transition function from 425// a particular state in the HTML5 parser's state machine. It updates the 426// parser's fields depending on parser.tok (where ErrorToken means EOF). 427// It returns whether the token was consumed. 428type insertionMode func(*parser) bool 429 430// setOriginalIM sets the insertion mode to return to after completing a text or 431// inTableText insertion mode. 432// Section 12.2.4.1, "using the rules for". 433func (p *parser) setOriginalIM() { 434 if p.originalIM != nil { 435 panic("html: bad parser state: originalIM was set twice") 436 } 437 p.originalIM = p.im 438} 439 440// Section 12.2.4.1, "reset the insertion mode". 441func (p *parser) resetInsertionMode() { 442 for i := len(p.oe) - 1; i >= 0; i-- { 443 n := p.oe[i] 444 last := i == 0 445 if last && p.context != nil { 446 n = p.context 447 } 448 449 switch n.DataAtom { 450 case a.Select: 451 if !last { 452 for ancestor, first := n, p.oe[0]; ancestor != first; { 453 ancestor = p.oe[p.oe.index(ancestor)-1] 454 switch ancestor.DataAtom { 455 case a.Template: 456 p.im = inSelectIM 457 return 458 case a.Table: 459 p.im = inSelectInTableIM 460 return 461 } 462 } 463 } 464 p.im = inSelectIM 465 case a.Td, a.Th: 466 // TODO: remove this divergence from the HTML5 spec. 467 // 468 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 469 p.im = inCellIM 470 case a.Tr: 471 p.im = inRowIM 472 case a.Tbody, a.Thead, a.Tfoot: 473 p.im = inTableBodyIM 474 case a.Caption: 475 p.im = inCaptionIM 476 case a.Colgroup: 477 p.im = inColumnGroupIM 478 case a.Table: 479 p.im = inTableIM 480 case a.Template: 481 // TODO: remove this divergence from the HTML5 spec. 482 if n.Namespace != "" { 483 continue 484 } 485 p.im = p.templateStack.top() 486 case a.Head: 487 // TODO: remove this divergence from the HTML5 spec. 488 // 489 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 490 p.im = inHeadIM 491 case a.Body: 492 p.im = inBodyIM 493 case a.Frameset: 494 p.im = inFramesetIM 495 case a.Html: 496 if p.head == nil { 497 p.im = beforeHeadIM 498 } else { 499 p.im = afterHeadIM 500 } 501 default: 502 if last { 503 p.im = inBodyIM 504 return 505 } 506 continue 507 } 508 return 509 } 510} 511 512const whitespace = " \t\r\n\f" 513 514// Section 12.2.6.4.1. 515func initialIM(p *parser) bool { 516 switch p.tok.Type { 517 case TextToken: 518 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 519 if len(p.tok.Data) == 0 { 520 // It was all whitespace, so ignore it. 521 return true 522 } 523 case CommentToken: 524 p.doc.AppendChild(&Node{ 525 Type: CommentNode, 526 Data: p.tok.Data, 527 }) 528 return true 529 case DoctypeToken: 530 n, quirks := parseDoctype(p.tok.Data) 531 p.doc.AppendChild(n) 532 p.quirks = quirks 533 p.im = beforeHTMLIM 534 return true 535 } 536 p.quirks = true 537 p.im = beforeHTMLIM 538 return false 539} 540 541// Section 12.2.6.4.2. 542func beforeHTMLIM(p *parser) bool { 543 switch p.tok.Type { 544 case DoctypeToken: 545 // Ignore the token. 546 return true 547 case TextToken: 548 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 549 if len(p.tok.Data) == 0 { 550 // It was all whitespace, so ignore it. 551 return true 552 } 553 case StartTagToken: 554 if p.tok.DataAtom == a.Html { 555 p.addElement() 556 p.im = beforeHeadIM 557 return true 558 } 559 case EndTagToken: 560 switch p.tok.DataAtom { 561 case a.Head, a.Body, a.Html, a.Br: 562 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 563 return false 564 default: 565 // Ignore the token. 566 return true 567 } 568 case CommentToken: 569 p.doc.AppendChild(&Node{ 570 Type: CommentNode, 571 Data: p.tok.Data, 572 }) 573 return true 574 } 575 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 576 return false 577} 578 579// Section 12.2.6.4.3. 580func beforeHeadIM(p *parser) bool { 581 switch p.tok.Type { 582 case TextToken: 583 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 584 if len(p.tok.Data) == 0 { 585 // It was all whitespace, so ignore it. 586 return true 587 } 588 case StartTagToken: 589 switch p.tok.DataAtom { 590 case a.Head: 591 p.addElement() 592 p.head = p.top() 593 p.im = inHeadIM 594 return true 595 case a.Html: 596 return inBodyIM(p) 597 } 598 case EndTagToken: 599 switch p.tok.DataAtom { 600 case a.Head, a.Body, a.Html, a.Br: 601 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 602 return false 603 default: 604 // Ignore the token. 605 return true 606 } 607 case CommentToken: 608 p.addChild(&Node{ 609 Type: CommentNode, 610 Data: p.tok.Data, 611 }) 612 return true 613 case DoctypeToken: 614 // Ignore the token. 615 return true 616 } 617 618 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 619 return false 620} 621 622// Section 12.2.6.4.4. 623func inHeadIM(p *parser) bool { 624 switch p.tok.Type { 625 case TextToken: 626 s := strings.TrimLeft(p.tok.Data, whitespace) 627 if len(s) < len(p.tok.Data) { 628 // Add the initial whitespace to the current node. 629 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 630 if s == "" { 631 return true 632 } 633 p.tok.Data = s 634 } 635 case StartTagToken: 636 switch p.tok.DataAtom { 637 case a.Html: 638 return inBodyIM(p) 639 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta: 640 p.addElement() 641 p.oe.pop() 642 p.acknowledgeSelfClosingTag() 643 return true 644 case a.Noscript: 645 if p.scripting { 646 p.parseGenericRawTextElement() 647 return true 648 } 649 p.addElement() 650 p.im = inHeadNoscriptIM 651 // Don't let the tokenizer go into raw text mode when scripting is disabled. 652 p.tokenizer.NextIsNotRawText() 653 return true 654 case a.Script, a.Title: 655 p.addElement() 656 p.setOriginalIM() 657 p.im = textIM 658 return true 659 case a.Noframes, a.Style: 660 p.parseGenericRawTextElement() 661 return true 662 case a.Head: 663 // Ignore the token. 664 return true 665 case a.Template: 666 p.addElement() 667 p.afe = append(p.afe, &scopeMarker) 668 p.framesetOK = false 669 p.im = inTemplateIM 670 p.templateStack = append(p.templateStack, inTemplateIM) 671 return true 672 } 673 case EndTagToken: 674 switch p.tok.DataAtom { 675 case a.Head: 676 p.oe.pop() 677 p.im = afterHeadIM 678 return true 679 case a.Body, a.Html, a.Br: 680 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 681 return false 682 case a.Template: 683 if !p.oe.contains(a.Template) { 684 return true 685 } 686 // TODO: remove this divergence from the HTML5 spec. 687 // 688 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 689 p.generateImpliedEndTags() 690 for i := len(p.oe) - 1; i >= 0; i-- { 691 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 692 p.oe = p.oe[:i] 693 break 694 } 695 } 696 p.clearActiveFormattingElements() 697 p.templateStack.pop() 698 p.resetInsertionMode() 699 return true 700 default: 701 // Ignore the token. 702 return true 703 } 704 case CommentToken: 705 p.addChild(&Node{ 706 Type: CommentNode, 707 Data: p.tok.Data, 708 }) 709 return true 710 case DoctypeToken: 711 // Ignore the token. 712 return true 713 } 714 715 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 716 return false 717} 718 719// 12.2.6.4.5. 720func inHeadNoscriptIM(p *parser) bool { 721 switch p.tok.Type { 722 case DoctypeToken: 723 // Ignore the token. 724 return true 725 case StartTagToken: 726 switch p.tok.DataAtom { 727 case a.Html: 728 return inBodyIM(p) 729 case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style: 730 return inHeadIM(p) 731 case a.Head, a.Noscript: 732 // Ignore the token. 733 return true 734 } 735 case EndTagToken: 736 switch p.tok.DataAtom { 737 case a.Noscript, a.Br: 738 default: 739 // Ignore the token. 740 return true 741 } 742 case TextToken: 743 s := strings.TrimLeft(p.tok.Data, whitespace) 744 if len(s) == 0 { 745 // It was all whitespace. 746 return inHeadIM(p) 747 } 748 case CommentToken: 749 return inHeadIM(p) 750 } 751 p.oe.pop() 752 if p.top().DataAtom != a.Head { 753 panic("html: the new current node will be a head element.") 754 } 755 p.im = inHeadIM 756 if p.tok.DataAtom == a.Noscript { 757 return true 758 } 759 return false 760} 761 762// Section 12.2.6.4.6. 763func afterHeadIM(p *parser) bool { 764 switch p.tok.Type { 765 case TextToken: 766 s := strings.TrimLeft(p.tok.Data, whitespace) 767 if len(s) < len(p.tok.Data) { 768 // Add the initial whitespace to the current node. 769 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 770 if s == "" { 771 return true 772 } 773 p.tok.Data = s 774 } 775 case StartTagToken: 776 switch p.tok.DataAtom { 777 case a.Html: 778 return inBodyIM(p) 779 case a.Body: 780 p.addElement() 781 p.framesetOK = false 782 p.im = inBodyIM 783 return true 784 case a.Frameset: 785 p.addElement() 786 p.im = inFramesetIM 787 return true 788 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 789 p.oe = append(p.oe, p.head) 790 defer p.oe.remove(p.head) 791 return inHeadIM(p) 792 case a.Head: 793 // Ignore the token. 794 return true 795 } 796 case EndTagToken: 797 switch p.tok.DataAtom { 798 case a.Body, a.Html, a.Br: 799 // Drop down to creating an implied <body> tag. 800 case a.Template: 801 return inHeadIM(p) 802 default: 803 // Ignore the token. 804 return true 805 } 806 case CommentToken: 807 p.addChild(&Node{ 808 Type: CommentNode, 809 Data: p.tok.Data, 810 }) 811 return true 812 case DoctypeToken: 813 // Ignore the token. 814 return true 815 } 816 817 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) 818 p.framesetOK = true 819 return false 820} 821 822// copyAttributes copies attributes of src not found on dst to dst. 823func copyAttributes(dst *Node, src Token) { 824 if len(src.Attr) == 0 { 825 return 826 } 827 attr := map[string]string{} 828 for _, t := range dst.Attr { 829 attr[t.Key] = t.Val 830 } 831 for _, t := range src.Attr { 832 if _, ok := attr[t.Key]; !ok { 833 dst.Attr = append(dst.Attr, t) 834 attr[t.Key] = t.Val 835 } 836 } 837} 838 839// Section 12.2.6.4.7. 840func inBodyIM(p *parser) bool { 841 switch p.tok.Type { 842 case TextToken: 843 d := p.tok.Data 844 switch n := p.oe.top(); n.DataAtom { 845 case a.Pre, a.Listing: 846 if n.FirstChild == nil { 847 // Ignore a newline at the start of a <pre> block. 848 if d != "" && d[0] == '\r' { 849 d = d[1:] 850 } 851 if d != "" && d[0] == '\n' { 852 d = d[1:] 853 } 854 } 855 } 856 d = strings.Replace(d, "\x00", "", -1) 857 if d == "" { 858 return true 859 } 860 p.reconstructActiveFormattingElements() 861 p.addText(d) 862 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { 863 // There were non-whitespace characters inserted. 864 p.framesetOK = false 865 } 866 case StartTagToken: 867 switch p.tok.DataAtom { 868 case a.Html: 869 if p.oe.contains(a.Template) { 870 return true 871 } 872 copyAttributes(p.oe[0], p.tok) 873 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 874 return inHeadIM(p) 875 case a.Body: 876 if p.oe.contains(a.Template) { 877 return true 878 } 879 if len(p.oe) >= 2 { 880 body := p.oe[1] 881 if body.Type == ElementNode && body.DataAtom == a.Body { 882 p.framesetOK = false 883 copyAttributes(body, p.tok) 884 } 885 } 886 case a.Frameset: 887 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { 888 // Ignore the token. 889 return true 890 } 891 body := p.oe[1] 892 if body.Parent != nil { 893 body.Parent.RemoveChild(body) 894 } 895 p.oe = p.oe[:1] 896 p.addElement() 897 p.im = inFramesetIM 898 return true 899 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: 900 p.popUntil(buttonScope, a.P) 901 p.addElement() 902 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 903 p.popUntil(buttonScope, a.P) 904 switch n := p.top(); n.DataAtom { 905 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 906 p.oe.pop() 907 } 908 p.addElement() 909 case a.Pre, a.Listing: 910 p.popUntil(buttonScope, a.P) 911 p.addElement() 912 // The newline, if any, will be dealt with by the TextToken case. 913 p.framesetOK = false 914 case a.Form: 915 if p.form != nil && !p.oe.contains(a.Template) { 916 // Ignore the token 917 return true 918 } 919 p.popUntil(buttonScope, a.P) 920 p.addElement() 921 if !p.oe.contains(a.Template) { 922 p.form = p.top() 923 } 924 case a.Li: 925 p.framesetOK = false 926 for i := len(p.oe) - 1; i >= 0; i-- { 927 node := p.oe[i] 928 switch node.DataAtom { 929 case a.Li: 930 p.oe = p.oe[:i] 931 case a.Address, a.Div, a.P: 932 continue 933 default: 934 if !isSpecialElement(node) { 935 continue 936 } 937 } 938 break 939 } 940 p.popUntil(buttonScope, a.P) 941 p.addElement() 942 case a.Dd, a.Dt: 943 p.framesetOK = false 944 for i := len(p.oe) - 1; i >= 0; i-- { 945 node := p.oe[i] 946 switch node.DataAtom { 947 case a.Dd, a.Dt: 948 p.oe = p.oe[:i] 949 case a.Address, a.Div, a.P: 950 continue 951 default: 952 if !isSpecialElement(node) { 953 continue 954 } 955 } 956 break 957 } 958 p.popUntil(buttonScope, a.P) 959 p.addElement() 960 case a.Plaintext: 961 p.popUntil(buttonScope, a.P) 962 p.addElement() 963 case a.Button: 964 p.popUntil(defaultScope, a.Button) 965 p.reconstructActiveFormattingElements() 966 p.addElement() 967 p.framesetOK = false 968 case a.A: 969 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { 970 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { 971 p.inBodyEndTagFormatting(a.A, "a") 972 p.oe.remove(n) 973 p.afe.remove(n) 974 break 975 } 976 } 977 p.reconstructActiveFormattingElements() 978 p.addFormattingElement() 979 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 980 p.reconstructActiveFormattingElements() 981 p.addFormattingElement() 982 case a.Nobr: 983 p.reconstructActiveFormattingElements() 984 if p.elementInScope(defaultScope, a.Nobr) { 985 p.inBodyEndTagFormatting(a.Nobr, "nobr") 986 p.reconstructActiveFormattingElements() 987 } 988 p.addFormattingElement() 989 case a.Applet, a.Marquee, a.Object: 990 p.reconstructActiveFormattingElements() 991 p.addElement() 992 p.afe = append(p.afe, &scopeMarker) 993 p.framesetOK = false 994 case a.Table: 995 if !p.quirks { 996 p.popUntil(buttonScope, a.P) 997 } 998 p.addElement() 999 p.framesetOK = false 1000 p.im = inTableIM 1001 return true 1002 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: 1003 p.reconstructActiveFormattingElements() 1004 p.addElement() 1005 p.oe.pop() 1006 p.acknowledgeSelfClosingTag() 1007 if p.tok.DataAtom == a.Input { 1008 for _, t := range p.tok.Attr { 1009 if t.Key == "type" { 1010 if strings.ToLower(t.Val) == "hidden" { 1011 // Skip setting framesetOK = false 1012 return true 1013 } 1014 } 1015 } 1016 } 1017 p.framesetOK = false 1018 case a.Param, a.Source, a.Track: 1019 p.addElement() 1020 p.oe.pop() 1021 p.acknowledgeSelfClosingTag() 1022 case a.Hr: 1023 p.popUntil(buttonScope, a.P) 1024 p.addElement() 1025 p.oe.pop() 1026 p.acknowledgeSelfClosingTag() 1027 p.framesetOK = false 1028 case a.Image: 1029 p.tok.DataAtom = a.Img 1030 p.tok.Data = a.Img.String() 1031 return false 1032 case a.Textarea: 1033 p.addElement() 1034 p.setOriginalIM() 1035 p.framesetOK = false 1036 p.im = textIM 1037 case a.Xmp: 1038 p.popUntil(buttonScope, a.P) 1039 p.reconstructActiveFormattingElements() 1040 p.framesetOK = false 1041 p.parseGenericRawTextElement() 1042 case a.Iframe: 1043 p.framesetOK = false 1044 p.parseGenericRawTextElement() 1045 case a.Noembed: 1046 p.parseGenericRawTextElement() 1047 case a.Noscript: 1048 if p.scripting { 1049 p.parseGenericRawTextElement() 1050 return true 1051 } 1052 p.reconstructActiveFormattingElements() 1053 p.addElement() 1054 // Don't let the tokenizer go into raw text mode when scripting is disabled. 1055 p.tokenizer.NextIsNotRawText() 1056 case a.Select: 1057 p.reconstructActiveFormattingElements() 1058 p.addElement() 1059 p.framesetOK = false 1060 p.im = inSelectIM 1061 return true 1062 case a.Optgroup, a.Option: 1063 if p.top().DataAtom == a.Option { 1064 p.oe.pop() 1065 } 1066 p.reconstructActiveFormattingElements() 1067 p.addElement() 1068 case a.Rb, a.Rtc: 1069 if p.elementInScope(defaultScope, a.Ruby) { 1070 p.generateImpliedEndTags() 1071 } 1072 p.addElement() 1073 case a.Rp, a.Rt: 1074 if p.elementInScope(defaultScope, a.Ruby) { 1075 p.generateImpliedEndTags("rtc") 1076 } 1077 p.addElement() 1078 case a.Math, a.Svg: 1079 p.reconstructActiveFormattingElements() 1080 if p.tok.DataAtom == a.Math { 1081 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 1082 } else { 1083 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 1084 } 1085 adjustForeignAttributes(p.tok.Attr) 1086 p.addElement() 1087 p.top().Namespace = p.tok.Data 1088 if p.hasSelfClosingToken { 1089 p.oe.pop() 1090 p.acknowledgeSelfClosingTag() 1091 } 1092 return true 1093 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1094 // Ignore the token. 1095 default: 1096 p.reconstructActiveFormattingElements() 1097 p.addElement() 1098 } 1099 case EndTagToken: 1100 switch p.tok.DataAtom { 1101 case a.Body: 1102 if p.elementInScope(defaultScope, a.Body) { 1103 p.im = afterBodyIM 1104 } 1105 case a.Html: 1106 if p.elementInScope(defaultScope, a.Body) { 1107 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) 1108 return false 1109 } 1110 return true 1111 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: 1112 p.popUntil(defaultScope, p.tok.DataAtom) 1113 case a.Form: 1114 if p.oe.contains(a.Template) { 1115 i := p.indexOfElementInScope(defaultScope, a.Form) 1116 if i == -1 { 1117 // Ignore the token. 1118 return true 1119 } 1120 p.generateImpliedEndTags() 1121 if p.oe[i].DataAtom != a.Form { 1122 // Ignore the token. 1123 return true 1124 } 1125 p.popUntil(defaultScope, a.Form) 1126 } else { 1127 node := p.form 1128 p.form = nil 1129 i := p.indexOfElementInScope(defaultScope, a.Form) 1130 if node == nil || i == -1 || p.oe[i] != node { 1131 // Ignore the token. 1132 return true 1133 } 1134 p.generateImpliedEndTags() 1135 p.oe.remove(node) 1136 } 1137 case a.P: 1138 if !p.elementInScope(buttonScope, a.P) { 1139 p.parseImpliedToken(StartTagToken, a.P, a.P.String()) 1140 } 1141 p.popUntil(buttonScope, a.P) 1142 case a.Li: 1143 p.popUntil(listItemScope, a.Li) 1144 case a.Dd, a.Dt: 1145 p.popUntil(defaultScope, p.tok.DataAtom) 1146 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 1147 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) 1148 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 1149 p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data) 1150 case a.Applet, a.Marquee, a.Object: 1151 if p.popUntil(defaultScope, p.tok.DataAtom) { 1152 p.clearActiveFormattingElements() 1153 } 1154 case a.Br: 1155 p.tok.Type = StartTagToken 1156 return false 1157 case a.Template: 1158 return inHeadIM(p) 1159 default: 1160 p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data) 1161 } 1162 case CommentToken: 1163 p.addChild(&Node{ 1164 Type: CommentNode, 1165 Data: p.tok.Data, 1166 }) 1167 case ErrorToken: 1168 // TODO: remove this divergence from the HTML5 spec. 1169 if len(p.templateStack) > 0 { 1170 p.im = inTemplateIM 1171 return false 1172 } 1173 for _, e := range p.oe { 1174 switch e.DataAtom { 1175 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th, 1176 a.Thead, a.Tr, a.Body, a.Html: 1177 default: 1178 return true 1179 } 1180 } 1181 } 1182 1183 return true 1184} 1185 1186func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) { 1187 // This is the "adoption agency" algorithm, described at 1188 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency 1189 1190 // TODO: this is a fairly literal line-by-line translation of that algorithm. 1191 // Once the code successfully parses the comprehensive test suite, we should 1192 // refactor this code to be more idiomatic. 1193 1194 // Steps 1-2 1195 if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 { 1196 p.oe.pop() 1197 return 1198 } 1199 1200 // Steps 3-5. The outer loop. 1201 for i := 0; i < 8; i++ { 1202 // Step 6. Find the formatting element. 1203 var formattingElement *Node 1204 for j := len(p.afe) - 1; j >= 0; j-- { 1205 if p.afe[j].Type == scopeMarkerNode { 1206 break 1207 } 1208 if p.afe[j].DataAtom == tagAtom { 1209 formattingElement = p.afe[j] 1210 break 1211 } 1212 } 1213 if formattingElement == nil { 1214 p.inBodyEndTagOther(tagAtom, tagName) 1215 return 1216 } 1217 1218 // Step 7. Ignore the tag if formatting element is not in the stack of open elements. 1219 feIndex := p.oe.index(formattingElement) 1220 if feIndex == -1 { 1221 p.afe.remove(formattingElement) 1222 return 1223 } 1224 // Step 8. Ignore the tag if formatting element is not in the scope. 1225 if !p.elementInScope(defaultScope, tagAtom) { 1226 // Ignore the tag. 1227 return 1228 } 1229 1230 // Step 9. This step is omitted because it's just a parse error but no need to return. 1231 1232 // Steps 10-11. Find the furthest block. 1233 var furthestBlock *Node 1234 for _, e := range p.oe[feIndex:] { 1235 if isSpecialElement(e) { 1236 furthestBlock = e 1237 break 1238 } 1239 } 1240 if furthestBlock == nil { 1241 e := p.oe.pop() 1242 for e != formattingElement { 1243 e = p.oe.pop() 1244 } 1245 p.afe.remove(e) 1246 return 1247 } 1248 1249 // Steps 12-13. Find the common ancestor and bookmark node. 1250 commonAncestor := p.oe[feIndex-1] 1251 bookmark := p.afe.index(formattingElement) 1252 1253 // Step 14. The inner loop. Find the lastNode to reparent. 1254 lastNode := furthestBlock 1255 node := furthestBlock 1256 x := p.oe.index(node) 1257 // Step 14.1. 1258 j := 0 1259 for { 1260 // Step 14.2. 1261 j++ 1262 // Step. 14.3. 1263 x-- 1264 node = p.oe[x] 1265 // Step 14.4. Go to the next step if node is formatting element. 1266 if node == formattingElement { 1267 break 1268 } 1269 // Step 14.5. Remove node from the list of active formatting elements if 1270 // inner loop counter is greater than three and node is in the list of 1271 // active formatting elements. 1272 if ni := p.afe.index(node); j > 3 && ni > -1 { 1273 p.afe.remove(node) 1274 // If any element of the list of active formatting elements is removed, 1275 // we need to take care whether bookmark should be decremented or not. 1276 // This is because the value of bookmark may exceed the size of the 1277 // list by removing elements from the list. 1278 if ni <= bookmark { 1279 bookmark-- 1280 } 1281 continue 1282 } 1283 // Step 14.6. Continue the next inner loop if node is not in the list of 1284 // active formatting elements. 1285 if p.afe.index(node) == -1 { 1286 p.oe.remove(node) 1287 continue 1288 } 1289 // Step 14.7. 1290 clone := node.clone() 1291 p.afe[p.afe.index(node)] = clone 1292 p.oe[p.oe.index(node)] = clone 1293 node = clone 1294 // Step 14.8. 1295 if lastNode == furthestBlock { 1296 bookmark = p.afe.index(node) + 1 1297 } 1298 // Step 14.9. 1299 if lastNode.Parent != nil { 1300 lastNode.Parent.RemoveChild(lastNode) 1301 } 1302 node.AppendChild(lastNode) 1303 // Step 14.10. 1304 lastNode = node 1305 } 1306 1307 // Step 15. Reparent lastNode to the common ancestor, 1308 // or for misnested table nodes, to the foster parent. 1309 if lastNode.Parent != nil { 1310 lastNode.Parent.RemoveChild(lastNode) 1311 } 1312 switch commonAncestor.DataAtom { 1313 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1314 p.fosterParent(lastNode) 1315 default: 1316 commonAncestor.AppendChild(lastNode) 1317 } 1318 1319 // Steps 16-18. Reparent nodes from the furthest block's children 1320 // to a clone of the formatting element. 1321 clone := formattingElement.clone() 1322 reparentChildren(clone, furthestBlock) 1323 furthestBlock.AppendChild(clone) 1324 1325 // Step 19. Fix up the list of active formatting elements. 1326 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { 1327 // Move the bookmark with the rest of the list. 1328 bookmark-- 1329 } 1330 p.afe.remove(formattingElement) 1331 p.afe.insert(bookmark, clone) 1332 1333 // Step 20. Fix up the stack of open elements. 1334 p.oe.remove(formattingElement) 1335 p.oe.insert(p.oe.index(furthestBlock)+1, clone) 1336 } 1337} 1338 1339// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. 1340// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content 1341// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign 1342func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) { 1343 for i := len(p.oe) - 1; i >= 0; i-- { 1344 // Two element nodes have the same tag if they have the same Data (a 1345 // string-typed field). As an optimization, for common HTML tags, each 1346 // Data string is assigned a unique, non-zero DataAtom (a uint32-typed 1347 // field), since integer comparison is faster than string comparison. 1348 // Uncommon (custom) tags get a zero DataAtom. 1349 // 1350 // The if condition here is equivalent to (p.oe[i].Data == tagName). 1351 if (p.oe[i].DataAtom == tagAtom) && 1352 ((tagAtom != 0) || (p.oe[i].Data == tagName)) { 1353 p.oe = p.oe[:i] 1354 break 1355 } 1356 if isSpecialElement(p.oe[i]) { 1357 break 1358 } 1359 } 1360} 1361 1362// Section 12.2.6.4.8. 1363func textIM(p *parser) bool { 1364 switch p.tok.Type { 1365 case ErrorToken: 1366 p.oe.pop() 1367 case TextToken: 1368 d := p.tok.Data 1369 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { 1370 // Ignore a newline at the start of a <textarea> block. 1371 if d != "" && d[0] == '\r' { 1372 d = d[1:] 1373 } 1374 if d != "" && d[0] == '\n' { 1375 d = d[1:] 1376 } 1377 } 1378 if d == "" { 1379 return true 1380 } 1381 p.addText(d) 1382 return true 1383 case EndTagToken: 1384 p.oe.pop() 1385 } 1386 p.im = p.originalIM 1387 p.originalIM = nil 1388 return p.tok.Type == EndTagToken 1389} 1390 1391// Section 12.2.6.4.9. 1392func inTableIM(p *parser) bool { 1393 switch p.tok.Type { 1394 case TextToken: 1395 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) 1396 switch p.oe.top().DataAtom { 1397 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1398 if strings.Trim(p.tok.Data, whitespace) == "" { 1399 p.addText(p.tok.Data) 1400 return true 1401 } 1402 } 1403 case StartTagToken: 1404 switch p.tok.DataAtom { 1405 case a.Caption: 1406 p.clearStackToContext(tableScope) 1407 p.afe = append(p.afe, &scopeMarker) 1408 p.addElement() 1409 p.im = inCaptionIM 1410 return true 1411 case a.Colgroup: 1412 p.clearStackToContext(tableScope) 1413 p.addElement() 1414 p.im = inColumnGroupIM 1415 return true 1416 case a.Col: 1417 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) 1418 return false 1419 case a.Tbody, a.Tfoot, a.Thead: 1420 p.clearStackToContext(tableScope) 1421 p.addElement() 1422 p.im = inTableBodyIM 1423 return true 1424 case a.Td, a.Th, a.Tr: 1425 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) 1426 return false 1427 case a.Table: 1428 if p.popUntil(tableScope, a.Table) { 1429 p.resetInsertionMode() 1430 return false 1431 } 1432 // Ignore the token. 1433 return true 1434 case a.Style, a.Script, a.Template: 1435 return inHeadIM(p) 1436 case a.Input: 1437 for _, t := range p.tok.Attr { 1438 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { 1439 p.addElement() 1440 p.oe.pop() 1441 return true 1442 } 1443 } 1444 // Otherwise drop down to the default action. 1445 case a.Form: 1446 if p.oe.contains(a.Template) || p.form != nil { 1447 // Ignore the token. 1448 return true 1449 } 1450 p.addElement() 1451 p.form = p.oe.pop() 1452 case a.Select: 1453 p.reconstructActiveFormattingElements() 1454 switch p.top().DataAtom { 1455 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1456 p.fosterParenting = true 1457 } 1458 p.addElement() 1459 p.fosterParenting = false 1460 p.framesetOK = false 1461 p.im = inSelectInTableIM 1462 return true 1463 } 1464 case EndTagToken: 1465 switch p.tok.DataAtom { 1466 case a.Table: 1467 if p.popUntil(tableScope, a.Table) { 1468 p.resetInsertionMode() 1469 return true 1470 } 1471 // Ignore the token. 1472 return true 1473 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1474 // Ignore the token. 1475 return true 1476 case a.Template: 1477 return inHeadIM(p) 1478 } 1479 case CommentToken: 1480 p.addChild(&Node{ 1481 Type: CommentNode, 1482 Data: p.tok.Data, 1483 }) 1484 return true 1485 case DoctypeToken: 1486 // Ignore the token. 1487 return true 1488 case ErrorToken: 1489 return inBodyIM(p) 1490 } 1491 1492 p.fosterParenting = true 1493 defer func() { p.fosterParenting = false }() 1494 1495 return inBodyIM(p) 1496} 1497 1498// Section 12.2.6.4.11. 1499func inCaptionIM(p *parser) bool { 1500 switch p.tok.Type { 1501 case StartTagToken: 1502 switch p.tok.DataAtom { 1503 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: 1504 if !p.popUntil(tableScope, a.Caption) { 1505 // Ignore the token. 1506 return true 1507 } 1508 p.clearActiveFormattingElements() 1509 p.im = inTableIM 1510 return false 1511 case a.Select: 1512 p.reconstructActiveFormattingElements() 1513 p.addElement() 1514 p.framesetOK = false 1515 p.im = inSelectInTableIM 1516 return true 1517 } 1518 case EndTagToken: 1519 switch p.tok.DataAtom { 1520 case a.Caption: 1521 if p.popUntil(tableScope, a.Caption) { 1522 p.clearActiveFormattingElements() 1523 p.im = inTableIM 1524 } 1525 return true 1526 case a.Table: 1527 if !p.popUntil(tableScope, a.Caption) { 1528 // Ignore the token. 1529 return true 1530 } 1531 p.clearActiveFormattingElements() 1532 p.im = inTableIM 1533 return false 1534 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1535 // Ignore the token. 1536 return true 1537 } 1538 } 1539 return inBodyIM(p) 1540} 1541 1542// Section 12.2.6.4.12. 1543func inColumnGroupIM(p *parser) bool { 1544 switch p.tok.Type { 1545 case TextToken: 1546 s := strings.TrimLeft(p.tok.Data, whitespace) 1547 if len(s) < len(p.tok.Data) { 1548 // Add the initial whitespace to the current node. 1549 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 1550 if s == "" { 1551 return true 1552 } 1553 p.tok.Data = s 1554 } 1555 case CommentToken: 1556 p.addChild(&Node{ 1557 Type: CommentNode, 1558 Data: p.tok.Data, 1559 }) 1560 return true 1561 case DoctypeToken: 1562 // Ignore the token. 1563 return true 1564 case StartTagToken: 1565 switch p.tok.DataAtom { 1566 case a.Html: 1567 return inBodyIM(p) 1568 case a.Col: 1569 p.addElement() 1570 p.oe.pop() 1571 p.acknowledgeSelfClosingTag() 1572 return true 1573 case a.Template: 1574 return inHeadIM(p) 1575 } 1576 case EndTagToken: 1577 switch p.tok.DataAtom { 1578 case a.Colgroup: 1579 if p.oe.top().DataAtom == a.Colgroup { 1580 p.oe.pop() 1581 p.im = inTableIM 1582 } 1583 return true 1584 case a.Col: 1585 // Ignore the token. 1586 return true 1587 case a.Template: 1588 return inHeadIM(p) 1589 } 1590 case ErrorToken: 1591 return inBodyIM(p) 1592 } 1593 if p.oe.top().DataAtom != a.Colgroup { 1594 return true 1595 } 1596 p.oe.pop() 1597 p.im = inTableIM 1598 return false 1599} 1600 1601// Section 12.2.6.4.13. 1602func inTableBodyIM(p *parser) bool { 1603 switch p.tok.Type { 1604 case StartTagToken: 1605 switch p.tok.DataAtom { 1606 case a.Tr: 1607 p.clearStackToContext(tableBodyScope) 1608 p.addElement() 1609 p.im = inRowIM 1610 return true 1611 case a.Td, a.Th: 1612 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) 1613 return false 1614 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1615 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1616 p.im = inTableIM 1617 return false 1618 } 1619 // Ignore the token. 1620 return true 1621 } 1622 case EndTagToken: 1623 switch p.tok.DataAtom { 1624 case a.Tbody, a.Tfoot, a.Thead: 1625 if p.elementInScope(tableScope, p.tok.DataAtom) { 1626 p.clearStackToContext(tableBodyScope) 1627 p.oe.pop() 1628 p.im = inTableIM 1629 } 1630 return true 1631 case a.Table: 1632 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1633 p.im = inTableIM 1634 return false 1635 } 1636 // Ignore the token. 1637 return true 1638 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: 1639 // Ignore the token. 1640 return true 1641 } 1642 case CommentToken: 1643 p.addChild(&Node{ 1644 Type: CommentNode, 1645 Data: p.tok.Data, 1646 }) 1647 return true 1648 } 1649 1650 return inTableIM(p) 1651} 1652 1653// Section 12.2.6.4.14. 1654func inRowIM(p *parser) bool { 1655 switch p.tok.Type { 1656 case StartTagToken: 1657 switch p.tok.DataAtom { 1658 case a.Td, a.Th: 1659 p.clearStackToContext(tableRowScope) 1660 p.addElement() 1661 p.afe = append(p.afe, &scopeMarker) 1662 p.im = inCellIM 1663 return true 1664 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1665 if p.popUntil(tableScope, a.Tr) { 1666 p.im = inTableBodyIM 1667 return false 1668 } 1669 // Ignore the token. 1670 return true 1671 } 1672 case EndTagToken: 1673 switch p.tok.DataAtom { 1674 case a.Tr: 1675 if p.popUntil(tableScope, a.Tr) { 1676 p.im = inTableBodyIM 1677 return true 1678 } 1679 // Ignore the token. 1680 return true 1681 case a.Table: 1682 if p.popUntil(tableScope, a.Tr) { 1683 p.im = inTableBodyIM 1684 return false 1685 } 1686 // Ignore the token. 1687 return true 1688 case a.Tbody, a.Tfoot, a.Thead: 1689 if p.elementInScope(tableScope, p.tok.DataAtom) { 1690 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) 1691 return false 1692 } 1693 // Ignore the token. 1694 return true 1695 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: 1696 // Ignore the token. 1697 return true 1698 } 1699 } 1700 1701 return inTableIM(p) 1702} 1703 1704// Section 12.2.6.4.15. 1705func inCellIM(p *parser) bool { 1706 switch p.tok.Type { 1707 case StartTagToken: 1708 switch p.tok.DataAtom { 1709 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1710 if p.popUntil(tableScope, a.Td, a.Th) { 1711 // Close the cell and reprocess. 1712 p.clearActiveFormattingElements() 1713 p.im = inRowIM 1714 return false 1715 } 1716 // Ignore the token. 1717 return true 1718 case a.Select: 1719 p.reconstructActiveFormattingElements() 1720 p.addElement() 1721 p.framesetOK = false 1722 p.im = inSelectInTableIM 1723 return true 1724 } 1725 case EndTagToken: 1726 switch p.tok.DataAtom { 1727 case a.Td, a.Th: 1728 if !p.popUntil(tableScope, p.tok.DataAtom) { 1729 // Ignore the token. 1730 return true 1731 } 1732 p.clearActiveFormattingElements() 1733 p.im = inRowIM 1734 return true 1735 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: 1736 // Ignore the token. 1737 return true 1738 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1739 if !p.elementInScope(tableScope, p.tok.DataAtom) { 1740 // Ignore the token. 1741 return true 1742 } 1743 // Close the cell and reprocess. 1744 if p.popUntil(tableScope, a.Td, a.Th) { 1745 p.clearActiveFormattingElements() 1746 } 1747 p.im = inRowIM 1748 return false 1749 } 1750 } 1751 return inBodyIM(p) 1752} 1753 1754// Section 12.2.6.4.16. 1755func inSelectIM(p *parser) bool { 1756 switch p.tok.Type { 1757 case TextToken: 1758 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) 1759 case StartTagToken: 1760 switch p.tok.DataAtom { 1761 case a.Html: 1762 return inBodyIM(p) 1763 case a.Option: 1764 if p.top().DataAtom == a.Option { 1765 p.oe.pop() 1766 } 1767 p.addElement() 1768 case a.Optgroup: 1769 if p.top().DataAtom == a.Option { 1770 p.oe.pop() 1771 } 1772 if p.top().DataAtom == a.Optgroup { 1773 p.oe.pop() 1774 } 1775 p.addElement() 1776 case a.Select: 1777 if !p.popUntil(selectScope, a.Select) { 1778 // Ignore the token. 1779 return true 1780 } 1781 p.resetInsertionMode() 1782 case a.Input, a.Keygen, a.Textarea: 1783 if p.elementInScope(selectScope, a.Select) { 1784 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) 1785 return false 1786 } 1787 // In order to properly ignore <textarea>, we need to change the tokenizer mode. 1788 p.tokenizer.NextIsNotRawText() 1789 // Ignore the token. 1790 return true 1791 case a.Script, a.Template: 1792 return inHeadIM(p) 1793 } 1794 case EndTagToken: 1795 switch p.tok.DataAtom { 1796 case a.Option: 1797 if p.top().DataAtom == a.Option { 1798 p.oe.pop() 1799 } 1800 case a.Optgroup: 1801 i := len(p.oe) - 1 1802 if p.oe[i].DataAtom == a.Option { 1803 i-- 1804 } 1805 if p.oe[i].DataAtom == a.Optgroup { 1806 p.oe = p.oe[:i] 1807 } 1808 case a.Select: 1809 if !p.popUntil(selectScope, a.Select) { 1810 // Ignore the token. 1811 return true 1812 } 1813 p.resetInsertionMode() 1814 case a.Template: 1815 return inHeadIM(p) 1816 } 1817 case CommentToken: 1818 p.addChild(&Node{ 1819 Type: CommentNode, 1820 Data: p.tok.Data, 1821 }) 1822 case DoctypeToken: 1823 // Ignore the token. 1824 return true 1825 case ErrorToken: 1826 return inBodyIM(p) 1827 } 1828 1829 return true 1830} 1831 1832// Section 12.2.6.4.17. 1833func inSelectInTableIM(p *parser) bool { 1834 switch p.tok.Type { 1835 case StartTagToken, EndTagToken: 1836 switch p.tok.DataAtom { 1837 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: 1838 if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) { 1839 // Ignore the token. 1840 return true 1841 } 1842 // This is like p.popUntil(selectScope, a.Select), but it also 1843 // matches <math select>, not just <select>. Matching the MathML 1844 // tag is arguably incorrect (conceptually), but it mimics what 1845 // Chromium does. 1846 for i := len(p.oe) - 1; i >= 0; i-- { 1847 if n := p.oe[i]; n.DataAtom == a.Select { 1848 p.oe = p.oe[:i] 1849 break 1850 } 1851 } 1852 p.resetInsertionMode() 1853 return false 1854 } 1855 } 1856 return inSelectIM(p) 1857} 1858 1859// Section 12.2.6.4.18. 1860func inTemplateIM(p *parser) bool { 1861 switch p.tok.Type { 1862 case TextToken, CommentToken, DoctypeToken: 1863 return inBodyIM(p) 1864 case StartTagToken: 1865 switch p.tok.DataAtom { 1866 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 1867 return inHeadIM(p) 1868 case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1869 p.templateStack.pop() 1870 p.templateStack = append(p.templateStack, inTableIM) 1871 p.im = inTableIM 1872 return false 1873 case a.Col: 1874 p.templateStack.pop() 1875 p.templateStack = append(p.templateStack, inColumnGroupIM) 1876 p.im = inColumnGroupIM 1877 return false 1878 case a.Tr: 1879 p.templateStack.pop() 1880 p.templateStack = append(p.templateStack, inTableBodyIM) 1881 p.im = inTableBodyIM 1882 return false 1883 case a.Td, a.Th: 1884 p.templateStack.pop() 1885 p.templateStack = append(p.templateStack, inRowIM) 1886 p.im = inRowIM 1887 return false 1888 default: 1889 p.templateStack.pop() 1890 p.templateStack = append(p.templateStack, inBodyIM) 1891 p.im = inBodyIM 1892 return false 1893 } 1894 case EndTagToken: 1895 switch p.tok.DataAtom { 1896 case a.Template: 1897 return inHeadIM(p) 1898 default: 1899 // Ignore the token. 1900 return true 1901 } 1902 case ErrorToken: 1903 if !p.oe.contains(a.Template) { 1904 // Ignore the token. 1905 return true 1906 } 1907 // TODO: remove this divergence from the HTML5 spec. 1908 // 1909 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 1910 p.generateImpliedEndTags() 1911 for i := len(p.oe) - 1; i >= 0; i-- { 1912 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 1913 p.oe = p.oe[:i] 1914 break 1915 } 1916 } 1917 p.clearActiveFormattingElements() 1918 p.templateStack.pop() 1919 p.resetInsertionMode() 1920 return false 1921 } 1922 return false 1923} 1924 1925// Section 12.2.6.4.19. 1926func afterBodyIM(p *parser) bool { 1927 switch p.tok.Type { 1928 case ErrorToken: 1929 // Stop parsing. 1930 return true 1931 case TextToken: 1932 s := strings.TrimLeft(p.tok.Data, whitespace) 1933 if len(s) == 0 { 1934 // It was all whitespace. 1935 return inBodyIM(p) 1936 } 1937 case StartTagToken: 1938 if p.tok.DataAtom == a.Html { 1939 return inBodyIM(p) 1940 } 1941 case EndTagToken: 1942 if p.tok.DataAtom == a.Html { 1943 if !p.fragment { 1944 p.im = afterAfterBodyIM 1945 } 1946 return true 1947 } 1948 case CommentToken: 1949 // The comment is attached to the <html> element. 1950 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { 1951 panic("html: bad parser state: <html> element not found, in the after-body insertion mode") 1952 } 1953 p.oe[0].AppendChild(&Node{ 1954 Type: CommentNode, 1955 Data: p.tok.Data, 1956 }) 1957 return true 1958 } 1959 p.im = inBodyIM 1960 return false 1961} 1962 1963// Section 12.2.6.4.20. 1964func inFramesetIM(p *parser) bool { 1965 switch p.tok.Type { 1966 case CommentToken: 1967 p.addChild(&Node{ 1968 Type: CommentNode, 1969 Data: p.tok.Data, 1970 }) 1971 case TextToken: 1972 // Ignore all text but whitespace. 1973 s := strings.Map(func(c rune) rune { 1974 switch c { 1975 case ' ', '\t', '\n', '\f', '\r': 1976 return c 1977 } 1978 return -1 1979 }, p.tok.Data) 1980 if s != "" { 1981 p.addText(s) 1982 } 1983 case StartTagToken: 1984 switch p.tok.DataAtom { 1985 case a.Html: 1986 return inBodyIM(p) 1987 case a.Frameset: 1988 p.addElement() 1989 case a.Frame: 1990 p.addElement() 1991 p.oe.pop() 1992 p.acknowledgeSelfClosingTag() 1993 case a.Noframes: 1994 return inHeadIM(p) 1995 } 1996 case EndTagToken: 1997 switch p.tok.DataAtom { 1998 case a.Frameset: 1999 if p.oe.top().DataAtom != a.Html { 2000 p.oe.pop() 2001 if p.oe.top().DataAtom != a.Frameset { 2002 p.im = afterFramesetIM 2003 return true 2004 } 2005 } 2006 } 2007 default: 2008 // Ignore the token. 2009 } 2010 return true 2011} 2012 2013// Section 12.2.6.4.21. 2014func afterFramesetIM(p *parser) bool { 2015 switch p.tok.Type { 2016 case CommentToken: 2017 p.addChild(&Node{ 2018 Type: CommentNode, 2019 Data: p.tok.Data, 2020 }) 2021 case TextToken: 2022 // Ignore all text but whitespace. 2023 s := strings.Map(func(c rune) rune { 2024 switch c { 2025 case ' ', '\t', '\n', '\f', '\r': 2026 return c 2027 } 2028 return -1 2029 }, p.tok.Data) 2030 if s != "" { 2031 p.addText(s) 2032 } 2033 case StartTagToken: 2034 switch p.tok.DataAtom { 2035 case a.Html: 2036 return inBodyIM(p) 2037 case a.Noframes: 2038 return inHeadIM(p) 2039 } 2040 case EndTagToken: 2041 switch p.tok.DataAtom { 2042 case a.Html: 2043 p.im = afterAfterFramesetIM 2044 return true 2045 } 2046 default: 2047 // Ignore the token. 2048 } 2049 return true 2050} 2051 2052// Section 12.2.6.4.22. 2053func afterAfterBodyIM(p *parser) bool { 2054 switch p.tok.Type { 2055 case ErrorToken: 2056 // Stop parsing. 2057 return true 2058 case TextToken: 2059 s := strings.TrimLeft(p.tok.Data, whitespace) 2060 if len(s) == 0 { 2061 // It was all whitespace. 2062 return inBodyIM(p) 2063 } 2064 case StartTagToken: 2065 if p.tok.DataAtom == a.Html { 2066 return inBodyIM(p) 2067 } 2068 case CommentToken: 2069 p.doc.AppendChild(&Node{ 2070 Type: CommentNode, 2071 Data: p.tok.Data, 2072 }) 2073 return true 2074 case DoctypeToken: 2075 return inBodyIM(p) 2076 } 2077 p.im = inBodyIM 2078 return false 2079} 2080 2081// Section 12.2.6.4.23. 2082func afterAfterFramesetIM(p *parser) bool { 2083 switch p.tok.Type { 2084 case CommentToken: 2085 p.doc.AppendChild(&Node{ 2086 Type: CommentNode, 2087 Data: p.tok.Data, 2088 }) 2089 case TextToken: 2090 // Ignore all text but whitespace. 2091 s := strings.Map(func(c rune) rune { 2092 switch c { 2093 case ' ', '\t', '\n', '\f', '\r': 2094 return c 2095 } 2096 return -1 2097 }, p.tok.Data) 2098 if s != "" { 2099 p.tok.Data = s 2100 return inBodyIM(p) 2101 } 2102 case StartTagToken: 2103 switch p.tok.DataAtom { 2104 case a.Html: 2105 return inBodyIM(p) 2106 case a.Noframes: 2107 return inHeadIM(p) 2108 } 2109 case DoctypeToken: 2110 return inBodyIM(p) 2111 default: 2112 // Ignore the token. 2113 } 2114 return true 2115} 2116 2117const whitespaceOrNUL = whitespace + "\x00" 2118 2119// Section 12.2.6.5 2120func parseForeignContent(p *parser) bool { 2121 switch p.tok.Type { 2122 case TextToken: 2123 if p.framesetOK { 2124 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" 2125 } 2126 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) 2127 p.addText(p.tok.Data) 2128 case CommentToken: 2129 p.addChild(&Node{ 2130 Type: CommentNode, 2131 Data: p.tok.Data, 2132 }) 2133 case StartTagToken: 2134 if !p.fragment { 2135 b := breakout[p.tok.Data] 2136 if p.tok.DataAtom == a.Font { 2137 loop: 2138 for _, attr := range p.tok.Attr { 2139 switch attr.Key { 2140 case "color", "face", "size": 2141 b = true 2142 break loop 2143 } 2144 } 2145 } 2146 if b { 2147 for i := len(p.oe) - 1; i >= 0; i-- { 2148 n := p.oe[i] 2149 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { 2150 p.oe = p.oe[:i+1] 2151 break 2152 } 2153 } 2154 return false 2155 } 2156 } 2157 current := p.adjustedCurrentNode() 2158 switch current.Namespace { 2159 case "math": 2160 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 2161 case "svg": 2162 // Adjust SVG tag names. The tokenizer lower-cases tag names, but 2163 // SVG wants e.g. "foreignObject" with a capital second "O". 2164 if x := svgTagNameAdjustments[p.tok.Data]; x != "" { 2165 p.tok.DataAtom = a.Lookup([]byte(x)) 2166 p.tok.Data = x 2167 } 2168 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 2169 default: 2170 panic("html: bad parser state: unexpected namespace") 2171 } 2172 adjustForeignAttributes(p.tok.Attr) 2173 namespace := current.Namespace 2174 p.addElement() 2175 p.top().Namespace = namespace 2176 if namespace != "" { 2177 // Don't let the tokenizer go into raw text mode in foreign content 2178 // (e.g. in an SVG <title> tag). 2179 p.tokenizer.NextIsNotRawText() 2180 } 2181 if p.hasSelfClosingToken { 2182 p.oe.pop() 2183 p.acknowledgeSelfClosingTag() 2184 } 2185 case EndTagToken: 2186 for i := len(p.oe) - 1; i >= 0; i-- { 2187 if p.oe[i].Namespace == "" { 2188 return p.im(p) 2189 } 2190 if strings.EqualFold(p.oe[i].Data, p.tok.Data) { 2191 p.oe = p.oe[:i] 2192 break 2193 } 2194 } 2195 return true 2196 default: 2197 // Ignore the token. 2198 } 2199 return true 2200} 2201 2202// Section 12.2.4.2. 2203func (p *parser) adjustedCurrentNode() *Node { 2204 if len(p.oe) == 1 && p.fragment && p.context != nil { 2205 return p.context 2206 } 2207 return p.oe.top() 2208} 2209 2210// Section 12.2.6. 2211func (p *parser) inForeignContent() bool { 2212 if len(p.oe) == 0 { 2213 return false 2214 } 2215 n := p.adjustedCurrentNode() 2216 if n.Namespace == "" { 2217 return false 2218 } 2219 if mathMLTextIntegrationPoint(n) { 2220 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { 2221 return false 2222 } 2223 if p.tok.Type == TextToken { 2224 return false 2225 } 2226 } 2227 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { 2228 return false 2229 } 2230 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { 2231 return false 2232 } 2233 if p.tok.Type == ErrorToken { 2234 return false 2235 } 2236 return true 2237} 2238 2239// parseImpliedToken parses a token as though it had appeared in the parser's 2240// input. 2241func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { 2242 realToken, selfClosing := p.tok, p.hasSelfClosingToken 2243 p.tok = Token{ 2244 Type: t, 2245 DataAtom: dataAtom, 2246 Data: data, 2247 } 2248 p.hasSelfClosingToken = false 2249 p.parseCurrentToken() 2250 p.tok, p.hasSelfClosingToken = realToken, selfClosing 2251} 2252 2253// parseCurrentToken runs the current token through the parsing routines 2254// until it is consumed. 2255func (p *parser) parseCurrentToken() { 2256 if p.tok.Type == SelfClosingTagToken { 2257 p.hasSelfClosingToken = true 2258 p.tok.Type = StartTagToken 2259 } 2260 2261 consumed := false 2262 for !consumed { 2263 if p.inForeignContent() { 2264 consumed = parseForeignContent(p) 2265 } else { 2266 consumed = p.im(p) 2267 } 2268 } 2269 2270 if p.hasSelfClosingToken { 2271 // This is a parse error, but ignore it. 2272 p.hasSelfClosingToken = false 2273 } 2274} 2275 2276func (p *parser) parse() error { 2277 // Iterate until EOF. Any other error will cause an early return. 2278 var err error 2279 for err != io.EOF { 2280 // CDATA sections are allowed only in foreign content. 2281 n := p.oe.top() 2282 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") 2283 // Read and parse the next token. 2284 p.tokenizer.Next() 2285 p.tok = p.tokenizer.Token() 2286 if p.tok.Type == ErrorToken { 2287 err = p.tokenizer.Err() 2288 if err != nil && err != io.EOF { 2289 return err 2290 } 2291 } 2292 p.parseCurrentToken() 2293 } 2294 return nil 2295} 2296 2297// Parse returns the parse tree for the HTML from the given Reader. 2298// 2299// It implements the HTML5 parsing algorithm 2300// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction), 2301// which is very complicated. The resultant tree can contain implicitly created 2302// nodes that have no explicit <tag> listed in r's data, and nodes' parents can 2303// differ from the nesting implied by a naive processing of start and end 2304// <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped, 2305// with no corresponding node in the resulting tree. 2306// 2307// The input is assumed to be UTF-8 encoded. 2308func Parse(r io.Reader) (*Node, error) { 2309 return ParseWithOptions(r) 2310} 2311 2312// ParseFragment parses a fragment of HTML and returns the nodes that were 2313// found. If the fragment is the InnerHTML for an existing element, pass that 2314// element in context. 2315// 2316// It has the same intricacies as Parse. 2317func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { 2318 return ParseFragmentWithOptions(r, context) 2319} 2320 2321// ParseOption configures a parser. 2322type ParseOption func(p *parser) 2323 2324// ParseOptionEnableScripting configures the scripting flag. 2325// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting 2326// 2327// By default, scripting is enabled. 2328func ParseOptionEnableScripting(enable bool) ParseOption { 2329 return func(p *parser) { 2330 p.scripting = enable 2331 } 2332} 2333 2334// ParseWithOptions is like Parse, with options. 2335func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) { 2336 p := &parser{ 2337 tokenizer: NewTokenizer(r), 2338 doc: &Node{ 2339 Type: DocumentNode, 2340 }, 2341 scripting: true, 2342 framesetOK: true, 2343 im: initialIM, 2344 } 2345 2346 for _, f := range opts { 2347 f(p) 2348 } 2349 2350 if err := p.parse(); err != nil { 2351 return nil, err 2352 } 2353 return p.doc, nil 2354} 2355 2356// ParseFragmentWithOptions is like ParseFragment, with options. 2357func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) { 2358 contextTag := "" 2359 if context != nil { 2360 if context.Type != ElementNode { 2361 return nil, errors.New("html: ParseFragment of non-element Node") 2362 } 2363 // The next check isn't just context.DataAtom.String() == context.Data because 2364 // it is valid to pass an element whose tag isn't a known atom. For example, 2365 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. 2366 if context.DataAtom != a.Lookup([]byte(context.Data)) { 2367 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) 2368 } 2369 contextTag = context.DataAtom.String() 2370 } 2371 p := &parser{ 2372 doc: &Node{ 2373 Type: DocumentNode, 2374 }, 2375 scripting: true, 2376 fragment: true, 2377 context: context, 2378 } 2379 if context != nil && context.Namespace != "" { 2380 p.tokenizer = NewTokenizer(r) 2381 } else { 2382 p.tokenizer = NewTokenizerFragment(r, contextTag) 2383 } 2384 2385 for _, f := range opts { 2386 f(p) 2387 } 2388 2389 root := &Node{ 2390 Type: ElementNode, 2391 DataAtom: a.Html, 2392 Data: a.Html.String(), 2393 } 2394 p.doc.AppendChild(root) 2395 p.oe = nodeStack{root} 2396 if context != nil && context.DataAtom == a.Template { 2397 p.templateStack = append(p.templateStack, inTemplateIM) 2398 } 2399 p.resetInsertionMode() 2400 2401 for n := context; n != nil; n = n.Parent { 2402 if n.Type == ElementNode && n.DataAtom == a.Form { 2403 p.form = n 2404 break 2405 } 2406 } 2407 2408 if err := p.parse(); err != nil { 2409 return nil, err 2410 } 2411 2412 parent := p.doc 2413 if context != nil { 2414 parent = root 2415 } 2416 2417 var result []*Node 2418 for c := parent.FirstChild; c != nil; { 2419 next := c.NextSibling 2420 parent.RemoveChild(c) 2421 result = append(result, c) 2422 c = next 2423 } 2424 return result, nil 2425} 2426