1// Copyright 2010 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package html 6 7import ( 8 "errors" 9 "fmt" 10 "io" 11 "strings" 12 13 a "golang.org/x/net/html/atom" 14) 15 16// A parser implements the HTML5 parsing algorithm: 17// https://html.spec.whatwg.org/multipage/syntax.html#tree-construction 18type parser struct { 19 // tokenizer provides the tokens for the parser. 20 tokenizer *Tokenizer 21 // tok is the most recently read token. 22 tok Token 23 // Self-closing tags like <hr/> are treated as start tags, except that 24 // hasSelfClosingToken is set while they are being processed. 25 hasSelfClosingToken bool 26 // doc is the document root element. 27 doc *Node 28 // The stack of open elements (section 12.2.4.2) and active formatting 29 // elements (section 12.2.4.3). 30 oe, afe nodeStack 31 // Element pointers (section 12.2.4.4). 32 head, form *Node 33 // Other parsing state flags (section 12.2.4.5). 34 scripting, framesetOK bool 35 // The stack of template insertion modes 36 templateStack insertionModeStack 37 // im is the current insertion mode. 38 im insertionMode 39 // originalIM is the insertion mode to go back to after completing a text 40 // or inTableText insertion mode. 41 originalIM insertionMode 42 // fosterParenting is whether new elements should be inserted according to 43 // the foster parenting rules (section 12.2.6.1). 44 fosterParenting bool 45 // quirks is whether the parser is operating in "quirks mode." 46 quirks bool 47 // fragment is whether the parser is parsing an HTML fragment. 48 fragment bool 49 // context is the context element when parsing an HTML fragment 50 // (section 12.4). 51 context *Node 52} 53 54func (p *parser) top() *Node { 55 if n := p.oe.top(); n != nil { 56 return n 57 } 58 return p.doc 59} 60 61// Stop tags for use in popUntil. These come from section 12.2.4.2. 62var ( 63 defaultScopeStopTags = map[string][]a.Atom{ 64 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, 65 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, 66 "svg": {a.Desc, a.ForeignObject, a.Title}, 67 } 68) 69 70type scope int 71 72const ( 73 defaultScope scope = iota 74 listItemScope 75 buttonScope 76 tableScope 77 tableRowScope 78 tableBodyScope 79 selectScope 80) 81 82// popUntil pops the stack of open elements at the highest element whose tag 83// is in matchTags, provided there is no higher element in the scope's stop 84// tags (as defined in section 12.2.4.2). It returns whether or not there was 85// such an element. If there was not, popUntil leaves the stack unchanged. 86// 87// For example, the set of stop tags for table scope is: "html", "table". If 88// the stack was: 89// ["html", "body", "font", "table", "b", "i", "u"] 90// then popUntil(tableScope, "font") would return false, but 91// popUntil(tableScope, "i") would return true and the stack would become: 92// ["html", "body", "font", "table", "b"] 93// 94// If an element's tag is in both the stop tags and matchTags, then the stack 95// will be popped and the function returns true (provided, of course, there was 96// no higher element in the stack that was also in the stop tags). For example, 97// popUntil(tableScope, "table") returns true and leaves: 98// ["html", "body", "font"] 99func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { 100 if i := p.indexOfElementInScope(s, matchTags...); i != -1 { 101 p.oe = p.oe[:i] 102 return true 103 } 104 return false 105} 106 107// indexOfElementInScope returns the index in p.oe of the highest element whose 108// tag is in matchTags that is in scope. If no matching element is in scope, it 109// returns -1. 110func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { 111 for i := len(p.oe) - 1; i >= 0; i-- { 112 tagAtom := p.oe[i].DataAtom 113 if p.oe[i].Namespace == "" { 114 for _, t := range matchTags { 115 if t == tagAtom { 116 return i 117 } 118 } 119 switch s { 120 case defaultScope: 121 // No-op. 122 case listItemScope: 123 if tagAtom == a.Ol || tagAtom == a.Ul { 124 return -1 125 } 126 case buttonScope: 127 if tagAtom == a.Button { 128 return -1 129 } 130 case tableScope: 131 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 132 return -1 133 } 134 case selectScope: 135 if tagAtom != a.Optgroup && tagAtom != a.Option { 136 return -1 137 } 138 default: 139 panic("unreachable") 140 } 141 } 142 switch s { 143 case defaultScope, listItemScope, buttonScope: 144 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { 145 if t == tagAtom { 146 return -1 147 } 148 } 149 } 150 } 151 return -1 152} 153 154// elementInScope is like popUntil, except that it doesn't modify the stack of 155// open elements. 156func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { 157 return p.indexOfElementInScope(s, matchTags...) != -1 158} 159 160// clearStackToContext pops elements off the stack of open elements until a 161// scope-defined element is found. 162func (p *parser) clearStackToContext(s scope) { 163 for i := len(p.oe) - 1; i >= 0; i-- { 164 tagAtom := p.oe[i].DataAtom 165 switch s { 166 case tableScope: 167 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 168 p.oe = p.oe[:i+1] 169 return 170 } 171 case tableRowScope: 172 if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template { 173 p.oe = p.oe[:i+1] 174 return 175 } 176 case tableBodyScope: 177 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template { 178 p.oe = p.oe[:i+1] 179 return 180 } 181 default: 182 panic("unreachable") 183 } 184 } 185} 186 187// parseGenericRawTextElements implements the generic raw text element parsing 188// algorithm defined in 12.2.6.2. 189// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text 190// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part 191// officially, need to make tokenizer consider both states. 192func (p *parser) parseGenericRawTextElement() { 193 p.addElement() 194 p.originalIM = p.im 195 p.im = textIM 196} 197 198// generateImpliedEndTags pops nodes off the stack of open elements as long as 199// the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc. 200// If exceptions are specified, nodes with that name will not be popped off. 201func (p *parser) generateImpliedEndTags(exceptions ...string) { 202 var i int 203loop: 204 for i = len(p.oe) - 1; i >= 0; i-- { 205 n := p.oe[i] 206 if n.Type != ElementNode { 207 break 208 } 209 switch n.DataAtom { 210 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc: 211 for _, except := range exceptions { 212 if n.Data == except { 213 break loop 214 } 215 } 216 continue 217 } 218 break 219 } 220 221 p.oe = p.oe[:i+1] 222} 223 224// addChild adds a child node n to the top element, and pushes n onto the stack 225// of open elements if it is an element node. 226func (p *parser) addChild(n *Node) { 227 if p.shouldFosterParent() { 228 p.fosterParent(n) 229 } else { 230 p.top().AppendChild(n) 231 } 232 233 if n.Type == ElementNode { 234 p.oe = append(p.oe, n) 235 } 236} 237 238// shouldFosterParent returns whether the next node to be added should be 239// foster parented. 240func (p *parser) shouldFosterParent() bool { 241 if p.fosterParenting { 242 switch p.top().DataAtom { 243 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 244 return true 245 } 246 } 247 return false 248} 249 250// fosterParent adds a child node according to the foster parenting rules. 251// Section 12.2.6.1, "foster parenting". 252func (p *parser) fosterParent(n *Node) { 253 var table, parent, prev, template *Node 254 var i int 255 for i = len(p.oe) - 1; i >= 0; i-- { 256 if p.oe[i].DataAtom == a.Table { 257 table = p.oe[i] 258 break 259 } 260 } 261 262 var j int 263 for j = len(p.oe) - 1; j >= 0; j-- { 264 if p.oe[j].DataAtom == a.Template { 265 template = p.oe[j] 266 break 267 } 268 } 269 270 if template != nil && (table == nil || j > i) { 271 template.AppendChild(n) 272 return 273 } 274 275 if table == nil { 276 // The foster parent is the html element. 277 parent = p.oe[0] 278 } else { 279 parent = table.Parent 280 } 281 if parent == nil { 282 parent = p.oe[i-1] 283 } 284 285 if table != nil { 286 prev = table.PrevSibling 287 } else { 288 prev = parent.LastChild 289 } 290 if prev != nil && prev.Type == TextNode && n.Type == TextNode { 291 prev.Data += n.Data 292 return 293 } 294 295 parent.InsertBefore(n, table) 296} 297 298// addText adds text to the preceding node if it is a text node, or else it 299// calls addChild with a new text node. 300func (p *parser) addText(text string) { 301 if text == "" { 302 return 303 } 304 305 if p.shouldFosterParent() { 306 p.fosterParent(&Node{ 307 Type: TextNode, 308 Data: text, 309 }) 310 return 311 } 312 313 t := p.top() 314 if n := t.LastChild; n != nil && n.Type == TextNode { 315 n.Data += text 316 return 317 } 318 p.addChild(&Node{ 319 Type: TextNode, 320 Data: text, 321 }) 322} 323 324// addElement adds a child element based on the current token. 325func (p *parser) addElement() { 326 p.addChild(&Node{ 327 Type: ElementNode, 328 DataAtom: p.tok.DataAtom, 329 Data: p.tok.Data, 330 Attr: p.tok.Attr, 331 }) 332} 333 334// Section 12.2.4.3. 335func (p *parser) addFormattingElement() { 336 tagAtom, attr := p.tok.DataAtom, p.tok.Attr 337 p.addElement() 338 339 // Implement the Noah's Ark clause, but with three per family instead of two. 340 identicalElements := 0 341findIdenticalElements: 342 for i := len(p.afe) - 1; i >= 0; i-- { 343 n := p.afe[i] 344 if n.Type == scopeMarkerNode { 345 break 346 } 347 if n.Type != ElementNode { 348 continue 349 } 350 if n.Namespace != "" { 351 continue 352 } 353 if n.DataAtom != tagAtom { 354 continue 355 } 356 if len(n.Attr) != len(attr) { 357 continue 358 } 359 compareAttributes: 360 for _, t0 := range n.Attr { 361 for _, t1 := range attr { 362 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { 363 // Found a match for this attribute, continue with the next attribute. 364 continue compareAttributes 365 } 366 } 367 // If we get here, there is no attribute that matches a. 368 // Therefore the element is not identical to the new one. 369 continue findIdenticalElements 370 } 371 372 identicalElements++ 373 if identicalElements >= 3 { 374 p.afe.remove(n) 375 } 376 } 377 378 p.afe = append(p.afe, p.top()) 379} 380 381// Section 12.2.4.3. 382func (p *parser) clearActiveFormattingElements() { 383 for { 384 if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode { 385 return 386 } 387 } 388} 389 390// Section 12.2.4.3. 391func (p *parser) reconstructActiveFormattingElements() { 392 n := p.afe.top() 393 if n == nil { 394 return 395 } 396 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { 397 return 398 } 399 i := len(p.afe) - 1 400 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { 401 if i == 0 { 402 i = -1 403 break 404 } 405 i-- 406 n = p.afe[i] 407 } 408 for { 409 i++ 410 clone := p.afe[i].clone() 411 p.addChild(clone) 412 p.afe[i] = clone 413 if i == len(p.afe)-1 { 414 break 415 } 416 } 417} 418 419// Section 12.2.5. 420func (p *parser) acknowledgeSelfClosingTag() { 421 p.hasSelfClosingToken = false 422} 423 424// An insertion mode (section 12.2.4.1) is the state transition function from 425// a particular state in the HTML5 parser's state machine. It updates the 426// parser's fields depending on parser.tok (where ErrorToken means EOF). 427// It returns whether the token was consumed. 428type insertionMode func(*parser) bool 429 430// setOriginalIM sets the insertion mode to return to after completing a text or 431// inTableText insertion mode. 432// Section 12.2.4.1, "using the rules for". 433func (p *parser) setOriginalIM() { 434 if p.originalIM != nil { 435 panic("html: bad parser state: originalIM was set twice") 436 } 437 p.originalIM = p.im 438} 439 440// Section 12.2.4.1, "reset the insertion mode". 441func (p *parser) resetInsertionMode() { 442 for i := len(p.oe) - 1; i >= 0; i-- { 443 n := p.oe[i] 444 last := i == 0 445 if last && p.context != nil { 446 n = p.context 447 } 448 449 switch n.DataAtom { 450 case a.Select: 451 if !last { 452 for ancestor, first := n, p.oe[0]; ancestor != first; { 453 ancestor = p.oe[p.oe.index(ancestor)-1] 454 switch ancestor.DataAtom { 455 case a.Template: 456 p.im = inSelectIM 457 return 458 case a.Table: 459 p.im = inSelectInTableIM 460 return 461 } 462 } 463 } 464 p.im = inSelectIM 465 case a.Td, a.Th: 466 // TODO: remove this divergence from the HTML5 spec. 467 // 468 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 469 p.im = inCellIM 470 case a.Tr: 471 p.im = inRowIM 472 case a.Tbody, a.Thead, a.Tfoot: 473 p.im = inTableBodyIM 474 case a.Caption: 475 p.im = inCaptionIM 476 case a.Colgroup: 477 p.im = inColumnGroupIM 478 case a.Table: 479 p.im = inTableIM 480 case a.Template: 481 // TODO: remove this divergence from the HTML5 spec. 482 if n.Namespace != "" { 483 continue 484 } 485 p.im = p.templateStack.top() 486 case a.Head: 487 // TODO: remove this divergence from the HTML5 spec. 488 // 489 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 490 p.im = inHeadIM 491 case a.Body: 492 p.im = inBodyIM 493 case a.Frameset: 494 p.im = inFramesetIM 495 case a.Html: 496 if p.head == nil { 497 p.im = beforeHeadIM 498 } else { 499 p.im = afterHeadIM 500 } 501 default: 502 if last { 503 p.im = inBodyIM 504 return 505 } 506 continue 507 } 508 return 509 } 510} 511 512const whitespace = " \t\r\n\f" 513 514// Section 12.2.6.4.1. 515func initialIM(p *parser) bool { 516 switch p.tok.Type { 517 case TextToken: 518 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 519 if len(p.tok.Data) == 0 { 520 // It was all whitespace, so ignore it. 521 return true 522 } 523 case CommentToken: 524 p.doc.AppendChild(&Node{ 525 Type: CommentNode, 526 Data: p.tok.Data, 527 }) 528 return true 529 case DoctypeToken: 530 n, quirks := parseDoctype(p.tok.Data) 531 p.doc.AppendChild(n) 532 p.quirks = quirks 533 p.im = beforeHTMLIM 534 return true 535 } 536 p.quirks = true 537 p.im = beforeHTMLIM 538 return false 539} 540 541// Section 12.2.6.4.2. 542func beforeHTMLIM(p *parser) bool { 543 switch p.tok.Type { 544 case DoctypeToken: 545 // Ignore the token. 546 return true 547 case TextToken: 548 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 549 if len(p.tok.Data) == 0 { 550 // It was all whitespace, so ignore it. 551 return true 552 } 553 case StartTagToken: 554 if p.tok.DataAtom == a.Html { 555 p.addElement() 556 p.im = beforeHeadIM 557 return true 558 } 559 case EndTagToken: 560 switch p.tok.DataAtom { 561 case a.Head, a.Body, a.Html, a.Br: 562 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 563 return false 564 default: 565 // Ignore the token. 566 return true 567 } 568 case CommentToken: 569 p.doc.AppendChild(&Node{ 570 Type: CommentNode, 571 Data: p.tok.Data, 572 }) 573 return true 574 } 575 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 576 return false 577} 578 579// Section 12.2.6.4.3. 580func beforeHeadIM(p *parser) bool { 581 switch p.tok.Type { 582 case TextToken: 583 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 584 if len(p.tok.Data) == 0 { 585 // It was all whitespace, so ignore it. 586 return true 587 } 588 case StartTagToken: 589 switch p.tok.DataAtom { 590 case a.Head: 591 p.addElement() 592 p.head = p.top() 593 p.im = inHeadIM 594 return true 595 case a.Html: 596 return inBodyIM(p) 597 } 598 case EndTagToken: 599 switch p.tok.DataAtom { 600 case a.Head, a.Body, a.Html, a.Br: 601 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 602 return false 603 default: 604 // Ignore the token. 605 return true 606 } 607 case CommentToken: 608 p.addChild(&Node{ 609 Type: CommentNode, 610 Data: p.tok.Data, 611 }) 612 return true 613 case DoctypeToken: 614 // Ignore the token. 615 return true 616 } 617 618 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 619 return false 620} 621 622// Section 12.2.6.4.4. 623func inHeadIM(p *parser) bool { 624 switch p.tok.Type { 625 case TextToken: 626 s := strings.TrimLeft(p.tok.Data, whitespace) 627 if len(s) < len(p.tok.Data) { 628 // Add the initial whitespace to the current node. 629 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 630 if s == "" { 631 return true 632 } 633 p.tok.Data = s 634 } 635 case StartTagToken: 636 switch p.tok.DataAtom { 637 case a.Html: 638 return inBodyIM(p) 639 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta: 640 p.addElement() 641 p.oe.pop() 642 p.acknowledgeSelfClosingTag() 643 return true 644 case a.Noscript: 645 if p.scripting { 646 p.parseGenericRawTextElement() 647 return true 648 } 649 p.addElement() 650 p.im = inHeadNoscriptIM 651 // Don't let the tokenizer go into raw text mode when scripting is disabled. 652 p.tokenizer.NextIsNotRawText() 653 return true 654 case a.Script, a.Title: 655 p.addElement() 656 p.setOriginalIM() 657 p.im = textIM 658 return true 659 case a.Noframes, a.Style: 660 p.parseGenericRawTextElement() 661 return true 662 case a.Head: 663 // Ignore the token. 664 return true 665 case a.Template: 666 p.addElement() 667 p.afe = append(p.afe, &scopeMarker) 668 p.framesetOK = false 669 p.im = inTemplateIM 670 p.templateStack = append(p.templateStack, inTemplateIM) 671 return true 672 } 673 case EndTagToken: 674 switch p.tok.DataAtom { 675 case a.Head: 676 p.oe.pop() 677 p.im = afterHeadIM 678 return true 679 case a.Body, a.Html, a.Br: 680 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 681 return false 682 case a.Template: 683 if !p.oe.contains(a.Template) { 684 return true 685 } 686 // TODO: remove this divergence from the HTML5 spec. 687 // 688 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 689 p.generateImpliedEndTags() 690 for i := len(p.oe) - 1; i >= 0; i-- { 691 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 692 p.oe = p.oe[:i] 693 break 694 } 695 } 696 p.clearActiveFormattingElements() 697 p.templateStack.pop() 698 p.resetInsertionMode() 699 return true 700 default: 701 // Ignore the token. 702 return true 703 } 704 case CommentToken: 705 p.addChild(&Node{ 706 Type: CommentNode, 707 Data: p.tok.Data, 708 }) 709 return true 710 case DoctypeToken: 711 // Ignore the token. 712 return true 713 } 714 715 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 716 return false 717} 718 719// 12.2.6.4.5. 720func inHeadNoscriptIM(p *parser) bool { 721 switch p.tok.Type { 722 case DoctypeToken: 723 // Ignore the token. 724 return true 725 case StartTagToken: 726 switch p.tok.DataAtom { 727 case a.Html: 728 return inBodyIM(p) 729 case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style: 730 return inHeadIM(p) 731 case a.Head: 732 // Ignore the token. 733 return true 734 case a.Noscript: 735 // Don't let the tokenizer go into raw text mode even when a <noscript> 736 // tag is in "in head noscript" insertion mode. 737 p.tokenizer.NextIsNotRawText() 738 // Ignore the token. 739 return true 740 } 741 case EndTagToken: 742 switch p.tok.DataAtom { 743 case a.Noscript, a.Br: 744 default: 745 // Ignore the token. 746 return true 747 } 748 case TextToken: 749 s := strings.TrimLeft(p.tok.Data, whitespace) 750 if len(s) == 0 { 751 // It was all whitespace. 752 return inHeadIM(p) 753 } 754 case CommentToken: 755 return inHeadIM(p) 756 } 757 p.oe.pop() 758 if p.top().DataAtom != a.Head { 759 panic("html: the new current node will be a head element.") 760 } 761 p.im = inHeadIM 762 if p.tok.DataAtom == a.Noscript { 763 return true 764 } 765 return false 766} 767 768// Section 12.2.6.4.6. 769func afterHeadIM(p *parser) bool { 770 switch p.tok.Type { 771 case TextToken: 772 s := strings.TrimLeft(p.tok.Data, whitespace) 773 if len(s) < len(p.tok.Data) { 774 // Add the initial whitespace to the current node. 775 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 776 if s == "" { 777 return true 778 } 779 p.tok.Data = s 780 } 781 case StartTagToken: 782 switch p.tok.DataAtom { 783 case a.Html: 784 return inBodyIM(p) 785 case a.Body: 786 p.addElement() 787 p.framesetOK = false 788 p.im = inBodyIM 789 return true 790 case a.Frameset: 791 p.addElement() 792 p.im = inFramesetIM 793 return true 794 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 795 p.oe = append(p.oe, p.head) 796 defer p.oe.remove(p.head) 797 return inHeadIM(p) 798 case a.Head: 799 // Ignore the token. 800 return true 801 } 802 case EndTagToken: 803 switch p.tok.DataAtom { 804 case a.Body, a.Html, a.Br: 805 // Drop down to creating an implied <body> tag. 806 case a.Template: 807 return inHeadIM(p) 808 default: 809 // Ignore the token. 810 return true 811 } 812 case CommentToken: 813 p.addChild(&Node{ 814 Type: CommentNode, 815 Data: p.tok.Data, 816 }) 817 return true 818 case DoctypeToken: 819 // Ignore the token. 820 return true 821 } 822 823 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) 824 p.framesetOK = true 825 return false 826} 827 828// copyAttributes copies attributes of src not found on dst to dst. 829func copyAttributes(dst *Node, src Token) { 830 if len(src.Attr) == 0 { 831 return 832 } 833 attr := map[string]string{} 834 for _, t := range dst.Attr { 835 attr[t.Key] = t.Val 836 } 837 for _, t := range src.Attr { 838 if _, ok := attr[t.Key]; !ok { 839 dst.Attr = append(dst.Attr, t) 840 attr[t.Key] = t.Val 841 } 842 } 843} 844 845// Section 12.2.6.4.7. 846func inBodyIM(p *parser) bool { 847 switch p.tok.Type { 848 case TextToken: 849 d := p.tok.Data 850 switch n := p.oe.top(); n.DataAtom { 851 case a.Pre, a.Listing: 852 if n.FirstChild == nil { 853 // Ignore a newline at the start of a <pre> block. 854 if d != "" && d[0] == '\r' { 855 d = d[1:] 856 } 857 if d != "" && d[0] == '\n' { 858 d = d[1:] 859 } 860 } 861 } 862 d = strings.Replace(d, "\x00", "", -1) 863 if d == "" { 864 return true 865 } 866 p.reconstructActiveFormattingElements() 867 p.addText(d) 868 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { 869 // There were non-whitespace characters inserted. 870 p.framesetOK = false 871 } 872 case StartTagToken: 873 switch p.tok.DataAtom { 874 case a.Html: 875 if p.oe.contains(a.Template) { 876 return true 877 } 878 copyAttributes(p.oe[0], p.tok) 879 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 880 return inHeadIM(p) 881 case a.Body: 882 if p.oe.contains(a.Template) { 883 return true 884 } 885 if len(p.oe) >= 2 { 886 body := p.oe[1] 887 if body.Type == ElementNode && body.DataAtom == a.Body { 888 p.framesetOK = false 889 copyAttributes(body, p.tok) 890 } 891 } 892 case a.Frameset: 893 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { 894 // Ignore the token. 895 return true 896 } 897 body := p.oe[1] 898 if body.Parent != nil { 899 body.Parent.RemoveChild(body) 900 } 901 p.oe = p.oe[:1] 902 p.addElement() 903 p.im = inFramesetIM 904 return true 905 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: 906 p.popUntil(buttonScope, a.P) 907 p.addElement() 908 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 909 p.popUntil(buttonScope, a.P) 910 switch n := p.top(); n.DataAtom { 911 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 912 p.oe.pop() 913 } 914 p.addElement() 915 case a.Pre, a.Listing: 916 p.popUntil(buttonScope, a.P) 917 p.addElement() 918 // The newline, if any, will be dealt with by the TextToken case. 919 p.framesetOK = false 920 case a.Form: 921 if p.form != nil && !p.oe.contains(a.Template) { 922 // Ignore the token 923 return true 924 } 925 p.popUntil(buttonScope, a.P) 926 p.addElement() 927 if !p.oe.contains(a.Template) { 928 p.form = p.top() 929 } 930 case a.Li: 931 p.framesetOK = false 932 for i := len(p.oe) - 1; i >= 0; i-- { 933 node := p.oe[i] 934 switch node.DataAtom { 935 case a.Li: 936 p.oe = p.oe[:i] 937 case a.Address, a.Div, a.P: 938 continue 939 default: 940 if !isSpecialElement(node) { 941 continue 942 } 943 } 944 break 945 } 946 p.popUntil(buttonScope, a.P) 947 p.addElement() 948 case a.Dd, a.Dt: 949 p.framesetOK = false 950 for i := len(p.oe) - 1; i >= 0; i-- { 951 node := p.oe[i] 952 switch node.DataAtom { 953 case a.Dd, a.Dt: 954 p.oe = p.oe[:i] 955 case a.Address, a.Div, a.P: 956 continue 957 default: 958 if !isSpecialElement(node) { 959 continue 960 } 961 } 962 break 963 } 964 p.popUntil(buttonScope, a.P) 965 p.addElement() 966 case a.Plaintext: 967 p.popUntil(buttonScope, a.P) 968 p.addElement() 969 case a.Button: 970 p.popUntil(defaultScope, a.Button) 971 p.reconstructActiveFormattingElements() 972 p.addElement() 973 p.framesetOK = false 974 case a.A: 975 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { 976 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { 977 p.inBodyEndTagFormatting(a.A, "a") 978 p.oe.remove(n) 979 p.afe.remove(n) 980 break 981 } 982 } 983 p.reconstructActiveFormattingElements() 984 p.addFormattingElement() 985 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 986 p.reconstructActiveFormattingElements() 987 p.addFormattingElement() 988 case a.Nobr: 989 p.reconstructActiveFormattingElements() 990 if p.elementInScope(defaultScope, a.Nobr) { 991 p.inBodyEndTagFormatting(a.Nobr, "nobr") 992 p.reconstructActiveFormattingElements() 993 } 994 p.addFormattingElement() 995 case a.Applet, a.Marquee, a.Object: 996 p.reconstructActiveFormattingElements() 997 p.addElement() 998 p.afe = append(p.afe, &scopeMarker) 999 p.framesetOK = false 1000 case a.Table: 1001 if !p.quirks { 1002 p.popUntil(buttonScope, a.P) 1003 } 1004 p.addElement() 1005 p.framesetOK = false 1006 p.im = inTableIM 1007 return true 1008 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: 1009 p.reconstructActiveFormattingElements() 1010 p.addElement() 1011 p.oe.pop() 1012 p.acknowledgeSelfClosingTag() 1013 if p.tok.DataAtom == a.Input { 1014 for _, t := range p.tok.Attr { 1015 if t.Key == "type" { 1016 if strings.ToLower(t.Val) == "hidden" { 1017 // Skip setting framesetOK = false 1018 return true 1019 } 1020 } 1021 } 1022 } 1023 p.framesetOK = false 1024 case a.Param, a.Source, a.Track: 1025 p.addElement() 1026 p.oe.pop() 1027 p.acknowledgeSelfClosingTag() 1028 case a.Hr: 1029 p.popUntil(buttonScope, a.P) 1030 p.addElement() 1031 p.oe.pop() 1032 p.acknowledgeSelfClosingTag() 1033 p.framesetOK = false 1034 case a.Image: 1035 p.tok.DataAtom = a.Img 1036 p.tok.Data = a.Img.String() 1037 return false 1038 case a.Textarea: 1039 p.addElement() 1040 p.setOriginalIM() 1041 p.framesetOK = false 1042 p.im = textIM 1043 case a.Xmp: 1044 p.popUntil(buttonScope, a.P) 1045 p.reconstructActiveFormattingElements() 1046 p.framesetOK = false 1047 p.parseGenericRawTextElement() 1048 case a.Iframe: 1049 p.framesetOK = false 1050 p.parseGenericRawTextElement() 1051 case a.Noembed: 1052 p.parseGenericRawTextElement() 1053 case a.Noscript: 1054 if p.scripting { 1055 p.parseGenericRawTextElement() 1056 return true 1057 } 1058 p.reconstructActiveFormattingElements() 1059 p.addElement() 1060 // Don't let the tokenizer go into raw text mode when scripting is disabled. 1061 p.tokenizer.NextIsNotRawText() 1062 case a.Select: 1063 p.reconstructActiveFormattingElements() 1064 p.addElement() 1065 p.framesetOK = false 1066 p.im = inSelectIM 1067 return true 1068 case a.Optgroup, a.Option: 1069 if p.top().DataAtom == a.Option { 1070 p.oe.pop() 1071 } 1072 p.reconstructActiveFormattingElements() 1073 p.addElement() 1074 case a.Rb, a.Rtc: 1075 if p.elementInScope(defaultScope, a.Ruby) { 1076 p.generateImpliedEndTags() 1077 } 1078 p.addElement() 1079 case a.Rp, a.Rt: 1080 if p.elementInScope(defaultScope, a.Ruby) { 1081 p.generateImpliedEndTags("rtc") 1082 } 1083 p.addElement() 1084 case a.Math, a.Svg: 1085 p.reconstructActiveFormattingElements() 1086 if p.tok.DataAtom == a.Math { 1087 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 1088 } else { 1089 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 1090 } 1091 adjustForeignAttributes(p.tok.Attr) 1092 p.addElement() 1093 p.top().Namespace = p.tok.Data 1094 if p.hasSelfClosingToken { 1095 p.oe.pop() 1096 p.acknowledgeSelfClosingTag() 1097 } 1098 return true 1099 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1100 // Ignore the token. 1101 default: 1102 p.reconstructActiveFormattingElements() 1103 p.addElement() 1104 } 1105 case EndTagToken: 1106 switch p.tok.DataAtom { 1107 case a.Body: 1108 if p.elementInScope(defaultScope, a.Body) { 1109 p.im = afterBodyIM 1110 } 1111 case a.Html: 1112 if p.elementInScope(defaultScope, a.Body) { 1113 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) 1114 return false 1115 } 1116 return true 1117 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: 1118 p.popUntil(defaultScope, p.tok.DataAtom) 1119 case a.Form: 1120 if p.oe.contains(a.Template) { 1121 i := p.indexOfElementInScope(defaultScope, a.Form) 1122 if i == -1 { 1123 // Ignore the token. 1124 return true 1125 } 1126 p.generateImpliedEndTags() 1127 if p.oe[i].DataAtom != a.Form { 1128 // Ignore the token. 1129 return true 1130 } 1131 p.popUntil(defaultScope, a.Form) 1132 } else { 1133 node := p.form 1134 p.form = nil 1135 i := p.indexOfElementInScope(defaultScope, a.Form) 1136 if node == nil || i == -1 || p.oe[i] != node { 1137 // Ignore the token. 1138 return true 1139 } 1140 p.generateImpliedEndTags() 1141 p.oe.remove(node) 1142 } 1143 case a.P: 1144 if !p.elementInScope(buttonScope, a.P) { 1145 p.parseImpliedToken(StartTagToken, a.P, a.P.String()) 1146 } 1147 p.popUntil(buttonScope, a.P) 1148 case a.Li: 1149 p.popUntil(listItemScope, a.Li) 1150 case a.Dd, a.Dt: 1151 p.popUntil(defaultScope, p.tok.DataAtom) 1152 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 1153 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) 1154 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 1155 p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data) 1156 case a.Applet, a.Marquee, a.Object: 1157 if p.popUntil(defaultScope, p.tok.DataAtom) { 1158 p.clearActiveFormattingElements() 1159 } 1160 case a.Br: 1161 p.tok.Type = StartTagToken 1162 return false 1163 case a.Template: 1164 return inHeadIM(p) 1165 default: 1166 p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data) 1167 } 1168 case CommentToken: 1169 p.addChild(&Node{ 1170 Type: CommentNode, 1171 Data: p.tok.Data, 1172 }) 1173 case ErrorToken: 1174 // TODO: remove this divergence from the HTML5 spec. 1175 if len(p.templateStack) > 0 { 1176 p.im = inTemplateIM 1177 return false 1178 } 1179 for _, e := range p.oe { 1180 switch e.DataAtom { 1181 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th, 1182 a.Thead, a.Tr, a.Body, a.Html: 1183 default: 1184 return true 1185 } 1186 } 1187 } 1188 1189 return true 1190} 1191 1192func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) { 1193 // This is the "adoption agency" algorithm, described at 1194 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency 1195 1196 // TODO: this is a fairly literal line-by-line translation of that algorithm. 1197 // Once the code successfully parses the comprehensive test suite, we should 1198 // refactor this code to be more idiomatic. 1199 1200 // Steps 1-2 1201 if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 { 1202 p.oe.pop() 1203 return 1204 } 1205 1206 // Steps 3-5. The outer loop. 1207 for i := 0; i < 8; i++ { 1208 // Step 6. Find the formatting element. 1209 var formattingElement *Node 1210 for j := len(p.afe) - 1; j >= 0; j-- { 1211 if p.afe[j].Type == scopeMarkerNode { 1212 break 1213 } 1214 if p.afe[j].DataAtom == tagAtom { 1215 formattingElement = p.afe[j] 1216 break 1217 } 1218 } 1219 if formattingElement == nil { 1220 p.inBodyEndTagOther(tagAtom, tagName) 1221 return 1222 } 1223 1224 // Step 7. Ignore the tag if formatting element is not in the stack of open elements. 1225 feIndex := p.oe.index(formattingElement) 1226 if feIndex == -1 { 1227 p.afe.remove(formattingElement) 1228 return 1229 } 1230 // Step 8. Ignore the tag if formatting element is not in the scope. 1231 if !p.elementInScope(defaultScope, tagAtom) { 1232 // Ignore the tag. 1233 return 1234 } 1235 1236 // Step 9. This step is omitted because it's just a parse error but no need to return. 1237 1238 // Steps 10-11. Find the furthest block. 1239 var furthestBlock *Node 1240 for _, e := range p.oe[feIndex:] { 1241 if isSpecialElement(e) { 1242 furthestBlock = e 1243 break 1244 } 1245 } 1246 if furthestBlock == nil { 1247 e := p.oe.pop() 1248 for e != formattingElement { 1249 e = p.oe.pop() 1250 } 1251 p.afe.remove(e) 1252 return 1253 } 1254 1255 // Steps 12-13. Find the common ancestor and bookmark node. 1256 commonAncestor := p.oe[feIndex-1] 1257 bookmark := p.afe.index(formattingElement) 1258 1259 // Step 14. The inner loop. Find the lastNode to reparent. 1260 lastNode := furthestBlock 1261 node := furthestBlock 1262 x := p.oe.index(node) 1263 // Step 14.1. 1264 j := 0 1265 for { 1266 // Step 14.2. 1267 j++ 1268 // Step. 14.3. 1269 x-- 1270 node = p.oe[x] 1271 // Step 14.4. Go to the next step if node is formatting element. 1272 if node == formattingElement { 1273 break 1274 } 1275 // Step 14.5. Remove node from the list of active formatting elements if 1276 // inner loop counter is greater than three and node is in the list of 1277 // active formatting elements. 1278 if ni := p.afe.index(node); j > 3 && ni > -1 { 1279 p.afe.remove(node) 1280 // If any element of the list of active formatting elements is removed, 1281 // we need to take care whether bookmark should be decremented or not. 1282 // This is because the value of bookmark may exceed the size of the 1283 // list by removing elements from the list. 1284 if ni <= bookmark { 1285 bookmark-- 1286 } 1287 continue 1288 } 1289 // Step 14.6. Continue the next inner loop if node is not in the list of 1290 // active formatting elements. 1291 if p.afe.index(node) == -1 { 1292 p.oe.remove(node) 1293 continue 1294 } 1295 // Step 14.7. 1296 clone := node.clone() 1297 p.afe[p.afe.index(node)] = clone 1298 p.oe[p.oe.index(node)] = clone 1299 node = clone 1300 // Step 14.8. 1301 if lastNode == furthestBlock { 1302 bookmark = p.afe.index(node) + 1 1303 } 1304 // Step 14.9. 1305 if lastNode.Parent != nil { 1306 lastNode.Parent.RemoveChild(lastNode) 1307 } 1308 node.AppendChild(lastNode) 1309 // Step 14.10. 1310 lastNode = node 1311 } 1312 1313 // Step 15. Reparent lastNode to the common ancestor, 1314 // or for misnested table nodes, to the foster parent. 1315 if lastNode.Parent != nil { 1316 lastNode.Parent.RemoveChild(lastNode) 1317 } 1318 switch commonAncestor.DataAtom { 1319 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1320 p.fosterParent(lastNode) 1321 default: 1322 commonAncestor.AppendChild(lastNode) 1323 } 1324 1325 // Steps 16-18. Reparent nodes from the furthest block's children 1326 // to a clone of the formatting element. 1327 clone := formattingElement.clone() 1328 reparentChildren(clone, furthestBlock) 1329 furthestBlock.AppendChild(clone) 1330 1331 // Step 19. Fix up the list of active formatting elements. 1332 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { 1333 // Move the bookmark with the rest of the list. 1334 bookmark-- 1335 } 1336 p.afe.remove(formattingElement) 1337 p.afe.insert(bookmark, clone) 1338 1339 // Step 20. Fix up the stack of open elements. 1340 p.oe.remove(formattingElement) 1341 p.oe.insert(p.oe.index(furthestBlock)+1, clone) 1342 } 1343} 1344 1345// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. 1346// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content 1347// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign 1348func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) { 1349 for i := len(p.oe) - 1; i >= 0; i-- { 1350 // Two element nodes have the same tag if they have the same Data (a 1351 // string-typed field). As an optimization, for common HTML tags, each 1352 // Data string is assigned a unique, non-zero DataAtom (a uint32-typed 1353 // field), since integer comparison is faster than string comparison. 1354 // Uncommon (custom) tags get a zero DataAtom. 1355 // 1356 // The if condition here is equivalent to (p.oe[i].Data == tagName). 1357 if (p.oe[i].DataAtom == tagAtom) && 1358 ((tagAtom != 0) || (p.oe[i].Data == tagName)) { 1359 p.oe = p.oe[:i] 1360 break 1361 } 1362 if isSpecialElement(p.oe[i]) { 1363 break 1364 } 1365 } 1366} 1367 1368// Section 12.2.6.4.8. 1369func textIM(p *parser) bool { 1370 switch p.tok.Type { 1371 case ErrorToken: 1372 p.oe.pop() 1373 case TextToken: 1374 d := p.tok.Data 1375 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { 1376 // Ignore a newline at the start of a <textarea> block. 1377 if d != "" && d[0] == '\r' { 1378 d = d[1:] 1379 } 1380 if d != "" && d[0] == '\n' { 1381 d = d[1:] 1382 } 1383 } 1384 if d == "" { 1385 return true 1386 } 1387 p.addText(d) 1388 return true 1389 case EndTagToken: 1390 p.oe.pop() 1391 } 1392 p.im = p.originalIM 1393 p.originalIM = nil 1394 return p.tok.Type == EndTagToken 1395} 1396 1397// Section 12.2.6.4.9. 1398func inTableIM(p *parser) bool { 1399 switch p.tok.Type { 1400 case TextToken: 1401 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) 1402 switch p.oe.top().DataAtom { 1403 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1404 if strings.Trim(p.tok.Data, whitespace) == "" { 1405 p.addText(p.tok.Data) 1406 return true 1407 } 1408 } 1409 case StartTagToken: 1410 switch p.tok.DataAtom { 1411 case a.Caption: 1412 p.clearStackToContext(tableScope) 1413 p.afe = append(p.afe, &scopeMarker) 1414 p.addElement() 1415 p.im = inCaptionIM 1416 return true 1417 case a.Colgroup: 1418 p.clearStackToContext(tableScope) 1419 p.addElement() 1420 p.im = inColumnGroupIM 1421 return true 1422 case a.Col: 1423 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) 1424 return false 1425 case a.Tbody, a.Tfoot, a.Thead: 1426 p.clearStackToContext(tableScope) 1427 p.addElement() 1428 p.im = inTableBodyIM 1429 return true 1430 case a.Td, a.Th, a.Tr: 1431 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) 1432 return false 1433 case a.Table: 1434 if p.popUntil(tableScope, a.Table) { 1435 p.resetInsertionMode() 1436 return false 1437 } 1438 // Ignore the token. 1439 return true 1440 case a.Style, a.Script, a.Template: 1441 return inHeadIM(p) 1442 case a.Input: 1443 for _, t := range p.tok.Attr { 1444 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { 1445 p.addElement() 1446 p.oe.pop() 1447 return true 1448 } 1449 } 1450 // Otherwise drop down to the default action. 1451 case a.Form: 1452 if p.oe.contains(a.Template) || p.form != nil { 1453 // Ignore the token. 1454 return true 1455 } 1456 p.addElement() 1457 p.form = p.oe.pop() 1458 case a.Select: 1459 p.reconstructActiveFormattingElements() 1460 switch p.top().DataAtom { 1461 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1462 p.fosterParenting = true 1463 } 1464 p.addElement() 1465 p.fosterParenting = false 1466 p.framesetOK = false 1467 p.im = inSelectInTableIM 1468 return true 1469 } 1470 case EndTagToken: 1471 switch p.tok.DataAtom { 1472 case a.Table: 1473 if p.popUntil(tableScope, a.Table) { 1474 p.resetInsertionMode() 1475 return true 1476 } 1477 // Ignore the token. 1478 return true 1479 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1480 // Ignore the token. 1481 return true 1482 case a.Template: 1483 return inHeadIM(p) 1484 } 1485 case CommentToken: 1486 p.addChild(&Node{ 1487 Type: CommentNode, 1488 Data: p.tok.Data, 1489 }) 1490 return true 1491 case DoctypeToken: 1492 // Ignore the token. 1493 return true 1494 case ErrorToken: 1495 return inBodyIM(p) 1496 } 1497 1498 p.fosterParenting = true 1499 defer func() { p.fosterParenting = false }() 1500 1501 return inBodyIM(p) 1502} 1503 1504// Section 12.2.6.4.11. 1505func inCaptionIM(p *parser) bool { 1506 switch p.tok.Type { 1507 case StartTagToken: 1508 switch p.tok.DataAtom { 1509 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: 1510 if !p.popUntil(tableScope, a.Caption) { 1511 // Ignore the token. 1512 return true 1513 } 1514 p.clearActiveFormattingElements() 1515 p.im = inTableIM 1516 return false 1517 case a.Select: 1518 p.reconstructActiveFormattingElements() 1519 p.addElement() 1520 p.framesetOK = false 1521 p.im = inSelectInTableIM 1522 return true 1523 } 1524 case EndTagToken: 1525 switch p.tok.DataAtom { 1526 case a.Caption: 1527 if p.popUntil(tableScope, a.Caption) { 1528 p.clearActiveFormattingElements() 1529 p.im = inTableIM 1530 } 1531 return true 1532 case a.Table: 1533 if !p.popUntil(tableScope, a.Caption) { 1534 // Ignore the token. 1535 return true 1536 } 1537 p.clearActiveFormattingElements() 1538 p.im = inTableIM 1539 return false 1540 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1541 // Ignore the token. 1542 return true 1543 } 1544 } 1545 return inBodyIM(p) 1546} 1547 1548// Section 12.2.6.4.12. 1549func inColumnGroupIM(p *parser) bool { 1550 switch p.tok.Type { 1551 case TextToken: 1552 s := strings.TrimLeft(p.tok.Data, whitespace) 1553 if len(s) < len(p.tok.Data) { 1554 // Add the initial whitespace to the current node. 1555 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 1556 if s == "" { 1557 return true 1558 } 1559 p.tok.Data = s 1560 } 1561 case CommentToken: 1562 p.addChild(&Node{ 1563 Type: CommentNode, 1564 Data: p.tok.Data, 1565 }) 1566 return true 1567 case DoctypeToken: 1568 // Ignore the token. 1569 return true 1570 case StartTagToken: 1571 switch p.tok.DataAtom { 1572 case a.Html: 1573 return inBodyIM(p) 1574 case a.Col: 1575 p.addElement() 1576 p.oe.pop() 1577 p.acknowledgeSelfClosingTag() 1578 return true 1579 case a.Template: 1580 return inHeadIM(p) 1581 } 1582 case EndTagToken: 1583 switch p.tok.DataAtom { 1584 case a.Colgroup: 1585 if p.oe.top().DataAtom == a.Colgroup { 1586 p.oe.pop() 1587 p.im = inTableIM 1588 } 1589 return true 1590 case a.Col: 1591 // Ignore the token. 1592 return true 1593 case a.Template: 1594 return inHeadIM(p) 1595 } 1596 case ErrorToken: 1597 return inBodyIM(p) 1598 } 1599 if p.oe.top().DataAtom != a.Colgroup { 1600 return true 1601 } 1602 p.oe.pop() 1603 p.im = inTableIM 1604 return false 1605} 1606 1607// Section 12.2.6.4.13. 1608func inTableBodyIM(p *parser) bool { 1609 switch p.tok.Type { 1610 case StartTagToken: 1611 switch p.tok.DataAtom { 1612 case a.Tr: 1613 p.clearStackToContext(tableBodyScope) 1614 p.addElement() 1615 p.im = inRowIM 1616 return true 1617 case a.Td, a.Th: 1618 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) 1619 return false 1620 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1621 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1622 p.im = inTableIM 1623 return false 1624 } 1625 // Ignore the token. 1626 return true 1627 } 1628 case EndTagToken: 1629 switch p.tok.DataAtom { 1630 case a.Tbody, a.Tfoot, a.Thead: 1631 if p.elementInScope(tableScope, p.tok.DataAtom) { 1632 p.clearStackToContext(tableBodyScope) 1633 p.oe.pop() 1634 p.im = inTableIM 1635 } 1636 return true 1637 case a.Table: 1638 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1639 p.im = inTableIM 1640 return false 1641 } 1642 // Ignore the token. 1643 return true 1644 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: 1645 // Ignore the token. 1646 return true 1647 } 1648 case CommentToken: 1649 p.addChild(&Node{ 1650 Type: CommentNode, 1651 Data: p.tok.Data, 1652 }) 1653 return true 1654 } 1655 1656 return inTableIM(p) 1657} 1658 1659// Section 12.2.6.4.14. 1660func inRowIM(p *parser) bool { 1661 switch p.tok.Type { 1662 case StartTagToken: 1663 switch p.tok.DataAtom { 1664 case a.Td, a.Th: 1665 p.clearStackToContext(tableRowScope) 1666 p.addElement() 1667 p.afe = append(p.afe, &scopeMarker) 1668 p.im = inCellIM 1669 return true 1670 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1671 if p.popUntil(tableScope, a.Tr) { 1672 p.im = inTableBodyIM 1673 return false 1674 } 1675 // Ignore the token. 1676 return true 1677 } 1678 case EndTagToken: 1679 switch p.tok.DataAtom { 1680 case a.Tr: 1681 if p.popUntil(tableScope, a.Tr) { 1682 p.im = inTableBodyIM 1683 return true 1684 } 1685 // Ignore the token. 1686 return true 1687 case a.Table: 1688 if p.popUntil(tableScope, a.Tr) { 1689 p.im = inTableBodyIM 1690 return false 1691 } 1692 // Ignore the token. 1693 return true 1694 case a.Tbody, a.Tfoot, a.Thead: 1695 if p.elementInScope(tableScope, p.tok.DataAtom) { 1696 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) 1697 return false 1698 } 1699 // Ignore the token. 1700 return true 1701 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: 1702 // Ignore the token. 1703 return true 1704 } 1705 } 1706 1707 return inTableIM(p) 1708} 1709 1710// Section 12.2.6.4.15. 1711func inCellIM(p *parser) bool { 1712 switch p.tok.Type { 1713 case StartTagToken: 1714 switch p.tok.DataAtom { 1715 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1716 if p.popUntil(tableScope, a.Td, a.Th) { 1717 // Close the cell and reprocess. 1718 p.clearActiveFormattingElements() 1719 p.im = inRowIM 1720 return false 1721 } 1722 // Ignore the token. 1723 return true 1724 case a.Select: 1725 p.reconstructActiveFormattingElements() 1726 p.addElement() 1727 p.framesetOK = false 1728 p.im = inSelectInTableIM 1729 return true 1730 } 1731 case EndTagToken: 1732 switch p.tok.DataAtom { 1733 case a.Td, a.Th: 1734 if !p.popUntil(tableScope, p.tok.DataAtom) { 1735 // Ignore the token. 1736 return true 1737 } 1738 p.clearActiveFormattingElements() 1739 p.im = inRowIM 1740 return true 1741 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: 1742 // Ignore the token. 1743 return true 1744 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1745 if !p.elementInScope(tableScope, p.tok.DataAtom) { 1746 // Ignore the token. 1747 return true 1748 } 1749 // Close the cell and reprocess. 1750 if p.popUntil(tableScope, a.Td, a.Th) { 1751 p.clearActiveFormattingElements() 1752 } 1753 p.im = inRowIM 1754 return false 1755 } 1756 } 1757 return inBodyIM(p) 1758} 1759 1760// Section 12.2.6.4.16. 1761func inSelectIM(p *parser) bool { 1762 switch p.tok.Type { 1763 case TextToken: 1764 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) 1765 case StartTagToken: 1766 switch p.tok.DataAtom { 1767 case a.Html: 1768 return inBodyIM(p) 1769 case a.Option: 1770 if p.top().DataAtom == a.Option { 1771 p.oe.pop() 1772 } 1773 p.addElement() 1774 case a.Optgroup: 1775 if p.top().DataAtom == a.Option { 1776 p.oe.pop() 1777 } 1778 if p.top().DataAtom == a.Optgroup { 1779 p.oe.pop() 1780 } 1781 p.addElement() 1782 case a.Select: 1783 if !p.popUntil(selectScope, a.Select) { 1784 // Ignore the token. 1785 return true 1786 } 1787 p.resetInsertionMode() 1788 case a.Input, a.Keygen, a.Textarea: 1789 if p.elementInScope(selectScope, a.Select) { 1790 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) 1791 return false 1792 } 1793 // In order to properly ignore <textarea>, we need to change the tokenizer mode. 1794 p.tokenizer.NextIsNotRawText() 1795 // Ignore the token. 1796 return true 1797 case a.Script, a.Template: 1798 return inHeadIM(p) 1799 case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp: 1800 // Don't let the tokenizer go into raw text mode when there are raw tags 1801 // to be ignored. These tags should be ignored from the tokenizer 1802 // properly. 1803 p.tokenizer.NextIsNotRawText() 1804 // Ignore the token. 1805 return true 1806 } 1807 case EndTagToken: 1808 switch p.tok.DataAtom { 1809 case a.Option: 1810 if p.top().DataAtom == a.Option { 1811 p.oe.pop() 1812 } 1813 case a.Optgroup: 1814 i := len(p.oe) - 1 1815 if p.oe[i].DataAtom == a.Option { 1816 i-- 1817 } 1818 if p.oe[i].DataAtom == a.Optgroup { 1819 p.oe = p.oe[:i] 1820 } 1821 case a.Select: 1822 if !p.popUntil(selectScope, a.Select) { 1823 // Ignore the token. 1824 return true 1825 } 1826 p.resetInsertionMode() 1827 case a.Template: 1828 return inHeadIM(p) 1829 } 1830 case CommentToken: 1831 p.addChild(&Node{ 1832 Type: CommentNode, 1833 Data: p.tok.Data, 1834 }) 1835 case DoctypeToken: 1836 // Ignore the token. 1837 return true 1838 case ErrorToken: 1839 return inBodyIM(p) 1840 } 1841 1842 return true 1843} 1844 1845// Section 12.2.6.4.17. 1846func inSelectInTableIM(p *parser) bool { 1847 switch p.tok.Type { 1848 case StartTagToken, EndTagToken: 1849 switch p.tok.DataAtom { 1850 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: 1851 if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) { 1852 // Ignore the token. 1853 return true 1854 } 1855 // This is like p.popUntil(selectScope, a.Select), but it also 1856 // matches <math select>, not just <select>. Matching the MathML 1857 // tag is arguably incorrect (conceptually), but it mimics what 1858 // Chromium does. 1859 for i := len(p.oe) - 1; i >= 0; i-- { 1860 if n := p.oe[i]; n.DataAtom == a.Select { 1861 p.oe = p.oe[:i] 1862 break 1863 } 1864 } 1865 p.resetInsertionMode() 1866 return false 1867 } 1868 } 1869 return inSelectIM(p) 1870} 1871 1872// Section 12.2.6.4.18. 1873func inTemplateIM(p *parser) bool { 1874 switch p.tok.Type { 1875 case TextToken, CommentToken, DoctypeToken: 1876 return inBodyIM(p) 1877 case StartTagToken: 1878 switch p.tok.DataAtom { 1879 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 1880 return inHeadIM(p) 1881 case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1882 p.templateStack.pop() 1883 p.templateStack = append(p.templateStack, inTableIM) 1884 p.im = inTableIM 1885 return false 1886 case a.Col: 1887 p.templateStack.pop() 1888 p.templateStack = append(p.templateStack, inColumnGroupIM) 1889 p.im = inColumnGroupIM 1890 return false 1891 case a.Tr: 1892 p.templateStack.pop() 1893 p.templateStack = append(p.templateStack, inTableBodyIM) 1894 p.im = inTableBodyIM 1895 return false 1896 case a.Td, a.Th: 1897 p.templateStack.pop() 1898 p.templateStack = append(p.templateStack, inRowIM) 1899 p.im = inRowIM 1900 return false 1901 default: 1902 p.templateStack.pop() 1903 p.templateStack = append(p.templateStack, inBodyIM) 1904 p.im = inBodyIM 1905 return false 1906 } 1907 case EndTagToken: 1908 switch p.tok.DataAtom { 1909 case a.Template: 1910 return inHeadIM(p) 1911 default: 1912 // Ignore the token. 1913 return true 1914 } 1915 case ErrorToken: 1916 if !p.oe.contains(a.Template) { 1917 // Ignore the token. 1918 return true 1919 } 1920 // TODO: remove this divergence from the HTML5 spec. 1921 // 1922 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 1923 p.generateImpliedEndTags() 1924 for i := len(p.oe) - 1; i >= 0; i-- { 1925 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 1926 p.oe = p.oe[:i] 1927 break 1928 } 1929 } 1930 p.clearActiveFormattingElements() 1931 p.templateStack.pop() 1932 p.resetInsertionMode() 1933 return false 1934 } 1935 return false 1936} 1937 1938// Section 12.2.6.4.19. 1939func afterBodyIM(p *parser) bool { 1940 switch p.tok.Type { 1941 case ErrorToken: 1942 // Stop parsing. 1943 return true 1944 case TextToken: 1945 s := strings.TrimLeft(p.tok.Data, whitespace) 1946 if len(s) == 0 { 1947 // It was all whitespace. 1948 return inBodyIM(p) 1949 } 1950 case StartTagToken: 1951 if p.tok.DataAtom == a.Html { 1952 return inBodyIM(p) 1953 } 1954 case EndTagToken: 1955 if p.tok.DataAtom == a.Html { 1956 if !p.fragment { 1957 p.im = afterAfterBodyIM 1958 } 1959 return true 1960 } 1961 case CommentToken: 1962 // The comment is attached to the <html> element. 1963 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { 1964 panic("html: bad parser state: <html> element not found, in the after-body insertion mode") 1965 } 1966 p.oe[0].AppendChild(&Node{ 1967 Type: CommentNode, 1968 Data: p.tok.Data, 1969 }) 1970 return true 1971 } 1972 p.im = inBodyIM 1973 return false 1974} 1975 1976// Section 12.2.6.4.20. 1977func inFramesetIM(p *parser) bool { 1978 switch p.tok.Type { 1979 case CommentToken: 1980 p.addChild(&Node{ 1981 Type: CommentNode, 1982 Data: p.tok.Data, 1983 }) 1984 case TextToken: 1985 // Ignore all text but whitespace. 1986 s := strings.Map(func(c rune) rune { 1987 switch c { 1988 case ' ', '\t', '\n', '\f', '\r': 1989 return c 1990 } 1991 return -1 1992 }, p.tok.Data) 1993 if s != "" { 1994 p.addText(s) 1995 } 1996 case StartTagToken: 1997 switch p.tok.DataAtom { 1998 case a.Html: 1999 return inBodyIM(p) 2000 case a.Frameset: 2001 p.addElement() 2002 case a.Frame: 2003 p.addElement() 2004 p.oe.pop() 2005 p.acknowledgeSelfClosingTag() 2006 case a.Noframes: 2007 return inHeadIM(p) 2008 } 2009 case EndTagToken: 2010 switch p.tok.DataAtom { 2011 case a.Frameset: 2012 if p.oe.top().DataAtom != a.Html { 2013 p.oe.pop() 2014 if p.oe.top().DataAtom != a.Frameset { 2015 p.im = afterFramesetIM 2016 return true 2017 } 2018 } 2019 } 2020 default: 2021 // Ignore the token. 2022 } 2023 return true 2024} 2025 2026// Section 12.2.6.4.21. 2027func afterFramesetIM(p *parser) bool { 2028 switch p.tok.Type { 2029 case CommentToken: 2030 p.addChild(&Node{ 2031 Type: CommentNode, 2032 Data: p.tok.Data, 2033 }) 2034 case TextToken: 2035 // Ignore all text but whitespace. 2036 s := strings.Map(func(c rune) rune { 2037 switch c { 2038 case ' ', '\t', '\n', '\f', '\r': 2039 return c 2040 } 2041 return -1 2042 }, p.tok.Data) 2043 if s != "" { 2044 p.addText(s) 2045 } 2046 case StartTagToken: 2047 switch p.tok.DataAtom { 2048 case a.Html: 2049 return inBodyIM(p) 2050 case a.Noframes: 2051 return inHeadIM(p) 2052 } 2053 case EndTagToken: 2054 switch p.tok.DataAtom { 2055 case a.Html: 2056 p.im = afterAfterFramesetIM 2057 return true 2058 } 2059 default: 2060 // Ignore the token. 2061 } 2062 return true 2063} 2064 2065// Section 12.2.6.4.22. 2066func afterAfterBodyIM(p *parser) bool { 2067 switch p.tok.Type { 2068 case ErrorToken: 2069 // Stop parsing. 2070 return true 2071 case TextToken: 2072 s := strings.TrimLeft(p.tok.Data, whitespace) 2073 if len(s) == 0 { 2074 // It was all whitespace. 2075 return inBodyIM(p) 2076 } 2077 case StartTagToken: 2078 if p.tok.DataAtom == a.Html { 2079 return inBodyIM(p) 2080 } 2081 case CommentToken: 2082 p.doc.AppendChild(&Node{ 2083 Type: CommentNode, 2084 Data: p.tok.Data, 2085 }) 2086 return true 2087 case DoctypeToken: 2088 return inBodyIM(p) 2089 } 2090 p.im = inBodyIM 2091 return false 2092} 2093 2094// Section 12.2.6.4.23. 2095func afterAfterFramesetIM(p *parser) bool { 2096 switch p.tok.Type { 2097 case CommentToken: 2098 p.doc.AppendChild(&Node{ 2099 Type: CommentNode, 2100 Data: p.tok.Data, 2101 }) 2102 case TextToken: 2103 // Ignore all text but whitespace. 2104 s := strings.Map(func(c rune) rune { 2105 switch c { 2106 case ' ', '\t', '\n', '\f', '\r': 2107 return c 2108 } 2109 return -1 2110 }, p.tok.Data) 2111 if s != "" { 2112 p.tok.Data = s 2113 return inBodyIM(p) 2114 } 2115 case StartTagToken: 2116 switch p.tok.DataAtom { 2117 case a.Html: 2118 return inBodyIM(p) 2119 case a.Noframes: 2120 return inHeadIM(p) 2121 } 2122 case DoctypeToken: 2123 return inBodyIM(p) 2124 default: 2125 // Ignore the token. 2126 } 2127 return true 2128} 2129 2130const whitespaceOrNUL = whitespace + "\x00" 2131 2132// Section 12.2.6.5 2133func parseForeignContent(p *parser) bool { 2134 switch p.tok.Type { 2135 case TextToken: 2136 if p.framesetOK { 2137 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" 2138 } 2139 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) 2140 p.addText(p.tok.Data) 2141 case CommentToken: 2142 p.addChild(&Node{ 2143 Type: CommentNode, 2144 Data: p.tok.Data, 2145 }) 2146 case StartTagToken: 2147 if !p.fragment { 2148 b := breakout[p.tok.Data] 2149 if p.tok.DataAtom == a.Font { 2150 loop: 2151 for _, attr := range p.tok.Attr { 2152 switch attr.Key { 2153 case "color", "face", "size": 2154 b = true 2155 break loop 2156 } 2157 } 2158 } 2159 if b { 2160 for i := len(p.oe) - 1; i >= 0; i-- { 2161 n := p.oe[i] 2162 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { 2163 p.oe = p.oe[:i+1] 2164 break 2165 } 2166 } 2167 return false 2168 } 2169 } 2170 current := p.adjustedCurrentNode() 2171 switch current.Namespace { 2172 case "math": 2173 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 2174 case "svg": 2175 // Adjust SVG tag names. The tokenizer lower-cases tag names, but 2176 // SVG wants e.g. "foreignObject" with a capital second "O". 2177 if x := svgTagNameAdjustments[p.tok.Data]; x != "" { 2178 p.tok.DataAtom = a.Lookup([]byte(x)) 2179 p.tok.Data = x 2180 } 2181 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 2182 default: 2183 panic("html: bad parser state: unexpected namespace") 2184 } 2185 adjustForeignAttributes(p.tok.Attr) 2186 namespace := current.Namespace 2187 p.addElement() 2188 p.top().Namespace = namespace 2189 if namespace != "" { 2190 // Don't let the tokenizer go into raw text mode in foreign content 2191 // (e.g. in an SVG <title> tag). 2192 p.tokenizer.NextIsNotRawText() 2193 } 2194 if p.hasSelfClosingToken { 2195 p.oe.pop() 2196 p.acknowledgeSelfClosingTag() 2197 } 2198 case EndTagToken: 2199 for i := len(p.oe) - 1; i >= 0; i-- { 2200 if p.oe[i].Namespace == "" { 2201 return p.im(p) 2202 } 2203 if strings.EqualFold(p.oe[i].Data, p.tok.Data) { 2204 p.oe = p.oe[:i] 2205 break 2206 } 2207 } 2208 return true 2209 default: 2210 // Ignore the token. 2211 } 2212 return true 2213} 2214 2215// Section 12.2.4.2. 2216func (p *parser) adjustedCurrentNode() *Node { 2217 if len(p.oe) == 1 && p.fragment && p.context != nil { 2218 return p.context 2219 } 2220 return p.oe.top() 2221} 2222 2223// Section 12.2.6. 2224func (p *parser) inForeignContent() bool { 2225 if len(p.oe) == 0 { 2226 return false 2227 } 2228 n := p.adjustedCurrentNode() 2229 if n.Namespace == "" { 2230 return false 2231 } 2232 if mathMLTextIntegrationPoint(n) { 2233 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { 2234 return false 2235 } 2236 if p.tok.Type == TextToken { 2237 return false 2238 } 2239 } 2240 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { 2241 return false 2242 } 2243 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { 2244 return false 2245 } 2246 if p.tok.Type == ErrorToken { 2247 return false 2248 } 2249 return true 2250} 2251 2252// parseImpliedToken parses a token as though it had appeared in the parser's 2253// input. 2254func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { 2255 realToken, selfClosing := p.tok, p.hasSelfClosingToken 2256 p.tok = Token{ 2257 Type: t, 2258 DataAtom: dataAtom, 2259 Data: data, 2260 } 2261 p.hasSelfClosingToken = false 2262 p.parseCurrentToken() 2263 p.tok, p.hasSelfClosingToken = realToken, selfClosing 2264} 2265 2266// parseCurrentToken runs the current token through the parsing routines 2267// until it is consumed. 2268func (p *parser) parseCurrentToken() { 2269 if p.tok.Type == SelfClosingTagToken { 2270 p.hasSelfClosingToken = true 2271 p.tok.Type = StartTagToken 2272 } 2273 2274 consumed := false 2275 for !consumed { 2276 if p.inForeignContent() { 2277 consumed = parseForeignContent(p) 2278 } else { 2279 consumed = p.im(p) 2280 } 2281 } 2282 2283 if p.hasSelfClosingToken { 2284 // This is a parse error, but ignore it. 2285 p.hasSelfClosingToken = false 2286 } 2287} 2288 2289func (p *parser) parse() error { 2290 // Iterate until EOF. Any other error will cause an early return. 2291 var err error 2292 for err != io.EOF { 2293 // CDATA sections are allowed only in foreign content. 2294 n := p.oe.top() 2295 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") 2296 // Read and parse the next token. 2297 p.tokenizer.Next() 2298 p.tok = p.tokenizer.Token() 2299 if p.tok.Type == ErrorToken { 2300 err = p.tokenizer.Err() 2301 if err != nil && err != io.EOF { 2302 return err 2303 } 2304 } 2305 p.parseCurrentToken() 2306 } 2307 return nil 2308} 2309 2310// Parse returns the parse tree for the HTML from the given Reader. 2311// 2312// It implements the HTML5 parsing algorithm 2313// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction), 2314// which is very complicated. The resultant tree can contain implicitly created 2315// nodes that have no explicit <tag> listed in r's data, and nodes' parents can 2316// differ from the nesting implied by a naive processing of start and end 2317// <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped, 2318// with no corresponding node in the resulting tree. 2319// 2320// The input is assumed to be UTF-8 encoded. 2321func Parse(r io.Reader) (*Node, error) { 2322 return ParseWithOptions(r) 2323} 2324 2325// ParseFragment parses a fragment of HTML and returns the nodes that were 2326// found. If the fragment is the InnerHTML for an existing element, pass that 2327// element in context. 2328// 2329// It has the same intricacies as Parse. 2330func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { 2331 return ParseFragmentWithOptions(r, context) 2332} 2333 2334// ParseOption configures a parser. 2335type ParseOption func(p *parser) 2336 2337// ParseOptionEnableScripting configures the scripting flag. 2338// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting 2339// 2340// By default, scripting is enabled. 2341func ParseOptionEnableScripting(enable bool) ParseOption { 2342 return func(p *parser) { 2343 p.scripting = enable 2344 } 2345} 2346 2347// ParseWithOptions is like Parse, with options. 2348func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) { 2349 p := &parser{ 2350 tokenizer: NewTokenizer(r), 2351 doc: &Node{ 2352 Type: DocumentNode, 2353 }, 2354 scripting: true, 2355 framesetOK: true, 2356 im: initialIM, 2357 } 2358 2359 for _, f := range opts { 2360 f(p) 2361 } 2362 2363 if err := p.parse(); err != nil { 2364 return nil, err 2365 } 2366 return p.doc, nil 2367} 2368 2369// ParseFragmentWithOptions is like ParseFragment, with options. 2370func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) { 2371 contextTag := "" 2372 if context != nil { 2373 if context.Type != ElementNode { 2374 return nil, errors.New("html: ParseFragment of non-element Node") 2375 } 2376 // The next check isn't just context.DataAtom.String() == context.Data because 2377 // it is valid to pass an element whose tag isn't a known atom. For example, 2378 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. 2379 if context.DataAtom != a.Lookup([]byte(context.Data)) { 2380 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) 2381 } 2382 contextTag = context.DataAtom.String() 2383 } 2384 p := &parser{ 2385 doc: &Node{ 2386 Type: DocumentNode, 2387 }, 2388 scripting: true, 2389 fragment: true, 2390 context: context, 2391 } 2392 if context != nil && context.Namespace != "" { 2393 p.tokenizer = NewTokenizer(r) 2394 } else { 2395 p.tokenizer = NewTokenizerFragment(r, contextTag) 2396 } 2397 2398 for _, f := range opts { 2399 f(p) 2400 } 2401 2402 root := &Node{ 2403 Type: ElementNode, 2404 DataAtom: a.Html, 2405 Data: a.Html.String(), 2406 } 2407 p.doc.AppendChild(root) 2408 p.oe = nodeStack{root} 2409 if context != nil && context.DataAtom == a.Template { 2410 p.templateStack = append(p.templateStack, inTemplateIM) 2411 } 2412 p.resetInsertionMode() 2413 2414 for n := context; n != nil; n = n.Parent { 2415 if n.Type == ElementNode && n.DataAtom == a.Form { 2416 p.form = n 2417 break 2418 } 2419 } 2420 2421 if err := p.parse(); err != nil { 2422 return nil, err 2423 } 2424 2425 parent := p.doc 2426 if context != nil { 2427 parent = root 2428 } 2429 2430 var result []*Node 2431 for c := parent.FirstChild; c != nil; { 2432 next := c.NextSibling 2433 parent.RemoveChild(c) 2434 result = append(result, c) 2435 c = next 2436 } 2437 return result, nil 2438} 2439