1from __future__ import absolute_import, division, unicode_literals 2from pip._vendor.six import with_metaclass, viewkeys 3 4import types 5 6from . import _inputstream 7from . import _tokenizer 8 9from . import treebuilders 10from .treebuilders.base import Marker 11 12from . import _utils 13from .constants import ( 14 spaceCharacters, asciiUpper2Lower, 15 specialElements, headingElements, cdataElements, rcdataElements, 16 tokenTypes, tagTokenTypes, 17 namespaces, 18 htmlIntegrationPointElements, mathmlTextIntegrationPointElements, 19 adjustForeignAttributes as adjustForeignAttributesMap, 20 adjustMathMLAttributes, adjustSVGAttributes, 21 E, 22 _ReparseException 23) 24 25 26def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): 27 """Parse an HTML document as a string or file-like object into a tree 28 29 :arg doc: the document to parse as a string or file-like object 30 31 :arg treebuilder: the treebuilder to use when parsing 32 33 :arg namespaceHTMLElements: whether or not to namespace HTML elements 34 35 :returns: parsed tree 36 37 Example: 38 39 >>> from html5lib.html5parser import parse 40 >>> parse('<html><body><p>This is a doc</p></body></html>') 41 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> 42 43 """ 44 tb = treebuilders.getTreeBuilder(treebuilder) 45 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) 46 return p.parse(doc, **kwargs) 47 48 49def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): 50 """Parse an HTML fragment as a string or file-like object into a tree 51 52 :arg doc: the fragment to parse as a string or file-like object 53 54 :arg container: the container context to parse the fragment in 55 56 :arg treebuilder: the treebuilder to use when parsing 57 58 :arg namespaceHTMLElements: whether or not to namespace HTML elements 59 60 :returns: parsed tree 61 62 Example: 63 64 >>> from html5lib.html5libparser import parseFragment 65 >>> parseFragment('<b>this is a fragment</b>') 66 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> 67 68 """ 69 tb = treebuilders.getTreeBuilder(treebuilder) 70 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) 71 return p.parseFragment(doc, container=container, **kwargs) 72 73 74def method_decorator_metaclass(function): 75 class Decorated(type): 76 def __new__(meta, classname, bases, classDict): 77 for attributeName, attribute in classDict.items(): 78 if isinstance(attribute, types.FunctionType): 79 attribute = function(attribute) 80 81 classDict[attributeName] = attribute 82 return type.__new__(meta, classname, bases, classDict) 83 return Decorated 84 85 86class HTMLParser(object): 87 """HTML parser 88 89 Generates a tree structure from a stream of (possibly malformed) HTML. 90 91 """ 92 93 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): 94 """ 95 :arg tree: a treebuilder class controlling the type of tree that will be 96 returned. Built in treebuilders can be accessed through 97 html5lib.treebuilders.getTreeBuilder(treeType) 98 99 :arg strict: raise an exception when a parse error is encountered 100 101 :arg namespaceHTMLElements: whether or not to namespace HTML elements 102 103 :arg debug: whether or not to enable debug mode which logs things 104 105 Example: 106 107 >>> from html5lib.html5parser import HTMLParser 108 >>> parser = HTMLParser() # generates parser with etree builder 109 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict 110 111 """ 112 113 # Raise an exception on the first error encountered 114 self.strict = strict 115 116 if tree is None: 117 tree = treebuilders.getTreeBuilder("etree") 118 self.tree = tree(namespaceHTMLElements) 119 self.errors = [] 120 121 self.phases = {name: cls(self, self.tree) for name, cls in 122 getPhases(debug).items()} 123 124 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): 125 126 self.innerHTMLMode = innerHTML 127 self.container = container 128 self.scripting = scripting 129 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) 130 self.reset() 131 132 try: 133 self.mainLoop() 134 except _ReparseException: 135 self.reset() 136 self.mainLoop() 137 138 def reset(self): 139 self.tree.reset() 140 self.firstStartTag = False 141 self.errors = [] 142 self.log = [] # only used with debug mode 143 # "quirks" / "limited quirks" / "no quirks" 144 self.compatMode = "no quirks" 145 146 if self.innerHTMLMode: 147 self.innerHTML = self.container.lower() 148 149 if self.innerHTML in cdataElements: 150 self.tokenizer.state = self.tokenizer.rcdataState 151 elif self.innerHTML in rcdataElements: 152 self.tokenizer.state = self.tokenizer.rawtextState 153 elif self.innerHTML == 'plaintext': 154 self.tokenizer.state = self.tokenizer.plaintextState 155 else: 156 # state already is data state 157 # self.tokenizer.state = self.tokenizer.dataState 158 pass 159 self.phase = self.phases["beforeHtml"] 160 self.phase.insertHtmlElement() 161 self.resetInsertionMode() 162 else: 163 self.innerHTML = False # pylint:disable=redefined-variable-type 164 self.phase = self.phases["initial"] 165 166 self.lastPhase = None 167 168 self.beforeRCDataPhase = None 169 170 self.framesetOK = True 171 172 @property 173 def documentEncoding(self): 174 """Name of the character encoding that was used to decode the input stream, or 175 :obj:`None` if that is not determined yet 176 177 """ 178 if not hasattr(self, 'tokenizer'): 179 return None 180 return self.tokenizer.stream.charEncoding[0].name 181 182 def isHTMLIntegrationPoint(self, element): 183 if (element.name == "annotation-xml" and 184 element.namespace == namespaces["mathml"]): 185 return ("encoding" in element.attributes and 186 element.attributes["encoding"].translate( 187 asciiUpper2Lower) in 188 ("text/html", "application/xhtml+xml")) 189 else: 190 return (element.namespace, element.name) in htmlIntegrationPointElements 191 192 def isMathMLTextIntegrationPoint(self, element): 193 return (element.namespace, element.name) in mathmlTextIntegrationPointElements 194 195 def mainLoop(self): 196 CharactersToken = tokenTypes["Characters"] 197 SpaceCharactersToken = tokenTypes["SpaceCharacters"] 198 StartTagToken = tokenTypes["StartTag"] 199 EndTagToken = tokenTypes["EndTag"] 200 CommentToken = tokenTypes["Comment"] 201 DoctypeToken = tokenTypes["Doctype"] 202 ParseErrorToken = tokenTypes["ParseError"] 203 204 for token in self.tokenizer: 205 prev_token = None 206 new_token = token 207 while new_token is not None: 208 prev_token = new_token 209 currentNode = self.tree.openElements[-1] if self.tree.openElements else None 210 currentNodeNamespace = currentNode.namespace if currentNode else None 211 currentNodeName = currentNode.name if currentNode else None 212 213 type = new_token["type"] 214 215 if type == ParseErrorToken: 216 self.parseError(new_token["data"], new_token.get("datavars", {})) 217 new_token = None 218 else: 219 if (len(self.tree.openElements) == 0 or 220 currentNodeNamespace == self.tree.defaultNamespace or 221 (self.isMathMLTextIntegrationPoint(currentNode) and 222 ((type == StartTagToken and 223 token["name"] not in frozenset(["mglyph", "malignmark"])) or 224 type in (CharactersToken, SpaceCharactersToken))) or 225 (currentNodeNamespace == namespaces["mathml"] and 226 currentNodeName == "annotation-xml" and 227 type == StartTagToken and 228 token["name"] == "svg") or 229 (self.isHTMLIntegrationPoint(currentNode) and 230 type in (StartTagToken, CharactersToken, SpaceCharactersToken))): 231 phase = self.phase 232 else: 233 phase = self.phases["inForeignContent"] 234 235 if type == CharactersToken: 236 new_token = phase.processCharacters(new_token) 237 elif type == SpaceCharactersToken: 238 new_token = phase.processSpaceCharacters(new_token) 239 elif type == StartTagToken: 240 new_token = phase.processStartTag(new_token) 241 elif type == EndTagToken: 242 new_token = phase.processEndTag(new_token) 243 elif type == CommentToken: 244 new_token = phase.processComment(new_token) 245 elif type == DoctypeToken: 246 new_token = phase.processDoctype(new_token) 247 248 if (type == StartTagToken and prev_token["selfClosing"] and 249 not prev_token["selfClosingAcknowledged"]): 250 self.parseError("non-void-element-with-trailing-solidus", 251 {"name": prev_token["name"]}) 252 253 # When the loop finishes it's EOF 254 reprocess = True 255 phases = [] 256 while reprocess: 257 phases.append(self.phase) 258 reprocess = self.phase.processEOF() 259 if reprocess: 260 assert self.phase not in phases 261 262 def parse(self, stream, *args, **kwargs): 263 """Parse a HTML document into a well-formed tree 264 265 :arg stream: a file-like object or string containing the HTML to be parsed 266 267 The optional encoding parameter must be a string that indicates 268 the encoding. If specified, that encoding will be used, 269 regardless of any BOM or later declaration (such as in a meta 270 element). 271 272 :arg scripting: treat noscript elements as if JavaScript was turned on 273 274 :returns: parsed tree 275 276 Example: 277 278 >>> from html5lib.html5parser import HTMLParser 279 >>> parser = HTMLParser() 280 >>> parser.parse('<html><body><p>This is a doc</p></body></html>') 281 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> 282 283 """ 284 self._parse(stream, False, None, *args, **kwargs) 285 return self.tree.getDocument() 286 287 def parseFragment(self, stream, *args, **kwargs): 288 """Parse a HTML fragment into a well-formed tree fragment 289 290 :arg container: name of the element we're setting the innerHTML 291 property if set to None, default to 'div' 292 293 :arg stream: a file-like object or string containing the HTML to be parsed 294 295 The optional encoding parameter must be a string that indicates 296 the encoding. If specified, that encoding will be used, 297 regardless of any BOM or later declaration (such as in a meta 298 element) 299 300 :arg scripting: treat noscript elements as if JavaScript was turned on 301 302 :returns: parsed tree 303 304 Example: 305 306 >>> from html5lib.html5libparser import HTMLParser 307 >>> parser = HTMLParser() 308 >>> parser.parseFragment('<b>this is a fragment</b>') 309 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> 310 311 """ 312 self._parse(stream, True, *args, **kwargs) 313 return self.tree.getFragment() 314 315 def parseError(self, errorcode="XXX-undefined-error", datavars=None): 316 # XXX The idea is to make errorcode mandatory. 317 if datavars is None: 318 datavars = {} 319 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) 320 if self.strict: 321 raise ParseError(E[errorcode] % datavars) 322 323 def adjustMathMLAttributes(self, token): 324 adjust_attributes(token, adjustMathMLAttributes) 325 326 def adjustSVGAttributes(self, token): 327 adjust_attributes(token, adjustSVGAttributes) 328 329 def adjustForeignAttributes(self, token): 330 adjust_attributes(token, adjustForeignAttributesMap) 331 332 def reparseTokenNormal(self, token): 333 # pylint:disable=unused-argument 334 self.parser.phase() 335 336 def resetInsertionMode(self): 337 # The name of this method is mostly historical. (It's also used in the 338 # specification.) 339 last = False 340 newModes = { 341 "select": "inSelect", 342 "td": "inCell", 343 "th": "inCell", 344 "tr": "inRow", 345 "tbody": "inTableBody", 346 "thead": "inTableBody", 347 "tfoot": "inTableBody", 348 "caption": "inCaption", 349 "colgroup": "inColumnGroup", 350 "table": "inTable", 351 "head": "inBody", 352 "body": "inBody", 353 "frameset": "inFrameset", 354 "html": "beforeHead" 355 } 356 for node in self.tree.openElements[::-1]: 357 nodeName = node.name 358 new_phase = None 359 if node == self.tree.openElements[0]: 360 assert self.innerHTML 361 last = True 362 nodeName = self.innerHTML 363 # Check for conditions that should only happen in the innerHTML 364 # case 365 if nodeName in ("select", "colgroup", "head", "html"): 366 assert self.innerHTML 367 368 if not last and node.namespace != self.tree.defaultNamespace: 369 continue 370 371 if nodeName in newModes: 372 new_phase = self.phases[newModes[nodeName]] 373 break 374 elif last: 375 new_phase = self.phases["inBody"] 376 break 377 378 self.phase = new_phase 379 380 def parseRCDataRawtext(self, token, contentType): 381 # Generic RCDATA/RAWTEXT Parsing algorithm 382 assert contentType in ("RAWTEXT", "RCDATA") 383 384 self.tree.insertElement(token) 385 386 if contentType == "RAWTEXT": 387 self.tokenizer.state = self.tokenizer.rawtextState 388 else: 389 self.tokenizer.state = self.tokenizer.rcdataState 390 391 self.originalPhase = self.phase 392 393 self.phase = self.phases["text"] 394 395 396@_utils.memoize 397def getPhases(debug): 398 def log(function): 399 """Logger that records which phase processes each token""" 400 type_names = {value: key for key, value in tokenTypes.items()} 401 402 def wrapped(self, *args, **kwargs): 403 if function.__name__.startswith("process") and len(args) > 0: 404 token = args[0] 405 info = {"type": type_names[token['type']]} 406 if token['type'] in tagTokenTypes: 407 info["name"] = token['name'] 408 409 self.parser.log.append((self.parser.tokenizer.state.__name__, 410 self.parser.phase.__class__.__name__, 411 self.__class__.__name__, 412 function.__name__, 413 info)) 414 return function(self, *args, **kwargs) 415 else: 416 return function(self, *args, **kwargs) 417 return wrapped 418 419 def getMetaclass(use_metaclass, metaclass_func): 420 if use_metaclass: 421 return method_decorator_metaclass(metaclass_func) 422 else: 423 return type 424 425 # pylint:disable=unused-argument 426 class Phase(with_metaclass(getMetaclass(debug, log))): 427 """Base class for helper object that implements each phase of processing 428 """ 429 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") 430 431 def __init__(self, parser, tree): 432 self.parser = parser 433 self.tree = tree 434 self.__startTagCache = {} 435 self.__endTagCache = {} 436 437 def processEOF(self): 438 raise NotImplementedError 439 440 def processComment(self, token): 441 # For most phases the following is correct. Where it's not it will be 442 # overridden. 443 self.tree.insertComment(token, self.tree.openElements[-1]) 444 445 def processDoctype(self, token): 446 self.parser.parseError("unexpected-doctype") 447 448 def processCharacters(self, token): 449 self.tree.insertText(token["data"]) 450 451 def processSpaceCharacters(self, token): 452 self.tree.insertText(token["data"]) 453 454 def processStartTag(self, token): 455 # Note the caching is done here rather than BoundMethodDispatcher as doing it there 456 # requires a circular reference to the Phase, and this ends up with a significant 457 # (CPython 2.7, 3.8) GC cost when parsing many short inputs 458 name = token["name"] 459 # In Py2, using `in` is quicker in general than try/except KeyError 460 # In Py3, `in` is quicker when there are few cache hits (typically short inputs) 461 if name in self.__startTagCache: 462 func = self.__startTagCache[name] 463 else: 464 func = self.__startTagCache[name] = self.startTagHandler[name] 465 # bound the cache size in case we get loads of unknown tags 466 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: 467 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 468 self.__startTagCache.pop(next(iter(self.__startTagCache))) 469 return func(token) 470 471 def startTagHtml(self, token): 472 if not self.parser.firstStartTag and token["name"] == "html": 473 self.parser.parseError("non-html-root") 474 # XXX Need a check here to see if the first start tag token emitted is 475 # this token... If it's not, invoke self.parser.parseError(). 476 for attr, value in token["data"].items(): 477 if attr not in self.tree.openElements[0].attributes: 478 self.tree.openElements[0].attributes[attr] = value 479 self.parser.firstStartTag = False 480 481 def processEndTag(self, token): 482 # Note the caching is done here rather than BoundMethodDispatcher as doing it there 483 # requires a circular reference to the Phase, and this ends up with a significant 484 # (CPython 2.7, 3.8) GC cost when parsing many short inputs 485 name = token["name"] 486 # In Py2, using `in` is quicker in general than try/except KeyError 487 # In Py3, `in` is quicker when there are few cache hits (typically short inputs) 488 if name in self.__endTagCache: 489 func = self.__endTagCache[name] 490 else: 491 func = self.__endTagCache[name] = self.endTagHandler[name] 492 # bound the cache size in case we get loads of unknown tags 493 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: 494 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 495 self.__endTagCache.pop(next(iter(self.__endTagCache))) 496 return func(token) 497 498 class InitialPhase(Phase): 499 __slots__ = tuple() 500 501 def processSpaceCharacters(self, token): 502 pass 503 504 def processComment(self, token): 505 self.tree.insertComment(token, self.tree.document) 506 507 def processDoctype(self, token): 508 name = token["name"] 509 publicId = token["publicId"] 510 systemId = token["systemId"] 511 correct = token["correct"] 512 513 if (name != "html" or publicId is not None or 514 systemId is not None and systemId != "about:legacy-compat"): 515 self.parser.parseError("unknown-doctype") 516 517 if publicId is None: 518 publicId = "" 519 520 self.tree.insertDoctype(token) 521 522 if publicId != "": 523 publicId = publicId.translate(asciiUpper2Lower) 524 525 if (not correct or token["name"] != "html" or 526 publicId.startswith( 527 ("+//silmaril//dtd html pro v0r11 19970101//", 528 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", 529 "-//as//dtd html 3.0 aswedit + extensions//", 530 "-//ietf//dtd html 2.0 level 1//", 531 "-//ietf//dtd html 2.0 level 2//", 532 "-//ietf//dtd html 2.0 strict level 1//", 533 "-//ietf//dtd html 2.0 strict level 2//", 534 "-//ietf//dtd html 2.0 strict//", 535 "-//ietf//dtd html 2.0//", 536 "-//ietf//dtd html 2.1e//", 537 "-//ietf//dtd html 3.0//", 538 "-//ietf//dtd html 3.2 final//", 539 "-//ietf//dtd html 3.2//", 540 "-//ietf//dtd html 3//", 541 "-//ietf//dtd html level 0//", 542 "-//ietf//dtd html level 1//", 543 "-//ietf//dtd html level 2//", 544 "-//ietf//dtd html level 3//", 545 "-//ietf//dtd html strict level 0//", 546 "-//ietf//dtd html strict level 1//", 547 "-//ietf//dtd html strict level 2//", 548 "-//ietf//dtd html strict level 3//", 549 "-//ietf//dtd html strict//", 550 "-//ietf//dtd html//", 551 "-//metrius//dtd metrius presentational//", 552 "-//microsoft//dtd internet explorer 2.0 html strict//", 553 "-//microsoft//dtd internet explorer 2.0 html//", 554 "-//microsoft//dtd internet explorer 2.0 tables//", 555 "-//microsoft//dtd internet explorer 3.0 html strict//", 556 "-//microsoft//dtd internet explorer 3.0 html//", 557 "-//microsoft//dtd internet explorer 3.0 tables//", 558 "-//netscape comm. corp.//dtd html//", 559 "-//netscape comm. corp.//dtd strict html//", 560 "-//o'reilly and associates//dtd html 2.0//", 561 "-//o'reilly and associates//dtd html extended 1.0//", 562 "-//o'reilly and associates//dtd html extended relaxed 1.0//", 563 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", 564 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", 565 "-//spyglass//dtd html 2.0 extended//", 566 "-//sq//dtd html 2.0 hotmetal + extensions//", 567 "-//sun microsystems corp.//dtd hotjava html//", 568 "-//sun microsystems corp.//dtd hotjava strict html//", 569 "-//w3c//dtd html 3 1995-03-24//", 570 "-//w3c//dtd html 3.2 draft//", 571 "-//w3c//dtd html 3.2 final//", 572 "-//w3c//dtd html 3.2//", 573 "-//w3c//dtd html 3.2s draft//", 574 "-//w3c//dtd html 4.0 frameset//", 575 "-//w3c//dtd html 4.0 transitional//", 576 "-//w3c//dtd html experimental 19960712//", 577 "-//w3c//dtd html experimental 970421//", 578 "-//w3c//dtd w3 html//", 579 "-//w3o//dtd w3 html 3.0//", 580 "-//webtechs//dtd mozilla html 2.0//", 581 "-//webtechs//dtd mozilla html//")) or 582 publicId in ("-//w3o//dtd w3 html strict 3.0//en//", 583 "-/w3c/dtd html 4.0 transitional/en", 584 "html") or 585 publicId.startswith( 586 ("-//w3c//dtd html 4.01 frameset//", 587 "-//w3c//dtd html 4.01 transitional//")) and 588 systemId is None or 589 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): 590 self.parser.compatMode = "quirks" 591 elif (publicId.startswith( 592 ("-//w3c//dtd xhtml 1.0 frameset//", 593 "-//w3c//dtd xhtml 1.0 transitional//")) or 594 publicId.startswith( 595 ("-//w3c//dtd html 4.01 frameset//", 596 "-//w3c//dtd html 4.01 transitional//")) and 597 systemId is not None): 598 self.parser.compatMode = "limited quirks" 599 600 self.parser.phase = self.parser.phases["beforeHtml"] 601 602 def anythingElse(self): 603 self.parser.compatMode = "quirks" 604 self.parser.phase = self.parser.phases["beforeHtml"] 605 606 def processCharacters(self, token): 607 self.parser.parseError("expected-doctype-but-got-chars") 608 self.anythingElse() 609 return token 610 611 def processStartTag(self, token): 612 self.parser.parseError("expected-doctype-but-got-start-tag", 613 {"name": token["name"]}) 614 self.anythingElse() 615 return token 616 617 def processEndTag(self, token): 618 self.parser.parseError("expected-doctype-but-got-end-tag", 619 {"name": token["name"]}) 620 self.anythingElse() 621 return token 622 623 def processEOF(self): 624 self.parser.parseError("expected-doctype-but-got-eof") 625 self.anythingElse() 626 return True 627 628 class BeforeHtmlPhase(Phase): 629 __slots__ = tuple() 630 631 # helper methods 632 def insertHtmlElement(self): 633 self.tree.insertRoot(impliedTagToken("html", "StartTag")) 634 self.parser.phase = self.parser.phases["beforeHead"] 635 636 # other 637 def processEOF(self): 638 self.insertHtmlElement() 639 return True 640 641 def processComment(self, token): 642 self.tree.insertComment(token, self.tree.document) 643 644 def processSpaceCharacters(self, token): 645 pass 646 647 def processCharacters(self, token): 648 self.insertHtmlElement() 649 return token 650 651 def processStartTag(self, token): 652 if token["name"] == "html": 653 self.parser.firstStartTag = True 654 self.insertHtmlElement() 655 return token 656 657 def processEndTag(self, token): 658 if token["name"] not in ("head", "body", "html", "br"): 659 self.parser.parseError("unexpected-end-tag-before-html", 660 {"name": token["name"]}) 661 else: 662 self.insertHtmlElement() 663 return token 664 665 class BeforeHeadPhase(Phase): 666 __slots__ = tuple() 667 668 def processEOF(self): 669 self.startTagHead(impliedTagToken("head", "StartTag")) 670 return True 671 672 def processSpaceCharacters(self, token): 673 pass 674 675 def processCharacters(self, token): 676 self.startTagHead(impliedTagToken("head", "StartTag")) 677 return token 678 679 def startTagHtml(self, token): 680 return self.parser.phases["inBody"].processStartTag(token) 681 682 def startTagHead(self, token): 683 self.tree.insertElement(token) 684 self.tree.headPointer = self.tree.openElements[-1] 685 self.parser.phase = self.parser.phases["inHead"] 686 687 def startTagOther(self, token): 688 self.startTagHead(impliedTagToken("head", "StartTag")) 689 return token 690 691 def endTagImplyHead(self, token): 692 self.startTagHead(impliedTagToken("head", "StartTag")) 693 return token 694 695 def endTagOther(self, token): 696 self.parser.parseError("end-tag-after-implied-root", 697 {"name": token["name"]}) 698 699 startTagHandler = _utils.MethodDispatcher([ 700 ("html", startTagHtml), 701 ("head", startTagHead) 702 ]) 703 startTagHandler.default = startTagOther 704 705 endTagHandler = _utils.MethodDispatcher([ 706 (("head", "body", "html", "br"), endTagImplyHead) 707 ]) 708 endTagHandler.default = endTagOther 709 710 class InHeadPhase(Phase): 711 __slots__ = tuple() 712 713 # the real thing 714 def processEOF(self): 715 self.anythingElse() 716 return True 717 718 def processCharacters(self, token): 719 self.anythingElse() 720 return token 721 722 def startTagHtml(self, token): 723 return self.parser.phases["inBody"].processStartTag(token) 724 725 def startTagHead(self, token): 726 self.parser.parseError("two-heads-are-not-better-than-one") 727 728 def startTagBaseLinkCommand(self, token): 729 self.tree.insertElement(token) 730 self.tree.openElements.pop() 731 token["selfClosingAcknowledged"] = True 732 733 def startTagMeta(self, token): 734 self.tree.insertElement(token) 735 self.tree.openElements.pop() 736 token["selfClosingAcknowledged"] = True 737 738 attributes = token["data"] 739 if self.parser.tokenizer.stream.charEncoding[1] == "tentative": 740 if "charset" in attributes: 741 self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) 742 elif ("content" in attributes and 743 "http-equiv" in attributes and 744 attributes["http-equiv"].lower() == "content-type"): 745 # Encoding it as UTF-8 here is a hack, as really we should pass 746 # the abstract Unicode string, and just use the 747 # ContentAttrParser on that, but using UTF-8 allows all chars 748 # to be encoded and as a ASCII-superset works. 749 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) 750 parser = _inputstream.ContentAttrParser(data) 751 codec = parser.parse() 752 self.parser.tokenizer.stream.changeEncoding(codec) 753 754 def startTagTitle(self, token): 755 self.parser.parseRCDataRawtext(token, "RCDATA") 756 757 def startTagNoFramesStyle(self, token): 758 # Need to decide whether to implement the scripting-disabled case 759 self.parser.parseRCDataRawtext(token, "RAWTEXT") 760 761 def startTagNoscript(self, token): 762 if self.parser.scripting: 763 self.parser.parseRCDataRawtext(token, "RAWTEXT") 764 else: 765 self.tree.insertElement(token) 766 self.parser.phase = self.parser.phases["inHeadNoscript"] 767 768 def startTagScript(self, token): 769 self.tree.insertElement(token) 770 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState 771 self.parser.originalPhase = self.parser.phase 772 self.parser.phase = self.parser.phases["text"] 773 774 def startTagOther(self, token): 775 self.anythingElse() 776 return token 777 778 def endTagHead(self, token): 779 node = self.parser.tree.openElements.pop() 780 assert node.name == "head", "Expected head got %s" % node.name 781 self.parser.phase = self.parser.phases["afterHead"] 782 783 def endTagHtmlBodyBr(self, token): 784 self.anythingElse() 785 return token 786 787 def endTagOther(self, token): 788 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 789 790 def anythingElse(self): 791 self.endTagHead(impliedTagToken("head")) 792 793 startTagHandler = _utils.MethodDispatcher([ 794 ("html", startTagHtml), 795 ("title", startTagTitle), 796 (("noframes", "style"), startTagNoFramesStyle), 797 ("noscript", startTagNoscript), 798 ("script", startTagScript), 799 (("base", "basefont", "bgsound", "command", "link"), 800 startTagBaseLinkCommand), 801 ("meta", startTagMeta), 802 ("head", startTagHead) 803 ]) 804 startTagHandler.default = startTagOther 805 806 endTagHandler = _utils.MethodDispatcher([ 807 ("head", endTagHead), 808 (("br", "html", "body"), endTagHtmlBodyBr) 809 ]) 810 endTagHandler.default = endTagOther 811 812 class InHeadNoscriptPhase(Phase): 813 __slots__ = tuple() 814 815 def processEOF(self): 816 self.parser.parseError("eof-in-head-noscript") 817 self.anythingElse() 818 return True 819 820 def processComment(self, token): 821 return self.parser.phases["inHead"].processComment(token) 822 823 def processCharacters(self, token): 824 self.parser.parseError("char-in-head-noscript") 825 self.anythingElse() 826 return token 827 828 def processSpaceCharacters(self, token): 829 return self.parser.phases["inHead"].processSpaceCharacters(token) 830 831 def startTagHtml(self, token): 832 return self.parser.phases["inBody"].processStartTag(token) 833 834 def startTagBaseLinkCommand(self, token): 835 return self.parser.phases["inHead"].processStartTag(token) 836 837 def startTagHeadNoscript(self, token): 838 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 839 840 def startTagOther(self, token): 841 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) 842 self.anythingElse() 843 return token 844 845 def endTagNoscript(self, token): 846 node = self.parser.tree.openElements.pop() 847 assert node.name == "noscript", "Expected noscript got %s" % node.name 848 self.parser.phase = self.parser.phases["inHead"] 849 850 def endTagBr(self, token): 851 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) 852 self.anythingElse() 853 return token 854 855 def endTagOther(self, token): 856 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 857 858 def anythingElse(self): 859 # Caller must raise parse error first! 860 self.endTagNoscript(impliedTagToken("noscript")) 861 862 startTagHandler = _utils.MethodDispatcher([ 863 ("html", startTagHtml), 864 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), 865 (("head", "noscript"), startTagHeadNoscript), 866 ]) 867 startTagHandler.default = startTagOther 868 869 endTagHandler = _utils.MethodDispatcher([ 870 ("noscript", endTagNoscript), 871 ("br", endTagBr), 872 ]) 873 endTagHandler.default = endTagOther 874 875 class AfterHeadPhase(Phase): 876 __slots__ = tuple() 877 878 def processEOF(self): 879 self.anythingElse() 880 return True 881 882 def processCharacters(self, token): 883 self.anythingElse() 884 return token 885 886 def startTagHtml(self, token): 887 return self.parser.phases["inBody"].processStartTag(token) 888 889 def startTagBody(self, token): 890 self.parser.framesetOK = False 891 self.tree.insertElement(token) 892 self.parser.phase = self.parser.phases["inBody"] 893 894 def startTagFrameset(self, token): 895 self.tree.insertElement(token) 896 self.parser.phase = self.parser.phases["inFrameset"] 897 898 def startTagFromHead(self, token): 899 self.parser.parseError("unexpected-start-tag-out-of-my-head", 900 {"name": token["name"]}) 901 self.tree.openElements.append(self.tree.headPointer) 902 self.parser.phases["inHead"].processStartTag(token) 903 for node in self.tree.openElements[::-1]: 904 if node.name == "head": 905 self.tree.openElements.remove(node) 906 break 907 908 def startTagHead(self, token): 909 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 910 911 def startTagOther(self, token): 912 self.anythingElse() 913 return token 914 915 def endTagHtmlBodyBr(self, token): 916 self.anythingElse() 917 return token 918 919 def endTagOther(self, token): 920 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 921 922 def anythingElse(self): 923 self.tree.insertElement(impliedTagToken("body", "StartTag")) 924 self.parser.phase = self.parser.phases["inBody"] 925 self.parser.framesetOK = True 926 927 startTagHandler = _utils.MethodDispatcher([ 928 ("html", startTagHtml), 929 ("body", startTagBody), 930 ("frameset", startTagFrameset), 931 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", 932 "style", "title"), 933 startTagFromHead), 934 ("head", startTagHead) 935 ]) 936 startTagHandler.default = startTagOther 937 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), 938 endTagHtmlBodyBr)]) 939 endTagHandler.default = endTagOther 940 941 class InBodyPhase(Phase): 942 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody 943 # the really-really-really-very crazy mode 944 __slots__ = ("processSpaceCharacters",) 945 946 def __init__(self, *args, **kwargs): 947 super(InBodyPhase, self).__init__(*args, **kwargs) 948 # Set this to the default handler 949 self.processSpaceCharacters = self.processSpaceCharactersNonPre 950 951 def isMatchingFormattingElement(self, node1, node2): 952 return (node1.name == node2.name and 953 node1.namespace == node2.namespace and 954 node1.attributes == node2.attributes) 955 956 # helper 957 def addFormattingElement(self, token): 958 self.tree.insertElement(token) 959 element = self.tree.openElements[-1] 960 961 matchingElements = [] 962 for node in self.tree.activeFormattingElements[::-1]: 963 if node is Marker: 964 break 965 elif self.isMatchingFormattingElement(node, element): 966 matchingElements.append(node) 967 968 assert len(matchingElements) <= 3 969 if len(matchingElements) == 3: 970 self.tree.activeFormattingElements.remove(matchingElements[-1]) 971 self.tree.activeFormattingElements.append(element) 972 973 # the real deal 974 def processEOF(self): 975 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", 976 "tfoot", "th", "thead", "tr", "body", 977 "html")) 978 for node in self.tree.openElements[::-1]: 979 if node.name not in allowed_elements: 980 self.parser.parseError("expected-closing-tag-but-got-eof") 981 break 982 # Stop parsing 983 984 def processSpaceCharactersDropNewline(self, token): 985 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we 986 # want to drop leading newlines 987 data = token["data"] 988 self.processSpaceCharacters = self.processSpaceCharactersNonPre 989 if (data.startswith("\n") and 990 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and 991 not self.tree.openElements[-1].hasContent()): 992 data = data[1:] 993 if data: 994 self.tree.reconstructActiveFormattingElements() 995 self.tree.insertText(data) 996 997 def processCharacters(self, token): 998 if token["data"] == "\u0000": 999 # The tokenizer should always emit null on its own 1000 return 1001 self.tree.reconstructActiveFormattingElements() 1002 self.tree.insertText(token["data"]) 1003 # This must be bad for performance 1004 if (self.parser.framesetOK and 1005 any([char not in spaceCharacters 1006 for char in token["data"]])): 1007 self.parser.framesetOK = False 1008 1009 def processSpaceCharactersNonPre(self, token): 1010 self.tree.reconstructActiveFormattingElements() 1011 self.tree.insertText(token["data"]) 1012 1013 def startTagProcessInHead(self, token): 1014 return self.parser.phases["inHead"].processStartTag(token) 1015 1016 def startTagBody(self, token): 1017 self.parser.parseError("unexpected-start-tag", {"name": "body"}) 1018 if (len(self.tree.openElements) == 1 or 1019 self.tree.openElements[1].name != "body"): 1020 assert self.parser.innerHTML 1021 else: 1022 self.parser.framesetOK = False 1023 for attr, value in token["data"].items(): 1024 if attr not in self.tree.openElements[1].attributes: 1025 self.tree.openElements[1].attributes[attr] = value 1026 1027 def startTagFrameset(self, token): 1028 self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) 1029 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): 1030 assert self.parser.innerHTML 1031 elif not self.parser.framesetOK: 1032 pass 1033 else: 1034 if self.tree.openElements[1].parent: 1035 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) 1036 while self.tree.openElements[-1].name != "html": 1037 self.tree.openElements.pop() 1038 self.tree.insertElement(token) 1039 self.parser.phase = self.parser.phases["inFrameset"] 1040 1041 def startTagCloseP(self, token): 1042 if self.tree.elementInScope("p", variant="button"): 1043 self.endTagP(impliedTagToken("p")) 1044 self.tree.insertElement(token) 1045 1046 def startTagPreListing(self, token): 1047 if self.tree.elementInScope("p", variant="button"): 1048 self.endTagP(impliedTagToken("p")) 1049 self.tree.insertElement(token) 1050 self.parser.framesetOK = False 1051 self.processSpaceCharacters = self.processSpaceCharactersDropNewline 1052 1053 def startTagForm(self, token): 1054 if self.tree.formPointer: 1055 self.parser.parseError("unexpected-start-tag", {"name": "form"}) 1056 else: 1057 if self.tree.elementInScope("p", variant="button"): 1058 self.endTagP(impliedTagToken("p")) 1059 self.tree.insertElement(token) 1060 self.tree.formPointer = self.tree.openElements[-1] 1061 1062 def startTagListItem(self, token): 1063 self.parser.framesetOK = False 1064 1065 stopNamesMap = {"li": ["li"], 1066 "dt": ["dt", "dd"], 1067 "dd": ["dt", "dd"]} 1068 stopNames = stopNamesMap[token["name"]] 1069 for node in reversed(self.tree.openElements): 1070 if node.name in stopNames: 1071 self.parser.phase.processEndTag( 1072 impliedTagToken(node.name, "EndTag")) 1073 break 1074 if (node.nameTuple in specialElements and 1075 node.name not in ("address", "div", "p")): 1076 break 1077 1078 if self.tree.elementInScope("p", variant="button"): 1079 self.parser.phase.processEndTag( 1080 impliedTagToken("p", "EndTag")) 1081 1082 self.tree.insertElement(token) 1083 1084 def startTagPlaintext(self, token): 1085 if self.tree.elementInScope("p", variant="button"): 1086 self.endTagP(impliedTagToken("p")) 1087 self.tree.insertElement(token) 1088 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState 1089 1090 def startTagHeading(self, token): 1091 if self.tree.elementInScope("p", variant="button"): 1092 self.endTagP(impliedTagToken("p")) 1093 if self.tree.openElements[-1].name in headingElements: 1094 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 1095 self.tree.openElements.pop() 1096 self.tree.insertElement(token) 1097 1098 def startTagA(self, token): 1099 afeAElement = self.tree.elementInActiveFormattingElements("a") 1100 if afeAElement: 1101 self.parser.parseError("unexpected-start-tag-implies-end-tag", 1102 {"startName": "a", "endName": "a"}) 1103 self.endTagFormatting(impliedTagToken("a")) 1104 if afeAElement in self.tree.openElements: 1105 self.tree.openElements.remove(afeAElement) 1106 if afeAElement in self.tree.activeFormattingElements: 1107 self.tree.activeFormattingElements.remove(afeAElement) 1108 self.tree.reconstructActiveFormattingElements() 1109 self.addFormattingElement(token) 1110 1111 def startTagFormatting(self, token): 1112 self.tree.reconstructActiveFormattingElements() 1113 self.addFormattingElement(token) 1114 1115 def startTagNobr(self, token): 1116 self.tree.reconstructActiveFormattingElements() 1117 if self.tree.elementInScope("nobr"): 1118 self.parser.parseError("unexpected-start-tag-implies-end-tag", 1119 {"startName": "nobr", "endName": "nobr"}) 1120 self.processEndTag(impliedTagToken("nobr")) 1121 # XXX Need tests that trigger the following 1122 self.tree.reconstructActiveFormattingElements() 1123 self.addFormattingElement(token) 1124 1125 def startTagButton(self, token): 1126 if self.tree.elementInScope("button"): 1127 self.parser.parseError("unexpected-start-tag-implies-end-tag", 1128 {"startName": "button", "endName": "button"}) 1129 self.processEndTag(impliedTagToken("button")) 1130 return token 1131 else: 1132 self.tree.reconstructActiveFormattingElements() 1133 self.tree.insertElement(token) 1134 self.parser.framesetOK = False 1135 1136 def startTagAppletMarqueeObject(self, token): 1137 self.tree.reconstructActiveFormattingElements() 1138 self.tree.insertElement(token) 1139 self.tree.activeFormattingElements.append(Marker) 1140 self.parser.framesetOK = False 1141 1142 def startTagXmp(self, token): 1143 if self.tree.elementInScope("p", variant="button"): 1144 self.endTagP(impliedTagToken("p")) 1145 self.tree.reconstructActiveFormattingElements() 1146 self.parser.framesetOK = False 1147 self.parser.parseRCDataRawtext(token, "RAWTEXT") 1148 1149 def startTagTable(self, token): 1150 if self.parser.compatMode != "quirks": 1151 if self.tree.elementInScope("p", variant="button"): 1152 self.processEndTag(impliedTagToken("p")) 1153 self.tree.insertElement(token) 1154 self.parser.framesetOK = False 1155 self.parser.phase = self.parser.phases["inTable"] 1156 1157 def startTagVoidFormatting(self, token): 1158 self.tree.reconstructActiveFormattingElements() 1159 self.tree.insertElement(token) 1160 self.tree.openElements.pop() 1161 token["selfClosingAcknowledged"] = True 1162 self.parser.framesetOK = False 1163 1164 def startTagInput(self, token): 1165 framesetOK = self.parser.framesetOK 1166 self.startTagVoidFormatting(token) 1167 if ("type" in token["data"] and 1168 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): 1169 # input type=hidden doesn't change framesetOK 1170 self.parser.framesetOK = framesetOK 1171 1172 def startTagParamSource(self, token): 1173 self.tree.insertElement(token) 1174 self.tree.openElements.pop() 1175 token["selfClosingAcknowledged"] = True 1176 1177 def startTagHr(self, token): 1178 if self.tree.elementInScope("p", variant="button"): 1179 self.endTagP(impliedTagToken("p")) 1180 self.tree.insertElement(token) 1181 self.tree.openElements.pop() 1182 token["selfClosingAcknowledged"] = True 1183 self.parser.framesetOK = False 1184 1185 def startTagImage(self, token): 1186 # No really... 1187 self.parser.parseError("unexpected-start-tag-treated-as", 1188 {"originalName": "image", "newName": "img"}) 1189 self.processStartTag(impliedTagToken("img", "StartTag", 1190 attributes=token["data"], 1191 selfClosing=token["selfClosing"])) 1192 1193 def startTagIsIndex(self, token): 1194 self.parser.parseError("deprecated-tag", {"name": "isindex"}) 1195 if self.tree.formPointer: 1196 return 1197 form_attrs = {} 1198 if "action" in token["data"]: 1199 form_attrs["action"] = token["data"]["action"] 1200 self.processStartTag(impliedTagToken("form", "StartTag", 1201 attributes=form_attrs)) 1202 self.processStartTag(impliedTagToken("hr", "StartTag")) 1203 self.processStartTag(impliedTagToken("label", "StartTag")) 1204 # XXX Localization ... 1205 if "prompt" in token["data"]: 1206 prompt = token["data"]["prompt"] 1207 else: 1208 prompt = "This is a searchable index. Enter search keywords: " 1209 self.processCharacters( 1210 {"type": tokenTypes["Characters"], "data": prompt}) 1211 attributes = token["data"].copy() 1212 if "action" in attributes: 1213 del attributes["action"] 1214 if "prompt" in attributes: 1215 del attributes["prompt"] 1216 attributes["name"] = "isindex" 1217 self.processStartTag(impliedTagToken("input", "StartTag", 1218 attributes=attributes, 1219 selfClosing=token["selfClosing"])) 1220 self.processEndTag(impliedTagToken("label")) 1221 self.processStartTag(impliedTagToken("hr", "StartTag")) 1222 self.processEndTag(impliedTagToken("form")) 1223 1224 def startTagTextarea(self, token): 1225 self.tree.insertElement(token) 1226 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState 1227 self.processSpaceCharacters = self.processSpaceCharactersDropNewline 1228 self.parser.framesetOK = False 1229 1230 def startTagIFrame(self, token): 1231 self.parser.framesetOK = False 1232 self.startTagRawtext(token) 1233 1234 def startTagNoscript(self, token): 1235 if self.parser.scripting: 1236 self.startTagRawtext(token) 1237 else: 1238 self.startTagOther(token) 1239 1240 def startTagRawtext(self, token): 1241 """iframe, noembed noframes, noscript(if scripting enabled)""" 1242 self.parser.parseRCDataRawtext(token, "RAWTEXT") 1243 1244 def startTagOpt(self, token): 1245 if self.tree.openElements[-1].name == "option": 1246 self.parser.phase.processEndTag(impliedTagToken("option")) 1247 self.tree.reconstructActiveFormattingElements() 1248 self.parser.tree.insertElement(token) 1249 1250 def startTagSelect(self, token): 1251 self.tree.reconstructActiveFormattingElements() 1252 self.tree.insertElement(token) 1253 self.parser.framesetOK = False 1254 if self.parser.phase in (self.parser.phases["inTable"], 1255 self.parser.phases["inCaption"], 1256 self.parser.phases["inColumnGroup"], 1257 self.parser.phases["inTableBody"], 1258 self.parser.phases["inRow"], 1259 self.parser.phases["inCell"]): 1260 self.parser.phase = self.parser.phases["inSelectInTable"] 1261 else: 1262 self.parser.phase = self.parser.phases["inSelect"] 1263 1264 def startTagRpRt(self, token): 1265 if self.tree.elementInScope("ruby"): 1266 self.tree.generateImpliedEndTags() 1267 if self.tree.openElements[-1].name != "ruby": 1268 self.parser.parseError() 1269 self.tree.insertElement(token) 1270 1271 def startTagMath(self, token): 1272 self.tree.reconstructActiveFormattingElements() 1273 self.parser.adjustMathMLAttributes(token) 1274 self.parser.adjustForeignAttributes(token) 1275 token["namespace"] = namespaces["mathml"] 1276 self.tree.insertElement(token) 1277 # Need to get the parse error right for the case where the token 1278 # has a namespace not equal to the xmlns attribute 1279 if token["selfClosing"]: 1280 self.tree.openElements.pop() 1281 token["selfClosingAcknowledged"] = True 1282 1283 def startTagSvg(self, token): 1284 self.tree.reconstructActiveFormattingElements() 1285 self.parser.adjustSVGAttributes(token) 1286 self.parser.adjustForeignAttributes(token) 1287 token["namespace"] = namespaces["svg"] 1288 self.tree.insertElement(token) 1289 # Need to get the parse error right for the case where the token 1290 # has a namespace not equal to the xmlns attribute 1291 if token["selfClosing"]: 1292 self.tree.openElements.pop() 1293 token["selfClosingAcknowledged"] = True 1294 1295 def startTagMisplaced(self, token): 1296 """ Elements that should be children of other elements that have a 1297 different insertion mode; here they are ignored 1298 "caption", "col", "colgroup", "frame", "frameset", "head", 1299 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", 1300 "tr", "noscript" 1301 """ 1302 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) 1303 1304 def startTagOther(self, token): 1305 self.tree.reconstructActiveFormattingElements() 1306 self.tree.insertElement(token) 1307 1308 def endTagP(self, token): 1309 if not self.tree.elementInScope("p", variant="button"): 1310 self.startTagCloseP(impliedTagToken("p", "StartTag")) 1311 self.parser.parseError("unexpected-end-tag", {"name": "p"}) 1312 self.endTagP(impliedTagToken("p", "EndTag")) 1313 else: 1314 self.tree.generateImpliedEndTags("p") 1315 if self.tree.openElements[-1].name != "p": 1316 self.parser.parseError("unexpected-end-tag", {"name": "p"}) 1317 node = self.tree.openElements.pop() 1318 while node.name != "p": 1319 node = self.tree.openElements.pop() 1320 1321 def endTagBody(self, token): 1322 if not self.tree.elementInScope("body"): 1323 self.parser.parseError() 1324 return 1325 elif self.tree.openElements[-1].name != "body": 1326 for node in self.tree.openElements[2:]: 1327 if node.name not in frozenset(("dd", "dt", "li", "optgroup", 1328 "option", "p", "rp", "rt", 1329 "tbody", "td", "tfoot", 1330 "th", "thead", "tr", "body", 1331 "html")): 1332 # Not sure this is the correct name for the parse error 1333 self.parser.parseError( 1334 "expected-one-end-tag-but-got-another", 1335 {"gotName": "body", "expectedName": node.name}) 1336 break 1337 self.parser.phase = self.parser.phases["afterBody"] 1338 1339 def endTagHtml(self, token): 1340 # We repeat the test for the body end tag token being ignored here 1341 if self.tree.elementInScope("body"): 1342 self.endTagBody(impliedTagToken("body")) 1343 return token 1344 1345 def endTagBlock(self, token): 1346 # Put us back in the right whitespace handling mode 1347 if token["name"] == "pre": 1348 self.processSpaceCharacters = self.processSpaceCharactersNonPre 1349 inScope = self.tree.elementInScope(token["name"]) 1350 if inScope: 1351 self.tree.generateImpliedEndTags() 1352 if self.tree.openElements[-1].name != token["name"]: 1353 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 1354 if inScope: 1355 node = self.tree.openElements.pop() 1356 while node.name != token["name"]: 1357 node = self.tree.openElements.pop() 1358 1359 def endTagForm(self, token): 1360 node = self.tree.formPointer 1361 self.tree.formPointer = None 1362 if node is None or not self.tree.elementInScope(node): 1363 self.parser.parseError("unexpected-end-tag", 1364 {"name": "form"}) 1365 else: 1366 self.tree.generateImpliedEndTags() 1367 if self.tree.openElements[-1] != node: 1368 self.parser.parseError("end-tag-too-early-ignored", 1369 {"name": "form"}) 1370 self.tree.openElements.remove(node) 1371 1372 def endTagListItem(self, token): 1373 if token["name"] == "li": 1374 variant = "list" 1375 else: 1376 variant = None 1377 if not self.tree.elementInScope(token["name"], variant=variant): 1378 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 1379 else: 1380 self.tree.generateImpliedEndTags(exclude=token["name"]) 1381 if self.tree.openElements[-1].name != token["name"]: 1382 self.parser.parseError( 1383 "end-tag-too-early", 1384 {"name": token["name"]}) 1385 node = self.tree.openElements.pop() 1386 while node.name != token["name"]: 1387 node = self.tree.openElements.pop() 1388 1389 def endTagHeading(self, token): 1390 for item in headingElements: 1391 if self.tree.elementInScope(item): 1392 self.tree.generateImpliedEndTags() 1393 break 1394 if self.tree.openElements[-1].name != token["name"]: 1395 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 1396 1397 for item in headingElements: 1398 if self.tree.elementInScope(item): 1399 item = self.tree.openElements.pop() 1400 while item.name not in headingElements: 1401 item = self.tree.openElements.pop() 1402 break 1403 1404 def endTagFormatting(self, token): 1405 """The much-feared adoption agency algorithm""" 1406 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 1407 # XXX Better parseError messages appreciated. 1408 1409 # Step 1 1410 outerLoopCounter = 0 1411 1412 # Step 2 1413 while outerLoopCounter < 8: 1414 1415 # Step 3 1416 outerLoopCounter += 1 1417 1418 # Step 4: 1419 1420 # Let the formatting element be the last element in 1421 # the list of active formatting elements that: 1422 # - is between the end of the list and the last scope 1423 # marker in the list, if any, or the start of the list 1424 # otherwise, and 1425 # - has the same tag name as the token. 1426 formattingElement = self.tree.elementInActiveFormattingElements( 1427 token["name"]) 1428 if (not formattingElement or 1429 (formattingElement in self.tree.openElements and 1430 not self.tree.elementInScope(formattingElement.name))): 1431 # If there is no such node, then abort these steps 1432 # and instead act as described in the "any other 1433 # end tag" entry below. 1434 self.endTagOther(token) 1435 return 1436 1437 # Otherwise, if there is such a node, but that node is 1438 # not in the stack of open elements, then this is a 1439 # parse error; remove the element from the list, and 1440 # abort these steps. 1441 elif formattingElement not in self.tree.openElements: 1442 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) 1443 self.tree.activeFormattingElements.remove(formattingElement) 1444 return 1445 1446 # Otherwise, if there is such a node, and that node is 1447 # also in the stack of open elements, but the element 1448 # is not in scope, then this is a parse error; ignore 1449 # the token, and abort these steps. 1450 elif not self.tree.elementInScope(formattingElement.name): 1451 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) 1452 return 1453 1454 # Otherwise, there is a formatting element and that 1455 # element is in the stack and is in scope. If the 1456 # element is not the current node, this is a parse 1457 # error. In any case, proceed with the algorithm as 1458 # written in the following steps. 1459 else: 1460 if formattingElement != self.tree.openElements[-1]: 1461 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) 1462 1463 # Step 5: 1464 1465 # Let the furthest block be the topmost node in the 1466 # stack of open elements that is lower in the stack 1467 # than the formatting element, and is an element in 1468 # the special category. There might not be one. 1469 afeIndex = self.tree.openElements.index(formattingElement) 1470 furthestBlock = None 1471 for element in self.tree.openElements[afeIndex:]: 1472 if element.nameTuple in specialElements: 1473 furthestBlock = element 1474 break 1475 1476 # Step 6: 1477 1478 # If there is no furthest block, then the UA must 1479 # first pop all the nodes from the bottom of the stack 1480 # of open elements, from the current node up to and 1481 # including the formatting element, then remove the 1482 # formatting element from the list of active 1483 # formatting elements, and finally abort these steps. 1484 if furthestBlock is None: 1485 element = self.tree.openElements.pop() 1486 while element != formattingElement: 1487 element = self.tree.openElements.pop() 1488 self.tree.activeFormattingElements.remove(element) 1489 return 1490 1491 # Step 7 1492 commonAncestor = self.tree.openElements[afeIndex - 1] 1493 1494 # Step 8: 1495 # The bookmark is supposed to help us identify where to reinsert 1496 # nodes in step 15. We have to ensure that we reinsert nodes after 1497 # the node before the active formatting element. Note the bookmark 1498 # can move in step 9.7 1499 bookmark = self.tree.activeFormattingElements.index(formattingElement) 1500 1501 # Step 9 1502 lastNode = node = furthestBlock 1503 innerLoopCounter = 0 1504 1505 index = self.tree.openElements.index(node) 1506 while innerLoopCounter < 3: 1507 innerLoopCounter += 1 1508 # Node is element before node in open elements 1509 index -= 1 1510 node = self.tree.openElements[index] 1511 if node not in self.tree.activeFormattingElements: 1512 self.tree.openElements.remove(node) 1513 continue 1514 # Step 9.6 1515 if node == formattingElement: 1516 break 1517 # Step 9.7 1518 if lastNode == furthestBlock: 1519 bookmark = self.tree.activeFormattingElements.index(node) + 1 1520 # Step 9.8 1521 clone = node.cloneNode() 1522 # Replace node with clone 1523 self.tree.activeFormattingElements[ 1524 self.tree.activeFormattingElements.index(node)] = clone 1525 self.tree.openElements[ 1526 self.tree.openElements.index(node)] = clone 1527 node = clone 1528 # Step 9.9 1529 # Remove lastNode from its parents, if any 1530 if lastNode.parent: 1531 lastNode.parent.removeChild(lastNode) 1532 node.appendChild(lastNode) 1533 # Step 9.10 1534 lastNode = node 1535 1536 # Step 10 1537 # Foster parent lastNode if commonAncestor is a 1538 # table, tbody, tfoot, thead, or tr we need to foster 1539 # parent the lastNode 1540 if lastNode.parent: 1541 lastNode.parent.removeChild(lastNode) 1542 1543 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): 1544 parent, insertBefore = self.tree.getTableMisnestedNodePosition() 1545 parent.insertBefore(lastNode, insertBefore) 1546 else: 1547 commonAncestor.appendChild(lastNode) 1548 1549 # Step 11 1550 clone = formattingElement.cloneNode() 1551 1552 # Step 12 1553 furthestBlock.reparentChildren(clone) 1554 1555 # Step 13 1556 furthestBlock.appendChild(clone) 1557 1558 # Step 14 1559 self.tree.activeFormattingElements.remove(formattingElement) 1560 self.tree.activeFormattingElements.insert(bookmark, clone) 1561 1562 # Step 15 1563 self.tree.openElements.remove(formattingElement) 1564 self.tree.openElements.insert( 1565 self.tree.openElements.index(furthestBlock) + 1, clone) 1566 1567 def endTagAppletMarqueeObject(self, token): 1568 if self.tree.elementInScope(token["name"]): 1569 self.tree.generateImpliedEndTags() 1570 if self.tree.openElements[-1].name != token["name"]: 1571 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 1572 1573 if self.tree.elementInScope(token["name"]): 1574 element = self.tree.openElements.pop() 1575 while element.name != token["name"]: 1576 element = self.tree.openElements.pop() 1577 self.tree.clearActiveFormattingElements() 1578 1579 def endTagBr(self, token): 1580 self.parser.parseError("unexpected-end-tag-treated-as", 1581 {"originalName": "br", "newName": "br element"}) 1582 self.tree.reconstructActiveFormattingElements() 1583 self.tree.insertElement(impliedTagToken("br", "StartTag")) 1584 self.tree.openElements.pop() 1585 1586 def endTagOther(self, token): 1587 for node in self.tree.openElements[::-1]: 1588 if node.name == token["name"]: 1589 self.tree.generateImpliedEndTags(exclude=token["name"]) 1590 if self.tree.openElements[-1].name != token["name"]: 1591 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 1592 while self.tree.openElements.pop() != node: 1593 pass 1594 break 1595 else: 1596 if node.nameTuple in specialElements: 1597 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 1598 break 1599 1600 startTagHandler = _utils.MethodDispatcher([ 1601 ("html", Phase.startTagHtml), 1602 (("base", "basefont", "bgsound", "command", "link", "meta", 1603 "script", "style", "title"), 1604 startTagProcessInHead), 1605 ("body", startTagBody), 1606 ("frameset", startTagFrameset), 1607 (("address", "article", "aside", "blockquote", "center", "details", 1608 "dir", "div", "dl", "fieldset", "figcaption", "figure", 1609 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", 1610 "section", "summary", "ul"), 1611 startTagCloseP), 1612 (headingElements, startTagHeading), 1613 (("pre", "listing"), startTagPreListing), 1614 ("form", startTagForm), 1615 (("li", "dd", "dt"), startTagListItem), 1616 ("plaintext", startTagPlaintext), 1617 ("a", startTagA), 1618 (("b", "big", "code", "em", "font", "i", "s", "small", "strike", 1619 "strong", "tt", "u"), startTagFormatting), 1620 ("nobr", startTagNobr), 1621 ("button", startTagButton), 1622 (("applet", "marquee", "object"), startTagAppletMarqueeObject), 1623 ("xmp", startTagXmp), 1624 ("table", startTagTable), 1625 (("area", "br", "embed", "img", "keygen", "wbr"), 1626 startTagVoidFormatting), 1627 (("param", "source", "track"), startTagParamSource), 1628 ("input", startTagInput), 1629 ("hr", startTagHr), 1630 ("image", startTagImage), 1631 ("isindex", startTagIsIndex), 1632 ("textarea", startTagTextarea), 1633 ("iframe", startTagIFrame), 1634 ("noscript", startTagNoscript), 1635 (("noembed", "noframes"), startTagRawtext), 1636 ("select", startTagSelect), 1637 (("rp", "rt"), startTagRpRt), 1638 (("option", "optgroup"), startTagOpt), 1639 (("math"), startTagMath), 1640 (("svg"), startTagSvg), 1641 (("caption", "col", "colgroup", "frame", "head", 1642 "tbody", "td", "tfoot", "th", "thead", 1643 "tr"), startTagMisplaced) 1644 ]) 1645 startTagHandler.default = startTagOther 1646 1647 endTagHandler = _utils.MethodDispatcher([ 1648 ("body", endTagBody), 1649 ("html", endTagHtml), 1650 (("address", "article", "aside", "blockquote", "button", "center", 1651 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", 1652 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", 1653 "section", "summary", "ul"), endTagBlock), 1654 ("form", endTagForm), 1655 ("p", endTagP), 1656 (("dd", "dt", "li"), endTagListItem), 1657 (headingElements, endTagHeading), 1658 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", 1659 "strike", "strong", "tt", "u"), endTagFormatting), 1660 (("applet", "marquee", "object"), endTagAppletMarqueeObject), 1661 ("br", endTagBr), 1662 ]) 1663 endTagHandler.default = endTagOther 1664 1665 class TextPhase(Phase): 1666 __slots__ = tuple() 1667 1668 def processCharacters(self, token): 1669 self.tree.insertText(token["data"]) 1670 1671 def processEOF(self): 1672 self.parser.parseError("expected-named-closing-tag-but-got-eof", 1673 {"name": self.tree.openElements[-1].name}) 1674 self.tree.openElements.pop() 1675 self.parser.phase = self.parser.originalPhase 1676 return True 1677 1678 def startTagOther(self, token): 1679 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] 1680 1681 def endTagScript(self, token): 1682 node = self.tree.openElements.pop() 1683 assert node.name == "script" 1684 self.parser.phase = self.parser.originalPhase 1685 # The rest of this method is all stuff that only happens if 1686 # document.write works 1687 1688 def endTagOther(self, token): 1689 self.tree.openElements.pop() 1690 self.parser.phase = self.parser.originalPhase 1691 1692 startTagHandler = _utils.MethodDispatcher([]) 1693 startTagHandler.default = startTagOther 1694 endTagHandler = _utils.MethodDispatcher([ 1695 ("script", endTagScript)]) 1696 endTagHandler.default = endTagOther 1697 1698 class InTablePhase(Phase): 1699 # http://www.whatwg.org/specs/web-apps/current-work/#in-table 1700 __slots__ = tuple() 1701 1702 # helper methods 1703 def clearStackToTableContext(self): 1704 # "clear the stack back to a table context" 1705 while self.tree.openElements[-1].name not in ("table", "html"): 1706 # self.parser.parseError("unexpected-implied-end-tag-in-table", 1707 # {"name": self.tree.openElements[-1].name}) 1708 self.tree.openElements.pop() 1709 # When the current node is <html> it's an innerHTML case 1710 1711 # processing methods 1712 def processEOF(self): 1713 if self.tree.openElements[-1].name != "html": 1714 self.parser.parseError("eof-in-table") 1715 else: 1716 assert self.parser.innerHTML 1717 # Stop parsing 1718 1719 def processSpaceCharacters(self, token): 1720 originalPhase = self.parser.phase 1721 self.parser.phase = self.parser.phases["inTableText"] 1722 self.parser.phase.originalPhase = originalPhase 1723 self.parser.phase.processSpaceCharacters(token) 1724 1725 def processCharacters(self, token): 1726 originalPhase = self.parser.phase 1727 self.parser.phase = self.parser.phases["inTableText"] 1728 self.parser.phase.originalPhase = originalPhase 1729 self.parser.phase.processCharacters(token) 1730 1731 def insertText(self, token): 1732 # If we get here there must be at least one non-whitespace character 1733 # Do the table magic! 1734 self.tree.insertFromTable = True 1735 self.parser.phases["inBody"].processCharacters(token) 1736 self.tree.insertFromTable = False 1737 1738 def startTagCaption(self, token): 1739 self.clearStackToTableContext() 1740 self.tree.activeFormattingElements.append(Marker) 1741 self.tree.insertElement(token) 1742 self.parser.phase = self.parser.phases["inCaption"] 1743 1744 def startTagColgroup(self, token): 1745 self.clearStackToTableContext() 1746 self.tree.insertElement(token) 1747 self.parser.phase = self.parser.phases["inColumnGroup"] 1748 1749 def startTagCol(self, token): 1750 self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) 1751 return token 1752 1753 def startTagRowGroup(self, token): 1754 self.clearStackToTableContext() 1755 self.tree.insertElement(token) 1756 self.parser.phase = self.parser.phases["inTableBody"] 1757 1758 def startTagImplyTbody(self, token): 1759 self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) 1760 return token 1761 1762 def startTagTable(self, token): 1763 self.parser.parseError("unexpected-start-tag-implies-end-tag", 1764 {"startName": "table", "endName": "table"}) 1765 self.parser.phase.processEndTag(impliedTagToken("table")) 1766 if not self.parser.innerHTML: 1767 return token 1768 1769 def startTagStyleScript(self, token): 1770 return self.parser.phases["inHead"].processStartTag(token) 1771 1772 def startTagInput(self, token): 1773 if ("type" in token["data"] and 1774 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): 1775 self.parser.parseError("unexpected-hidden-input-in-table") 1776 self.tree.insertElement(token) 1777 # XXX associate with form 1778 self.tree.openElements.pop() 1779 else: 1780 self.startTagOther(token) 1781 1782 def startTagForm(self, token): 1783 self.parser.parseError("unexpected-form-in-table") 1784 if self.tree.formPointer is None: 1785 self.tree.insertElement(token) 1786 self.tree.formPointer = self.tree.openElements[-1] 1787 self.tree.openElements.pop() 1788 1789 def startTagOther(self, token): 1790 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) 1791 # Do the table magic! 1792 self.tree.insertFromTable = True 1793 self.parser.phases["inBody"].processStartTag(token) 1794 self.tree.insertFromTable = False 1795 1796 def endTagTable(self, token): 1797 if self.tree.elementInScope("table", variant="table"): 1798 self.tree.generateImpliedEndTags() 1799 if self.tree.openElements[-1].name != "table": 1800 self.parser.parseError("end-tag-too-early-named", 1801 {"gotName": "table", 1802 "expectedName": self.tree.openElements[-1].name}) 1803 while self.tree.openElements[-1].name != "table": 1804 self.tree.openElements.pop() 1805 self.tree.openElements.pop() 1806 self.parser.resetInsertionMode() 1807 else: 1808 # innerHTML case 1809 assert self.parser.innerHTML 1810 self.parser.parseError() 1811 1812 def endTagIgnore(self, token): 1813 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 1814 1815 def endTagOther(self, token): 1816 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) 1817 # Do the table magic! 1818 self.tree.insertFromTable = True 1819 self.parser.phases["inBody"].processEndTag(token) 1820 self.tree.insertFromTable = False 1821 1822 startTagHandler = _utils.MethodDispatcher([ 1823 ("html", Phase.startTagHtml), 1824 ("caption", startTagCaption), 1825 ("colgroup", startTagColgroup), 1826 ("col", startTagCol), 1827 (("tbody", "tfoot", "thead"), startTagRowGroup), 1828 (("td", "th", "tr"), startTagImplyTbody), 1829 ("table", startTagTable), 1830 (("style", "script"), startTagStyleScript), 1831 ("input", startTagInput), 1832 ("form", startTagForm) 1833 ]) 1834 startTagHandler.default = startTagOther 1835 1836 endTagHandler = _utils.MethodDispatcher([ 1837 ("table", endTagTable), 1838 (("body", "caption", "col", "colgroup", "html", "tbody", "td", 1839 "tfoot", "th", "thead", "tr"), endTagIgnore) 1840 ]) 1841 endTagHandler.default = endTagOther 1842 1843 class InTableTextPhase(Phase): 1844 __slots__ = ("originalPhase", "characterTokens") 1845 1846 def __init__(self, *args, **kwargs): 1847 super(InTableTextPhase, self).__init__(*args, **kwargs) 1848 self.originalPhase = None 1849 self.characterTokens = [] 1850 1851 def flushCharacters(self): 1852 data = "".join([item["data"] for item in self.characterTokens]) 1853 if any([item not in spaceCharacters for item in data]): 1854 token = {"type": tokenTypes["Characters"], "data": data} 1855 self.parser.phases["inTable"].insertText(token) 1856 elif data: 1857 self.tree.insertText(data) 1858 self.characterTokens = [] 1859 1860 def processComment(self, token): 1861 self.flushCharacters() 1862 self.parser.phase = self.originalPhase 1863 return token 1864 1865 def processEOF(self): 1866 self.flushCharacters() 1867 self.parser.phase = self.originalPhase 1868 return True 1869 1870 def processCharacters(self, token): 1871 if token["data"] == "\u0000": 1872 return 1873 self.characterTokens.append(token) 1874 1875 def processSpaceCharacters(self, token): 1876 # pretty sure we should never reach here 1877 self.characterTokens.append(token) 1878 # assert False 1879 1880 def processStartTag(self, token): 1881 self.flushCharacters() 1882 self.parser.phase = self.originalPhase 1883 return token 1884 1885 def processEndTag(self, token): 1886 self.flushCharacters() 1887 self.parser.phase = self.originalPhase 1888 return token 1889 1890 class InCaptionPhase(Phase): 1891 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption 1892 __slots__ = tuple() 1893 1894 def ignoreEndTagCaption(self): 1895 return not self.tree.elementInScope("caption", variant="table") 1896 1897 def processEOF(self): 1898 self.parser.phases["inBody"].processEOF() 1899 1900 def processCharacters(self, token): 1901 return self.parser.phases["inBody"].processCharacters(token) 1902 1903 def startTagTableElement(self, token): 1904 self.parser.parseError() 1905 # XXX Have to duplicate logic here to find out if the tag is ignored 1906 ignoreEndTag = self.ignoreEndTagCaption() 1907 self.parser.phase.processEndTag(impliedTagToken("caption")) 1908 if not ignoreEndTag: 1909 return token 1910 1911 def startTagOther(self, token): 1912 return self.parser.phases["inBody"].processStartTag(token) 1913 1914 def endTagCaption(self, token): 1915 if not self.ignoreEndTagCaption(): 1916 # AT this code is quite similar to endTagTable in "InTable" 1917 self.tree.generateImpliedEndTags() 1918 if self.tree.openElements[-1].name != "caption": 1919 self.parser.parseError("expected-one-end-tag-but-got-another", 1920 {"gotName": "caption", 1921 "expectedName": self.tree.openElements[-1].name}) 1922 while self.tree.openElements[-1].name != "caption": 1923 self.tree.openElements.pop() 1924 self.tree.openElements.pop() 1925 self.tree.clearActiveFormattingElements() 1926 self.parser.phase = self.parser.phases["inTable"] 1927 else: 1928 # innerHTML case 1929 assert self.parser.innerHTML 1930 self.parser.parseError() 1931 1932 def endTagTable(self, token): 1933 self.parser.parseError() 1934 ignoreEndTag = self.ignoreEndTagCaption() 1935 self.parser.phase.processEndTag(impliedTagToken("caption")) 1936 if not ignoreEndTag: 1937 return token 1938 1939 def endTagIgnore(self, token): 1940 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 1941 1942 def endTagOther(self, token): 1943 return self.parser.phases["inBody"].processEndTag(token) 1944 1945 startTagHandler = _utils.MethodDispatcher([ 1946 ("html", Phase.startTagHtml), 1947 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", 1948 "thead", "tr"), startTagTableElement) 1949 ]) 1950 startTagHandler.default = startTagOther 1951 1952 endTagHandler = _utils.MethodDispatcher([ 1953 ("caption", endTagCaption), 1954 ("table", endTagTable), 1955 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", 1956 "thead", "tr"), endTagIgnore) 1957 ]) 1958 endTagHandler.default = endTagOther 1959 1960 class InColumnGroupPhase(Phase): 1961 # http://www.whatwg.org/specs/web-apps/current-work/#in-column 1962 __slots__ = tuple() 1963 1964 def ignoreEndTagColgroup(self): 1965 return self.tree.openElements[-1].name == "html" 1966 1967 def processEOF(self): 1968 if self.tree.openElements[-1].name == "html": 1969 assert self.parser.innerHTML 1970 return 1971 else: 1972 ignoreEndTag = self.ignoreEndTagColgroup() 1973 self.endTagColgroup(impliedTagToken("colgroup")) 1974 if not ignoreEndTag: 1975 return True 1976 1977 def processCharacters(self, token): 1978 ignoreEndTag = self.ignoreEndTagColgroup() 1979 self.endTagColgroup(impliedTagToken("colgroup")) 1980 if not ignoreEndTag: 1981 return token 1982 1983 def startTagCol(self, token): 1984 self.tree.insertElement(token) 1985 self.tree.openElements.pop() 1986 token["selfClosingAcknowledged"] = True 1987 1988 def startTagOther(self, token): 1989 ignoreEndTag = self.ignoreEndTagColgroup() 1990 self.endTagColgroup(impliedTagToken("colgroup")) 1991 if not ignoreEndTag: 1992 return token 1993 1994 def endTagColgroup(self, token): 1995 if self.ignoreEndTagColgroup(): 1996 # innerHTML case 1997 assert self.parser.innerHTML 1998 self.parser.parseError() 1999 else: 2000 self.tree.openElements.pop() 2001 self.parser.phase = self.parser.phases["inTable"] 2002 2003 def endTagCol(self, token): 2004 self.parser.parseError("no-end-tag", {"name": "col"}) 2005 2006 def endTagOther(self, token): 2007 ignoreEndTag = self.ignoreEndTagColgroup() 2008 self.endTagColgroup(impliedTagToken("colgroup")) 2009 if not ignoreEndTag: 2010 return token 2011 2012 startTagHandler = _utils.MethodDispatcher([ 2013 ("html", Phase.startTagHtml), 2014 ("col", startTagCol) 2015 ]) 2016 startTagHandler.default = startTagOther 2017 2018 endTagHandler = _utils.MethodDispatcher([ 2019 ("colgroup", endTagColgroup), 2020 ("col", endTagCol) 2021 ]) 2022 endTagHandler.default = endTagOther 2023 2024 class InTableBodyPhase(Phase): 2025 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 2026 __slots__ = tuple() 2027 2028 # helper methods 2029 def clearStackToTableBodyContext(self): 2030 while self.tree.openElements[-1].name not in ("tbody", "tfoot", 2031 "thead", "html"): 2032 # self.parser.parseError("unexpected-implied-end-tag-in-table", 2033 # {"name": self.tree.openElements[-1].name}) 2034 self.tree.openElements.pop() 2035 if self.tree.openElements[-1].name == "html": 2036 assert self.parser.innerHTML 2037 2038 # the rest 2039 def processEOF(self): 2040 self.parser.phases["inTable"].processEOF() 2041 2042 def processSpaceCharacters(self, token): 2043 return self.parser.phases["inTable"].processSpaceCharacters(token) 2044 2045 def processCharacters(self, token): 2046 return self.parser.phases["inTable"].processCharacters(token) 2047 2048 def startTagTr(self, token): 2049 self.clearStackToTableBodyContext() 2050 self.tree.insertElement(token) 2051 self.parser.phase = self.parser.phases["inRow"] 2052 2053 def startTagTableCell(self, token): 2054 self.parser.parseError("unexpected-cell-in-table-body", 2055 {"name": token["name"]}) 2056 self.startTagTr(impliedTagToken("tr", "StartTag")) 2057 return token 2058 2059 def startTagTableOther(self, token): 2060 # XXX AT Any ideas on how to share this with endTagTable? 2061 if (self.tree.elementInScope("tbody", variant="table") or 2062 self.tree.elementInScope("thead", variant="table") or 2063 self.tree.elementInScope("tfoot", variant="table")): 2064 self.clearStackToTableBodyContext() 2065 self.endTagTableRowGroup( 2066 impliedTagToken(self.tree.openElements[-1].name)) 2067 return token 2068 else: 2069 # innerHTML case 2070 assert self.parser.innerHTML 2071 self.parser.parseError() 2072 2073 def startTagOther(self, token): 2074 return self.parser.phases["inTable"].processStartTag(token) 2075 2076 def endTagTableRowGroup(self, token): 2077 if self.tree.elementInScope(token["name"], variant="table"): 2078 self.clearStackToTableBodyContext() 2079 self.tree.openElements.pop() 2080 self.parser.phase = self.parser.phases["inTable"] 2081 else: 2082 self.parser.parseError("unexpected-end-tag-in-table-body", 2083 {"name": token["name"]}) 2084 2085 def endTagTable(self, token): 2086 if (self.tree.elementInScope("tbody", variant="table") or 2087 self.tree.elementInScope("thead", variant="table") or 2088 self.tree.elementInScope("tfoot", variant="table")): 2089 self.clearStackToTableBodyContext() 2090 self.endTagTableRowGroup( 2091 impliedTagToken(self.tree.openElements[-1].name)) 2092 return token 2093 else: 2094 # innerHTML case 2095 assert self.parser.innerHTML 2096 self.parser.parseError() 2097 2098 def endTagIgnore(self, token): 2099 self.parser.parseError("unexpected-end-tag-in-table-body", 2100 {"name": token["name"]}) 2101 2102 def endTagOther(self, token): 2103 return self.parser.phases["inTable"].processEndTag(token) 2104 2105 startTagHandler = _utils.MethodDispatcher([ 2106 ("html", Phase.startTagHtml), 2107 ("tr", startTagTr), 2108 (("td", "th"), startTagTableCell), 2109 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), 2110 startTagTableOther) 2111 ]) 2112 startTagHandler.default = startTagOther 2113 2114 endTagHandler = _utils.MethodDispatcher([ 2115 (("tbody", "tfoot", "thead"), endTagTableRowGroup), 2116 ("table", endTagTable), 2117 (("body", "caption", "col", "colgroup", "html", "td", "th", 2118 "tr"), endTagIgnore) 2119 ]) 2120 endTagHandler.default = endTagOther 2121 2122 class InRowPhase(Phase): 2123 # http://www.whatwg.org/specs/web-apps/current-work/#in-row 2124 __slots__ = tuple() 2125 2126 # helper methods (XXX unify this with other table helper methods) 2127 def clearStackToTableRowContext(self): 2128 while self.tree.openElements[-1].name not in ("tr", "html"): 2129 self.parser.parseError("unexpected-implied-end-tag-in-table-row", 2130 {"name": self.tree.openElements[-1].name}) 2131 self.tree.openElements.pop() 2132 2133 def ignoreEndTagTr(self): 2134 return not self.tree.elementInScope("tr", variant="table") 2135 2136 # the rest 2137 def processEOF(self): 2138 self.parser.phases["inTable"].processEOF() 2139 2140 def processSpaceCharacters(self, token): 2141 return self.parser.phases["inTable"].processSpaceCharacters(token) 2142 2143 def processCharacters(self, token): 2144 return self.parser.phases["inTable"].processCharacters(token) 2145 2146 def startTagTableCell(self, token): 2147 self.clearStackToTableRowContext() 2148 self.tree.insertElement(token) 2149 self.parser.phase = self.parser.phases["inCell"] 2150 self.tree.activeFormattingElements.append(Marker) 2151 2152 def startTagTableOther(self, token): 2153 ignoreEndTag = self.ignoreEndTagTr() 2154 self.endTagTr(impliedTagToken("tr")) 2155 # XXX how are we sure it's always ignored in the innerHTML case? 2156 if not ignoreEndTag: 2157 return token 2158 2159 def startTagOther(self, token): 2160 return self.parser.phases["inTable"].processStartTag(token) 2161 2162 def endTagTr(self, token): 2163 if not self.ignoreEndTagTr(): 2164 self.clearStackToTableRowContext() 2165 self.tree.openElements.pop() 2166 self.parser.phase = self.parser.phases["inTableBody"] 2167 else: 2168 # innerHTML case 2169 assert self.parser.innerHTML 2170 self.parser.parseError() 2171 2172 def endTagTable(self, token): 2173 ignoreEndTag = self.ignoreEndTagTr() 2174 self.endTagTr(impliedTagToken("tr")) 2175 # Reprocess the current tag if the tr end tag was not ignored 2176 # XXX how are we sure it's always ignored in the innerHTML case? 2177 if not ignoreEndTag: 2178 return token 2179 2180 def endTagTableRowGroup(self, token): 2181 if self.tree.elementInScope(token["name"], variant="table"): 2182 self.endTagTr(impliedTagToken("tr")) 2183 return token 2184 else: 2185 self.parser.parseError() 2186 2187 def endTagIgnore(self, token): 2188 self.parser.parseError("unexpected-end-tag-in-table-row", 2189 {"name": token["name"]}) 2190 2191 def endTagOther(self, token): 2192 return self.parser.phases["inTable"].processEndTag(token) 2193 2194 startTagHandler = _utils.MethodDispatcher([ 2195 ("html", Phase.startTagHtml), 2196 (("td", "th"), startTagTableCell), 2197 (("caption", "col", "colgroup", "tbody", "tfoot", "thead", 2198 "tr"), startTagTableOther) 2199 ]) 2200 startTagHandler.default = startTagOther 2201 2202 endTagHandler = _utils.MethodDispatcher([ 2203 ("tr", endTagTr), 2204 ("table", endTagTable), 2205 (("tbody", "tfoot", "thead"), endTagTableRowGroup), 2206 (("body", "caption", "col", "colgroup", "html", "td", "th"), 2207 endTagIgnore) 2208 ]) 2209 endTagHandler.default = endTagOther 2210 2211 class InCellPhase(Phase): 2212 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell 2213 __slots__ = tuple() 2214 2215 # helper 2216 def closeCell(self): 2217 if self.tree.elementInScope("td", variant="table"): 2218 self.endTagTableCell(impliedTagToken("td")) 2219 elif self.tree.elementInScope("th", variant="table"): 2220 self.endTagTableCell(impliedTagToken("th")) 2221 2222 # the rest 2223 def processEOF(self): 2224 self.parser.phases["inBody"].processEOF() 2225 2226 def processCharacters(self, token): 2227 return self.parser.phases["inBody"].processCharacters(token) 2228 2229 def startTagTableOther(self, token): 2230 if (self.tree.elementInScope("td", variant="table") or 2231 self.tree.elementInScope("th", variant="table")): 2232 self.closeCell() 2233 return token 2234 else: 2235 # innerHTML case 2236 assert self.parser.innerHTML 2237 self.parser.parseError() 2238 2239 def startTagOther(self, token): 2240 return self.parser.phases["inBody"].processStartTag(token) 2241 2242 def endTagTableCell(self, token): 2243 if self.tree.elementInScope(token["name"], variant="table"): 2244 self.tree.generateImpliedEndTags(token["name"]) 2245 if self.tree.openElements[-1].name != token["name"]: 2246 self.parser.parseError("unexpected-cell-end-tag", 2247 {"name": token["name"]}) 2248 while True: 2249 node = self.tree.openElements.pop() 2250 if node.name == token["name"]: 2251 break 2252 else: 2253 self.tree.openElements.pop() 2254 self.tree.clearActiveFormattingElements() 2255 self.parser.phase = self.parser.phases["inRow"] 2256 else: 2257 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 2258 2259 def endTagIgnore(self, token): 2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 2261 2262 def endTagImply(self, token): 2263 if self.tree.elementInScope(token["name"], variant="table"): 2264 self.closeCell() 2265 return token 2266 else: 2267 # sometimes innerHTML case 2268 self.parser.parseError() 2269 2270 def endTagOther(self, token): 2271 return self.parser.phases["inBody"].processEndTag(token) 2272 2273 startTagHandler = _utils.MethodDispatcher([ 2274 ("html", Phase.startTagHtml), 2275 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", 2276 "thead", "tr"), startTagTableOther) 2277 ]) 2278 startTagHandler.default = startTagOther 2279 2280 endTagHandler = _utils.MethodDispatcher([ 2281 (("td", "th"), endTagTableCell), 2282 (("body", "caption", "col", "colgroup", "html"), endTagIgnore), 2283 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply) 2284 ]) 2285 endTagHandler.default = endTagOther 2286 2287 class InSelectPhase(Phase): 2288 __slots__ = tuple() 2289 2290 # http://www.whatwg.org/specs/web-apps/current-work/#in-select 2291 def processEOF(self): 2292 if self.tree.openElements[-1].name != "html": 2293 self.parser.parseError("eof-in-select") 2294 else: 2295 assert self.parser.innerHTML 2296 2297 def processCharacters(self, token): 2298 if token["data"] == "\u0000": 2299 return 2300 self.tree.insertText(token["data"]) 2301 2302 def startTagOption(self, token): 2303 # We need to imply </option> if <option> is the current node. 2304 if self.tree.openElements[-1].name == "option": 2305 self.tree.openElements.pop() 2306 self.tree.insertElement(token) 2307 2308 def startTagOptgroup(self, token): 2309 if self.tree.openElements[-1].name == "option": 2310 self.tree.openElements.pop() 2311 if self.tree.openElements[-1].name == "optgroup": 2312 self.tree.openElements.pop() 2313 self.tree.insertElement(token) 2314 2315 def startTagSelect(self, token): 2316 self.parser.parseError("unexpected-select-in-select") 2317 self.endTagSelect(impliedTagToken("select")) 2318 2319 def startTagInput(self, token): 2320 self.parser.parseError("unexpected-input-in-select") 2321 if self.tree.elementInScope("select", variant="select"): 2322 self.endTagSelect(impliedTagToken("select")) 2323 return token 2324 else: 2325 assert self.parser.innerHTML 2326 2327 def startTagScript(self, token): 2328 return self.parser.phases["inHead"].processStartTag(token) 2329 2330 def startTagOther(self, token): 2331 self.parser.parseError("unexpected-start-tag-in-select", 2332 {"name": token["name"]}) 2333 2334 def endTagOption(self, token): 2335 if self.tree.openElements[-1].name == "option": 2336 self.tree.openElements.pop() 2337 else: 2338 self.parser.parseError("unexpected-end-tag-in-select", 2339 {"name": "option"}) 2340 2341 def endTagOptgroup(self, token): 2342 # </optgroup> implicitly closes <option> 2343 if (self.tree.openElements[-1].name == "option" and 2344 self.tree.openElements[-2].name == "optgroup"): 2345 self.tree.openElements.pop() 2346 # It also closes </optgroup> 2347 if self.tree.openElements[-1].name == "optgroup": 2348 self.tree.openElements.pop() 2349 # But nothing else 2350 else: 2351 self.parser.parseError("unexpected-end-tag-in-select", 2352 {"name": "optgroup"}) 2353 2354 def endTagSelect(self, token): 2355 if self.tree.elementInScope("select", variant="select"): 2356 node = self.tree.openElements.pop() 2357 while node.name != "select": 2358 node = self.tree.openElements.pop() 2359 self.parser.resetInsertionMode() 2360 else: 2361 # innerHTML case 2362 assert self.parser.innerHTML 2363 self.parser.parseError() 2364 2365 def endTagOther(self, token): 2366 self.parser.parseError("unexpected-end-tag-in-select", 2367 {"name": token["name"]}) 2368 2369 startTagHandler = _utils.MethodDispatcher([ 2370 ("html", Phase.startTagHtml), 2371 ("option", startTagOption), 2372 ("optgroup", startTagOptgroup), 2373 ("select", startTagSelect), 2374 (("input", "keygen", "textarea"), startTagInput), 2375 ("script", startTagScript) 2376 ]) 2377 startTagHandler.default = startTagOther 2378 2379 endTagHandler = _utils.MethodDispatcher([ 2380 ("option", endTagOption), 2381 ("optgroup", endTagOptgroup), 2382 ("select", endTagSelect) 2383 ]) 2384 endTagHandler.default = endTagOther 2385 2386 class InSelectInTablePhase(Phase): 2387 __slots__ = tuple() 2388 2389 def processEOF(self): 2390 self.parser.phases["inSelect"].processEOF() 2391 2392 def processCharacters(self, token): 2393 return self.parser.phases["inSelect"].processCharacters(token) 2394 2395 def startTagTable(self, token): 2396 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) 2397 self.endTagOther(impliedTagToken("select")) 2398 return token 2399 2400 def startTagOther(self, token): 2401 return self.parser.phases["inSelect"].processStartTag(token) 2402 2403 def endTagTable(self, token): 2404 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) 2405 if self.tree.elementInScope(token["name"], variant="table"): 2406 self.endTagOther(impliedTagToken("select")) 2407 return token 2408 2409 def endTagOther(self, token): 2410 return self.parser.phases["inSelect"].processEndTag(token) 2411 2412 startTagHandler = _utils.MethodDispatcher([ 2413 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), 2414 startTagTable) 2415 ]) 2416 startTagHandler.default = startTagOther 2417 2418 endTagHandler = _utils.MethodDispatcher([ 2419 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), 2420 endTagTable) 2421 ]) 2422 endTagHandler.default = endTagOther 2423 2424 class InForeignContentPhase(Phase): 2425 __slots__ = tuple() 2426 2427 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", 2428 "center", "code", "dd", "div", "dl", "dt", 2429 "em", "embed", "h1", "h2", "h3", 2430 "h4", "h5", "h6", "head", "hr", "i", "img", 2431 "li", "listing", "menu", "meta", "nobr", 2432 "ol", "p", "pre", "ruby", "s", "small", 2433 "span", "strong", "strike", "sub", "sup", 2434 "table", "tt", "u", "ul", "var"]) 2435 2436 def adjustSVGTagNames(self, token): 2437 replacements = {"altglyph": "altGlyph", 2438 "altglyphdef": "altGlyphDef", 2439 "altglyphitem": "altGlyphItem", 2440 "animatecolor": "animateColor", 2441 "animatemotion": "animateMotion", 2442 "animatetransform": "animateTransform", 2443 "clippath": "clipPath", 2444 "feblend": "feBlend", 2445 "fecolormatrix": "feColorMatrix", 2446 "fecomponenttransfer": "feComponentTransfer", 2447 "fecomposite": "feComposite", 2448 "feconvolvematrix": "feConvolveMatrix", 2449 "fediffuselighting": "feDiffuseLighting", 2450 "fedisplacementmap": "feDisplacementMap", 2451 "fedistantlight": "feDistantLight", 2452 "feflood": "feFlood", 2453 "fefunca": "feFuncA", 2454 "fefuncb": "feFuncB", 2455 "fefuncg": "feFuncG", 2456 "fefuncr": "feFuncR", 2457 "fegaussianblur": "feGaussianBlur", 2458 "feimage": "feImage", 2459 "femerge": "feMerge", 2460 "femergenode": "feMergeNode", 2461 "femorphology": "feMorphology", 2462 "feoffset": "feOffset", 2463 "fepointlight": "fePointLight", 2464 "fespecularlighting": "feSpecularLighting", 2465 "fespotlight": "feSpotLight", 2466 "fetile": "feTile", 2467 "feturbulence": "feTurbulence", 2468 "foreignobject": "foreignObject", 2469 "glyphref": "glyphRef", 2470 "lineargradient": "linearGradient", 2471 "radialgradient": "radialGradient", 2472 "textpath": "textPath"} 2473 2474 if token["name"] in replacements: 2475 token["name"] = replacements[token["name"]] 2476 2477 def processCharacters(self, token): 2478 if token["data"] == "\u0000": 2479 token["data"] = "\uFFFD" 2480 elif (self.parser.framesetOK and 2481 any(char not in spaceCharacters for char in token["data"])): 2482 self.parser.framesetOK = False 2483 Phase.processCharacters(self, token) 2484 2485 def processStartTag(self, token): 2486 currentNode = self.tree.openElements[-1] 2487 if (token["name"] in self.breakoutElements or 2488 (token["name"] == "font" and 2489 set(token["data"].keys()) & {"color", "face", "size"})): 2490 self.parser.parseError("unexpected-html-element-in-foreign-content", 2491 {"name": token["name"]}) 2492 while (self.tree.openElements[-1].namespace != 2493 self.tree.defaultNamespace and 2494 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and 2495 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): 2496 self.tree.openElements.pop() 2497 return token 2498 2499 else: 2500 if currentNode.namespace == namespaces["mathml"]: 2501 self.parser.adjustMathMLAttributes(token) 2502 elif currentNode.namespace == namespaces["svg"]: 2503 self.adjustSVGTagNames(token) 2504 self.parser.adjustSVGAttributes(token) 2505 self.parser.adjustForeignAttributes(token) 2506 token["namespace"] = currentNode.namespace 2507 self.tree.insertElement(token) 2508 if token["selfClosing"]: 2509 self.tree.openElements.pop() 2510 token["selfClosingAcknowledged"] = True 2511 2512 def processEndTag(self, token): 2513 nodeIndex = len(self.tree.openElements) - 1 2514 node = self.tree.openElements[-1] 2515 if node.name.translate(asciiUpper2Lower) != token["name"]: 2516 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 2517 2518 while True: 2519 if node.name.translate(asciiUpper2Lower) == token["name"]: 2520 # XXX this isn't in the spec but it seems necessary 2521 if self.parser.phase == self.parser.phases["inTableText"]: 2522 self.parser.phase.flushCharacters() 2523 self.parser.phase = self.parser.phase.originalPhase 2524 while self.tree.openElements.pop() != node: 2525 assert self.tree.openElements 2526 new_token = None 2527 break 2528 nodeIndex -= 1 2529 2530 node = self.tree.openElements[nodeIndex] 2531 if node.namespace != self.tree.defaultNamespace: 2532 continue 2533 else: 2534 new_token = self.parser.phase.processEndTag(token) 2535 break 2536 return new_token 2537 2538 class AfterBodyPhase(Phase): 2539 __slots__ = tuple() 2540 2541 def processEOF(self): 2542 # Stop parsing 2543 pass 2544 2545 def processComment(self, token): 2546 # This is needed because data is to be appended to the <html> element 2547 # here and not to whatever is currently open. 2548 self.tree.insertComment(token, self.tree.openElements[0]) 2549 2550 def processCharacters(self, token): 2551 self.parser.parseError("unexpected-char-after-body") 2552 self.parser.phase = self.parser.phases["inBody"] 2553 return token 2554 2555 def startTagHtml(self, token): 2556 return self.parser.phases["inBody"].processStartTag(token) 2557 2558 def startTagOther(self, token): 2559 self.parser.parseError("unexpected-start-tag-after-body", 2560 {"name": token["name"]}) 2561 self.parser.phase = self.parser.phases["inBody"] 2562 return token 2563 2564 def endTagHtml(self, name): 2565 if self.parser.innerHTML: 2566 self.parser.parseError("unexpected-end-tag-after-body-innerhtml") 2567 else: 2568 self.parser.phase = self.parser.phases["afterAfterBody"] 2569 2570 def endTagOther(self, token): 2571 self.parser.parseError("unexpected-end-tag-after-body", 2572 {"name": token["name"]}) 2573 self.parser.phase = self.parser.phases["inBody"] 2574 return token 2575 2576 startTagHandler = _utils.MethodDispatcher([ 2577 ("html", startTagHtml) 2578 ]) 2579 startTagHandler.default = startTagOther 2580 2581 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)]) 2582 endTagHandler.default = endTagOther 2583 2584 class InFramesetPhase(Phase): 2585 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset 2586 __slots__ = tuple() 2587 2588 def processEOF(self): 2589 if self.tree.openElements[-1].name != "html": 2590 self.parser.parseError("eof-in-frameset") 2591 else: 2592 assert self.parser.innerHTML 2593 2594 def processCharacters(self, token): 2595 self.parser.parseError("unexpected-char-in-frameset") 2596 2597 def startTagFrameset(self, token): 2598 self.tree.insertElement(token) 2599 2600 def startTagFrame(self, token): 2601 self.tree.insertElement(token) 2602 self.tree.openElements.pop() 2603 2604 def startTagNoframes(self, token): 2605 return self.parser.phases["inBody"].processStartTag(token) 2606 2607 def startTagOther(self, token): 2608 self.parser.parseError("unexpected-start-tag-in-frameset", 2609 {"name": token["name"]}) 2610 2611 def endTagFrameset(self, token): 2612 if self.tree.openElements[-1].name == "html": 2613 # innerHTML case 2614 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") 2615 else: 2616 self.tree.openElements.pop() 2617 if (not self.parser.innerHTML and 2618 self.tree.openElements[-1].name != "frameset"): 2619 # If we're not in innerHTML mode and the current node is not a 2620 # "frameset" element (anymore) then switch. 2621 self.parser.phase = self.parser.phases["afterFrameset"] 2622 2623 def endTagOther(self, token): 2624 self.parser.parseError("unexpected-end-tag-in-frameset", 2625 {"name": token["name"]}) 2626 2627 startTagHandler = _utils.MethodDispatcher([ 2628 ("html", Phase.startTagHtml), 2629 ("frameset", startTagFrameset), 2630 ("frame", startTagFrame), 2631 ("noframes", startTagNoframes) 2632 ]) 2633 startTagHandler.default = startTagOther 2634 2635 endTagHandler = _utils.MethodDispatcher([ 2636 ("frameset", endTagFrameset) 2637 ]) 2638 endTagHandler.default = endTagOther 2639 2640 class AfterFramesetPhase(Phase): 2641 # http://www.whatwg.org/specs/web-apps/current-work/#after3 2642 __slots__ = tuple() 2643 2644 def processEOF(self): 2645 # Stop parsing 2646 pass 2647 2648 def processCharacters(self, token): 2649 self.parser.parseError("unexpected-char-after-frameset") 2650 2651 def startTagNoframes(self, token): 2652 return self.parser.phases["inHead"].processStartTag(token) 2653 2654 def startTagOther(self, token): 2655 self.parser.parseError("unexpected-start-tag-after-frameset", 2656 {"name": token["name"]}) 2657 2658 def endTagHtml(self, token): 2659 self.parser.phase = self.parser.phases["afterAfterFrameset"] 2660 2661 def endTagOther(self, token): 2662 self.parser.parseError("unexpected-end-tag-after-frameset", 2663 {"name": token["name"]}) 2664 2665 startTagHandler = _utils.MethodDispatcher([ 2666 ("html", Phase.startTagHtml), 2667 ("noframes", startTagNoframes) 2668 ]) 2669 startTagHandler.default = startTagOther 2670 2671 endTagHandler = _utils.MethodDispatcher([ 2672 ("html", endTagHtml) 2673 ]) 2674 endTagHandler.default = endTagOther 2675 2676 class AfterAfterBodyPhase(Phase): 2677 __slots__ = tuple() 2678 2679 def processEOF(self): 2680 pass 2681 2682 def processComment(self, token): 2683 self.tree.insertComment(token, self.tree.document) 2684 2685 def processSpaceCharacters(self, token): 2686 return self.parser.phases["inBody"].processSpaceCharacters(token) 2687 2688 def processCharacters(self, token): 2689 self.parser.parseError("expected-eof-but-got-char") 2690 self.parser.phase = self.parser.phases["inBody"] 2691 return token 2692 2693 def startTagHtml(self, token): 2694 return self.parser.phases["inBody"].processStartTag(token) 2695 2696 def startTagOther(self, token): 2697 self.parser.parseError("expected-eof-but-got-start-tag", 2698 {"name": token["name"]}) 2699 self.parser.phase = self.parser.phases["inBody"] 2700 return token 2701 2702 def processEndTag(self, token): 2703 self.parser.parseError("expected-eof-but-got-end-tag", 2704 {"name": token["name"]}) 2705 self.parser.phase = self.parser.phases["inBody"] 2706 return token 2707 2708 startTagHandler = _utils.MethodDispatcher([ 2709 ("html", startTagHtml) 2710 ]) 2711 startTagHandler.default = startTagOther 2712 2713 class AfterAfterFramesetPhase(Phase): 2714 __slots__ = tuple() 2715 2716 def processEOF(self): 2717 pass 2718 2719 def processComment(self, token): 2720 self.tree.insertComment(token, self.tree.document) 2721 2722 def processSpaceCharacters(self, token): 2723 return self.parser.phases["inBody"].processSpaceCharacters(token) 2724 2725 def processCharacters(self, token): 2726 self.parser.parseError("expected-eof-but-got-char") 2727 2728 def startTagHtml(self, token): 2729 return self.parser.phases["inBody"].processStartTag(token) 2730 2731 def startTagNoFrames(self, token): 2732 return self.parser.phases["inHead"].processStartTag(token) 2733 2734 def startTagOther(self, token): 2735 self.parser.parseError("expected-eof-but-got-start-tag", 2736 {"name": token["name"]}) 2737 2738 def processEndTag(self, token): 2739 self.parser.parseError("expected-eof-but-got-end-tag", 2740 {"name": token["name"]}) 2741 2742 startTagHandler = _utils.MethodDispatcher([ 2743 ("html", startTagHtml), 2744 ("noframes", startTagNoFrames) 2745 ]) 2746 startTagHandler.default = startTagOther 2747 2748 # pylint:enable=unused-argument 2749 2750 return { 2751 "initial": InitialPhase, 2752 "beforeHtml": BeforeHtmlPhase, 2753 "beforeHead": BeforeHeadPhase, 2754 "inHead": InHeadPhase, 2755 "inHeadNoscript": InHeadNoscriptPhase, 2756 "afterHead": AfterHeadPhase, 2757 "inBody": InBodyPhase, 2758 "text": TextPhase, 2759 "inTable": InTablePhase, 2760 "inTableText": InTableTextPhase, 2761 "inCaption": InCaptionPhase, 2762 "inColumnGroup": InColumnGroupPhase, 2763 "inTableBody": InTableBodyPhase, 2764 "inRow": InRowPhase, 2765 "inCell": InCellPhase, 2766 "inSelect": InSelectPhase, 2767 "inSelectInTable": InSelectInTablePhase, 2768 "inForeignContent": InForeignContentPhase, 2769 "afterBody": AfterBodyPhase, 2770 "inFrameset": InFramesetPhase, 2771 "afterFrameset": AfterFramesetPhase, 2772 "afterAfterBody": AfterAfterBodyPhase, 2773 "afterAfterFrameset": AfterAfterFramesetPhase, 2774 # XXX after after frameset 2775 } 2776 2777 2778def adjust_attributes(token, replacements): 2779 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) 2780 if needs_adjustment: 2781 token['data'] = type(token['data'])((replacements.get(k, k), v) 2782 for k, v in token['data'].items()) 2783 2784 2785def impliedTagToken(name, type="EndTag", attributes=None, 2786 selfClosing=False): 2787 if attributes is None: 2788 attributes = {} 2789 return {"type": tokenTypes[type], "name": name, "data": attributes, 2790 "selfClosing": selfClosing} 2791 2792 2793class ParseError(Exception): 2794 """Error in parsed document""" 2795 pass 2796