1from __future__ import absolute_import, division, unicode_literals
2from pip._vendor.six import with_metaclass, viewkeys
3
4import types
5
6from . import _inputstream
7from . import _tokenizer
8
9from . import treebuilders
10from .treebuilders.base import Marker
11
12from . import _utils
13from .constants import (
14    spaceCharacters, asciiUpper2Lower,
15    specialElements, headingElements, cdataElements, rcdataElements,
16    tokenTypes, tagTokenTypes,
17    namespaces,
18    htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
19    adjustForeignAttributes as adjustForeignAttributesMap,
20    adjustMathMLAttributes, adjustSVGAttributes,
21    E,
22    _ReparseException
23)
24
25
26def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
27    """Parse an HTML document as a string or file-like object into a tree
28
29    :arg doc: the document to parse as a string or file-like object
30
31    :arg treebuilder: the treebuilder to use when parsing
32
33    :arg namespaceHTMLElements: whether or not to namespace HTML elements
34
35    :returns: parsed tree
36
37    Example:
38
39    >>> from html5lib.html5parser import parse
40    >>> parse('<html><body><p>This is a doc</p></body></html>')
41    <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
42
43    """
44    tb = treebuilders.getTreeBuilder(treebuilder)
45    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
46    return p.parse(doc, **kwargs)
47
48
49def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
50    """Parse an HTML fragment as a string or file-like object into a tree
51
52    :arg doc: the fragment to parse as a string or file-like object
53
54    :arg container: the container context to parse the fragment in
55
56    :arg treebuilder: the treebuilder to use when parsing
57
58    :arg namespaceHTMLElements: whether or not to namespace HTML elements
59
60    :returns: parsed tree
61
62    Example:
63
64    >>> from html5lib.html5libparser import parseFragment
65    >>> parseFragment('<b>this is a fragment</b>')
66    <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
67
68    """
69    tb = treebuilders.getTreeBuilder(treebuilder)
70    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
71    return p.parseFragment(doc, container=container, **kwargs)
72
73
74def method_decorator_metaclass(function):
75    class Decorated(type):
76        def __new__(meta, classname, bases, classDict):
77            for attributeName, attribute in classDict.items():
78                if isinstance(attribute, types.FunctionType):
79                    attribute = function(attribute)
80
81                classDict[attributeName] = attribute
82            return type.__new__(meta, classname, bases, classDict)
83    return Decorated
84
85
86class HTMLParser(object):
87    """HTML parser
88
89    Generates a tree structure from a stream of (possibly malformed) HTML.
90
91    """
92
93    def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
94        """
95        :arg tree: a treebuilder class controlling the type of tree that will be
96            returned. Built in treebuilders can be accessed through
97            html5lib.treebuilders.getTreeBuilder(treeType)
98
99        :arg strict: raise an exception when a parse error is encountered
100
101        :arg namespaceHTMLElements: whether or not to namespace HTML elements
102
103        :arg debug: whether or not to enable debug mode which logs things
104
105        Example:
106
107        >>> from html5lib.html5parser import HTMLParser
108        >>> parser = HTMLParser()                     # generates parser with etree builder
109        >>> parser = HTMLParser('lxml', strict=True)  # generates parser with lxml builder which is strict
110
111        """
112
113        # Raise an exception on the first error encountered
114        self.strict = strict
115
116        if tree is None:
117            tree = treebuilders.getTreeBuilder("etree")
118        self.tree = tree(namespaceHTMLElements)
119        self.errors = []
120
121        self.phases = {name: cls(self, self.tree) for name, cls in
122                       getPhases(debug).items()}
123
124    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
125
126        self.innerHTMLMode = innerHTML
127        self.container = container
128        self.scripting = scripting
129        self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
130        self.reset()
131
132        try:
133            self.mainLoop()
134        except _ReparseException:
135            self.reset()
136            self.mainLoop()
137
138    def reset(self):
139        self.tree.reset()
140        self.firstStartTag = False
141        self.errors = []
142        self.log = []  # only used with debug mode
143        # "quirks" / "limited quirks" / "no quirks"
144        self.compatMode = "no quirks"
145
146        if self.innerHTMLMode:
147            self.innerHTML = self.container.lower()
148
149            if self.innerHTML in cdataElements:
150                self.tokenizer.state = self.tokenizer.rcdataState
151            elif self.innerHTML in rcdataElements:
152                self.tokenizer.state = self.tokenizer.rawtextState
153            elif self.innerHTML == 'plaintext':
154                self.tokenizer.state = self.tokenizer.plaintextState
155            else:
156                # state already is data state
157                # self.tokenizer.state = self.tokenizer.dataState
158                pass
159            self.phase = self.phases["beforeHtml"]
160            self.phase.insertHtmlElement()
161            self.resetInsertionMode()
162        else:
163            self.innerHTML = False  # pylint:disable=redefined-variable-type
164            self.phase = self.phases["initial"]
165
166        self.lastPhase = None
167
168        self.beforeRCDataPhase = None
169
170        self.framesetOK = True
171
172    @property
173    def documentEncoding(self):
174        """Name of the character encoding that was used to decode the input stream, or
175        :obj:`None` if that is not determined yet
176
177        """
178        if not hasattr(self, 'tokenizer'):
179            return None
180        return self.tokenizer.stream.charEncoding[0].name
181
182    def isHTMLIntegrationPoint(self, element):
183        if (element.name == "annotation-xml" and
184                element.namespace == namespaces["mathml"]):
185            return ("encoding" in element.attributes and
186                    element.attributes["encoding"].translate(
187                        asciiUpper2Lower) in
188                    ("text/html", "application/xhtml+xml"))
189        else:
190            return (element.namespace, element.name) in htmlIntegrationPointElements
191
192    def isMathMLTextIntegrationPoint(self, element):
193        return (element.namespace, element.name) in mathmlTextIntegrationPointElements
194
195    def mainLoop(self):
196        CharactersToken = tokenTypes["Characters"]
197        SpaceCharactersToken = tokenTypes["SpaceCharacters"]
198        StartTagToken = tokenTypes["StartTag"]
199        EndTagToken = tokenTypes["EndTag"]
200        CommentToken = tokenTypes["Comment"]
201        DoctypeToken = tokenTypes["Doctype"]
202        ParseErrorToken = tokenTypes["ParseError"]
203
204        for token in self.tokenizer:
205            prev_token = None
206            new_token = token
207            while new_token is not None:
208                prev_token = new_token
209                currentNode = self.tree.openElements[-1] if self.tree.openElements else None
210                currentNodeNamespace = currentNode.namespace if currentNode else None
211                currentNodeName = currentNode.name if currentNode else None
212
213                type = new_token["type"]
214
215                if type == ParseErrorToken:
216                    self.parseError(new_token["data"], new_token.get("datavars", {}))
217                    new_token = None
218                else:
219                    if (len(self.tree.openElements) == 0 or
220                        currentNodeNamespace == self.tree.defaultNamespace or
221                        (self.isMathMLTextIntegrationPoint(currentNode) and
222                         ((type == StartTagToken and
223                           token["name"] not in frozenset(["mglyph", "malignmark"])) or
224                          type in (CharactersToken, SpaceCharactersToken))) or
225                        (currentNodeNamespace == namespaces["mathml"] and
226                         currentNodeName == "annotation-xml" and
227                         type == StartTagToken and
228                         token["name"] == "svg") or
229                        (self.isHTMLIntegrationPoint(currentNode) and
230                         type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
231                        phase = self.phase
232                    else:
233                        phase = self.phases["inForeignContent"]
234
235                    if type == CharactersToken:
236                        new_token = phase.processCharacters(new_token)
237                    elif type == SpaceCharactersToken:
238                        new_token = phase.processSpaceCharacters(new_token)
239                    elif type == StartTagToken:
240                        new_token = phase.processStartTag(new_token)
241                    elif type == EndTagToken:
242                        new_token = phase.processEndTag(new_token)
243                    elif type == CommentToken:
244                        new_token = phase.processComment(new_token)
245                    elif type == DoctypeToken:
246                        new_token = phase.processDoctype(new_token)
247
248            if (type == StartTagToken and prev_token["selfClosing"] and
249                    not prev_token["selfClosingAcknowledged"]):
250                self.parseError("non-void-element-with-trailing-solidus",
251                                {"name": prev_token["name"]})
252
253        # When the loop finishes it's EOF
254        reprocess = True
255        phases = []
256        while reprocess:
257            phases.append(self.phase)
258            reprocess = self.phase.processEOF()
259            if reprocess:
260                assert self.phase not in phases
261
262    def parse(self, stream, *args, **kwargs):
263        """Parse a HTML document into a well-formed tree
264
265        :arg stream: a file-like object or string containing the HTML to be parsed
266
267            The optional encoding parameter must be a string that indicates
268            the encoding.  If specified, that encoding will be used,
269            regardless of any BOM or later declaration (such as in a meta
270            element).
271
272        :arg scripting: treat noscript elements as if JavaScript was turned on
273
274        :returns: parsed tree
275
276        Example:
277
278        >>> from html5lib.html5parser import HTMLParser
279        >>> parser = HTMLParser()
280        >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
281        <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
282
283        """
284        self._parse(stream, False, None, *args, **kwargs)
285        return self.tree.getDocument()
286
287    def parseFragment(self, stream, *args, **kwargs):
288        """Parse a HTML fragment into a well-formed tree fragment
289
290        :arg container: name of the element we're setting the innerHTML
291            property if set to None, default to 'div'
292
293        :arg stream: a file-like object or string containing the HTML to be parsed
294
295            The optional encoding parameter must be a string that indicates
296            the encoding.  If specified, that encoding will be used,
297            regardless of any BOM or later declaration (such as in a meta
298            element)
299
300        :arg scripting: treat noscript elements as if JavaScript was turned on
301
302        :returns: parsed tree
303
304        Example:
305
306        >>> from html5lib.html5libparser import HTMLParser
307        >>> parser = HTMLParser()
308        >>> parser.parseFragment('<b>this is a fragment</b>')
309        <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
310
311        """
312        self._parse(stream, True, *args, **kwargs)
313        return self.tree.getFragment()
314
315    def parseError(self, errorcode="XXX-undefined-error", datavars=None):
316        # XXX The idea is to make errorcode mandatory.
317        if datavars is None:
318            datavars = {}
319        self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
320        if self.strict:
321            raise ParseError(E[errorcode] % datavars)
322
323    def adjustMathMLAttributes(self, token):
324        adjust_attributes(token, adjustMathMLAttributes)
325
326    def adjustSVGAttributes(self, token):
327        adjust_attributes(token, adjustSVGAttributes)
328
329    def adjustForeignAttributes(self, token):
330        adjust_attributes(token, adjustForeignAttributesMap)
331
332    def reparseTokenNormal(self, token):
333        # pylint:disable=unused-argument
334        self.parser.phase()
335
336    def resetInsertionMode(self):
337        # The name of this method is mostly historical. (It's also used in the
338        # specification.)
339        last = False
340        newModes = {
341            "select": "inSelect",
342            "td": "inCell",
343            "th": "inCell",
344            "tr": "inRow",
345            "tbody": "inTableBody",
346            "thead": "inTableBody",
347            "tfoot": "inTableBody",
348            "caption": "inCaption",
349            "colgroup": "inColumnGroup",
350            "table": "inTable",
351            "head": "inBody",
352            "body": "inBody",
353            "frameset": "inFrameset",
354            "html": "beforeHead"
355        }
356        for node in self.tree.openElements[::-1]:
357            nodeName = node.name
358            new_phase = None
359            if node == self.tree.openElements[0]:
360                assert self.innerHTML
361                last = True
362                nodeName = self.innerHTML
363            # Check for conditions that should only happen in the innerHTML
364            # case
365            if nodeName in ("select", "colgroup", "head", "html"):
366                assert self.innerHTML
367
368            if not last and node.namespace != self.tree.defaultNamespace:
369                continue
370
371            if nodeName in newModes:
372                new_phase = self.phases[newModes[nodeName]]
373                break
374            elif last:
375                new_phase = self.phases["inBody"]
376                break
377
378        self.phase = new_phase
379
380    def parseRCDataRawtext(self, token, contentType):
381        # Generic RCDATA/RAWTEXT Parsing algorithm
382        assert contentType in ("RAWTEXT", "RCDATA")
383
384        self.tree.insertElement(token)
385
386        if contentType == "RAWTEXT":
387            self.tokenizer.state = self.tokenizer.rawtextState
388        else:
389            self.tokenizer.state = self.tokenizer.rcdataState
390
391        self.originalPhase = self.phase
392
393        self.phase = self.phases["text"]
394
395
396@_utils.memoize
397def getPhases(debug):
398    def log(function):
399        """Logger that records which phase processes each token"""
400        type_names = {value: key for key, value in tokenTypes.items()}
401
402        def wrapped(self, *args, **kwargs):
403            if function.__name__.startswith("process") and len(args) > 0:
404                token = args[0]
405                info = {"type": type_names[token['type']]}
406                if token['type'] in tagTokenTypes:
407                    info["name"] = token['name']
408
409                self.parser.log.append((self.parser.tokenizer.state.__name__,
410                                        self.parser.phase.__class__.__name__,
411                                        self.__class__.__name__,
412                                        function.__name__,
413                                        info))
414                return function(self, *args, **kwargs)
415            else:
416                return function(self, *args, **kwargs)
417        return wrapped
418
419    def getMetaclass(use_metaclass, metaclass_func):
420        if use_metaclass:
421            return method_decorator_metaclass(metaclass_func)
422        else:
423            return type
424
425    # pylint:disable=unused-argument
426    class Phase(with_metaclass(getMetaclass(debug, log))):
427        """Base class for helper object that implements each phase of processing
428        """
429        __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
430
431        def __init__(self, parser, tree):
432            self.parser = parser
433            self.tree = tree
434            self.__startTagCache = {}
435            self.__endTagCache = {}
436
437        def processEOF(self):
438            raise NotImplementedError
439
440        def processComment(self, token):
441            # For most phases the following is correct. Where it's not it will be
442            # overridden.
443            self.tree.insertComment(token, self.tree.openElements[-1])
444
445        def processDoctype(self, token):
446            self.parser.parseError("unexpected-doctype")
447
448        def processCharacters(self, token):
449            self.tree.insertText(token["data"])
450
451        def processSpaceCharacters(self, token):
452            self.tree.insertText(token["data"])
453
454        def processStartTag(self, token):
455            # Note the caching is done here rather than BoundMethodDispatcher as doing it there
456            # requires a circular reference to the Phase, and this ends up with a significant
457            # (CPython 2.7, 3.8) GC cost when parsing many short inputs
458            name = token["name"]
459            # In Py2, using `in` is quicker in general than try/except KeyError
460            # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
461            if name in self.__startTagCache:
462                func = self.__startTagCache[name]
463            else:
464                func = self.__startTagCache[name] = self.startTagHandler[name]
465                # bound the cache size in case we get loads of unknown tags
466                while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
467                    # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
468                    self.__startTagCache.pop(next(iter(self.__startTagCache)))
469            return func(token)
470
471        def startTagHtml(self, token):
472            if not self.parser.firstStartTag and token["name"] == "html":
473                self.parser.parseError("non-html-root")
474            # XXX Need a check here to see if the first start tag token emitted is
475            # this token... If it's not, invoke self.parser.parseError().
476            for attr, value in token["data"].items():
477                if attr not in self.tree.openElements[0].attributes:
478                    self.tree.openElements[0].attributes[attr] = value
479            self.parser.firstStartTag = False
480
481        def processEndTag(self, token):
482            # Note the caching is done here rather than BoundMethodDispatcher as doing it there
483            # requires a circular reference to the Phase, and this ends up with a significant
484            # (CPython 2.7, 3.8) GC cost when parsing many short inputs
485            name = token["name"]
486            # In Py2, using `in` is quicker in general than try/except KeyError
487            # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
488            if name in self.__endTagCache:
489                func = self.__endTagCache[name]
490            else:
491                func = self.__endTagCache[name] = self.endTagHandler[name]
492                # bound the cache size in case we get loads of unknown tags
493                while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
494                    # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
495                    self.__endTagCache.pop(next(iter(self.__endTagCache)))
496            return func(token)
497
498    class InitialPhase(Phase):
499        __slots__ = tuple()
500
501        def processSpaceCharacters(self, token):
502            pass
503
504        def processComment(self, token):
505            self.tree.insertComment(token, self.tree.document)
506
507        def processDoctype(self, token):
508            name = token["name"]
509            publicId = token["publicId"]
510            systemId = token["systemId"]
511            correct = token["correct"]
512
513            if (name != "html" or publicId is not None or
514                    systemId is not None and systemId != "about:legacy-compat"):
515                self.parser.parseError("unknown-doctype")
516
517            if publicId is None:
518                publicId = ""
519
520            self.tree.insertDoctype(token)
521
522            if publicId != "":
523                publicId = publicId.translate(asciiUpper2Lower)
524
525            if (not correct or token["name"] != "html" or
526                    publicId.startswith(
527                        ("+//silmaril//dtd html pro v0r11 19970101//",
528                         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
529                         "-//as//dtd html 3.0 aswedit + extensions//",
530                         "-//ietf//dtd html 2.0 level 1//",
531                         "-//ietf//dtd html 2.0 level 2//",
532                         "-//ietf//dtd html 2.0 strict level 1//",
533                         "-//ietf//dtd html 2.0 strict level 2//",
534                         "-//ietf//dtd html 2.0 strict//",
535                         "-//ietf//dtd html 2.0//",
536                         "-//ietf//dtd html 2.1e//",
537                         "-//ietf//dtd html 3.0//",
538                         "-//ietf//dtd html 3.2 final//",
539                         "-//ietf//dtd html 3.2//",
540                         "-//ietf//dtd html 3//",
541                         "-//ietf//dtd html level 0//",
542                         "-//ietf//dtd html level 1//",
543                         "-//ietf//dtd html level 2//",
544                         "-//ietf//dtd html level 3//",
545                         "-//ietf//dtd html strict level 0//",
546                         "-//ietf//dtd html strict level 1//",
547                         "-//ietf//dtd html strict level 2//",
548                         "-//ietf//dtd html strict level 3//",
549                         "-//ietf//dtd html strict//",
550                         "-//ietf//dtd html//",
551                         "-//metrius//dtd metrius presentational//",
552                         "-//microsoft//dtd internet explorer 2.0 html strict//",
553                         "-//microsoft//dtd internet explorer 2.0 html//",
554                         "-//microsoft//dtd internet explorer 2.0 tables//",
555                         "-//microsoft//dtd internet explorer 3.0 html strict//",
556                         "-//microsoft//dtd internet explorer 3.0 html//",
557                         "-//microsoft//dtd internet explorer 3.0 tables//",
558                         "-//netscape comm. corp.//dtd html//",
559                         "-//netscape comm. corp.//dtd strict html//",
560                         "-//o'reilly and associates//dtd html 2.0//",
561                         "-//o'reilly and associates//dtd html extended 1.0//",
562                         "-//o'reilly and associates//dtd html extended relaxed 1.0//",
563                         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
564                         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
565                         "-//spyglass//dtd html 2.0 extended//",
566                         "-//sq//dtd html 2.0 hotmetal + extensions//",
567                         "-//sun microsystems corp.//dtd hotjava html//",
568                         "-//sun microsystems corp.//dtd hotjava strict html//",
569                         "-//w3c//dtd html 3 1995-03-24//",
570                         "-//w3c//dtd html 3.2 draft//",
571                         "-//w3c//dtd html 3.2 final//",
572                         "-//w3c//dtd html 3.2//",
573                         "-//w3c//dtd html 3.2s draft//",
574                         "-//w3c//dtd html 4.0 frameset//",
575                         "-//w3c//dtd html 4.0 transitional//",
576                         "-//w3c//dtd html experimental 19960712//",
577                         "-//w3c//dtd html experimental 970421//",
578                         "-//w3c//dtd w3 html//",
579                         "-//w3o//dtd w3 html 3.0//",
580                         "-//webtechs//dtd mozilla html 2.0//",
581                         "-//webtechs//dtd mozilla html//")) or
582                    publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
583                                 "-/w3c/dtd html 4.0 transitional/en",
584                                 "html") or
585                    publicId.startswith(
586                        ("-//w3c//dtd html 4.01 frameset//",
587                         "-//w3c//dtd html 4.01 transitional//")) and
588                    systemId is None or
589                    systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
590                self.parser.compatMode = "quirks"
591            elif (publicId.startswith(
592                    ("-//w3c//dtd xhtml 1.0 frameset//",
593                     "-//w3c//dtd xhtml 1.0 transitional//")) or
594                  publicId.startswith(
595                      ("-//w3c//dtd html 4.01 frameset//",
596                       "-//w3c//dtd html 4.01 transitional//")) and
597                  systemId is not None):
598                self.parser.compatMode = "limited quirks"
599
600            self.parser.phase = self.parser.phases["beforeHtml"]
601
602        def anythingElse(self):
603            self.parser.compatMode = "quirks"
604            self.parser.phase = self.parser.phases["beforeHtml"]
605
606        def processCharacters(self, token):
607            self.parser.parseError("expected-doctype-but-got-chars")
608            self.anythingElse()
609            return token
610
611        def processStartTag(self, token):
612            self.parser.parseError("expected-doctype-but-got-start-tag",
613                                   {"name": token["name"]})
614            self.anythingElse()
615            return token
616
617        def processEndTag(self, token):
618            self.parser.parseError("expected-doctype-but-got-end-tag",
619                                   {"name": token["name"]})
620            self.anythingElse()
621            return token
622
623        def processEOF(self):
624            self.parser.parseError("expected-doctype-but-got-eof")
625            self.anythingElse()
626            return True
627
628    class BeforeHtmlPhase(Phase):
629        __slots__ = tuple()
630
631        # helper methods
632        def insertHtmlElement(self):
633            self.tree.insertRoot(impliedTagToken("html", "StartTag"))
634            self.parser.phase = self.parser.phases["beforeHead"]
635
636        # other
637        def processEOF(self):
638            self.insertHtmlElement()
639            return True
640
641        def processComment(self, token):
642            self.tree.insertComment(token, self.tree.document)
643
644        def processSpaceCharacters(self, token):
645            pass
646
647        def processCharacters(self, token):
648            self.insertHtmlElement()
649            return token
650
651        def processStartTag(self, token):
652            if token["name"] == "html":
653                self.parser.firstStartTag = True
654            self.insertHtmlElement()
655            return token
656
657        def processEndTag(self, token):
658            if token["name"] not in ("head", "body", "html", "br"):
659                self.parser.parseError("unexpected-end-tag-before-html",
660                                       {"name": token["name"]})
661            else:
662                self.insertHtmlElement()
663                return token
664
665    class BeforeHeadPhase(Phase):
666        __slots__ = tuple()
667
668        def processEOF(self):
669            self.startTagHead(impliedTagToken("head", "StartTag"))
670            return True
671
672        def processSpaceCharacters(self, token):
673            pass
674
675        def processCharacters(self, token):
676            self.startTagHead(impliedTagToken("head", "StartTag"))
677            return token
678
679        def startTagHtml(self, token):
680            return self.parser.phases["inBody"].processStartTag(token)
681
682        def startTagHead(self, token):
683            self.tree.insertElement(token)
684            self.tree.headPointer = self.tree.openElements[-1]
685            self.parser.phase = self.parser.phases["inHead"]
686
687        def startTagOther(self, token):
688            self.startTagHead(impliedTagToken("head", "StartTag"))
689            return token
690
691        def endTagImplyHead(self, token):
692            self.startTagHead(impliedTagToken("head", "StartTag"))
693            return token
694
695        def endTagOther(self, token):
696            self.parser.parseError("end-tag-after-implied-root",
697                                   {"name": token["name"]})
698
699        startTagHandler = _utils.MethodDispatcher([
700            ("html", startTagHtml),
701            ("head", startTagHead)
702        ])
703        startTagHandler.default = startTagOther
704
705        endTagHandler = _utils.MethodDispatcher([
706            (("head", "body", "html", "br"), endTagImplyHead)
707        ])
708        endTagHandler.default = endTagOther
709
710    class InHeadPhase(Phase):
711        __slots__ = tuple()
712
713        # the real thing
714        def processEOF(self):
715            self.anythingElse()
716            return True
717
718        def processCharacters(self, token):
719            self.anythingElse()
720            return token
721
722        def startTagHtml(self, token):
723            return self.parser.phases["inBody"].processStartTag(token)
724
725        def startTagHead(self, token):
726            self.parser.parseError("two-heads-are-not-better-than-one")
727
728        def startTagBaseLinkCommand(self, token):
729            self.tree.insertElement(token)
730            self.tree.openElements.pop()
731            token["selfClosingAcknowledged"] = True
732
733        def startTagMeta(self, token):
734            self.tree.insertElement(token)
735            self.tree.openElements.pop()
736            token["selfClosingAcknowledged"] = True
737
738            attributes = token["data"]
739            if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
740                if "charset" in attributes:
741                    self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
742                elif ("content" in attributes and
743                      "http-equiv" in attributes and
744                      attributes["http-equiv"].lower() == "content-type"):
745                    # Encoding it as UTF-8 here is a hack, as really we should pass
746                    # the abstract Unicode string, and just use the
747                    # ContentAttrParser on that, but using UTF-8 allows all chars
748                    # to be encoded and as a ASCII-superset works.
749                    data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
750                    parser = _inputstream.ContentAttrParser(data)
751                    codec = parser.parse()
752                    self.parser.tokenizer.stream.changeEncoding(codec)
753
754        def startTagTitle(self, token):
755            self.parser.parseRCDataRawtext(token, "RCDATA")
756
757        def startTagNoFramesStyle(self, token):
758            # Need to decide whether to implement the scripting-disabled case
759            self.parser.parseRCDataRawtext(token, "RAWTEXT")
760
761        def startTagNoscript(self, token):
762            if self.parser.scripting:
763                self.parser.parseRCDataRawtext(token, "RAWTEXT")
764            else:
765                self.tree.insertElement(token)
766                self.parser.phase = self.parser.phases["inHeadNoscript"]
767
768        def startTagScript(self, token):
769            self.tree.insertElement(token)
770            self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
771            self.parser.originalPhase = self.parser.phase
772            self.parser.phase = self.parser.phases["text"]
773
774        def startTagOther(self, token):
775            self.anythingElse()
776            return token
777
778        def endTagHead(self, token):
779            node = self.parser.tree.openElements.pop()
780            assert node.name == "head", "Expected head got %s" % node.name
781            self.parser.phase = self.parser.phases["afterHead"]
782
783        def endTagHtmlBodyBr(self, token):
784            self.anythingElse()
785            return token
786
787        def endTagOther(self, token):
788            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
789
790        def anythingElse(self):
791            self.endTagHead(impliedTagToken("head"))
792
793        startTagHandler = _utils.MethodDispatcher([
794            ("html", startTagHtml),
795            ("title", startTagTitle),
796            (("noframes", "style"), startTagNoFramesStyle),
797            ("noscript", startTagNoscript),
798            ("script", startTagScript),
799            (("base", "basefont", "bgsound", "command", "link"),
800             startTagBaseLinkCommand),
801            ("meta", startTagMeta),
802            ("head", startTagHead)
803        ])
804        startTagHandler.default = startTagOther
805
806        endTagHandler = _utils.MethodDispatcher([
807            ("head", endTagHead),
808            (("br", "html", "body"), endTagHtmlBodyBr)
809        ])
810        endTagHandler.default = endTagOther
811
812    class InHeadNoscriptPhase(Phase):
813        __slots__ = tuple()
814
815        def processEOF(self):
816            self.parser.parseError("eof-in-head-noscript")
817            self.anythingElse()
818            return True
819
820        def processComment(self, token):
821            return self.parser.phases["inHead"].processComment(token)
822
823        def processCharacters(self, token):
824            self.parser.parseError("char-in-head-noscript")
825            self.anythingElse()
826            return token
827
828        def processSpaceCharacters(self, token):
829            return self.parser.phases["inHead"].processSpaceCharacters(token)
830
831        def startTagHtml(self, token):
832            return self.parser.phases["inBody"].processStartTag(token)
833
834        def startTagBaseLinkCommand(self, token):
835            return self.parser.phases["inHead"].processStartTag(token)
836
837        def startTagHeadNoscript(self, token):
838            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
839
840        def startTagOther(self, token):
841            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
842            self.anythingElse()
843            return token
844
845        def endTagNoscript(self, token):
846            node = self.parser.tree.openElements.pop()
847            assert node.name == "noscript", "Expected noscript got %s" % node.name
848            self.parser.phase = self.parser.phases["inHead"]
849
850        def endTagBr(self, token):
851            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
852            self.anythingElse()
853            return token
854
855        def endTagOther(self, token):
856            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
857
858        def anythingElse(self):
859            # Caller must raise parse error first!
860            self.endTagNoscript(impliedTagToken("noscript"))
861
862        startTagHandler = _utils.MethodDispatcher([
863            ("html", startTagHtml),
864            (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
865            (("head", "noscript"), startTagHeadNoscript),
866        ])
867        startTagHandler.default = startTagOther
868
869        endTagHandler = _utils.MethodDispatcher([
870            ("noscript", endTagNoscript),
871            ("br", endTagBr),
872        ])
873        endTagHandler.default = endTagOther
874
875    class AfterHeadPhase(Phase):
876        __slots__ = tuple()
877
878        def processEOF(self):
879            self.anythingElse()
880            return True
881
882        def processCharacters(self, token):
883            self.anythingElse()
884            return token
885
886        def startTagHtml(self, token):
887            return self.parser.phases["inBody"].processStartTag(token)
888
889        def startTagBody(self, token):
890            self.parser.framesetOK = False
891            self.tree.insertElement(token)
892            self.parser.phase = self.parser.phases["inBody"]
893
894        def startTagFrameset(self, token):
895            self.tree.insertElement(token)
896            self.parser.phase = self.parser.phases["inFrameset"]
897
898        def startTagFromHead(self, token):
899            self.parser.parseError("unexpected-start-tag-out-of-my-head",
900                                   {"name": token["name"]})
901            self.tree.openElements.append(self.tree.headPointer)
902            self.parser.phases["inHead"].processStartTag(token)
903            for node in self.tree.openElements[::-1]:
904                if node.name == "head":
905                    self.tree.openElements.remove(node)
906                    break
907
908        def startTagHead(self, token):
909            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
910
911        def startTagOther(self, token):
912            self.anythingElse()
913            return token
914
915        def endTagHtmlBodyBr(self, token):
916            self.anythingElse()
917            return token
918
919        def endTagOther(self, token):
920            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
921
922        def anythingElse(self):
923            self.tree.insertElement(impliedTagToken("body", "StartTag"))
924            self.parser.phase = self.parser.phases["inBody"]
925            self.parser.framesetOK = True
926
927        startTagHandler = _utils.MethodDispatcher([
928            ("html", startTagHtml),
929            ("body", startTagBody),
930            ("frameset", startTagFrameset),
931            (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
932              "style", "title"),
933             startTagFromHead),
934            ("head", startTagHead)
935        ])
936        startTagHandler.default = startTagOther
937        endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
938                                                  endTagHtmlBodyBr)])
939        endTagHandler.default = endTagOther
940
941    class InBodyPhase(Phase):
942        # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
943        # the really-really-really-very crazy mode
944        __slots__ = ("processSpaceCharacters",)
945
946        def __init__(self, *args, **kwargs):
947            super(InBodyPhase, self).__init__(*args, **kwargs)
948            # Set this to the default handler
949            self.processSpaceCharacters = self.processSpaceCharactersNonPre
950
951        def isMatchingFormattingElement(self, node1, node2):
952            return (node1.name == node2.name and
953                    node1.namespace == node2.namespace and
954                    node1.attributes == node2.attributes)
955
956        # helper
957        def addFormattingElement(self, token):
958            self.tree.insertElement(token)
959            element = self.tree.openElements[-1]
960
961            matchingElements = []
962            for node in self.tree.activeFormattingElements[::-1]:
963                if node is Marker:
964                    break
965                elif self.isMatchingFormattingElement(node, element):
966                    matchingElements.append(node)
967
968            assert len(matchingElements) <= 3
969            if len(matchingElements) == 3:
970                self.tree.activeFormattingElements.remove(matchingElements[-1])
971            self.tree.activeFormattingElements.append(element)
972
973        # the real deal
974        def processEOF(self):
975            allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
976                                          "tfoot", "th", "thead", "tr", "body",
977                                          "html"))
978            for node in self.tree.openElements[::-1]:
979                if node.name not in allowed_elements:
980                    self.parser.parseError("expected-closing-tag-but-got-eof")
981                    break
982            # Stop parsing
983
984        def processSpaceCharactersDropNewline(self, token):
985            # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
986            # want to drop leading newlines
987            data = token["data"]
988            self.processSpaceCharacters = self.processSpaceCharactersNonPre
989            if (data.startswith("\n") and
990                self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
991                    not self.tree.openElements[-1].hasContent()):
992                data = data[1:]
993            if data:
994                self.tree.reconstructActiveFormattingElements()
995                self.tree.insertText(data)
996
997        def processCharacters(self, token):
998            if token["data"] == "\u0000":
999                # The tokenizer should always emit null on its own
1000                return
1001            self.tree.reconstructActiveFormattingElements()
1002            self.tree.insertText(token["data"])
1003            # This must be bad for performance
1004            if (self.parser.framesetOK and
1005                any([char not in spaceCharacters
1006                     for char in token["data"]])):
1007                self.parser.framesetOK = False
1008
1009        def processSpaceCharactersNonPre(self, token):
1010            self.tree.reconstructActiveFormattingElements()
1011            self.tree.insertText(token["data"])
1012
1013        def startTagProcessInHead(self, token):
1014            return self.parser.phases["inHead"].processStartTag(token)
1015
1016        def startTagBody(self, token):
1017            self.parser.parseError("unexpected-start-tag", {"name": "body"})
1018            if (len(self.tree.openElements) == 1 or
1019                    self.tree.openElements[1].name != "body"):
1020                assert self.parser.innerHTML
1021            else:
1022                self.parser.framesetOK = False
1023                for attr, value in token["data"].items():
1024                    if attr not in self.tree.openElements[1].attributes:
1025                        self.tree.openElements[1].attributes[attr] = value
1026
1027        def startTagFrameset(self, token):
1028            self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
1029            if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
1030                assert self.parser.innerHTML
1031            elif not self.parser.framesetOK:
1032                pass
1033            else:
1034                if self.tree.openElements[1].parent:
1035                    self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
1036                while self.tree.openElements[-1].name != "html":
1037                    self.tree.openElements.pop()
1038                self.tree.insertElement(token)
1039                self.parser.phase = self.parser.phases["inFrameset"]
1040
1041        def startTagCloseP(self, token):
1042            if self.tree.elementInScope("p", variant="button"):
1043                self.endTagP(impliedTagToken("p"))
1044            self.tree.insertElement(token)
1045
1046        def startTagPreListing(self, token):
1047            if self.tree.elementInScope("p", variant="button"):
1048                self.endTagP(impliedTagToken("p"))
1049            self.tree.insertElement(token)
1050            self.parser.framesetOK = False
1051            self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1052
1053        def startTagForm(self, token):
1054            if self.tree.formPointer:
1055                self.parser.parseError("unexpected-start-tag", {"name": "form"})
1056            else:
1057                if self.tree.elementInScope("p", variant="button"):
1058                    self.endTagP(impliedTagToken("p"))
1059                self.tree.insertElement(token)
1060                self.tree.formPointer = self.tree.openElements[-1]
1061
1062        def startTagListItem(self, token):
1063            self.parser.framesetOK = False
1064
1065            stopNamesMap = {"li": ["li"],
1066                            "dt": ["dt", "dd"],
1067                            "dd": ["dt", "dd"]}
1068            stopNames = stopNamesMap[token["name"]]
1069            for node in reversed(self.tree.openElements):
1070                if node.name in stopNames:
1071                    self.parser.phase.processEndTag(
1072                        impliedTagToken(node.name, "EndTag"))
1073                    break
1074                if (node.nameTuple in specialElements and
1075                        node.name not in ("address", "div", "p")):
1076                    break
1077
1078            if self.tree.elementInScope("p", variant="button"):
1079                self.parser.phase.processEndTag(
1080                    impliedTagToken("p", "EndTag"))
1081
1082            self.tree.insertElement(token)
1083
1084        def startTagPlaintext(self, token):
1085            if self.tree.elementInScope("p", variant="button"):
1086                self.endTagP(impliedTagToken("p"))
1087            self.tree.insertElement(token)
1088            self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
1089
1090        def startTagHeading(self, token):
1091            if self.tree.elementInScope("p", variant="button"):
1092                self.endTagP(impliedTagToken("p"))
1093            if self.tree.openElements[-1].name in headingElements:
1094                self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1095                self.tree.openElements.pop()
1096            self.tree.insertElement(token)
1097
1098        def startTagA(self, token):
1099            afeAElement = self.tree.elementInActiveFormattingElements("a")
1100            if afeAElement:
1101                self.parser.parseError("unexpected-start-tag-implies-end-tag",
1102                                       {"startName": "a", "endName": "a"})
1103                self.endTagFormatting(impliedTagToken("a"))
1104                if afeAElement in self.tree.openElements:
1105                    self.tree.openElements.remove(afeAElement)
1106                if afeAElement in self.tree.activeFormattingElements:
1107                    self.tree.activeFormattingElements.remove(afeAElement)
1108            self.tree.reconstructActiveFormattingElements()
1109            self.addFormattingElement(token)
1110
1111        def startTagFormatting(self, token):
1112            self.tree.reconstructActiveFormattingElements()
1113            self.addFormattingElement(token)
1114
1115        def startTagNobr(self, token):
1116            self.tree.reconstructActiveFormattingElements()
1117            if self.tree.elementInScope("nobr"):
1118                self.parser.parseError("unexpected-start-tag-implies-end-tag",
1119                                       {"startName": "nobr", "endName": "nobr"})
1120                self.processEndTag(impliedTagToken("nobr"))
1121                # XXX Need tests that trigger the following
1122                self.tree.reconstructActiveFormattingElements()
1123            self.addFormattingElement(token)
1124
1125        def startTagButton(self, token):
1126            if self.tree.elementInScope("button"):
1127                self.parser.parseError("unexpected-start-tag-implies-end-tag",
1128                                       {"startName": "button", "endName": "button"})
1129                self.processEndTag(impliedTagToken("button"))
1130                return token
1131            else:
1132                self.tree.reconstructActiveFormattingElements()
1133                self.tree.insertElement(token)
1134                self.parser.framesetOK = False
1135
1136        def startTagAppletMarqueeObject(self, token):
1137            self.tree.reconstructActiveFormattingElements()
1138            self.tree.insertElement(token)
1139            self.tree.activeFormattingElements.append(Marker)
1140            self.parser.framesetOK = False
1141
1142        def startTagXmp(self, token):
1143            if self.tree.elementInScope("p", variant="button"):
1144                self.endTagP(impliedTagToken("p"))
1145            self.tree.reconstructActiveFormattingElements()
1146            self.parser.framesetOK = False
1147            self.parser.parseRCDataRawtext(token, "RAWTEXT")
1148
1149        def startTagTable(self, token):
1150            if self.parser.compatMode != "quirks":
1151                if self.tree.elementInScope("p", variant="button"):
1152                    self.processEndTag(impliedTagToken("p"))
1153            self.tree.insertElement(token)
1154            self.parser.framesetOK = False
1155            self.parser.phase = self.parser.phases["inTable"]
1156
1157        def startTagVoidFormatting(self, token):
1158            self.tree.reconstructActiveFormattingElements()
1159            self.tree.insertElement(token)
1160            self.tree.openElements.pop()
1161            token["selfClosingAcknowledged"] = True
1162            self.parser.framesetOK = False
1163
1164        def startTagInput(self, token):
1165            framesetOK = self.parser.framesetOK
1166            self.startTagVoidFormatting(token)
1167            if ("type" in token["data"] and
1168                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1169                # input type=hidden doesn't change framesetOK
1170                self.parser.framesetOK = framesetOK
1171
1172        def startTagParamSource(self, token):
1173            self.tree.insertElement(token)
1174            self.tree.openElements.pop()
1175            token["selfClosingAcknowledged"] = True
1176
1177        def startTagHr(self, token):
1178            if self.tree.elementInScope("p", variant="button"):
1179                self.endTagP(impliedTagToken("p"))
1180            self.tree.insertElement(token)
1181            self.tree.openElements.pop()
1182            token["selfClosingAcknowledged"] = True
1183            self.parser.framesetOK = False
1184
1185        def startTagImage(self, token):
1186            # No really...
1187            self.parser.parseError("unexpected-start-tag-treated-as",
1188                                   {"originalName": "image", "newName": "img"})
1189            self.processStartTag(impliedTagToken("img", "StartTag",
1190                                                 attributes=token["data"],
1191                                                 selfClosing=token["selfClosing"]))
1192
1193        def startTagIsIndex(self, token):
1194            self.parser.parseError("deprecated-tag", {"name": "isindex"})
1195            if self.tree.formPointer:
1196                return
1197            form_attrs = {}
1198            if "action" in token["data"]:
1199                form_attrs["action"] = token["data"]["action"]
1200            self.processStartTag(impliedTagToken("form", "StartTag",
1201                                                 attributes=form_attrs))
1202            self.processStartTag(impliedTagToken("hr", "StartTag"))
1203            self.processStartTag(impliedTagToken("label", "StartTag"))
1204            # XXX Localization ...
1205            if "prompt" in token["data"]:
1206                prompt = token["data"]["prompt"]
1207            else:
1208                prompt = "This is a searchable index. Enter search keywords: "
1209            self.processCharacters(
1210                {"type": tokenTypes["Characters"], "data": prompt})
1211            attributes = token["data"].copy()
1212            if "action" in attributes:
1213                del attributes["action"]
1214            if "prompt" in attributes:
1215                del attributes["prompt"]
1216            attributes["name"] = "isindex"
1217            self.processStartTag(impliedTagToken("input", "StartTag",
1218                                                 attributes=attributes,
1219                                                 selfClosing=token["selfClosing"]))
1220            self.processEndTag(impliedTagToken("label"))
1221            self.processStartTag(impliedTagToken("hr", "StartTag"))
1222            self.processEndTag(impliedTagToken("form"))
1223
1224        def startTagTextarea(self, token):
1225            self.tree.insertElement(token)
1226            self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
1227            self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1228            self.parser.framesetOK = False
1229
1230        def startTagIFrame(self, token):
1231            self.parser.framesetOK = False
1232            self.startTagRawtext(token)
1233
1234        def startTagNoscript(self, token):
1235            if self.parser.scripting:
1236                self.startTagRawtext(token)
1237            else:
1238                self.startTagOther(token)
1239
1240        def startTagRawtext(self, token):
1241            """iframe, noembed noframes, noscript(if scripting enabled)"""
1242            self.parser.parseRCDataRawtext(token, "RAWTEXT")
1243
1244        def startTagOpt(self, token):
1245            if self.tree.openElements[-1].name == "option":
1246                self.parser.phase.processEndTag(impliedTagToken("option"))
1247            self.tree.reconstructActiveFormattingElements()
1248            self.parser.tree.insertElement(token)
1249
1250        def startTagSelect(self, token):
1251            self.tree.reconstructActiveFormattingElements()
1252            self.tree.insertElement(token)
1253            self.parser.framesetOK = False
1254            if self.parser.phase in (self.parser.phases["inTable"],
1255                                     self.parser.phases["inCaption"],
1256                                     self.parser.phases["inColumnGroup"],
1257                                     self.parser.phases["inTableBody"],
1258                                     self.parser.phases["inRow"],
1259                                     self.parser.phases["inCell"]):
1260                self.parser.phase = self.parser.phases["inSelectInTable"]
1261            else:
1262                self.parser.phase = self.parser.phases["inSelect"]
1263
1264        def startTagRpRt(self, token):
1265            if self.tree.elementInScope("ruby"):
1266                self.tree.generateImpliedEndTags()
1267                if self.tree.openElements[-1].name != "ruby":
1268                    self.parser.parseError()
1269            self.tree.insertElement(token)
1270
1271        def startTagMath(self, token):
1272            self.tree.reconstructActiveFormattingElements()
1273            self.parser.adjustMathMLAttributes(token)
1274            self.parser.adjustForeignAttributes(token)
1275            token["namespace"] = namespaces["mathml"]
1276            self.tree.insertElement(token)
1277            # Need to get the parse error right for the case where the token
1278            # has a namespace not equal to the xmlns attribute
1279            if token["selfClosing"]:
1280                self.tree.openElements.pop()
1281                token["selfClosingAcknowledged"] = True
1282
1283        def startTagSvg(self, token):
1284            self.tree.reconstructActiveFormattingElements()
1285            self.parser.adjustSVGAttributes(token)
1286            self.parser.adjustForeignAttributes(token)
1287            token["namespace"] = namespaces["svg"]
1288            self.tree.insertElement(token)
1289            # Need to get the parse error right for the case where the token
1290            # has a namespace not equal to the xmlns attribute
1291            if token["selfClosing"]:
1292                self.tree.openElements.pop()
1293                token["selfClosingAcknowledged"] = True
1294
1295        def startTagMisplaced(self, token):
1296            """ Elements that should be children of other elements that have a
1297            different insertion mode; here they are ignored
1298            "caption", "col", "colgroup", "frame", "frameset", "head",
1299            "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1300            "tr", "noscript"
1301            """
1302            self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
1303
1304        def startTagOther(self, token):
1305            self.tree.reconstructActiveFormattingElements()
1306            self.tree.insertElement(token)
1307
1308        def endTagP(self, token):
1309            if not self.tree.elementInScope("p", variant="button"):
1310                self.startTagCloseP(impliedTagToken("p", "StartTag"))
1311                self.parser.parseError("unexpected-end-tag", {"name": "p"})
1312                self.endTagP(impliedTagToken("p", "EndTag"))
1313            else:
1314                self.tree.generateImpliedEndTags("p")
1315                if self.tree.openElements[-1].name != "p":
1316                    self.parser.parseError("unexpected-end-tag", {"name": "p"})
1317                node = self.tree.openElements.pop()
1318                while node.name != "p":
1319                    node = self.tree.openElements.pop()
1320
1321        def endTagBody(self, token):
1322            if not self.tree.elementInScope("body"):
1323                self.parser.parseError()
1324                return
1325            elif self.tree.openElements[-1].name != "body":
1326                for node in self.tree.openElements[2:]:
1327                    if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1328                                                   "option", "p", "rp", "rt",
1329                                                   "tbody", "td", "tfoot",
1330                                                   "th", "thead", "tr", "body",
1331                                                   "html")):
1332                        # Not sure this is the correct name for the parse error
1333                        self.parser.parseError(
1334                            "expected-one-end-tag-but-got-another",
1335                            {"gotName": "body", "expectedName": node.name})
1336                        break
1337            self.parser.phase = self.parser.phases["afterBody"]
1338
1339        def endTagHtml(self, token):
1340            # We repeat the test for the body end tag token being ignored here
1341            if self.tree.elementInScope("body"):
1342                self.endTagBody(impliedTagToken("body"))
1343                return token
1344
1345        def endTagBlock(self, token):
1346            # Put us back in the right whitespace handling mode
1347            if token["name"] == "pre":
1348                self.processSpaceCharacters = self.processSpaceCharactersNonPre
1349            inScope = self.tree.elementInScope(token["name"])
1350            if inScope:
1351                self.tree.generateImpliedEndTags()
1352            if self.tree.openElements[-1].name != token["name"]:
1353                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1354            if inScope:
1355                node = self.tree.openElements.pop()
1356                while node.name != token["name"]:
1357                    node = self.tree.openElements.pop()
1358
1359        def endTagForm(self, token):
1360            node = self.tree.formPointer
1361            self.tree.formPointer = None
1362            if node is None or not self.tree.elementInScope(node):
1363                self.parser.parseError("unexpected-end-tag",
1364                                       {"name": "form"})
1365            else:
1366                self.tree.generateImpliedEndTags()
1367                if self.tree.openElements[-1] != node:
1368                    self.parser.parseError("end-tag-too-early-ignored",
1369                                           {"name": "form"})
1370                self.tree.openElements.remove(node)
1371
1372        def endTagListItem(self, token):
1373            if token["name"] == "li":
1374                variant = "list"
1375            else:
1376                variant = None
1377            if not self.tree.elementInScope(token["name"], variant=variant):
1378                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1379            else:
1380                self.tree.generateImpliedEndTags(exclude=token["name"])
1381                if self.tree.openElements[-1].name != token["name"]:
1382                    self.parser.parseError(
1383                        "end-tag-too-early",
1384                        {"name": token["name"]})
1385                node = self.tree.openElements.pop()
1386                while node.name != token["name"]:
1387                    node = self.tree.openElements.pop()
1388
1389        def endTagHeading(self, token):
1390            for item in headingElements:
1391                if self.tree.elementInScope(item):
1392                    self.tree.generateImpliedEndTags()
1393                    break
1394            if self.tree.openElements[-1].name != token["name"]:
1395                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1396
1397            for item in headingElements:
1398                if self.tree.elementInScope(item):
1399                    item = self.tree.openElements.pop()
1400                    while item.name not in headingElements:
1401                        item = self.tree.openElements.pop()
1402                    break
1403
1404        def endTagFormatting(self, token):
1405            """The much-feared adoption agency algorithm"""
1406            # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
1407            # XXX Better parseError messages appreciated.
1408
1409            # Step 1
1410            outerLoopCounter = 0
1411
1412            # Step 2
1413            while outerLoopCounter < 8:
1414
1415                # Step 3
1416                outerLoopCounter += 1
1417
1418                # Step 4:
1419
1420                # Let the formatting element be the last element in
1421                # the list of active formatting elements that:
1422                # - is between the end of the list and the last scope
1423                # marker in the list, if any, or the start of the list
1424                # otherwise, and
1425                # - has the same tag name as the token.
1426                formattingElement = self.tree.elementInActiveFormattingElements(
1427                    token["name"])
1428                if (not formattingElement or
1429                    (formattingElement in self.tree.openElements and
1430                     not self.tree.elementInScope(formattingElement.name))):
1431                    # If there is no such node, then abort these steps
1432                    # and instead act as described in the "any other
1433                    # end tag" entry below.
1434                    self.endTagOther(token)
1435                    return
1436
1437                # Otherwise, if there is such a node, but that node is
1438                # not in the stack of open elements, then this is a
1439                # parse error; remove the element from the list, and
1440                # abort these steps.
1441                elif formattingElement not in self.tree.openElements:
1442                    self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
1443                    self.tree.activeFormattingElements.remove(formattingElement)
1444                    return
1445
1446                # Otherwise, if there is such a node, and that node is
1447                # also in the stack of open elements, but the element
1448                # is not in scope, then this is a parse error; ignore
1449                # the token, and abort these steps.
1450                elif not self.tree.elementInScope(formattingElement.name):
1451                    self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
1452                    return
1453
1454                # Otherwise, there is a formatting element and that
1455                # element is in the stack and is in scope. If the
1456                # element is not the current node, this is a parse
1457                # error. In any case, proceed with the algorithm as
1458                # written in the following steps.
1459                else:
1460                    if formattingElement != self.tree.openElements[-1]:
1461                        self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
1462
1463                # Step 5:
1464
1465                # Let the furthest block be the topmost node in the
1466                # stack of open elements that is lower in the stack
1467                # than the formatting element, and is an element in
1468                # the special category. There might not be one.
1469                afeIndex = self.tree.openElements.index(formattingElement)
1470                furthestBlock = None
1471                for element in self.tree.openElements[afeIndex:]:
1472                    if element.nameTuple in specialElements:
1473                        furthestBlock = element
1474                        break
1475
1476                # Step 6:
1477
1478                # If there is no furthest block, then the UA must
1479                # first pop all the nodes from the bottom of the stack
1480                # of open elements, from the current node up to and
1481                # including the formatting element, then remove the
1482                # formatting element from the list of active
1483                # formatting elements, and finally abort these steps.
1484                if furthestBlock is None:
1485                    element = self.tree.openElements.pop()
1486                    while element != formattingElement:
1487                        element = self.tree.openElements.pop()
1488                    self.tree.activeFormattingElements.remove(element)
1489                    return
1490
1491                # Step 7
1492                commonAncestor = self.tree.openElements[afeIndex - 1]
1493
1494                # Step 8:
1495                # The bookmark is supposed to help us identify where to reinsert
1496                # nodes in step 15. We have to ensure that we reinsert nodes after
1497                # the node before the active formatting element. Note the bookmark
1498                # can move in step 9.7
1499                bookmark = self.tree.activeFormattingElements.index(formattingElement)
1500
1501                # Step 9
1502                lastNode = node = furthestBlock
1503                innerLoopCounter = 0
1504
1505                index = self.tree.openElements.index(node)
1506                while innerLoopCounter < 3:
1507                    innerLoopCounter += 1
1508                    # Node is element before node in open elements
1509                    index -= 1
1510                    node = self.tree.openElements[index]
1511                    if node not in self.tree.activeFormattingElements:
1512                        self.tree.openElements.remove(node)
1513                        continue
1514                    # Step 9.6
1515                    if node == formattingElement:
1516                        break
1517                    # Step 9.7
1518                    if lastNode == furthestBlock:
1519                        bookmark = self.tree.activeFormattingElements.index(node) + 1
1520                    # Step 9.8
1521                    clone = node.cloneNode()
1522                    # Replace node with clone
1523                    self.tree.activeFormattingElements[
1524                        self.tree.activeFormattingElements.index(node)] = clone
1525                    self.tree.openElements[
1526                        self.tree.openElements.index(node)] = clone
1527                    node = clone
1528                    # Step 9.9
1529                    # Remove lastNode from its parents, if any
1530                    if lastNode.parent:
1531                        lastNode.parent.removeChild(lastNode)
1532                    node.appendChild(lastNode)
1533                    # Step 9.10
1534                    lastNode = node
1535
1536                # Step 10
1537                # Foster parent lastNode if commonAncestor is a
1538                # table, tbody, tfoot, thead, or tr we need to foster
1539                # parent the lastNode
1540                if lastNode.parent:
1541                    lastNode.parent.removeChild(lastNode)
1542
1543                if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
1544                    parent, insertBefore = self.tree.getTableMisnestedNodePosition()
1545                    parent.insertBefore(lastNode, insertBefore)
1546                else:
1547                    commonAncestor.appendChild(lastNode)
1548
1549                # Step 11
1550                clone = formattingElement.cloneNode()
1551
1552                # Step 12
1553                furthestBlock.reparentChildren(clone)
1554
1555                # Step 13
1556                furthestBlock.appendChild(clone)
1557
1558                # Step 14
1559                self.tree.activeFormattingElements.remove(formattingElement)
1560                self.tree.activeFormattingElements.insert(bookmark, clone)
1561
1562                # Step 15
1563                self.tree.openElements.remove(formattingElement)
1564                self.tree.openElements.insert(
1565                    self.tree.openElements.index(furthestBlock) + 1, clone)
1566
1567        def endTagAppletMarqueeObject(self, token):
1568            if self.tree.elementInScope(token["name"]):
1569                self.tree.generateImpliedEndTags()
1570            if self.tree.openElements[-1].name != token["name"]:
1571                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1572
1573            if self.tree.elementInScope(token["name"]):
1574                element = self.tree.openElements.pop()
1575                while element.name != token["name"]:
1576                    element = self.tree.openElements.pop()
1577                self.tree.clearActiveFormattingElements()
1578
1579        def endTagBr(self, token):
1580            self.parser.parseError("unexpected-end-tag-treated-as",
1581                                   {"originalName": "br", "newName": "br element"})
1582            self.tree.reconstructActiveFormattingElements()
1583            self.tree.insertElement(impliedTagToken("br", "StartTag"))
1584            self.tree.openElements.pop()
1585
1586        def endTagOther(self, token):
1587            for node in self.tree.openElements[::-1]:
1588                if node.name == token["name"]:
1589                    self.tree.generateImpliedEndTags(exclude=token["name"])
1590                    if self.tree.openElements[-1].name != token["name"]:
1591                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1592                    while self.tree.openElements.pop() != node:
1593                        pass
1594                    break
1595                else:
1596                    if node.nameTuple in specialElements:
1597                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1598                        break
1599
1600        startTagHandler = _utils.MethodDispatcher([
1601            ("html", Phase.startTagHtml),
1602            (("base", "basefont", "bgsound", "command", "link", "meta",
1603              "script", "style", "title"),
1604             startTagProcessInHead),
1605            ("body", startTagBody),
1606            ("frameset", startTagFrameset),
1607            (("address", "article", "aside", "blockquote", "center", "details",
1608              "dir", "div", "dl", "fieldset", "figcaption", "figure",
1609              "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
1610              "section", "summary", "ul"),
1611             startTagCloseP),
1612            (headingElements, startTagHeading),
1613            (("pre", "listing"), startTagPreListing),
1614            ("form", startTagForm),
1615            (("li", "dd", "dt"), startTagListItem),
1616            ("plaintext", startTagPlaintext),
1617            ("a", startTagA),
1618            (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
1619              "strong", "tt", "u"), startTagFormatting),
1620            ("nobr", startTagNobr),
1621            ("button", startTagButton),
1622            (("applet", "marquee", "object"), startTagAppletMarqueeObject),
1623            ("xmp", startTagXmp),
1624            ("table", startTagTable),
1625            (("area", "br", "embed", "img", "keygen", "wbr"),
1626             startTagVoidFormatting),
1627            (("param", "source", "track"), startTagParamSource),
1628            ("input", startTagInput),
1629            ("hr", startTagHr),
1630            ("image", startTagImage),
1631            ("isindex", startTagIsIndex),
1632            ("textarea", startTagTextarea),
1633            ("iframe", startTagIFrame),
1634            ("noscript", startTagNoscript),
1635            (("noembed", "noframes"), startTagRawtext),
1636            ("select", startTagSelect),
1637            (("rp", "rt"), startTagRpRt),
1638            (("option", "optgroup"), startTagOpt),
1639            (("math"), startTagMath),
1640            (("svg"), startTagSvg),
1641            (("caption", "col", "colgroup", "frame", "head",
1642              "tbody", "td", "tfoot", "th", "thead",
1643              "tr"), startTagMisplaced)
1644        ])
1645        startTagHandler.default = startTagOther
1646
1647        endTagHandler = _utils.MethodDispatcher([
1648            ("body", endTagBody),
1649            ("html", endTagHtml),
1650            (("address", "article", "aside", "blockquote", "button", "center",
1651              "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
1652              "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
1653              "section", "summary", "ul"), endTagBlock),
1654            ("form", endTagForm),
1655            ("p", endTagP),
1656            (("dd", "dt", "li"), endTagListItem),
1657            (headingElements, endTagHeading),
1658            (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
1659              "strike", "strong", "tt", "u"), endTagFormatting),
1660            (("applet", "marquee", "object"), endTagAppletMarqueeObject),
1661            ("br", endTagBr),
1662        ])
1663        endTagHandler.default = endTagOther
1664
1665    class TextPhase(Phase):
1666        __slots__ = tuple()
1667
1668        def processCharacters(self, token):
1669            self.tree.insertText(token["data"])
1670
1671        def processEOF(self):
1672            self.parser.parseError("expected-named-closing-tag-but-got-eof",
1673                                   {"name": self.tree.openElements[-1].name})
1674            self.tree.openElements.pop()
1675            self.parser.phase = self.parser.originalPhase
1676            return True
1677
1678        def startTagOther(self, token):
1679            assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
1680
1681        def endTagScript(self, token):
1682            node = self.tree.openElements.pop()
1683            assert node.name == "script"
1684            self.parser.phase = self.parser.originalPhase
1685            # The rest of this method is all stuff that only happens if
1686            # document.write works
1687
1688        def endTagOther(self, token):
1689            self.tree.openElements.pop()
1690            self.parser.phase = self.parser.originalPhase
1691
1692        startTagHandler = _utils.MethodDispatcher([])
1693        startTagHandler.default = startTagOther
1694        endTagHandler = _utils.MethodDispatcher([
1695            ("script", endTagScript)])
1696        endTagHandler.default = endTagOther
1697
1698    class InTablePhase(Phase):
1699        # http://www.whatwg.org/specs/web-apps/current-work/#in-table
1700        __slots__ = tuple()
1701
1702        # helper methods
1703        def clearStackToTableContext(self):
1704            # "clear the stack back to a table context"
1705            while self.tree.openElements[-1].name not in ("table", "html"):
1706                # self.parser.parseError("unexpected-implied-end-tag-in-table",
1707                #  {"name":  self.tree.openElements[-1].name})
1708                self.tree.openElements.pop()
1709            # When the current node is <html> it's an innerHTML case
1710
1711        # processing methods
1712        def processEOF(self):
1713            if self.tree.openElements[-1].name != "html":
1714                self.parser.parseError("eof-in-table")
1715            else:
1716                assert self.parser.innerHTML
1717            # Stop parsing
1718
1719        def processSpaceCharacters(self, token):
1720            originalPhase = self.parser.phase
1721            self.parser.phase = self.parser.phases["inTableText"]
1722            self.parser.phase.originalPhase = originalPhase
1723            self.parser.phase.processSpaceCharacters(token)
1724
1725        def processCharacters(self, token):
1726            originalPhase = self.parser.phase
1727            self.parser.phase = self.parser.phases["inTableText"]
1728            self.parser.phase.originalPhase = originalPhase
1729            self.parser.phase.processCharacters(token)
1730
1731        def insertText(self, token):
1732            # If we get here there must be at least one non-whitespace character
1733            # Do the table magic!
1734            self.tree.insertFromTable = True
1735            self.parser.phases["inBody"].processCharacters(token)
1736            self.tree.insertFromTable = False
1737
1738        def startTagCaption(self, token):
1739            self.clearStackToTableContext()
1740            self.tree.activeFormattingElements.append(Marker)
1741            self.tree.insertElement(token)
1742            self.parser.phase = self.parser.phases["inCaption"]
1743
1744        def startTagColgroup(self, token):
1745            self.clearStackToTableContext()
1746            self.tree.insertElement(token)
1747            self.parser.phase = self.parser.phases["inColumnGroup"]
1748
1749        def startTagCol(self, token):
1750            self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
1751            return token
1752
1753        def startTagRowGroup(self, token):
1754            self.clearStackToTableContext()
1755            self.tree.insertElement(token)
1756            self.parser.phase = self.parser.phases["inTableBody"]
1757
1758        def startTagImplyTbody(self, token):
1759            self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
1760            return token
1761
1762        def startTagTable(self, token):
1763            self.parser.parseError("unexpected-start-tag-implies-end-tag",
1764                                   {"startName": "table", "endName": "table"})
1765            self.parser.phase.processEndTag(impliedTagToken("table"))
1766            if not self.parser.innerHTML:
1767                return token
1768
1769        def startTagStyleScript(self, token):
1770            return self.parser.phases["inHead"].processStartTag(token)
1771
1772        def startTagInput(self, token):
1773            if ("type" in token["data"] and
1774                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1775                self.parser.parseError("unexpected-hidden-input-in-table")
1776                self.tree.insertElement(token)
1777                # XXX associate with form
1778                self.tree.openElements.pop()
1779            else:
1780                self.startTagOther(token)
1781
1782        def startTagForm(self, token):
1783            self.parser.parseError("unexpected-form-in-table")
1784            if self.tree.formPointer is None:
1785                self.tree.insertElement(token)
1786                self.tree.formPointer = self.tree.openElements[-1]
1787                self.tree.openElements.pop()
1788
1789        def startTagOther(self, token):
1790            self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
1791            # Do the table magic!
1792            self.tree.insertFromTable = True
1793            self.parser.phases["inBody"].processStartTag(token)
1794            self.tree.insertFromTable = False
1795
1796        def endTagTable(self, token):
1797            if self.tree.elementInScope("table", variant="table"):
1798                self.tree.generateImpliedEndTags()
1799                if self.tree.openElements[-1].name != "table":
1800                    self.parser.parseError("end-tag-too-early-named",
1801                                           {"gotName": "table",
1802                                            "expectedName": self.tree.openElements[-1].name})
1803                while self.tree.openElements[-1].name != "table":
1804                    self.tree.openElements.pop()
1805                self.tree.openElements.pop()
1806                self.parser.resetInsertionMode()
1807            else:
1808                # innerHTML case
1809                assert self.parser.innerHTML
1810                self.parser.parseError()
1811
1812        def endTagIgnore(self, token):
1813            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1814
1815        def endTagOther(self, token):
1816            self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
1817            # Do the table magic!
1818            self.tree.insertFromTable = True
1819            self.parser.phases["inBody"].processEndTag(token)
1820            self.tree.insertFromTable = False
1821
1822        startTagHandler = _utils.MethodDispatcher([
1823            ("html", Phase.startTagHtml),
1824            ("caption", startTagCaption),
1825            ("colgroup", startTagColgroup),
1826            ("col", startTagCol),
1827            (("tbody", "tfoot", "thead"), startTagRowGroup),
1828            (("td", "th", "tr"), startTagImplyTbody),
1829            ("table", startTagTable),
1830            (("style", "script"), startTagStyleScript),
1831            ("input", startTagInput),
1832            ("form", startTagForm)
1833        ])
1834        startTagHandler.default = startTagOther
1835
1836        endTagHandler = _utils.MethodDispatcher([
1837            ("table", endTagTable),
1838            (("body", "caption", "col", "colgroup", "html", "tbody", "td",
1839              "tfoot", "th", "thead", "tr"), endTagIgnore)
1840        ])
1841        endTagHandler.default = endTagOther
1842
1843    class InTableTextPhase(Phase):
1844        __slots__ = ("originalPhase", "characterTokens")
1845
1846        def __init__(self, *args, **kwargs):
1847            super(InTableTextPhase, self).__init__(*args, **kwargs)
1848            self.originalPhase = None
1849            self.characterTokens = []
1850
1851        def flushCharacters(self):
1852            data = "".join([item["data"] for item in self.characterTokens])
1853            if any([item not in spaceCharacters for item in data]):
1854                token = {"type": tokenTypes["Characters"], "data": data}
1855                self.parser.phases["inTable"].insertText(token)
1856            elif data:
1857                self.tree.insertText(data)
1858            self.characterTokens = []
1859
1860        def processComment(self, token):
1861            self.flushCharacters()
1862            self.parser.phase = self.originalPhase
1863            return token
1864
1865        def processEOF(self):
1866            self.flushCharacters()
1867            self.parser.phase = self.originalPhase
1868            return True
1869
1870        def processCharacters(self, token):
1871            if token["data"] == "\u0000":
1872                return
1873            self.characterTokens.append(token)
1874
1875        def processSpaceCharacters(self, token):
1876            # pretty sure we should never reach here
1877            self.characterTokens.append(token)
1878    #        assert False
1879
1880        def processStartTag(self, token):
1881            self.flushCharacters()
1882            self.parser.phase = self.originalPhase
1883            return token
1884
1885        def processEndTag(self, token):
1886            self.flushCharacters()
1887            self.parser.phase = self.originalPhase
1888            return token
1889
1890    class InCaptionPhase(Phase):
1891        # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1892        __slots__ = tuple()
1893
1894        def ignoreEndTagCaption(self):
1895            return not self.tree.elementInScope("caption", variant="table")
1896
1897        def processEOF(self):
1898            self.parser.phases["inBody"].processEOF()
1899
1900        def processCharacters(self, token):
1901            return self.parser.phases["inBody"].processCharacters(token)
1902
1903        def startTagTableElement(self, token):
1904            self.parser.parseError()
1905            # XXX Have to duplicate logic here to find out if the tag is ignored
1906            ignoreEndTag = self.ignoreEndTagCaption()
1907            self.parser.phase.processEndTag(impliedTagToken("caption"))
1908            if not ignoreEndTag:
1909                return token
1910
1911        def startTagOther(self, token):
1912            return self.parser.phases["inBody"].processStartTag(token)
1913
1914        def endTagCaption(self, token):
1915            if not self.ignoreEndTagCaption():
1916                # AT this code is quite similar to endTagTable in "InTable"
1917                self.tree.generateImpliedEndTags()
1918                if self.tree.openElements[-1].name != "caption":
1919                    self.parser.parseError("expected-one-end-tag-but-got-another",
1920                                           {"gotName": "caption",
1921                                            "expectedName": self.tree.openElements[-1].name})
1922                while self.tree.openElements[-1].name != "caption":
1923                    self.tree.openElements.pop()
1924                self.tree.openElements.pop()
1925                self.tree.clearActiveFormattingElements()
1926                self.parser.phase = self.parser.phases["inTable"]
1927            else:
1928                # innerHTML case
1929                assert self.parser.innerHTML
1930                self.parser.parseError()
1931
1932        def endTagTable(self, token):
1933            self.parser.parseError()
1934            ignoreEndTag = self.ignoreEndTagCaption()
1935            self.parser.phase.processEndTag(impliedTagToken("caption"))
1936            if not ignoreEndTag:
1937                return token
1938
1939        def endTagIgnore(self, token):
1940            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1941
1942        def endTagOther(self, token):
1943            return self.parser.phases["inBody"].processEndTag(token)
1944
1945        startTagHandler = _utils.MethodDispatcher([
1946            ("html", Phase.startTagHtml),
1947            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
1948              "thead", "tr"), startTagTableElement)
1949        ])
1950        startTagHandler.default = startTagOther
1951
1952        endTagHandler = _utils.MethodDispatcher([
1953            ("caption", endTagCaption),
1954            ("table", endTagTable),
1955            (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
1956              "thead", "tr"), endTagIgnore)
1957        ])
1958        endTagHandler.default = endTagOther
1959
1960    class InColumnGroupPhase(Phase):
1961        # http://www.whatwg.org/specs/web-apps/current-work/#in-column
1962        __slots__ = tuple()
1963
1964        def ignoreEndTagColgroup(self):
1965            return self.tree.openElements[-1].name == "html"
1966
1967        def processEOF(self):
1968            if self.tree.openElements[-1].name == "html":
1969                assert self.parser.innerHTML
1970                return
1971            else:
1972                ignoreEndTag = self.ignoreEndTagColgroup()
1973                self.endTagColgroup(impliedTagToken("colgroup"))
1974                if not ignoreEndTag:
1975                    return True
1976
1977        def processCharacters(self, token):
1978            ignoreEndTag = self.ignoreEndTagColgroup()
1979            self.endTagColgroup(impliedTagToken("colgroup"))
1980            if not ignoreEndTag:
1981                return token
1982
1983        def startTagCol(self, token):
1984            self.tree.insertElement(token)
1985            self.tree.openElements.pop()
1986            token["selfClosingAcknowledged"] = True
1987
1988        def startTagOther(self, token):
1989            ignoreEndTag = self.ignoreEndTagColgroup()
1990            self.endTagColgroup(impliedTagToken("colgroup"))
1991            if not ignoreEndTag:
1992                return token
1993
1994        def endTagColgroup(self, token):
1995            if self.ignoreEndTagColgroup():
1996                # innerHTML case
1997                assert self.parser.innerHTML
1998                self.parser.parseError()
1999            else:
2000                self.tree.openElements.pop()
2001                self.parser.phase = self.parser.phases["inTable"]
2002
2003        def endTagCol(self, token):
2004            self.parser.parseError("no-end-tag", {"name": "col"})
2005
2006        def endTagOther(self, token):
2007            ignoreEndTag = self.ignoreEndTagColgroup()
2008            self.endTagColgroup(impliedTagToken("colgroup"))
2009            if not ignoreEndTag:
2010                return token
2011
2012        startTagHandler = _utils.MethodDispatcher([
2013            ("html", Phase.startTagHtml),
2014            ("col", startTagCol)
2015        ])
2016        startTagHandler.default = startTagOther
2017
2018        endTagHandler = _utils.MethodDispatcher([
2019            ("colgroup", endTagColgroup),
2020            ("col", endTagCol)
2021        ])
2022        endTagHandler.default = endTagOther
2023
2024    class InTableBodyPhase(Phase):
2025        # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
2026        __slots__ = tuple()
2027
2028        # helper methods
2029        def clearStackToTableBodyContext(self):
2030            while self.tree.openElements[-1].name not in ("tbody", "tfoot",
2031                                                          "thead", "html"):
2032                # self.parser.parseError("unexpected-implied-end-tag-in-table",
2033                #  {"name": self.tree.openElements[-1].name})
2034                self.tree.openElements.pop()
2035            if self.tree.openElements[-1].name == "html":
2036                assert self.parser.innerHTML
2037
2038        # the rest
2039        def processEOF(self):
2040            self.parser.phases["inTable"].processEOF()
2041
2042        def processSpaceCharacters(self, token):
2043            return self.parser.phases["inTable"].processSpaceCharacters(token)
2044
2045        def processCharacters(self, token):
2046            return self.parser.phases["inTable"].processCharacters(token)
2047
2048        def startTagTr(self, token):
2049            self.clearStackToTableBodyContext()
2050            self.tree.insertElement(token)
2051            self.parser.phase = self.parser.phases["inRow"]
2052
2053        def startTagTableCell(self, token):
2054            self.parser.parseError("unexpected-cell-in-table-body",
2055                                   {"name": token["name"]})
2056            self.startTagTr(impliedTagToken("tr", "StartTag"))
2057            return token
2058
2059        def startTagTableOther(self, token):
2060            # XXX AT Any ideas on how to share this with endTagTable?
2061            if (self.tree.elementInScope("tbody", variant="table") or
2062                self.tree.elementInScope("thead", variant="table") or
2063                    self.tree.elementInScope("tfoot", variant="table")):
2064                self.clearStackToTableBodyContext()
2065                self.endTagTableRowGroup(
2066                    impliedTagToken(self.tree.openElements[-1].name))
2067                return token
2068            else:
2069                # innerHTML case
2070                assert self.parser.innerHTML
2071                self.parser.parseError()
2072
2073        def startTagOther(self, token):
2074            return self.parser.phases["inTable"].processStartTag(token)
2075
2076        def endTagTableRowGroup(self, token):
2077            if self.tree.elementInScope(token["name"], variant="table"):
2078                self.clearStackToTableBodyContext()
2079                self.tree.openElements.pop()
2080                self.parser.phase = self.parser.phases["inTable"]
2081            else:
2082                self.parser.parseError("unexpected-end-tag-in-table-body",
2083                                       {"name": token["name"]})
2084
2085        def endTagTable(self, token):
2086            if (self.tree.elementInScope("tbody", variant="table") or
2087                self.tree.elementInScope("thead", variant="table") or
2088                    self.tree.elementInScope("tfoot", variant="table")):
2089                self.clearStackToTableBodyContext()
2090                self.endTagTableRowGroup(
2091                    impliedTagToken(self.tree.openElements[-1].name))
2092                return token
2093            else:
2094                # innerHTML case
2095                assert self.parser.innerHTML
2096                self.parser.parseError()
2097
2098        def endTagIgnore(self, token):
2099            self.parser.parseError("unexpected-end-tag-in-table-body",
2100                                   {"name": token["name"]})
2101
2102        def endTagOther(self, token):
2103            return self.parser.phases["inTable"].processEndTag(token)
2104
2105        startTagHandler = _utils.MethodDispatcher([
2106            ("html", Phase.startTagHtml),
2107            ("tr", startTagTr),
2108            (("td", "th"), startTagTableCell),
2109            (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
2110             startTagTableOther)
2111        ])
2112        startTagHandler.default = startTagOther
2113
2114        endTagHandler = _utils.MethodDispatcher([
2115            (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2116            ("table", endTagTable),
2117            (("body", "caption", "col", "colgroup", "html", "td", "th",
2118              "tr"), endTagIgnore)
2119        ])
2120        endTagHandler.default = endTagOther
2121
2122    class InRowPhase(Phase):
2123        # http://www.whatwg.org/specs/web-apps/current-work/#in-row
2124        __slots__ = tuple()
2125
2126        # helper methods (XXX unify this with other table helper methods)
2127        def clearStackToTableRowContext(self):
2128            while self.tree.openElements[-1].name not in ("tr", "html"):
2129                self.parser.parseError("unexpected-implied-end-tag-in-table-row",
2130                                       {"name": self.tree.openElements[-1].name})
2131                self.tree.openElements.pop()
2132
2133        def ignoreEndTagTr(self):
2134            return not self.tree.elementInScope("tr", variant="table")
2135
2136        # the rest
2137        def processEOF(self):
2138            self.parser.phases["inTable"].processEOF()
2139
2140        def processSpaceCharacters(self, token):
2141            return self.parser.phases["inTable"].processSpaceCharacters(token)
2142
2143        def processCharacters(self, token):
2144            return self.parser.phases["inTable"].processCharacters(token)
2145
2146        def startTagTableCell(self, token):
2147            self.clearStackToTableRowContext()
2148            self.tree.insertElement(token)
2149            self.parser.phase = self.parser.phases["inCell"]
2150            self.tree.activeFormattingElements.append(Marker)
2151
2152        def startTagTableOther(self, token):
2153            ignoreEndTag = self.ignoreEndTagTr()
2154            self.endTagTr(impliedTagToken("tr"))
2155            # XXX how are we sure it's always ignored in the innerHTML case?
2156            if not ignoreEndTag:
2157                return token
2158
2159        def startTagOther(self, token):
2160            return self.parser.phases["inTable"].processStartTag(token)
2161
2162        def endTagTr(self, token):
2163            if not self.ignoreEndTagTr():
2164                self.clearStackToTableRowContext()
2165                self.tree.openElements.pop()
2166                self.parser.phase = self.parser.phases["inTableBody"]
2167            else:
2168                # innerHTML case
2169                assert self.parser.innerHTML
2170                self.parser.parseError()
2171
2172        def endTagTable(self, token):
2173            ignoreEndTag = self.ignoreEndTagTr()
2174            self.endTagTr(impliedTagToken("tr"))
2175            # Reprocess the current tag if the tr end tag was not ignored
2176            # XXX how are we sure it's always ignored in the innerHTML case?
2177            if not ignoreEndTag:
2178                return token
2179
2180        def endTagTableRowGroup(self, token):
2181            if self.tree.elementInScope(token["name"], variant="table"):
2182                self.endTagTr(impliedTagToken("tr"))
2183                return token
2184            else:
2185                self.parser.parseError()
2186
2187        def endTagIgnore(self, token):
2188            self.parser.parseError("unexpected-end-tag-in-table-row",
2189                                   {"name": token["name"]})
2190
2191        def endTagOther(self, token):
2192            return self.parser.phases["inTable"].processEndTag(token)
2193
2194        startTagHandler = _utils.MethodDispatcher([
2195            ("html", Phase.startTagHtml),
2196            (("td", "th"), startTagTableCell),
2197            (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
2198              "tr"), startTagTableOther)
2199        ])
2200        startTagHandler.default = startTagOther
2201
2202        endTagHandler = _utils.MethodDispatcher([
2203            ("tr", endTagTr),
2204            ("table", endTagTable),
2205            (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2206            (("body", "caption", "col", "colgroup", "html", "td", "th"),
2207             endTagIgnore)
2208        ])
2209        endTagHandler.default = endTagOther
2210
2211    class InCellPhase(Phase):
2212        # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2213        __slots__ = tuple()
2214
2215        # helper
2216        def closeCell(self):
2217            if self.tree.elementInScope("td", variant="table"):
2218                self.endTagTableCell(impliedTagToken("td"))
2219            elif self.tree.elementInScope("th", variant="table"):
2220                self.endTagTableCell(impliedTagToken("th"))
2221
2222        # the rest
2223        def processEOF(self):
2224            self.parser.phases["inBody"].processEOF()
2225
2226        def processCharacters(self, token):
2227            return self.parser.phases["inBody"].processCharacters(token)
2228
2229        def startTagTableOther(self, token):
2230            if (self.tree.elementInScope("td", variant="table") or
2231                    self.tree.elementInScope("th", variant="table")):
2232                self.closeCell()
2233                return token
2234            else:
2235                # innerHTML case
2236                assert self.parser.innerHTML
2237                self.parser.parseError()
2238
2239        def startTagOther(self, token):
2240            return self.parser.phases["inBody"].processStartTag(token)
2241
2242        def endTagTableCell(self, token):
2243            if self.tree.elementInScope(token["name"], variant="table"):
2244                self.tree.generateImpliedEndTags(token["name"])
2245                if self.tree.openElements[-1].name != token["name"]:
2246                    self.parser.parseError("unexpected-cell-end-tag",
2247                                           {"name": token["name"]})
2248                    while True:
2249                        node = self.tree.openElements.pop()
2250                        if node.name == token["name"]:
2251                            break
2252                else:
2253                    self.tree.openElements.pop()
2254                self.tree.clearActiveFormattingElements()
2255                self.parser.phase = self.parser.phases["inRow"]
2256            else:
2257                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2258
2259        def endTagIgnore(self, token):
2260            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2261
2262        def endTagImply(self, token):
2263            if self.tree.elementInScope(token["name"], variant="table"):
2264                self.closeCell()
2265                return token
2266            else:
2267                # sometimes innerHTML case
2268                self.parser.parseError()
2269
2270        def endTagOther(self, token):
2271            return self.parser.phases["inBody"].processEndTag(token)
2272
2273        startTagHandler = _utils.MethodDispatcher([
2274            ("html", Phase.startTagHtml),
2275            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
2276              "thead", "tr"), startTagTableOther)
2277        ])
2278        startTagHandler.default = startTagOther
2279
2280        endTagHandler = _utils.MethodDispatcher([
2281            (("td", "th"), endTagTableCell),
2282            (("body", "caption", "col", "colgroup", "html"), endTagIgnore),
2283            (("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
2284        ])
2285        endTagHandler.default = endTagOther
2286
2287    class InSelectPhase(Phase):
2288        __slots__ = tuple()
2289
2290        # http://www.whatwg.org/specs/web-apps/current-work/#in-select
2291        def processEOF(self):
2292            if self.tree.openElements[-1].name != "html":
2293                self.parser.parseError("eof-in-select")
2294            else:
2295                assert self.parser.innerHTML
2296
2297        def processCharacters(self, token):
2298            if token["data"] == "\u0000":
2299                return
2300            self.tree.insertText(token["data"])
2301
2302        def startTagOption(self, token):
2303            # We need to imply </option> if <option> is the current node.
2304            if self.tree.openElements[-1].name == "option":
2305                self.tree.openElements.pop()
2306            self.tree.insertElement(token)
2307
2308        def startTagOptgroup(self, token):
2309            if self.tree.openElements[-1].name == "option":
2310                self.tree.openElements.pop()
2311            if self.tree.openElements[-1].name == "optgroup":
2312                self.tree.openElements.pop()
2313            self.tree.insertElement(token)
2314
2315        def startTagSelect(self, token):
2316            self.parser.parseError("unexpected-select-in-select")
2317            self.endTagSelect(impliedTagToken("select"))
2318
2319        def startTagInput(self, token):
2320            self.parser.parseError("unexpected-input-in-select")
2321            if self.tree.elementInScope("select", variant="select"):
2322                self.endTagSelect(impliedTagToken("select"))
2323                return token
2324            else:
2325                assert self.parser.innerHTML
2326
2327        def startTagScript(self, token):
2328            return self.parser.phases["inHead"].processStartTag(token)
2329
2330        def startTagOther(self, token):
2331            self.parser.parseError("unexpected-start-tag-in-select",
2332                                   {"name": token["name"]})
2333
2334        def endTagOption(self, token):
2335            if self.tree.openElements[-1].name == "option":
2336                self.tree.openElements.pop()
2337            else:
2338                self.parser.parseError("unexpected-end-tag-in-select",
2339                                       {"name": "option"})
2340
2341        def endTagOptgroup(self, token):
2342            # </optgroup> implicitly closes <option>
2343            if (self.tree.openElements[-1].name == "option" and
2344                    self.tree.openElements[-2].name == "optgroup"):
2345                self.tree.openElements.pop()
2346            # It also closes </optgroup>
2347            if self.tree.openElements[-1].name == "optgroup":
2348                self.tree.openElements.pop()
2349            # But nothing else
2350            else:
2351                self.parser.parseError("unexpected-end-tag-in-select",
2352                                       {"name": "optgroup"})
2353
2354        def endTagSelect(self, token):
2355            if self.tree.elementInScope("select", variant="select"):
2356                node = self.tree.openElements.pop()
2357                while node.name != "select":
2358                    node = self.tree.openElements.pop()
2359                self.parser.resetInsertionMode()
2360            else:
2361                # innerHTML case
2362                assert self.parser.innerHTML
2363                self.parser.parseError()
2364
2365        def endTagOther(self, token):
2366            self.parser.parseError("unexpected-end-tag-in-select",
2367                                   {"name": token["name"]})
2368
2369        startTagHandler = _utils.MethodDispatcher([
2370            ("html", Phase.startTagHtml),
2371            ("option", startTagOption),
2372            ("optgroup", startTagOptgroup),
2373            ("select", startTagSelect),
2374            (("input", "keygen", "textarea"), startTagInput),
2375            ("script", startTagScript)
2376        ])
2377        startTagHandler.default = startTagOther
2378
2379        endTagHandler = _utils.MethodDispatcher([
2380            ("option", endTagOption),
2381            ("optgroup", endTagOptgroup),
2382            ("select", endTagSelect)
2383        ])
2384        endTagHandler.default = endTagOther
2385
2386    class InSelectInTablePhase(Phase):
2387        __slots__ = tuple()
2388
2389        def processEOF(self):
2390            self.parser.phases["inSelect"].processEOF()
2391
2392        def processCharacters(self, token):
2393            return self.parser.phases["inSelect"].processCharacters(token)
2394
2395        def startTagTable(self, token):
2396            self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
2397            self.endTagOther(impliedTagToken("select"))
2398            return token
2399
2400        def startTagOther(self, token):
2401            return self.parser.phases["inSelect"].processStartTag(token)
2402
2403        def endTagTable(self, token):
2404            self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
2405            if self.tree.elementInScope(token["name"], variant="table"):
2406                self.endTagOther(impliedTagToken("select"))
2407                return token
2408
2409        def endTagOther(self, token):
2410            return self.parser.phases["inSelect"].processEndTag(token)
2411
2412        startTagHandler = _utils.MethodDispatcher([
2413            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2414             startTagTable)
2415        ])
2416        startTagHandler.default = startTagOther
2417
2418        endTagHandler = _utils.MethodDispatcher([
2419            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2420             endTagTable)
2421        ])
2422        endTagHandler.default = endTagOther
2423
2424    class InForeignContentPhase(Phase):
2425        __slots__ = tuple()
2426
2427        breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
2428                                      "center", "code", "dd", "div", "dl", "dt",
2429                                      "em", "embed", "h1", "h2", "h3",
2430                                      "h4", "h5", "h6", "head", "hr", "i", "img",
2431                                      "li", "listing", "menu", "meta", "nobr",
2432                                      "ol", "p", "pre", "ruby", "s", "small",
2433                                      "span", "strong", "strike", "sub", "sup",
2434                                      "table", "tt", "u", "ul", "var"])
2435
2436        def adjustSVGTagNames(self, token):
2437            replacements = {"altglyph": "altGlyph",
2438                            "altglyphdef": "altGlyphDef",
2439                            "altglyphitem": "altGlyphItem",
2440                            "animatecolor": "animateColor",
2441                            "animatemotion": "animateMotion",
2442                            "animatetransform": "animateTransform",
2443                            "clippath": "clipPath",
2444                            "feblend": "feBlend",
2445                            "fecolormatrix": "feColorMatrix",
2446                            "fecomponenttransfer": "feComponentTransfer",
2447                            "fecomposite": "feComposite",
2448                            "feconvolvematrix": "feConvolveMatrix",
2449                            "fediffuselighting": "feDiffuseLighting",
2450                            "fedisplacementmap": "feDisplacementMap",
2451                            "fedistantlight": "feDistantLight",
2452                            "feflood": "feFlood",
2453                            "fefunca": "feFuncA",
2454                            "fefuncb": "feFuncB",
2455                            "fefuncg": "feFuncG",
2456                            "fefuncr": "feFuncR",
2457                            "fegaussianblur": "feGaussianBlur",
2458                            "feimage": "feImage",
2459                            "femerge": "feMerge",
2460                            "femergenode": "feMergeNode",
2461                            "femorphology": "feMorphology",
2462                            "feoffset": "feOffset",
2463                            "fepointlight": "fePointLight",
2464                            "fespecularlighting": "feSpecularLighting",
2465                            "fespotlight": "feSpotLight",
2466                            "fetile": "feTile",
2467                            "feturbulence": "feTurbulence",
2468                            "foreignobject": "foreignObject",
2469                            "glyphref": "glyphRef",
2470                            "lineargradient": "linearGradient",
2471                            "radialgradient": "radialGradient",
2472                            "textpath": "textPath"}
2473
2474            if token["name"] in replacements:
2475                token["name"] = replacements[token["name"]]
2476
2477        def processCharacters(self, token):
2478            if token["data"] == "\u0000":
2479                token["data"] = "\uFFFD"
2480            elif (self.parser.framesetOK and
2481                  any(char not in spaceCharacters for char in token["data"])):
2482                self.parser.framesetOK = False
2483            Phase.processCharacters(self, token)
2484
2485        def processStartTag(self, token):
2486            currentNode = self.tree.openElements[-1]
2487            if (token["name"] in self.breakoutElements or
2488                (token["name"] == "font" and
2489                 set(token["data"].keys()) & {"color", "face", "size"})):
2490                self.parser.parseError("unexpected-html-element-in-foreign-content",
2491                                       {"name": token["name"]})
2492                while (self.tree.openElements[-1].namespace !=
2493                       self.tree.defaultNamespace and
2494                       not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2495                       not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2496                    self.tree.openElements.pop()
2497                return token
2498
2499            else:
2500                if currentNode.namespace == namespaces["mathml"]:
2501                    self.parser.adjustMathMLAttributes(token)
2502                elif currentNode.namespace == namespaces["svg"]:
2503                    self.adjustSVGTagNames(token)
2504                    self.parser.adjustSVGAttributes(token)
2505                self.parser.adjustForeignAttributes(token)
2506                token["namespace"] = currentNode.namespace
2507                self.tree.insertElement(token)
2508                if token["selfClosing"]:
2509                    self.tree.openElements.pop()
2510                    token["selfClosingAcknowledged"] = True
2511
2512        def processEndTag(self, token):
2513            nodeIndex = len(self.tree.openElements) - 1
2514            node = self.tree.openElements[-1]
2515            if node.name.translate(asciiUpper2Lower) != token["name"]:
2516                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2517
2518            while True:
2519                if node.name.translate(asciiUpper2Lower) == token["name"]:
2520                    # XXX this isn't in the spec but it seems necessary
2521                    if self.parser.phase == self.parser.phases["inTableText"]:
2522                        self.parser.phase.flushCharacters()
2523                        self.parser.phase = self.parser.phase.originalPhase
2524                    while self.tree.openElements.pop() != node:
2525                        assert self.tree.openElements
2526                    new_token = None
2527                    break
2528                nodeIndex -= 1
2529
2530                node = self.tree.openElements[nodeIndex]
2531                if node.namespace != self.tree.defaultNamespace:
2532                    continue
2533                else:
2534                    new_token = self.parser.phase.processEndTag(token)
2535                    break
2536            return new_token
2537
2538    class AfterBodyPhase(Phase):
2539        __slots__ = tuple()
2540
2541        def processEOF(self):
2542            # Stop parsing
2543            pass
2544
2545        def processComment(self, token):
2546            # This is needed because data is to be appended to the <html> element
2547            # here and not to whatever is currently open.
2548            self.tree.insertComment(token, self.tree.openElements[0])
2549
2550        def processCharacters(self, token):
2551            self.parser.parseError("unexpected-char-after-body")
2552            self.parser.phase = self.parser.phases["inBody"]
2553            return token
2554
2555        def startTagHtml(self, token):
2556            return self.parser.phases["inBody"].processStartTag(token)
2557
2558        def startTagOther(self, token):
2559            self.parser.parseError("unexpected-start-tag-after-body",
2560                                   {"name": token["name"]})
2561            self.parser.phase = self.parser.phases["inBody"]
2562            return token
2563
2564        def endTagHtml(self, name):
2565            if self.parser.innerHTML:
2566                self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
2567            else:
2568                self.parser.phase = self.parser.phases["afterAfterBody"]
2569
2570        def endTagOther(self, token):
2571            self.parser.parseError("unexpected-end-tag-after-body",
2572                                   {"name": token["name"]})
2573            self.parser.phase = self.parser.phases["inBody"]
2574            return token
2575
2576        startTagHandler = _utils.MethodDispatcher([
2577            ("html", startTagHtml)
2578        ])
2579        startTagHandler.default = startTagOther
2580
2581        endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
2582        endTagHandler.default = endTagOther
2583
2584    class InFramesetPhase(Phase):
2585        # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2586        __slots__ = tuple()
2587
2588        def processEOF(self):
2589            if self.tree.openElements[-1].name != "html":
2590                self.parser.parseError("eof-in-frameset")
2591            else:
2592                assert self.parser.innerHTML
2593
2594        def processCharacters(self, token):
2595            self.parser.parseError("unexpected-char-in-frameset")
2596
2597        def startTagFrameset(self, token):
2598            self.tree.insertElement(token)
2599
2600        def startTagFrame(self, token):
2601            self.tree.insertElement(token)
2602            self.tree.openElements.pop()
2603
2604        def startTagNoframes(self, token):
2605            return self.parser.phases["inBody"].processStartTag(token)
2606
2607        def startTagOther(self, token):
2608            self.parser.parseError("unexpected-start-tag-in-frameset",
2609                                   {"name": token["name"]})
2610
2611        def endTagFrameset(self, token):
2612            if self.tree.openElements[-1].name == "html":
2613                # innerHTML case
2614                self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
2615            else:
2616                self.tree.openElements.pop()
2617            if (not self.parser.innerHTML and
2618                    self.tree.openElements[-1].name != "frameset"):
2619                # If we're not in innerHTML mode and the current node is not a
2620                # "frameset" element (anymore) then switch.
2621                self.parser.phase = self.parser.phases["afterFrameset"]
2622
2623        def endTagOther(self, token):
2624            self.parser.parseError("unexpected-end-tag-in-frameset",
2625                                   {"name": token["name"]})
2626
2627        startTagHandler = _utils.MethodDispatcher([
2628            ("html", Phase.startTagHtml),
2629            ("frameset", startTagFrameset),
2630            ("frame", startTagFrame),
2631            ("noframes", startTagNoframes)
2632        ])
2633        startTagHandler.default = startTagOther
2634
2635        endTagHandler = _utils.MethodDispatcher([
2636            ("frameset", endTagFrameset)
2637        ])
2638        endTagHandler.default = endTagOther
2639
2640    class AfterFramesetPhase(Phase):
2641        # http://www.whatwg.org/specs/web-apps/current-work/#after3
2642        __slots__ = tuple()
2643
2644        def processEOF(self):
2645            # Stop parsing
2646            pass
2647
2648        def processCharacters(self, token):
2649            self.parser.parseError("unexpected-char-after-frameset")
2650
2651        def startTagNoframes(self, token):
2652            return self.parser.phases["inHead"].processStartTag(token)
2653
2654        def startTagOther(self, token):
2655            self.parser.parseError("unexpected-start-tag-after-frameset",
2656                                   {"name": token["name"]})
2657
2658        def endTagHtml(self, token):
2659            self.parser.phase = self.parser.phases["afterAfterFrameset"]
2660
2661        def endTagOther(self, token):
2662            self.parser.parseError("unexpected-end-tag-after-frameset",
2663                                   {"name": token["name"]})
2664
2665        startTagHandler = _utils.MethodDispatcher([
2666            ("html", Phase.startTagHtml),
2667            ("noframes", startTagNoframes)
2668        ])
2669        startTagHandler.default = startTagOther
2670
2671        endTagHandler = _utils.MethodDispatcher([
2672            ("html", endTagHtml)
2673        ])
2674        endTagHandler.default = endTagOther
2675
2676    class AfterAfterBodyPhase(Phase):
2677        __slots__ = tuple()
2678
2679        def processEOF(self):
2680            pass
2681
2682        def processComment(self, token):
2683            self.tree.insertComment(token, self.tree.document)
2684
2685        def processSpaceCharacters(self, token):
2686            return self.parser.phases["inBody"].processSpaceCharacters(token)
2687
2688        def processCharacters(self, token):
2689            self.parser.parseError("expected-eof-but-got-char")
2690            self.parser.phase = self.parser.phases["inBody"]
2691            return token
2692
2693        def startTagHtml(self, token):
2694            return self.parser.phases["inBody"].processStartTag(token)
2695
2696        def startTagOther(self, token):
2697            self.parser.parseError("expected-eof-but-got-start-tag",
2698                                   {"name": token["name"]})
2699            self.parser.phase = self.parser.phases["inBody"]
2700            return token
2701
2702        def processEndTag(self, token):
2703            self.parser.parseError("expected-eof-but-got-end-tag",
2704                                   {"name": token["name"]})
2705            self.parser.phase = self.parser.phases["inBody"]
2706            return token
2707
2708        startTagHandler = _utils.MethodDispatcher([
2709            ("html", startTagHtml)
2710        ])
2711        startTagHandler.default = startTagOther
2712
2713    class AfterAfterFramesetPhase(Phase):
2714        __slots__ = tuple()
2715
2716        def processEOF(self):
2717            pass
2718
2719        def processComment(self, token):
2720            self.tree.insertComment(token, self.tree.document)
2721
2722        def processSpaceCharacters(self, token):
2723            return self.parser.phases["inBody"].processSpaceCharacters(token)
2724
2725        def processCharacters(self, token):
2726            self.parser.parseError("expected-eof-but-got-char")
2727
2728        def startTagHtml(self, token):
2729            return self.parser.phases["inBody"].processStartTag(token)
2730
2731        def startTagNoFrames(self, token):
2732            return self.parser.phases["inHead"].processStartTag(token)
2733
2734        def startTagOther(self, token):
2735            self.parser.parseError("expected-eof-but-got-start-tag",
2736                                   {"name": token["name"]})
2737
2738        def processEndTag(self, token):
2739            self.parser.parseError("expected-eof-but-got-end-tag",
2740                                   {"name": token["name"]})
2741
2742        startTagHandler = _utils.MethodDispatcher([
2743            ("html", startTagHtml),
2744            ("noframes", startTagNoFrames)
2745        ])
2746        startTagHandler.default = startTagOther
2747
2748    # pylint:enable=unused-argument
2749
2750    return {
2751        "initial": InitialPhase,
2752        "beforeHtml": BeforeHtmlPhase,
2753        "beforeHead": BeforeHeadPhase,
2754        "inHead": InHeadPhase,
2755        "inHeadNoscript": InHeadNoscriptPhase,
2756        "afterHead": AfterHeadPhase,
2757        "inBody": InBodyPhase,
2758        "text": TextPhase,
2759        "inTable": InTablePhase,
2760        "inTableText": InTableTextPhase,
2761        "inCaption": InCaptionPhase,
2762        "inColumnGroup": InColumnGroupPhase,
2763        "inTableBody": InTableBodyPhase,
2764        "inRow": InRowPhase,
2765        "inCell": InCellPhase,
2766        "inSelect": InSelectPhase,
2767        "inSelectInTable": InSelectInTablePhase,
2768        "inForeignContent": InForeignContentPhase,
2769        "afterBody": AfterBodyPhase,
2770        "inFrameset": InFramesetPhase,
2771        "afterFrameset": AfterFramesetPhase,
2772        "afterAfterBody": AfterAfterBodyPhase,
2773        "afterAfterFrameset": AfterAfterFramesetPhase,
2774        # XXX after after frameset
2775    }
2776
2777
2778def adjust_attributes(token, replacements):
2779    needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
2780    if needs_adjustment:
2781        token['data'] = type(token['data'])((replacements.get(k, k), v)
2782                                            for k, v in token['data'].items())
2783
2784
2785def impliedTagToken(name, type="EndTag", attributes=None,
2786                    selfClosing=False):
2787    if attributes is None:
2788        attributes = {}
2789    return {"type": tokenTypes[type], "name": name, "data": attributes,
2790            "selfClosing": selfClosing}
2791
2792
2793class ParseError(Exception):
2794    """Error in parsed document"""
2795    pass
2796