1from __future__ import absolute_import, division, unicode_literals
2
3from pip9._vendor.six import unichr as chr
4
5from collections import deque
6
7from .constants import spaceCharacters
8from .constants import entities
9from .constants import asciiLetters, asciiUpper2Lower
10from .constants import digits, hexDigits, EOF
11from .constants import tokenTypes, tagTokenTypes
12from .constants import replacementCharacters
13
14from ._inputstream import HTMLInputStream
15
16from ._trie import Trie
17
18entitiesTrie = Trie(entities)
19
20
21class HTMLTokenizer(object):
22    """ This class takes care of tokenizing HTML.
23
24    * self.currentToken
25      Holds the token that is currently being processed.
26
27    * self.state
28      Holds a reference to the method to be invoked... XXX
29
30    * self.stream
31      Points to HTMLInputStream object.
32    """
33
34    def __init__(self, stream, parser=None, **kwargs):
35
36        self.stream = HTMLInputStream(stream, **kwargs)
37        self.parser = parser
38
39        # Setup the initial tokenizer state
40        self.escapeFlag = False
41        self.lastFourChars = []
42        self.state = self.dataState
43        self.escape = False
44
45        # The current token being created
46        self.currentToken = None
47        super(HTMLTokenizer, self).__init__()
48
49    def __iter__(self):
50        """ This is where the magic happens.
51
52        We do our usually processing through the states and when we have a token
53        to return we yield the token which pauses processing until the next token
54        is requested.
55        """
56        self.tokenQueue = deque([])
57        # Start processing. When EOF is reached self.state will return False
58        # instead of True and the loop will terminate.
59        while self.state():
60            while self.stream.errors:
61                yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
62            while self.tokenQueue:
63                yield self.tokenQueue.popleft()
64
65    def consumeNumberEntity(self, isHex):
66        """This function returns either U+FFFD or the character based on the
67        decimal or hexadecimal representation. It also discards ";" if present.
68        If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
69        """
70
71        allowed = digits
72        radix = 10
73        if isHex:
74            allowed = hexDigits
75            radix = 16
76
77        charStack = []
78
79        # Consume all the characters that are in range while making sure we
80        # don't hit an EOF.
81        c = self.stream.char()
82        while c in allowed and c is not EOF:
83            charStack.append(c)
84            c = self.stream.char()
85
86        # Convert the set of characters consumed to an int.
87        charAsInt = int("".join(charStack), radix)
88
89        # Certain characters get replaced with others
90        if charAsInt in replacementCharacters:
91            char = replacementCharacters[charAsInt]
92            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
93                                    "illegal-codepoint-for-numeric-entity",
94                                    "datavars": {"charAsInt": charAsInt}})
95        elif ((0xD800 <= charAsInt <= 0xDFFF) or
96              (charAsInt > 0x10FFFF)):
97            char = "\uFFFD"
98            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
99                                    "illegal-codepoint-for-numeric-entity",
100                                    "datavars": {"charAsInt": charAsInt}})
101        else:
102            # Should speed up this check somehow (e.g. move the set to a constant)
103            if ((0x0001 <= charAsInt <= 0x0008) or
104                (0x000E <= charAsInt <= 0x001F) or
105                (0x007F <= charAsInt <= 0x009F) or
106                (0xFDD0 <= charAsInt <= 0xFDEF) or
107                charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
108                                        0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
109                                        0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
110                                        0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
111                                        0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
112                                        0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
113                                        0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
114                                        0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
115                                        0xFFFFF, 0x10FFFE, 0x10FFFF])):
116                self.tokenQueue.append({"type": tokenTypes["ParseError"],
117                                        "data":
118                                        "illegal-codepoint-for-numeric-entity",
119                                        "datavars": {"charAsInt": charAsInt}})
120            try:
121                # Try/except needed as UCS-2 Python builds' unichar only works
122                # within the BMP.
123                char = chr(charAsInt)
124            except ValueError:
125                v = charAsInt - 0x10000
126                char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
127
128        # Discard the ; if present. Otherwise, put it back on the queue and
129        # invoke parseError on parser.
130        if c != ";":
131            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
132                                    "numeric-entity-without-semicolon"})
133            self.stream.unget(c)
134
135        return char
136
137    def consumeEntity(self, allowedChar=None, fromAttribute=False):
138        # Initialise to the default output for when no entity is matched
139        output = "&"
140
141        charStack = [self.stream.char()]
142        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
143                (allowedChar is not None and allowedChar == charStack[0])):
144            self.stream.unget(charStack[0])
145
146        elif charStack[0] == "#":
147            # Read the next character to see if it's hex or decimal
148            hex = False
149            charStack.append(self.stream.char())
150            if charStack[-1] in ("x", "X"):
151                hex = True
152                charStack.append(self.stream.char())
153
154            # charStack[-1] should be the first digit
155            if (hex and charStack[-1] in hexDigits) \
156                    or (not hex and charStack[-1] in digits):
157                # At least one digit found, so consume the whole number
158                self.stream.unget(charStack[-1])
159                output = self.consumeNumberEntity(hex)
160            else:
161                # No digits found
162                self.tokenQueue.append({"type": tokenTypes["ParseError"],
163                                        "data": "expected-numeric-entity"})
164                self.stream.unget(charStack.pop())
165                output = "&" + "".join(charStack)
166
167        else:
168            # At this point in the process might have named entity. Entities
169            # are stored in the global variable "entities".
170            #
171            # Consume characters and compare to these to a substring of the
172            # entity names in the list until the substring no longer matches.
173            while (charStack[-1] is not EOF):
174                if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
175                    break
176                charStack.append(self.stream.char())
177
178            # At this point we have a string that starts with some characters
179            # that may match an entity
180            # Try to find the longest entity the string will match to take care
181            # of &noti for instance.
182            try:
183                entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
184                entityLength = len(entityName)
185            except KeyError:
186                entityName = None
187
188            if entityName is not None:
189                if entityName[-1] != ";":
190                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
191                                            "named-entity-without-semicolon"})
192                if (entityName[-1] != ";" and fromAttribute and
193                    (charStack[entityLength] in asciiLetters or
194                     charStack[entityLength] in digits or
195                     charStack[entityLength] == "=")):
196                    self.stream.unget(charStack.pop())
197                    output = "&" + "".join(charStack)
198                else:
199                    output = entities[entityName]
200                    self.stream.unget(charStack.pop())
201                    output += "".join(charStack[entityLength:])
202            else:
203                self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
204                                        "expected-named-entity"})
205                self.stream.unget(charStack.pop())
206                output = "&" + "".join(charStack)
207
208        if fromAttribute:
209            self.currentToken["data"][-1][1] += output
210        else:
211            if output in spaceCharacters:
212                tokenType = "SpaceCharacters"
213            else:
214                tokenType = "Characters"
215            self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
216
217    def processEntityInAttribute(self, allowedChar):
218        """This method replaces the need for "entityInAttributeValueState".
219        """
220        self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
221
222    def emitCurrentToken(self):
223        """This method is a generic handler for emitting the tags. It also sets
224        the state to "data" because that's what's needed after a token has been
225        emitted.
226        """
227        token = self.currentToken
228        # Add token to the queue to be yielded
229        if (token["type"] in tagTokenTypes):
230            token["name"] = token["name"].translate(asciiUpper2Lower)
231            if token["type"] == tokenTypes["EndTag"]:
232                if token["data"]:
233                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
234                                            "data": "attributes-in-end-tag"})
235                if token["selfClosing"]:
236                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
237                                            "data": "self-closing-flag-on-end-tag"})
238        self.tokenQueue.append(token)
239        self.state = self.dataState
240
241    # Below are the various tokenizer states worked out.
242    def dataState(self):
243        data = self.stream.char()
244        if data == "&":
245            self.state = self.entityDataState
246        elif data == "<":
247            self.state = self.tagOpenState
248        elif data == "\u0000":
249            self.tokenQueue.append({"type": tokenTypes["ParseError"],
250                                    "data": "invalid-codepoint"})
251            self.tokenQueue.append({"type": tokenTypes["Characters"],
252                                    "data": "\u0000"})
253        elif data is EOF:
254            # Tokenization ends.
255            return False
256        elif data in spaceCharacters:
257            # Directly after emitting a token you switch back to the "data
258            # state". At that point spaceCharacters are important so they are
259            # emitted separately.
260            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
261                                    data + self.stream.charsUntil(spaceCharacters, True)})
262            # No need to update lastFourChars here, since the first space will
263            # have already been appended to lastFourChars and will have broken
264            # any <!-- or --> sequences
265        else:
266            chars = self.stream.charsUntil(("&", "<", "\u0000"))
267            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
268                                    data + chars})
269        return True
270
271    def entityDataState(self):
272        self.consumeEntity()
273        self.state = self.dataState
274        return True
275
276    def rcdataState(self):
277        data = self.stream.char()
278        if data == "&":
279            self.state = self.characterReferenceInRcdata
280        elif data == "<":
281            self.state = self.rcdataLessThanSignState
282        elif data == EOF:
283            # Tokenization ends.
284            return False
285        elif data == "\u0000":
286            self.tokenQueue.append({"type": tokenTypes["ParseError"],
287                                    "data": "invalid-codepoint"})
288            self.tokenQueue.append({"type": tokenTypes["Characters"],
289                                    "data": "\uFFFD"})
290        elif data in spaceCharacters:
291            # Directly after emitting a token you switch back to the "data
292            # state". At that point spaceCharacters are important so they are
293            # emitted separately.
294            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
295                                    data + self.stream.charsUntil(spaceCharacters, True)})
296            # No need to update lastFourChars here, since the first space will
297            # have already been appended to lastFourChars and will have broken
298            # any <!-- or --> sequences
299        else:
300            chars = self.stream.charsUntil(("&", "<", "\u0000"))
301            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
302                                    data + chars})
303        return True
304
305    def characterReferenceInRcdata(self):
306        self.consumeEntity()
307        self.state = self.rcdataState
308        return True
309
310    def rawtextState(self):
311        data = self.stream.char()
312        if data == "<":
313            self.state = self.rawtextLessThanSignState
314        elif data == "\u0000":
315            self.tokenQueue.append({"type": tokenTypes["ParseError"],
316                                    "data": "invalid-codepoint"})
317            self.tokenQueue.append({"type": tokenTypes["Characters"],
318                                    "data": "\uFFFD"})
319        elif data == EOF:
320            # Tokenization ends.
321            return False
322        else:
323            chars = self.stream.charsUntil(("<", "\u0000"))
324            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
325                                    data + chars})
326        return True
327
328    def scriptDataState(self):
329        data = self.stream.char()
330        if data == "<":
331            self.state = self.scriptDataLessThanSignState
332        elif data == "\u0000":
333            self.tokenQueue.append({"type": tokenTypes["ParseError"],
334                                    "data": "invalid-codepoint"})
335            self.tokenQueue.append({"type": tokenTypes["Characters"],
336                                    "data": "\uFFFD"})
337        elif data == EOF:
338            # Tokenization ends.
339            return False
340        else:
341            chars = self.stream.charsUntil(("<", "\u0000"))
342            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
343                                    data + chars})
344        return True
345
346    def plaintextState(self):
347        data = self.stream.char()
348        if data == EOF:
349            # Tokenization ends.
350            return False
351        elif data == "\u0000":
352            self.tokenQueue.append({"type": tokenTypes["ParseError"],
353                                    "data": "invalid-codepoint"})
354            self.tokenQueue.append({"type": tokenTypes["Characters"],
355                                    "data": "\uFFFD"})
356        else:
357            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
358                                    data + self.stream.charsUntil("\u0000")})
359        return True
360
361    def tagOpenState(self):
362        data = self.stream.char()
363        if data == "!":
364            self.state = self.markupDeclarationOpenState
365        elif data == "/":
366            self.state = self.closeTagOpenState
367        elif data in asciiLetters:
368            self.currentToken = {"type": tokenTypes["StartTag"],
369                                 "name": data, "data": [],
370                                 "selfClosing": False,
371                                 "selfClosingAcknowledged": False}
372            self.state = self.tagNameState
373        elif data == ">":
374            # XXX In theory it could be something besides a tag name. But
375            # do we really care?
376            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
377                                    "expected-tag-name-but-got-right-bracket"})
378            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
379            self.state = self.dataState
380        elif data == "?":
381            # XXX In theory it could be something besides a tag name. But
382            # do we really care?
383            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
384                                    "expected-tag-name-but-got-question-mark"})
385            self.stream.unget(data)
386            self.state = self.bogusCommentState
387        else:
388            # XXX
389            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
390                                    "expected-tag-name"})
391            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
392            self.stream.unget(data)
393            self.state = self.dataState
394        return True
395
396    def closeTagOpenState(self):
397        data = self.stream.char()
398        if data in asciiLetters:
399            self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
400                                 "data": [], "selfClosing": False}
401            self.state = self.tagNameState
402        elif data == ">":
403            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
404                                    "expected-closing-tag-but-got-right-bracket"})
405            self.state = self.dataState
406        elif data is EOF:
407            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
408                                    "expected-closing-tag-but-got-eof"})
409            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
410            self.state = self.dataState
411        else:
412            # XXX data can be _'_...
413            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
414                                    "expected-closing-tag-but-got-char",
415                                    "datavars": {"data": data}})
416            self.stream.unget(data)
417            self.state = self.bogusCommentState
418        return True
419
420    def tagNameState(self):
421        data = self.stream.char()
422        if data in spaceCharacters:
423            self.state = self.beforeAttributeNameState
424        elif data == ">":
425            self.emitCurrentToken()
426        elif data is EOF:
427            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
428                                    "eof-in-tag-name"})
429            self.state = self.dataState
430        elif data == "/":
431            self.state = self.selfClosingStartTagState
432        elif data == "\u0000":
433            self.tokenQueue.append({"type": tokenTypes["ParseError"],
434                                    "data": "invalid-codepoint"})
435            self.currentToken["name"] += "\uFFFD"
436        else:
437            self.currentToken["name"] += data
438            # (Don't use charsUntil here, because tag names are
439            # very short and it's faster to not do anything fancy)
440        return True
441
442    def rcdataLessThanSignState(self):
443        data = self.stream.char()
444        if data == "/":
445            self.temporaryBuffer = ""
446            self.state = self.rcdataEndTagOpenState
447        else:
448            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
449            self.stream.unget(data)
450            self.state = self.rcdataState
451        return True
452
453    def rcdataEndTagOpenState(self):
454        data = self.stream.char()
455        if data in asciiLetters:
456            self.temporaryBuffer += data
457            self.state = self.rcdataEndTagNameState
458        else:
459            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
460            self.stream.unget(data)
461            self.state = self.rcdataState
462        return True
463
464    def rcdataEndTagNameState(self):
465        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
466        data = self.stream.char()
467        if data in spaceCharacters and appropriate:
468            self.currentToken = {"type": tokenTypes["EndTag"],
469                                 "name": self.temporaryBuffer,
470                                 "data": [], "selfClosing": False}
471            self.state = self.beforeAttributeNameState
472        elif data == "/" and appropriate:
473            self.currentToken = {"type": tokenTypes["EndTag"],
474                                 "name": self.temporaryBuffer,
475                                 "data": [], "selfClosing": False}
476            self.state = self.selfClosingStartTagState
477        elif data == ">" and appropriate:
478            self.currentToken = {"type": tokenTypes["EndTag"],
479                                 "name": self.temporaryBuffer,
480                                 "data": [], "selfClosing": False}
481            self.emitCurrentToken()
482            self.state = self.dataState
483        elif data in asciiLetters:
484            self.temporaryBuffer += data
485        else:
486            self.tokenQueue.append({"type": tokenTypes["Characters"],
487                                    "data": "</" + self.temporaryBuffer})
488            self.stream.unget(data)
489            self.state = self.rcdataState
490        return True
491
492    def rawtextLessThanSignState(self):
493        data = self.stream.char()
494        if data == "/":
495            self.temporaryBuffer = ""
496            self.state = self.rawtextEndTagOpenState
497        else:
498            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
499            self.stream.unget(data)
500            self.state = self.rawtextState
501        return True
502
503    def rawtextEndTagOpenState(self):
504        data = self.stream.char()
505        if data in asciiLetters:
506            self.temporaryBuffer += data
507            self.state = self.rawtextEndTagNameState
508        else:
509            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
510            self.stream.unget(data)
511            self.state = self.rawtextState
512        return True
513
514    def rawtextEndTagNameState(self):
515        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
516        data = self.stream.char()
517        if data in spaceCharacters and appropriate:
518            self.currentToken = {"type": tokenTypes["EndTag"],
519                                 "name": self.temporaryBuffer,
520                                 "data": [], "selfClosing": False}
521            self.state = self.beforeAttributeNameState
522        elif data == "/" and appropriate:
523            self.currentToken = {"type": tokenTypes["EndTag"],
524                                 "name": self.temporaryBuffer,
525                                 "data": [], "selfClosing": False}
526            self.state = self.selfClosingStartTagState
527        elif data == ">" and appropriate:
528            self.currentToken = {"type": tokenTypes["EndTag"],
529                                 "name": self.temporaryBuffer,
530                                 "data": [], "selfClosing": False}
531            self.emitCurrentToken()
532            self.state = self.dataState
533        elif data in asciiLetters:
534            self.temporaryBuffer += data
535        else:
536            self.tokenQueue.append({"type": tokenTypes["Characters"],
537                                    "data": "</" + self.temporaryBuffer})
538            self.stream.unget(data)
539            self.state = self.rawtextState
540        return True
541
542    def scriptDataLessThanSignState(self):
543        data = self.stream.char()
544        if data == "/":
545            self.temporaryBuffer = ""
546            self.state = self.scriptDataEndTagOpenState
547        elif data == "!":
548            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
549            self.state = self.scriptDataEscapeStartState
550        else:
551            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
552            self.stream.unget(data)
553            self.state = self.scriptDataState
554        return True
555
556    def scriptDataEndTagOpenState(self):
557        data = self.stream.char()
558        if data in asciiLetters:
559            self.temporaryBuffer += data
560            self.state = self.scriptDataEndTagNameState
561        else:
562            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
563            self.stream.unget(data)
564            self.state = self.scriptDataState
565        return True
566
567    def scriptDataEndTagNameState(self):
568        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
569        data = self.stream.char()
570        if data in spaceCharacters and appropriate:
571            self.currentToken = {"type": tokenTypes["EndTag"],
572                                 "name": self.temporaryBuffer,
573                                 "data": [], "selfClosing": False}
574            self.state = self.beforeAttributeNameState
575        elif data == "/" and appropriate:
576            self.currentToken = {"type": tokenTypes["EndTag"],
577                                 "name": self.temporaryBuffer,
578                                 "data": [], "selfClosing": False}
579            self.state = self.selfClosingStartTagState
580        elif data == ">" and appropriate:
581            self.currentToken = {"type": tokenTypes["EndTag"],
582                                 "name": self.temporaryBuffer,
583                                 "data": [], "selfClosing": False}
584            self.emitCurrentToken()
585            self.state = self.dataState
586        elif data in asciiLetters:
587            self.temporaryBuffer += data
588        else:
589            self.tokenQueue.append({"type": tokenTypes["Characters"],
590                                    "data": "</" + self.temporaryBuffer})
591            self.stream.unget(data)
592            self.state = self.scriptDataState
593        return True
594
595    def scriptDataEscapeStartState(self):
596        data = self.stream.char()
597        if data == "-":
598            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
599            self.state = self.scriptDataEscapeStartDashState
600        else:
601            self.stream.unget(data)
602            self.state = self.scriptDataState
603        return True
604
605    def scriptDataEscapeStartDashState(self):
606        data = self.stream.char()
607        if data == "-":
608            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
609            self.state = self.scriptDataEscapedDashDashState
610        else:
611            self.stream.unget(data)
612            self.state = self.scriptDataState
613        return True
614
615    def scriptDataEscapedState(self):
616        data = self.stream.char()
617        if data == "-":
618            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
619            self.state = self.scriptDataEscapedDashState
620        elif data == "<":
621            self.state = self.scriptDataEscapedLessThanSignState
622        elif data == "\u0000":
623            self.tokenQueue.append({"type": tokenTypes["ParseError"],
624                                    "data": "invalid-codepoint"})
625            self.tokenQueue.append({"type": tokenTypes["Characters"],
626                                    "data": "\uFFFD"})
627        elif data == EOF:
628            self.state = self.dataState
629        else:
630            chars = self.stream.charsUntil(("<", "-", "\u0000"))
631            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
632                                    data + chars})
633        return True
634
635    def scriptDataEscapedDashState(self):
636        data = self.stream.char()
637        if data == "-":
638            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
639            self.state = self.scriptDataEscapedDashDashState
640        elif data == "<":
641            self.state = self.scriptDataEscapedLessThanSignState
642        elif data == "\u0000":
643            self.tokenQueue.append({"type": tokenTypes["ParseError"],
644                                    "data": "invalid-codepoint"})
645            self.tokenQueue.append({"type": tokenTypes["Characters"],
646                                    "data": "\uFFFD"})
647            self.state = self.scriptDataEscapedState
648        elif data == EOF:
649            self.state = self.dataState
650        else:
651            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
652            self.state = self.scriptDataEscapedState
653        return True
654
655    def scriptDataEscapedDashDashState(self):
656        data = self.stream.char()
657        if data == "-":
658            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
659        elif data == "<":
660            self.state = self.scriptDataEscapedLessThanSignState
661        elif data == ">":
662            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
663            self.state = self.scriptDataState
664        elif data == "\u0000":
665            self.tokenQueue.append({"type": tokenTypes["ParseError"],
666                                    "data": "invalid-codepoint"})
667            self.tokenQueue.append({"type": tokenTypes["Characters"],
668                                    "data": "\uFFFD"})
669            self.state = self.scriptDataEscapedState
670        elif data == EOF:
671            self.state = self.dataState
672        else:
673            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
674            self.state = self.scriptDataEscapedState
675        return True
676
677    def scriptDataEscapedLessThanSignState(self):
678        data = self.stream.char()
679        if data == "/":
680            self.temporaryBuffer = ""
681            self.state = self.scriptDataEscapedEndTagOpenState
682        elif data in asciiLetters:
683            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
684            self.temporaryBuffer = data
685            self.state = self.scriptDataDoubleEscapeStartState
686        else:
687            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
688            self.stream.unget(data)
689            self.state = self.scriptDataEscapedState
690        return True
691
692    def scriptDataEscapedEndTagOpenState(self):
693        data = self.stream.char()
694        if data in asciiLetters:
695            self.temporaryBuffer = data
696            self.state = self.scriptDataEscapedEndTagNameState
697        else:
698            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
699            self.stream.unget(data)
700            self.state = self.scriptDataEscapedState
701        return True
702
703    def scriptDataEscapedEndTagNameState(self):
704        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
705        data = self.stream.char()
706        if data in spaceCharacters and appropriate:
707            self.currentToken = {"type": tokenTypes["EndTag"],
708                                 "name": self.temporaryBuffer,
709                                 "data": [], "selfClosing": False}
710            self.state = self.beforeAttributeNameState
711        elif data == "/" and appropriate:
712            self.currentToken = {"type": tokenTypes["EndTag"],
713                                 "name": self.temporaryBuffer,
714                                 "data": [], "selfClosing": False}
715            self.state = self.selfClosingStartTagState
716        elif data == ">" and appropriate:
717            self.currentToken = {"type": tokenTypes["EndTag"],
718                                 "name": self.temporaryBuffer,
719                                 "data": [], "selfClosing": False}
720            self.emitCurrentToken()
721            self.state = self.dataState
722        elif data in asciiLetters:
723            self.temporaryBuffer += data
724        else:
725            self.tokenQueue.append({"type": tokenTypes["Characters"],
726                                    "data": "</" + self.temporaryBuffer})
727            self.stream.unget(data)
728            self.state = self.scriptDataEscapedState
729        return True
730
731    def scriptDataDoubleEscapeStartState(self):
732        data = self.stream.char()
733        if data in (spaceCharacters | frozenset(("/", ">"))):
734            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
735            if self.temporaryBuffer.lower() == "script":
736                self.state = self.scriptDataDoubleEscapedState
737            else:
738                self.state = self.scriptDataEscapedState
739        elif data in asciiLetters:
740            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
741            self.temporaryBuffer += data
742        else:
743            self.stream.unget(data)
744            self.state = self.scriptDataEscapedState
745        return True
746
747    def scriptDataDoubleEscapedState(self):
748        data = self.stream.char()
749        if data == "-":
750            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
751            self.state = self.scriptDataDoubleEscapedDashState
752        elif data == "<":
753            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
754            self.state = self.scriptDataDoubleEscapedLessThanSignState
755        elif data == "\u0000":
756            self.tokenQueue.append({"type": tokenTypes["ParseError"],
757                                    "data": "invalid-codepoint"})
758            self.tokenQueue.append({"type": tokenTypes["Characters"],
759                                    "data": "\uFFFD"})
760        elif data == EOF:
761            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
762                                    "eof-in-script-in-script"})
763            self.state = self.dataState
764        else:
765            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
766        return True
767
768    def scriptDataDoubleEscapedDashState(self):
769        data = self.stream.char()
770        if data == "-":
771            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
772            self.state = self.scriptDataDoubleEscapedDashDashState
773        elif data == "<":
774            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
775            self.state = self.scriptDataDoubleEscapedLessThanSignState
776        elif data == "\u0000":
777            self.tokenQueue.append({"type": tokenTypes["ParseError"],
778                                    "data": "invalid-codepoint"})
779            self.tokenQueue.append({"type": tokenTypes["Characters"],
780                                    "data": "\uFFFD"})
781            self.state = self.scriptDataDoubleEscapedState
782        elif data == EOF:
783            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
784                                    "eof-in-script-in-script"})
785            self.state = self.dataState
786        else:
787            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
788            self.state = self.scriptDataDoubleEscapedState
789        return True
790
791    def scriptDataDoubleEscapedDashDashState(self):
792        data = self.stream.char()
793        if data == "-":
794            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
795        elif data == "<":
796            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
797            self.state = self.scriptDataDoubleEscapedLessThanSignState
798        elif data == ">":
799            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
800            self.state = self.scriptDataState
801        elif data == "\u0000":
802            self.tokenQueue.append({"type": tokenTypes["ParseError"],
803                                    "data": "invalid-codepoint"})
804            self.tokenQueue.append({"type": tokenTypes["Characters"],
805                                    "data": "\uFFFD"})
806            self.state = self.scriptDataDoubleEscapedState
807        elif data == EOF:
808            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
809                                    "eof-in-script-in-script"})
810            self.state = self.dataState
811        else:
812            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
813            self.state = self.scriptDataDoubleEscapedState
814        return True
815
816    def scriptDataDoubleEscapedLessThanSignState(self):
817        data = self.stream.char()
818        if data == "/":
819            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
820            self.temporaryBuffer = ""
821            self.state = self.scriptDataDoubleEscapeEndState
822        else:
823            self.stream.unget(data)
824            self.state = self.scriptDataDoubleEscapedState
825        return True
826
827    def scriptDataDoubleEscapeEndState(self):
828        data = self.stream.char()
829        if data in (spaceCharacters | frozenset(("/", ">"))):
830            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
831            if self.temporaryBuffer.lower() == "script":
832                self.state = self.scriptDataEscapedState
833            else:
834                self.state = self.scriptDataDoubleEscapedState
835        elif data in asciiLetters:
836            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
837            self.temporaryBuffer += data
838        else:
839            self.stream.unget(data)
840            self.state = self.scriptDataDoubleEscapedState
841        return True
842
843    def beforeAttributeNameState(self):
844        data = self.stream.char()
845        if data in spaceCharacters:
846            self.stream.charsUntil(spaceCharacters, True)
847        elif data in asciiLetters:
848            self.currentToken["data"].append([data, ""])
849            self.state = self.attributeNameState
850        elif data == ">":
851            self.emitCurrentToken()
852        elif data == "/":
853            self.state = self.selfClosingStartTagState
854        elif data in ("'", '"', "=", "<"):
855            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
856                                    "invalid-character-in-attribute-name"})
857            self.currentToken["data"].append([data, ""])
858            self.state = self.attributeNameState
859        elif data == "\u0000":
860            self.tokenQueue.append({"type": tokenTypes["ParseError"],
861                                    "data": "invalid-codepoint"})
862            self.currentToken["data"].append(["\uFFFD", ""])
863            self.state = self.attributeNameState
864        elif data is EOF:
865            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
866                                    "expected-attribute-name-but-got-eof"})
867            self.state = self.dataState
868        else:
869            self.currentToken["data"].append([data, ""])
870            self.state = self.attributeNameState
871        return True
872
873    def attributeNameState(self):
874        data = self.stream.char()
875        leavingThisState = True
876        emitToken = False
877        if data == "=":
878            self.state = self.beforeAttributeValueState
879        elif data in asciiLetters:
880            self.currentToken["data"][-1][0] += data +\
881                self.stream.charsUntil(asciiLetters, True)
882            leavingThisState = False
883        elif data == ">":
884            # XXX If we emit here the attributes are converted to a dict
885            # without being checked and when the code below runs we error
886            # because data is a dict not a list
887            emitToken = True
888        elif data in spaceCharacters:
889            self.state = self.afterAttributeNameState
890        elif data == "/":
891            self.state = self.selfClosingStartTagState
892        elif data == "\u0000":
893            self.tokenQueue.append({"type": tokenTypes["ParseError"],
894                                    "data": "invalid-codepoint"})
895            self.currentToken["data"][-1][0] += "\uFFFD"
896            leavingThisState = False
897        elif data in ("'", '"', "<"):
898            self.tokenQueue.append({"type": tokenTypes["ParseError"],
899                                    "data":
900                                    "invalid-character-in-attribute-name"})
901            self.currentToken["data"][-1][0] += data
902            leavingThisState = False
903        elif data is EOF:
904            self.tokenQueue.append({"type": tokenTypes["ParseError"],
905                                    "data": "eof-in-attribute-name"})
906            self.state = self.dataState
907        else:
908            self.currentToken["data"][-1][0] += data
909            leavingThisState = False
910
911        if leavingThisState:
912            # Attributes are not dropped at this stage. That happens when the
913            # start tag token is emitted so values can still be safely appended
914            # to attributes, but we do want to report the parse error in time.
915            self.currentToken["data"][-1][0] = (
916                self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
917            for name, _ in self.currentToken["data"][:-1]:
918                if self.currentToken["data"][-1][0] == name:
919                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
920                                            "duplicate-attribute"})
921                    break
922            # XXX Fix for above XXX
923            if emitToken:
924                self.emitCurrentToken()
925        return True
926
927    def afterAttributeNameState(self):
928        data = self.stream.char()
929        if data in spaceCharacters:
930            self.stream.charsUntil(spaceCharacters, True)
931        elif data == "=":
932            self.state = self.beforeAttributeValueState
933        elif data == ">":
934            self.emitCurrentToken()
935        elif data in asciiLetters:
936            self.currentToken["data"].append([data, ""])
937            self.state = self.attributeNameState
938        elif data == "/":
939            self.state = self.selfClosingStartTagState
940        elif data == "\u0000":
941            self.tokenQueue.append({"type": tokenTypes["ParseError"],
942                                    "data": "invalid-codepoint"})
943            self.currentToken["data"].append(["\uFFFD", ""])
944            self.state = self.attributeNameState
945        elif data in ("'", '"', "<"):
946            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
947                                    "invalid-character-after-attribute-name"})
948            self.currentToken["data"].append([data, ""])
949            self.state = self.attributeNameState
950        elif data is EOF:
951            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
952                                    "expected-end-of-tag-but-got-eof"})
953            self.state = self.dataState
954        else:
955            self.currentToken["data"].append([data, ""])
956            self.state = self.attributeNameState
957        return True
958
959    def beforeAttributeValueState(self):
960        data = self.stream.char()
961        if data in spaceCharacters:
962            self.stream.charsUntil(spaceCharacters, True)
963        elif data == "\"":
964            self.state = self.attributeValueDoubleQuotedState
965        elif data == "&":
966            self.state = self.attributeValueUnQuotedState
967            self.stream.unget(data)
968        elif data == "'":
969            self.state = self.attributeValueSingleQuotedState
970        elif data == ">":
971            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
972                                    "expected-attribute-value-but-got-right-bracket"})
973            self.emitCurrentToken()
974        elif data == "\u0000":
975            self.tokenQueue.append({"type": tokenTypes["ParseError"],
976                                    "data": "invalid-codepoint"})
977            self.currentToken["data"][-1][1] += "\uFFFD"
978            self.state = self.attributeValueUnQuotedState
979        elif data in ("=", "<", "`"):
980            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
981                                    "equals-in-unquoted-attribute-value"})
982            self.currentToken["data"][-1][1] += data
983            self.state = self.attributeValueUnQuotedState
984        elif data is EOF:
985            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
986                                    "expected-attribute-value-but-got-eof"})
987            self.state = self.dataState
988        else:
989            self.currentToken["data"][-1][1] += data
990            self.state = self.attributeValueUnQuotedState
991        return True
992
993    def attributeValueDoubleQuotedState(self):
994        data = self.stream.char()
995        if data == "\"":
996            self.state = self.afterAttributeValueState
997        elif data == "&":
998            self.processEntityInAttribute('"')
999        elif data == "\u0000":
1000            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1001                                    "data": "invalid-codepoint"})
1002            self.currentToken["data"][-1][1] += "\uFFFD"
1003        elif data is EOF:
1004            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1005                                    "eof-in-attribute-value-double-quote"})
1006            self.state = self.dataState
1007        else:
1008            self.currentToken["data"][-1][1] += data +\
1009                self.stream.charsUntil(("\"", "&", "\u0000"))
1010        return True
1011
1012    def attributeValueSingleQuotedState(self):
1013        data = self.stream.char()
1014        if data == "'":
1015            self.state = self.afterAttributeValueState
1016        elif data == "&":
1017            self.processEntityInAttribute("'")
1018        elif data == "\u0000":
1019            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1020                                    "data": "invalid-codepoint"})
1021            self.currentToken["data"][-1][1] += "\uFFFD"
1022        elif data is EOF:
1023            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1024                                    "eof-in-attribute-value-single-quote"})
1025            self.state = self.dataState
1026        else:
1027            self.currentToken["data"][-1][1] += data +\
1028                self.stream.charsUntil(("'", "&", "\u0000"))
1029        return True
1030
1031    def attributeValueUnQuotedState(self):
1032        data = self.stream.char()
1033        if data in spaceCharacters:
1034            self.state = self.beforeAttributeNameState
1035        elif data == "&":
1036            self.processEntityInAttribute(">")
1037        elif data == ">":
1038            self.emitCurrentToken()
1039        elif data in ('"', "'", "=", "<", "`"):
1040            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1041                                    "unexpected-character-in-unquoted-attribute-value"})
1042            self.currentToken["data"][-1][1] += data
1043        elif data == "\u0000":
1044            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1045                                    "data": "invalid-codepoint"})
1046            self.currentToken["data"][-1][1] += "\uFFFD"
1047        elif data is EOF:
1048            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1049                                    "eof-in-attribute-value-no-quotes"})
1050            self.state = self.dataState
1051        else:
1052            self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
1053                frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
1054        return True
1055
1056    def afterAttributeValueState(self):
1057        data = self.stream.char()
1058        if data in spaceCharacters:
1059            self.state = self.beforeAttributeNameState
1060        elif data == ">":
1061            self.emitCurrentToken()
1062        elif data == "/":
1063            self.state = self.selfClosingStartTagState
1064        elif data is EOF:
1065            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1066                                    "unexpected-EOF-after-attribute-value"})
1067            self.stream.unget(data)
1068            self.state = self.dataState
1069        else:
1070            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1071                                    "unexpected-character-after-attribute-value"})
1072            self.stream.unget(data)
1073            self.state = self.beforeAttributeNameState
1074        return True
1075
1076    def selfClosingStartTagState(self):
1077        data = self.stream.char()
1078        if data == ">":
1079            self.currentToken["selfClosing"] = True
1080            self.emitCurrentToken()
1081        elif data is EOF:
1082            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1083                                    "data":
1084                                    "unexpected-EOF-after-solidus-in-tag"})
1085            self.stream.unget(data)
1086            self.state = self.dataState
1087        else:
1088            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1089                                    "unexpected-character-after-solidus-in-tag"})
1090            self.stream.unget(data)
1091            self.state = self.beforeAttributeNameState
1092        return True
1093
1094    def bogusCommentState(self):
1095        # Make a new comment token and give it as value all the characters
1096        # until the first > or EOF (charsUntil checks for EOF automatically)
1097        # and emit it.
1098        data = self.stream.charsUntil(">")
1099        data = data.replace("\u0000", "\uFFFD")
1100        self.tokenQueue.append(
1101            {"type": tokenTypes["Comment"], "data": data})
1102
1103        # Eat the character directly after the bogus comment which is either a
1104        # ">" or an EOF.
1105        self.stream.char()
1106        self.state = self.dataState
1107        return True
1108
1109    def markupDeclarationOpenState(self):
1110        charStack = [self.stream.char()]
1111        if charStack[-1] == "-":
1112            charStack.append(self.stream.char())
1113            if charStack[-1] == "-":
1114                self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
1115                self.state = self.commentStartState
1116                return True
1117        elif charStack[-1] in ('d', 'D'):
1118            matched = True
1119            for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
1120                             ('y', 'Y'), ('p', 'P'), ('e', 'E')):
1121                charStack.append(self.stream.char())
1122                if charStack[-1] not in expected:
1123                    matched = False
1124                    break
1125            if matched:
1126                self.currentToken = {"type": tokenTypes["Doctype"],
1127                                     "name": "",
1128                                     "publicId": None, "systemId": None,
1129                                     "correct": True}
1130                self.state = self.doctypeState
1131                return True
1132        elif (charStack[-1] == "[" and
1133              self.parser is not None and
1134              self.parser.tree.openElements and
1135              self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
1136            matched = True
1137            for expected in ["C", "D", "A", "T", "A", "["]:
1138                charStack.append(self.stream.char())
1139                if charStack[-1] != expected:
1140                    matched = False
1141                    break
1142            if matched:
1143                self.state = self.cdataSectionState
1144                return True
1145
1146        self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1147                                "expected-dashes-or-doctype"})
1148
1149        while charStack:
1150            self.stream.unget(charStack.pop())
1151        self.state = self.bogusCommentState
1152        return True
1153
1154    def commentStartState(self):
1155        data = self.stream.char()
1156        if data == "-":
1157            self.state = self.commentStartDashState
1158        elif data == "\u0000":
1159            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1160                                    "data": "invalid-codepoint"})
1161            self.currentToken["data"] += "\uFFFD"
1162        elif data == ">":
1163            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1164                                    "incorrect-comment"})
1165            self.tokenQueue.append(self.currentToken)
1166            self.state = self.dataState
1167        elif data is EOF:
1168            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1169                                    "eof-in-comment"})
1170            self.tokenQueue.append(self.currentToken)
1171            self.state = self.dataState
1172        else:
1173            self.currentToken["data"] += data
1174            self.state = self.commentState
1175        return True
1176
1177    def commentStartDashState(self):
1178        data = self.stream.char()
1179        if data == "-":
1180            self.state = self.commentEndState
1181        elif data == "\u0000":
1182            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1183                                    "data": "invalid-codepoint"})
1184            self.currentToken["data"] += "-\uFFFD"
1185        elif data == ">":
1186            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1187                                    "incorrect-comment"})
1188            self.tokenQueue.append(self.currentToken)
1189            self.state = self.dataState
1190        elif data is EOF:
1191            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1192                                    "eof-in-comment"})
1193            self.tokenQueue.append(self.currentToken)
1194            self.state = self.dataState
1195        else:
1196            self.currentToken["data"] += "-" + data
1197            self.state = self.commentState
1198        return True
1199
1200    def commentState(self):
1201        data = self.stream.char()
1202        if data == "-":
1203            self.state = self.commentEndDashState
1204        elif data == "\u0000":
1205            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1206                                    "data": "invalid-codepoint"})
1207            self.currentToken["data"] += "\uFFFD"
1208        elif data is EOF:
1209            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1210                                    "data": "eof-in-comment"})
1211            self.tokenQueue.append(self.currentToken)
1212            self.state = self.dataState
1213        else:
1214            self.currentToken["data"] += data + \
1215                self.stream.charsUntil(("-", "\u0000"))
1216        return True
1217
1218    def commentEndDashState(self):
1219        data = self.stream.char()
1220        if data == "-":
1221            self.state = self.commentEndState
1222        elif data == "\u0000":
1223            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1224                                    "data": "invalid-codepoint"})
1225            self.currentToken["data"] += "-\uFFFD"
1226            self.state = self.commentState
1227        elif data is EOF:
1228            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1229                                    "eof-in-comment-end-dash"})
1230            self.tokenQueue.append(self.currentToken)
1231            self.state = self.dataState
1232        else:
1233            self.currentToken["data"] += "-" + data
1234            self.state = self.commentState
1235        return True
1236
1237    def commentEndState(self):
1238        data = self.stream.char()
1239        if data == ">":
1240            self.tokenQueue.append(self.currentToken)
1241            self.state = self.dataState
1242        elif data == "\u0000":
1243            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1244                                    "data": "invalid-codepoint"})
1245            self.currentToken["data"] += "--\uFFFD"
1246            self.state = self.commentState
1247        elif data == "!":
1248            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1249                                    "unexpected-bang-after-double-dash-in-comment"})
1250            self.state = self.commentEndBangState
1251        elif data == "-":
1252            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1253                                    "unexpected-dash-after-double-dash-in-comment"})
1254            self.currentToken["data"] += data
1255        elif data is EOF:
1256            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1257                                    "eof-in-comment-double-dash"})
1258            self.tokenQueue.append(self.currentToken)
1259            self.state = self.dataState
1260        else:
1261            # XXX
1262            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1263                                    "unexpected-char-in-comment"})
1264            self.currentToken["data"] += "--" + data
1265            self.state = self.commentState
1266        return True
1267
1268    def commentEndBangState(self):
1269        data = self.stream.char()
1270        if data == ">":
1271            self.tokenQueue.append(self.currentToken)
1272            self.state = self.dataState
1273        elif data == "-":
1274            self.currentToken["data"] += "--!"
1275            self.state = self.commentEndDashState
1276        elif data == "\u0000":
1277            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1278                                    "data": "invalid-codepoint"})
1279            self.currentToken["data"] += "--!\uFFFD"
1280            self.state = self.commentState
1281        elif data is EOF:
1282            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1283                                    "eof-in-comment-end-bang-state"})
1284            self.tokenQueue.append(self.currentToken)
1285            self.state = self.dataState
1286        else:
1287            self.currentToken["data"] += "--!" + data
1288            self.state = self.commentState
1289        return True
1290
1291    def doctypeState(self):
1292        data = self.stream.char()
1293        if data in spaceCharacters:
1294            self.state = self.beforeDoctypeNameState
1295        elif data is EOF:
1296            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1297                                    "expected-doctype-name-but-got-eof"})
1298            self.currentToken["correct"] = False
1299            self.tokenQueue.append(self.currentToken)
1300            self.state = self.dataState
1301        else:
1302            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1303                                    "need-space-after-doctype"})
1304            self.stream.unget(data)
1305            self.state = self.beforeDoctypeNameState
1306        return True
1307
1308    def beforeDoctypeNameState(self):
1309        data = self.stream.char()
1310        if data in spaceCharacters:
1311            pass
1312        elif data == ">":
1313            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1314                                    "expected-doctype-name-but-got-right-bracket"})
1315            self.currentToken["correct"] = False
1316            self.tokenQueue.append(self.currentToken)
1317            self.state = self.dataState
1318        elif data == "\u0000":
1319            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1320                                    "data": "invalid-codepoint"})
1321            self.currentToken["name"] = "\uFFFD"
1322            self.state = self.doctypeNameState
1323        elif data is EOF:
1324            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1325                                    "expected-doctype-name-but-got-eof"})
1326            self.currentToken["correct"] = False
1327            self.tokenQueue.append(self.currentToken)
1328            self.state = self.dataState
1329        else:
1330            self.currentToken["name"] = data
1331            self.state = self.doctypeNameState
1332        return True
1333
1334    def doctypeNameState(self):
1335        data = self.stream.char()
1336        if data in spaceCharacters:
1337            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1338            self.state = self.afterDoctypeNameState
1339        elif data == ">":
1340            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1341            self.tokenQueue.append(self.currentToken)
1342            self.state = self.dataState
1343        elif data == "\u0000":
1344            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1345                                    "data": "invalid-codepoint"})
1346            self.currentToken["name"] += "\uFFFD"
1347            self.state = self.doctypeNameState
1348        elif data is EOF:
1349            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1350                                    "eof-in-doctype-name"})
1351            self.currentToken["correct"] = False
1352            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1353            self.tokenQueue.append(self.currentToken)
1354            self.state = self.dataState
1355        else:
1356            self.currentToken["name"] += data
1357        return True
1358
1359    def afterDoctypeNameState(self):
1360        data = self.stream.char()
1361        if data in spaceCharacters:
1362            pass
1363        elif data == ">":
1364            self.tokenQueue.append(self.currentToken)
1365            self.state = self.dataState
1366        elif data is EOF:
1367            self.currentToken["correct"] = False
1368            self.stream.unget(data)
1369            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1370                                    "eof-in-doctype"})
1371            self.tokenQueue.append(self.currentToken)
1372            self.state = self.dataState
1373        else:
1374            if data in ("p", "P"):
1375                matched = True
1376                for expected in (("u", "U"), ("b", "B"), ("l", "L"),
1377                                 ("i", "I"), ("c", "C")):
1378                    data = self.stream.char()
1379                    if data not in expected:
1380                        matched = False
1381                        break
1382                if matched:
1383                    self.state = self.afterDoctypePublicKeywordState
1384                    return True
1385            elif data in ("s", "S"):
1386                matched = True
1387                for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
1388                                 ("e", "E"), ("m", "M")):
1389                    data = self.stream.char()
1390                    if data not in expected:
1391                        matched = False
1392                        break
1393                if matched:
1394                    self.state = self.afterDoctypeSystemKeywordState
1395                    return True
1396
1397            # All the characters read before the current 'data' will be
1398            # [a-zA-Z], so they're garbage in the bogus doctype and can be
1399            # discarded; only the latest character might be '>' or EOF
1400            # and needs to be ungetted
1401            self.stream.unget(data)
1402            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1403                                    "expected-space-or-right-bracket-in-doctype", "datavars":
1404                                    {"data": data}})
1405            self.currentToken["correct"] = False
1406            self.state = self.bogusDoctypeState
1407
1408        return True
1409
1410    def afterDoctypePublicKeywordState(self):
1411        data = self.stream.char()
1412        if data in spaceCharacters:
1413            self.state = self.beforeDoctypePublicIdentifierState
1414        elif data in ("'", '"'):
1415            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1416                                    "unexpected-char-in-doctype"})
1417            self.stream.unget(data)
1418            self.state = self.beforeDoctypePublicIdentifierState
1419        elif data is EOF:
1420            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1421                                    "eof-in-doctype"})
1422            self.currentToken["correct"] = False
1423            self.tokenQueue.append(self.currentToken)
1424            self.state = self.dataState
1425        else:
1426            self.stream.unget(data)
1427            self.state = self.beforeDoctypePublicIdentifierState
1428        return True
1429
1430    def beforeDoctypePublicIdentifierState(self):
1431        data = self.stream.char()
1432        if data in spaceCharacters:
1433            pass
1434        elif data == "\"":
1435            self.currentToken["publicId"] = ""
1436            self.state = self.doctypePublicIdentifierDoubleQuotedState
1437        elif data == "'":
1438            self.currentToken["publicId"] = ""
1439            self.state = self.doctypePublicIdentifierSingleQuotedState
1440        elif data == ">":
1441            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1442                                    "unexpected-end-of-doctype"})
1443            self.currentToken["correct"] = False
1444            self.tokenQueue.append(self.currentToken)
1445            self.state = self.dataState
1446        elif data is EOF:
1447            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1448                                    "eof-in-doctype"})
1449            self.currentToken["correct"] = False
1450            self.tokenQueue.append(self.currentToken)
1451            self.state = self.dataState
1452        else:
1453            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1454                                    "unexpected-char-in-doctype"})
1455            self.currentToken["correct"] = False
1456            self.state = self.bogusDoctypeState
1457        return True
1458
1459    def doctypePublicIdentifierDoubleQuotedState(self):
1460        data = self.stream.char()
1461        if data == "\"":
1462            self.state = self.afterDoctypePublicIdentifierState
1463        elif data == "\u0000":
1464            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1465                                    "data": "invalid-codepoint"})
1466            self.currentToken["publicId"] += "\uFFFD"
1467        elif data == ">":
1468            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1469                                    "unexpected-end-of-doctype"})
1470            self.currentToken["correct"] = False
1471            self.tokenQueue.append(self.currentToken)
1472            self.state = self.dataState
1473        elif data is EOF:
1474            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1475                                    "eof-in-doctype"})
1476            self.currentToken["correct"] = False
1477            self.tokenQueue.append(self.currentToken)
1478            self.state = self.dataState
1479        else:
1480            self.currentToken["publicId"] += data
1481        return True
1482
1483    def doctypePublicIdentifierSingleQuotedState(self):
1484        data = self.stream.char()
1485        if data == "'":
1486            self.state = self.afterDoctypePublicIdentifierState
1487        elif data == "\u0000":
1488            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1489                                    "data": "invalid-codepoint"})
1490            self.currentToken["publicId"] += "\uFFFD"
1491        elif data == ">":
1492            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1493                                    "unexpected-end-of-doctype"})
1494            self.currentToken["correct"] = False
1495            self.tokenQueue.append(self.currentToken)
1496            self.state = self.dataState
1497        elif data is EOF:
1498            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1499                                    "eof-in-doctype"})
1500            self.currentToken["correct"] = False
1501            self.tokenQueue.append(self.currentToken)
1502            self.state = self.dataState
1503        else:
1504            self.currentToken["publicId"] += data
1505        return True
1506
1507    def afterDoctypePublicIdentifierState(self):
1508        data = self.stream.char()
1509        if data in spaceCharacters:
1510            self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1511        elif data == ">":
1512            self.tokenQueue.append(self.currentToken)
1513            self.state = self.dataState
1514        elif data == '"':
1515            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1516                                    "unexpected-char-in-doctype"})
1517            self.currentToken["systemId"] = ""
1518            self.state = self.doctypeSystemIdentifierDoubleQuotedState
1519        elif data == "'":
1520            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1521                                    "unexpected-char-in-doctype"})
1522            self.currentToken["systemId"] = ""
1523            self.state = self.doctypeSystemIdentifierSingleQuotedState
1524        elif data is EOF:
1525            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1526                                    "eof-in-doctype"})
1527            self.currentToken["correct"] = False
1528            self.tokenQueue.append(self.currentToken)
1529            self.state = self.dataState
1530        else:
1531            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1532                                    "unexpected-char-in-doctype"})
1533            self.currentToken["correct"] = False
1534            self.state = self.bogusDoctypeState
1535        return True
1536
1537    def betweenDoctypePublicAndSystemIdentifiersState(self):
1538        data = self.stream.char()
1539        if data in spaceCharacters:
1540            pass
1541        elif data == ">":
1542            self.tokenQueue.append(self.currentToken)
1543            self.state = self.dataState
1544        elif data == '"':
1545            self.currentToken["systemId"] = ""
1546            self.state = self.doctypeSystemIdentifierDoubleQuotedState
1547        elif data == "'":
1548            self.currentToken["systemId"] = ""
1549            self.state = self.doctypeSystemIdentifierSingleQuotedState
1550        elif data == EOF:
1551            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1552                                    "eof-in-doctype"})
1553            self.currentToken["correct"] = False
1554            self.tokenQueue.append(self.currentToken)
1555            self.state = self.dataState
1556        else:
1557            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1558                                    "unexpected-char-in-doctype"})
1559            self.currentToken["correct"] = False
1560            self.state = self.bogusDoctypeState
1561        return True
1562
1563    def afterDoctypeSystemKeywordState(self):
1564        data = self.stream.char()
1565        if data in spaceCharacters:
1566            self.state = self.beforeDoctypeSystemIdentifierState
1567        elif data in ("'", '"'):
1568            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1569                                    "unexpected-char-in-doctype"})
1570            self.stream.unget(data)
1571            self.state = self.beforeDoctypeSystemIdentifierState
1572        elif data is EOF:
1573            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1574                                    "eof-in-doctype"})
1575            self.currentToken["correct"] = False
1576            self.tokenQueue.append(self.currentToken)
1577            self.state = self.dataState
1578        else:
1579            self.stream.unget(data)
1580            self.state = self.beforeDoctypeSystemIdentifierState
1581        return True
1582
1583    def beforeDoctypeSystemIdentifierState(self):
1584        data = self.stream.char()
1585        if data in spaceCharacters:
1586            pass
1587        elif data == "\"":
1588            self.currentToken["systemId"] = ""
1589            self.state = self.doctypeSystemIdentifierDoubleQuotedState
1590        elif data == "'":
1591            self.currentToken["systemId"] = ""
1592            self.state = self.doctypeSystemIdentifierSingleQuotedState
1593        elif data == ">":
1594            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1595                                    "unexpected-char-in-doctype"})
1596            self.currentToken["correct"] = False
1597            self.tokenQueue.append(self.currentToken)
1598            self.state = self.dataState
1599        elif data is EOF:
1600            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1601                                    "eof-in-doctype"})
1602            self.currentToken["correct"] = False
1603            self.tokenQueue.append(self.currentToken)
1604            self.state = self.dataState
1605        else:
1606            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1607                                    "unexpected-char-in-doctype"})
1608            self.currentToken["correct"] = False
1609            self.state = self.bogusDoctypeState
1610        return True
1611
1612    def doctypeSystemIdentifierDoubleQuotedState(self):
1613        data = self.stream.char()
1614        if data == "\"":
1615            self.state = self.afterDoctypeSystemIdentifierState
1616        elif data == "\u0000":
1617            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1618                                    "data": "invalid-codepoint"})
1619            self.currentToken["systemId"] += "\uFFFD"
1620        elif data == ">":
1621            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1622                                    "unexpected-end-of-doctype"})
1623            self.currentToken["correct"] = False
1624            self.tokenQueue.append(self.currentToken)
1625            self.state = self.dataState
1626        elif data is EOF:
1627            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1628                                    "eof-in-doctype"})
1629            self.currentToken["correct"] = False
1630            self.tokenQueue.append(self.currentToken)
1631            self.state = self.dataState
1632        else:
1633            self.currentToken["systemId"] += data
1634        return True
1635
1636    def doctypeSystemIdentifierSingleQuotedState(self):
1637        data = self.stream.char()
1638        if data == "'":
1639            self.state = self.afterDoctypeSystemIdentifierState
1640        elif data == "\u0000":
1641            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1642                                    "data": "invalid-codepoint"})
1643            self.currentToken["systemId"] += "\uFFFD"
1644        elif data == ">":
1645            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1646                                    "unexpected-end-of-doctype"})
1647            self.currentToken["correct"] = False
1648            self.tokenQueue.append(self.currentToken)
1649            self.state = self.dataState
1650        elif data is EOF:
1651            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1652                                    "eof-in-doctype"})
1653            self.currentToken["correct"] = False
1654            self.tokenQueue.append(self.currentToken)
1655            self.state = self.dataState
1656        else:
1657            self.currentToken["systemId"] += data
1658        return True
1659
1660    def afterDoctypeSystemIdentifierState(self):
1661        data = self.stream.char()
1662        if data in spaceCharacters:
1663            pass
1664        elif data == ">":
1665            self.tokenQueue.append(self.currentToken)
1666            self.state = self.dataState
1667        elif data is EOF:
1668            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1669                                    "eof-in-doctype"})
1670            self.currentToken["correct"] = False
1671            self.tokenQueue.append(self.currentToken)
1672            self.state = self.dataState
1673        else:
1674            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1675                                    "unexpected-char-in-doctype"})
1676            self.state = self.bogusDoctypeState
1677        return True
1678
1679    def bogusDoctypeState(self):
1680        data = self.stream.char()
1681        if data == ">":
1682            self.tokenQueue.append(self.currentToken)
1683            self.state = self.dataState
1684        elif data is EOF:
1685            # XXX EMIT
1686            self.stream.unget(data)
1687            self.tokenQueue.append(self.currentToken)
1688            self.state = self.dataState
1689        else:
1690            pass
1691        return True
1692
1693    def cdataSectionState(self):
1694        data = []
1695        while True:
1696            data.append(self.stream.charsUntil("]"))
1697            data.append(self.stream.charsUntil(">"))
1698            char = self.stream.char()
1699            if char == EOF:
1700                break
1701            else:
1702                assert char == ">"
1703                if data[-1][-2:] == "]]":
1704                    data[-1] = data[-1][:-2]
1705                    break
1706                else:
1707                    data.append(char)
1708
1709        data = "".join(data)  # pylint:disable=redefined-variable-type
1710        # Deal with null here rather than in the parser
1711        nullCount = data.count("\u0000")
1712        if nullCount > 0:
1713            for _ in range(nullCount):
1714                self.tokenQueue.append({"type": tokenTypes["ParseError"],
1715                                        "data": "invalid-codepoint"})
1716            data = data.replace("\u0000", "\uFFFD")
1717        if data:
1718            self.tokenQueue.append({"type": tokenTypes["Characters"],
1719                                    "data": data})
1720        self.state = self.dataState
1721        return True
1722