1from __future__ import absolute_import, division, unicode_literals
2
3from pip._vendor.six import unichr as chr
4
5from collections import deque, OrderedDict
6from sys import version_info
7
8from .constants import spaceCharacters
9from .constants import entities
10from .constants import asciiLetters, asciiUpper2Lower
11from .constants import digits, hexDigits, EOF
12from .constants import tokenTypes, tagTokenTypes
13from .constants import replacementCharacters
14
15from ._inputstream import HTMLInputStream
16
17from ._trie import Trie
18
19entitiesTrie = Trie(entities)
20
21if version_info >= (3, 7):
22    attributeMap = dict
23else:
24    attributeMap = OrderedDict
25
26
27class HTMLTokenizer(object):
28    """ This class takes care of tokenizing HTML.
29
30    * self.currentToken
31      Holds the token that is currently being processed.
32
33    * self.state
34      Holds a reference to the method to be invoked... XXX
35
36    * self.stream
37      Points to HTMLInputStream object.
38    """
39
40    def __init__(self, stream, parser=None, **kwargs):
41
42        self.stream = HTMLInputStream(stream, **kwargs)
43        self.parser = parser
44
45        # Setup the initial tokenizer state
46        self.escapeFlag = False
47        self.lastFourChars = []
48        self.state = self.dataState
49        self.escape = False
50
51        # The current token being created
52        self.currentToken = None
53        super(HTMLTokenizer, self).__init__()
54
55    def __iter__(self):
56        """ This is where the magic happens.
57
58        We do our usually processing through the states and when we have a token
59        to return we yield the token which pauses processing until the next token
60        is requested.
61        """
62        self.tokenQueue = deque([])
63        # Start processing. When EOF is reached self.state will return False
64        # instead of True and the loop will terminate.
65        while self.state():
66            while self.stream.errors:
67                yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
68            while self.tokenQueue:
69                yield self.tokenQueue.popleft()
70
71    def consumeNumberEntity(self, isHex):
72        """This function returns either U+FFFD or the character based on the
73        decimal or hexadecimal representation. It also discards ";" if present.
74        If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
75        """
76
77        allowed = digits
78        radix = 10
79        if isHex:
80            allowed = hexDigits
81            radix = 16
82
83        charStack = []
84
85        # Consume all the characters that are in range while making sure we
86        # don't hit an EOF.
87        c = self.stream.char()
88        while c in allowed and c is not EOF:
89            charStack.append(c)
90            c = self.stream.char()
91
92        # Convert the set of characters consumed to an int.
93        charAsInt = int("".join(charStack), radix)
94
95        # Certain characters get replaced with others
96        if charAsInt in replacementCharacters:
97            char = replacementCharacters[charAsInt]
98            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
99                                    "illegal-codepoint-for-numeric-entity",
100                                    "datavars": {"charAsInt": charAsInt}})
101        elif ((0xD800 <= charAsInt <= 0xDFFF) or
102              (charAsInt > 0x10FFFF)):
103            char = "\uFFFD"
104            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
105                                    "illegal-codepoint-for-numeric-entity",
106                                    "datavars": {"charAsInt": charAsInt}})
107        else:
108            # Should speed up this check somehow (e.g. move the set to a constant)
109            if ((0x0001 <= charAsInt <= 0x0008) or
110                (0x000E <= charAsInt <= 0x001F) or
111                (0x007F <= charAsInt <= 0x009F) or
112                (0xFDD0 <= charAsInt <= 0xFDEF) or
113                charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
114                                        0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
115                                        0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
116                                        0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
117                                        0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
118                                        0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
119                                        0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
120                                        0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
121                                        0xFFFFF, 0x10FFFE, 0x10FFFF])):
122                self.tokenQueue.append({"type": tokenTypes["ParseError"],
123                                        "data":
124                                        "illegal-codepoint-for-numeric-entity",
125                                        "datavars": {"charAsInt": charAsInt}})
126            try:
127                # Try/except needed as UCS-2 Python builds' unichar only works
128                # within the BMP.
129                char = chr(charAsInt)
130            except ValueError:
131                v = charAsInt - 0x10000
132                char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
133
134        # Discard the ; if present. Otherwise, put it back on the queue and
135        # invoke parseError on parser.
136        if c != ";":
137            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
138                                    "numeric-entity-without-semicolon"})
139            self.stream.unget(c)
140
141        return char
142
143    def consumeEntity(self, allowedChar=None, fromAttribute=False):
144        # Initialise to the default output for when no entity is matched
145        output = "&"
146
147        charStack = [self.stream.char()]
148        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
149                (allowedChar is not None and allowedChar == charStack[0])):
150            self.stream.unget(charStack[0])
151
152        elif charStack[0] == "#":
153            # Read the next character to see if it's hex or decimal
154            hex = False
155            charStack.append(self.stream.char())
156            if charStack[-1] in ("x", "X"):
157                hex = True
158                charStack.append(self.stream.char())
159
160            # charStack[-1] should be the first digit
161            if (hex and charStack[-1] in hexDigits) \
162                    or (not hex and charStack[-1] in digits):
163                # At least one digit found, so consume the whole number
164                self.stream.unget(charStack[-1])
165                output = self.consumeNumberEntity(hex)
166            else:
167                # No digits found
168                self.tokenQueue.append({"type": tokenTypes["ParseError"],
169                                        "data": "expected-numeric-entity"})
170                self.stream.unget(charStack.pop())
171                output = "&" + "".join(charStack)
172
173        else:
174            # At this point in the process might have named entity. Entities
175            # are stored in the global variable "entities".
176            #
177            # Consume characters and compare to these to a substring of the
178            # entity names in the list until the substring no longer matches.
179            while (charStack[-1] is not EOF):
180                if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
181                    break
182                charStack.append(self.stream.char())
183
184            # At this point we have a string that starts with some characters
185            # that may match an entity
186            # Try to find the longest entity the string will match to take care
187            # of &noti for instance.
188            try:
189                entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
190                entityLength = len(entityName)
191            except KeyError:
192                entityName = None
193
194            if entityName is not None:
195                if entityName[-1] != ";":
196                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
197                                            "named-entity-without-semicolon"})
198                if (entityName[-1] != ";" and fromAttribute and
199                    (charStack[entityLength] in asciiLetters or
200                     charStack[entityLength] in digits or
201                     charStack[entityLength] == "=")):
202                    self.stream.unget(charStack.pop())
203                    output = "&" + "".join(charStack)
204                else:
205                    output = entities[entityName]
206                    self.stream.unget(charStack.pop())
207                    output += "".join(charStack[entityLength:])
208            else:
209                self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
210                                        "expected-named-entity"})
211                self.stream.unget(charStack.pop())
212                output = "&" + "".join(charStack)
213
214        if fromAttribute:
215            self.currentToken["data"][-1][1] += output
216        else:
217            if output in spaceCharacters:
218                tokenType = "SpaceCharacters"
219            else:
220                tokenType = "Characters"
221            self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
222
223    def processEntityInAttribute(self, allowedChar):
224        """This method replaces the need for "entityInAttributeValueState".
225        """
226        self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
227
228    def emitCurrentToken(self):
229        """This method is a generic handler for emitting the tags. It also sets
230        the state to "data" because that's what's needed after a token has been
231        emitted.
232        """
233        token = self.currentToken
234        # Add token to the queue to be yielded
235        if (token["type"] in tagTokenTypes):
236            token["name"] = token["name"].translate(asciiUpper2Lower)
237            if token["type"] == tokenTypes["StartTag"]:
238                raw = token["data"]
239                data = attributeMap(raw)
240                if len(raw) > len(data):
241                    # we had some duplicated attribute, fix so first wins
242                    data.update(raw[::-1])
243                token["data"] = data
244
245            if token["type"] == tokenTypes["EndTag"]:
246                if token["data"]:
247                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
248                                            "data": "attributes-in-end-tag"})
249                if token["selfClosing"]:
250                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
251                                            "data": "self-closing-flag-on-end-tag"})
252        self.tokenQueue.append(token)
253        self.state = self.dataState
254
255    # Below are the various tokenizer states worked out.
256    def dataState(self):
257        data = self.stream.char()
258        if data == "&":
259            self.state = self.entityDataState
260        elif data == "<":
261            self.state = self.tagOpenState
262        elif data == "\u0000":
263            self.tokenQueue.append({"type": tokenTypes["ParseError"],
264                                    "data": "invalid-codepoint"})
265            self.tokenQueue.append({"type": tokenTypes["Characters"],
266                                    "data": "\u0000"})
267        elif data is EOF:
268            # Tokenization ends.
269            return False
270        elif data in spaceCharacters:
271            # Directly after emitting a token you switch back to the "data
272            # state". At that point spaceCharacters are important so they are
273            # emitted separately.
274            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
275                                    data + self.stream.charsUntil(spaceCharacters, True)})
276            # No need to update lastFourChars here, since the first space will
277            # have already been appended to lastFourChars and will have broken
278            # any <!-- or --> sequences
279        else:
280            chars = self.stream.charsUntil(("&", "<", "\u0000"))
281            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
282                                    data + chars})
283        return True
284
285    def entityDataState(self):
286        self.consumeEntity()
287        self.state = self.dataState
288        return True
289
290    def rcdataState(self):
291        data = self.stream.char()
292        if data == "&":
293            self.state = self.characterReferenceInRcdata
294        elif data == "<":
295            self.state = self.rcdataLessThanSignState
296        elif data == EOF:
297            # Tokenization ends.
298            return False
299        elif data == "\u0000":
300            self.tokenQueue.append({"type": tokenTypes["ParseError"],
301                                    "data": "invalid-codepoint"})
302            self.tokenQueue.append({"type": tokenTypes["Characters"],
303                                    "data": "\uFFFD"})
304        elif data in spaceCharacters:
305            # Directly after emitting a token you switch back to the "data
306            # state". At that point spaceCharacters are important so they are
307            # emitted separately.
308            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
309                                    data + self.stream.charsUntil(spaceCharacters, True)})
310            # No need to update lastFourChars here, since the first space will
311            # have already been appended to lastFourChars and will have broken
312            # any <!-- or --> sequences
313        else:
314            chars = self.stream.charsUntil(("&", "<", "\u0000"))
315            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
316                                    data + chars})
317        return True
318
319    def characterReferenceInRcdata(self):
320        self.consumeEntity()
321        self.state = self.rcdataState
322        return True
323
324    def rawtextState(self):
325        data = self.stream.char()
326        if data == "<":
327            self.state = self.rawtextLessThanSignState
328        elif data == "\u0000":
329            self.tokenQueue.append({"type": tokenTypes["ParseError"],
330                                    "data": "invalid-codepoint"})
331            self.tokenQueue.append({"type": tokenTypes["Characters"],
332                                    "data": "\uFFFD"})
333        elif data == EOF:
334            # Tokenization ends.
335            return False
336        else:
337            chars = self.stream.charsUntil(("<", "\u0000"))
338            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
339                                    data + chars})
340        return True
341
342    def scriptDataState(self):
343        data = self.stream.char()
344        if data == "<":
345            self.state = self.scriptDataLessThanSignState
346        elif data == "\u0000":
347            self.tokenQueue.append({"type": tokenTypes["ParseError"],
348                                    "data": "invalid-codepoint"})
349            self.tokenQueue.append({"type": tokenTypes["Characters"],
350                                    "data": "\uFFFD"})
351        elif data == EOF:
352            # Tokenization ends.
353            return False
354        else:
355            chars = self.stream.charsUntil(("<", "\u0000"))
356            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
357                                    data + chars})
358        return True
359
360    def plaintextState(self):
361        data = self.stream.char()
362        if data == EOF:
363            # Tokenization ends.
364            return False
365        elif data == "\u0000":
366            self.tokenQueue.append({"type": tokenTypes["ParseError"],
367                                    "data": "invalid-codepoint"})
368            self.tokenQueue.append({"type": tokenTypes["Characters"],
369                                    "data": "\uFFFD"})
370        else:
371            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
372                                    data + self.stream.charsUntil("\u0000")})
373        return True
374
375    def tagOpenState(self):
376        data = self.stream.char()
377        if data == "!":
378            self.state = self.markupDeclarationOpenState
379        elif data == "/":
380            self.state = self.closeTagOpenState
381        elif data in asciiLetters:
382            self.currentToken = {"type": tokenTypes["StartTag"],
383                                 "name": data, "data": [],
384                                 "selfClosing": False,
385                                 "selfClosingAcknowledged": False}
386            self.state = self.tagNameState
387        elif data == ">":
388            # XXX In theory it could be something besides a tag name. But
389            # do we really care?
390            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
391                                    "expected-tag-name-but-got-right-bracket"})
392            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
393            self.state = self.dataState
394        elif data == "?":
395            # XXX In theory it could be something besides a tag name. But
396            # do we really care?
397            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
398                                    "expected-tag-name-but-got-question-mark"})
399            self.stream.unget(data)
400            self.state = self.bogusCommentState
401        else:
402            # XXX
403            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
404                                    "expected-tag-name"})
405            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
406            self.stream.unget(data)
407            self.state = self.dataState
408        return True
409
410    def closeTagOpenState(self):
411        data = self.stream.char()
412        if data in asciiLetters:
413            self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
414                                 "data": [], "selfClosing": False}
415            self.state = self.tagNameState
416        elif data == ">":
417            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
418                                    "expected-closing-tag-but-got-right-bracket"})
419            self.state = self.dataState
420        elif data is EOF:
421            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
422                                    "expected-closing-tag-but-got-eof"})
423            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
424            self.state = self.dataState
425        else:
426            # XXX data can be _'_...
427            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
428                                    "expected-closing-tag-but-got-char",
429                                    "datavars": {"data": data}})
430            self.stream.unget(data)
431            self.state = self.bogusCommentState
432        return True
433
434    def tagNameState(self):
435        data = self.stream.char()
436        if data in spaceCharacters:
437            self.state = self.beforeAttributeNameState
438        elif data == ">":
439            self.emitCurrentToken()
440        elif data is EOF:
441            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
442                                    "eof-in-tag-name"})
443            self.state = self.dataState
444        elif data == "/":
445            self.state = self.selfClosingStartTagState
446        elif data == "\u0000":
447            self.tokenQueue.append({"type": tokenTypes["ParseError"],
448                                    "data": "invalid-codepoint"})
449            self.currentToken["name"] += "\uFFFD"
450        else:
451            self.currentToken["name"] += data
452            # (Don't use charsUntil here, because tag names are
453            # very short and it's faster to not do anything fancy)
454        return True
455
456    def rcdataLessThanSignState(self):
457        data = self.stream.char()
458        if data == "/":
459            self.temporaryBuffer = ""
460            self.state = self.rcdataEndTagOpenState
461        else:
462            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
463            self.stream.unget(data)
464            self.state = self.rcdataState
465        return True
466
467    def rcdataEndTagOpenState(self):
468        data = self.stream.char()
469        if data in asciiLetters:
470            self.temporaryBuffer += data
471            self.state = self.rcdataEndTagNameState
472        else:
473            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
474            self.stream.unget(data)
475            self.state = self.rcdataState
476        return True
477
478    def rcdataEndTagNameState(self):
479        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
480        data = self.stream.char()
481        if data in spaceCharacters and appropriate:
482            self.currentToken = {"type": tokenTypes["EndTag"],
483                                 "name": self.temporaryBuffer,
484                                 "data": [], "selfClosing": False}
485            self.state = self.beforeAttributeNameState
486        elif data == "/" and appropriate:
487            self.currentToken = {"type": tokenTypes["EndTag"],
488                                 "name": self.temporaryBuffer,
489                                 "data": [], "selfClosing": False}
490            self.state = self.selfClosingStartTagState
491        elif data == ">" and appropriate:
492            self.currentToken = {"type": tokenTypes["EndTag"],
493                                 "name": self.temporaryBuffer,
494                                 "data": [], "selfClosing": False}
495            self.emitCurrentToken()
496            self.state = self.dataState
497        elif data in asciiLetters:
498            self.temporaryBuffer += data
499        else:
500            self.tokenQueue.append({"type": tokenTypes["Characters"],
501                                    "data": "</" + self.temporaryBuffer})
502            self.stream.unget(data)
503            self.state = self.rcdataState
504        return True
505
506    def rawtextLessThanSignState(self):
507        data = self.stream.char()
508        if data == "/":
509            self.temporaryBuffer = ""
510            self.state = self.rawtextEndTagOpenState
511        else:
512            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
513            self.stream.unget(data)
514            self.state = self.rawtextState
515        return True
516
517    def rawtextEndTagOpenState(self):
518        data = self.stream.char()
519        if data in asciiLetters:
520            self.temporaryBuffer += data
521            self.state = self.rawtextEndTagNameState
522        else:
523            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
524            self.stream.unget(data)
525            self.state = self.rawtextState
526        return True
527
528    def rawtextEndTagNameState(self):
529        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
530        data = self.stream.char()
531        if data in spaceCharacters and appropriate:
532            self.currentToken = {"type": tokenTypes["EndTag"],
533                                 "name": self.temporaryBuffer,
534                                 "data": [], "selfClosing": False}
535            self.state = self.beforeAttributeNameState
536        elif data == "/" and appropriate:
537            self.currentToken = {"type": tokenTypes["EndTag"],
538                                 "name": self.temporaryBuffer,
539                                 "data": [], "selfClosing": False}
540            self.state = self.selfClosingStartTagState
541        elif data == ">" and appropriate:
542            self.currentToken = {"type": tokenTypes["EndTag"],
543                                 "name": self.temporaryBuffer,
544                                 "data": [], "selfClosing": False}
545            self.emitCurrentToken()
546            self.state = self.dataState
547        elif data in asciiLetters:
548            self.temporaryBuffer += data
549        else:
550            self.tokenQueue.append({"type": tokenTypes["Characters"],
551                                    "data": "</" + self.temporaryBuffer})
552            self.stream.unget(data)
553            self.state = self.rawtextState
554        return True
555
556    def scriptDataLessThanSignState(self):
557        data = self.stream.char()
558        if data == "/":
559            self.temporaryBuffer = ""
560            self.state = self.scriptDataEndTagOpenState
561        elif data == "!":
562            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
563            self.state = self.scriptDataEscapeStartState
564        else:
565            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
566            self.stream.unget(data)
567            self.state = self.scriptDataState
568        return True
569
570    def scriptDataEndTagOpenState(self):
571        data = self.stream.char()
572        if data in asciiLetters:
573            self.temporaryBuffer += data
574            self.state = self.scriptDataEndTagNameState
575        else:
576            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
577            self.stream.unget(data)
578            self.state = self.scriptDataState
579        return True
580
581    def scriptDataEndTagNameState(self):
582        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
583        data = self.stream.char()
584        if data in spaceCharacters and appropriate:
585            self.currentToken = {"type": tokenTypes["EndTag"],
586                                 "name": self.temporaryBuffer,
587                                 "data": [], "selfClosing": False}
588            self.state = self.beforeAttributeNameState
589        elif data == "/" and appropriate:
590            self.currentToken = {"type": tokenTypes["EndTag"],
591                                 "name": self.temporaryBuffer,
592                                 "data": [], "selfClosing": False}
593            self.state = self.selfClosingStartTagState
594        elif data == ">" and appropriate:
595            self.currentToken = {"type": tokenTypes["EndTag"],
596                                 "name": self.temporaryBuffer,
597                                 "data": [], "selfClosing": False}
598            self.emitCurrentToken()
599            self.state = self.dataState
600        elif data in asciiLetters:
601            self.temporaryBuffer += data
602        else:
603            self.tokenQueue.append({"type": tokenTypes["Characters"],
604                                    "data": "</" + self.temporaryBuffer})
605            self.stream.unget(data)
606            self.state = self.scriptDataState
607        return True
608
609    def scriptDataEscapeStartState(self):
610        data = self.stream.char()
611        if data == "-":
612            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
613            self.state = self.scriptDataEscapeStartDashState
614        else:
615            self.stream.unget(data)
616            self.state = self.scriptDataState
617        return True
618
619    def scriptDataEscapeStartDashState(self):
620        data = self.stream.char()
621        if data == "-":
622            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
623            self.state = self.scriptDataEscapedDashDashState
624        else:
625            self.stream.unget(data)
626            self.state = self.scriptDataState
627        return True
628
629    def scriptDataEscapedState(self):
630        data = self.stream.char()
631        if data == "-":
632            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
633            self.state = self.scriptDataEscapedDashState
634        elif data == "<":
635            self.state = self.scriptDataEscapedLessThanSignState
636        elif data == "\u0000":
637            self.tokenQueue.append({"type": tokenTypes["ParseError"],
638                                    "data": "invalid-codepoint"})
639            self.tokenQueue.append({"type": tokenTypes["Characters"],
640                                    "data": "\uFFFD"})
641        elif data == EOF:
642            self.state = self.dataState
643        else:
644            chars = self.stream.charsUntil(("<", "-", "\u0000"))
645            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
646                                    data + chars})
647        return True
648
649    def scriptDataEscapedDashState(self):
650        data = self.stream.char()
651        if data == "-":
652            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
653            self.state = self.scriptDataEscapedDashDashState
654        elif data == "<":
655            self.state = self.scriptDataEscapedLessThanSignState
656        elif data == "\u0000":
657            self.tokenQueue.append({"type": tokenTypes["ParseError"],
658                                    "data": "invalid-codepoint"})
659            self.tokenQueue.append({"type": tokenTypes["Characters"],
660                                    "data": "\uFFFD"})
661            self.state = self.scriptDataEscapedState
662        elif data == EOF:
663            self.state = self.dataState
664        else:
665            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
666            self.state = self.scriptDataEscapedState
667        return True
668
669    def scriptDataEscapedDashDashState(self):
670        data = self.stream.char()
671        if data == "-":
672            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
673        elif data == "<":
674            self.state = self.scriptDataEscapedLessThanSignState
675        elif data == ">":
676            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
677            self.state = self.scriptDataState
678        elif data == "\u0000":
679            self.tokenQueue.append({"type": tokenTypes["ParseError"],
680                                    "data": "invalid-codepoint"})
681            self.tokenQueue.append({"type": tokenTypes["Characters"],
682                                    "data": "\uFFFD"})
683            self.state = self.scriptDataEscapedState
684        elif data == EOF:
685            self.state = self.dataState
686        else:
687            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
688            self.state = self.scriptDataEscapedState
689        return True
690
691    def scriptDataEscapedLessThanSignState(self):
692        data = self.stream.char()
693        if data == "/":
694            self.temporaryBuffer = ""
695            self.state = self.scriptDataEscapedEndTagOpenState
696        elif data in asciiLetters:
697            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
698            self.temporaryBuffer = data
699            self.state = self.scriptDataDoubleEscapeStartState
700        else:
701            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
702            self.stream.unget(data)
703            self.state = self.scriptDataEscapedState
704        return True
705
706    def scriptDataEscapedEndTagOpenState(self):
707        data = self.stream.char()
708        if data in asciiLetters:
709            self.temporaryBuffer = data
710            self.state = self.scriptDataEscapedEndTagNameState
711        else:
712            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
713            self.stream.unget(data)
714            self.state = self.scriptDataEscapedState
715        return True
716
717    def scriptDataEscapedEndTagNameState(self):
718        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
719        data = self.stream.char()
720        if data in spaceCharacters and appropriate:
721            self.currentToken = {"type": tokenTypes["EndTag"],
722                                 "name": self.temporaryBuffer,
723                                 "data": [], "selfClosing": False}
724            self.state = self.beforeAttributeNameState
725        elif data == "/" and appropriate:
726            self.currentToken = {"type": tokenTypes["EndTag"],
727                                 "name": self.temporaryBuffer,
728                                 "data": [], "selfClosing": False}
729            self.state = self.selfClosingStartTagState
730        elif data == ">" and appropriate:
731            self.currentToken = {"type": tokenTypes["EndTag"],
732                                 "name": self.temporaryBuffer,
733                                 "data": [], "selfClosing": False}
734            self.emitCurrentToken()
735            self.state = self.dataState
736        elif data in asciiLetters:
737            self.temporaryBuffer += data
738        else:
739            self.tokenQueue.append({"type": tokenTypes["Characters"],
740                                    "data": "</" + self.temporaryBuffer})
741            self.stream.unget(data)
742            self.state = self.scriptDataEscapedState
743        return True
744
745    def scriptDataDoubleEscapeStartState(self):
746        data = self.stream.char()
747        if data in (spaceCharacters | frozenset(("/", ">"))):
748            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
749            if self.temporaryBuffer.lower() == "script":
750                self.state = self.scriptDataDoubleEscapedState
751            else:
752                self.state = self.scriptDataEscapedState
753        elif data in asciiLetters:
754            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
755            self.temporaryBuffer += data
756        else:
757            self.stream.unget(data)
758            self.state = self.scriptDataEscapedState
759        return True
760
761    def scriptDataDoubleEscapedState(self):
762        data = self.stream.char()
763        if data == "-":
764            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
765            self.state = self.scriptDataDoubleEscapedDashState
766        elif data == "<":
767            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
768            self.state = self.scriptDataDoubleEscapedLessThanSignState
769        elif data == "\u0000":
770            self.tokenQueue.append({"type": tokenTypes["ParseError"],
771                                    "data": "invalid-codepoint"})
772            self.tokenQueue.append({"type": tokenTypes["Characters"],
773                                    "data": "\uFFFD"})
774        elif data == EOF:
775            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
776                                    "eof-in-script-in-script"})
777            self.state = self.dataState
778        else:
779            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
780        return True
781
782    def scriptDataDoubleEscapedDashState(self):
783        data = self.stream.char()
784        if data == "-":
785            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
786            self.state = self.scriptDataDoubleEscapedDashDashState
787        elif data == "<":
788            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
789            self.state = self.scriptDataDoubleEscapedLessThanSignState
790        elif data == "\u0000":
791            self.tokenQueue.append({"type": tokenTypes["ParseError"],
792                                    "data": "invalid-codepoint"})
793            self.tokenQueue.append({"type": tokenTypes["Characters"],
794                                    "data": "\uFFFD"})
795            self.state = self.scriptDataDoubleEscapedState
796        elif data == EOF:
797            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
798                                    "eof-in-script-in-script"})
799            self.state = self.dataState
800        else:
801            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
802            self.state = self.scriptDataDoubleEscapedState
803        return True
804
805    def scriptDataDoubleEscapedDashDashState(self):
806        data = self.stream.char()
807        if data == "-":
808            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
809        elif data == "<":
810            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
811            self.state = self.scriptDataDoubleEscapedLessThanSignState
812        elif data == ">":
813            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
814            self.state = self.scriptDataState
815        elif data == "\u0000":
816            self.tokenQueue.append({"type": tokenTypes["ParseError"],
817                                    "data": "invalid-codepoint"})
818            self.tokenQueue.append({"type": tokenTypes["Characters"],
819                                    "data": "\uFFFD"})
820            self.state = self.scriptDataDoubleEscapedState
821        elif data == EOF:
822            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
823                                    "eof-in-script-in-script"})
824            self.state = self.dataState
825        else:
826            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
827            self.state = self.scriptDataDoubleEscapedState
828        return True
829
830    def scriptDataDoubleEscapedLessThanSignState(self):
831        data = self.stream.char()
832        if data == "/":
833            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
834            self.temporaryBuffer = ""
835            self.state = self.scriptDataDoubleEscapeEndState
836        else:
837            self.stream.unget(data)
838            self.state = self.scriptDataDoubleEscapedState
839        return True
840
841    def scriptDataDoubleEscapeEndState(self):
842        data = self.stream.char()
843        if data in (spaceCharacters | frozenset(("/", ">"))):
844            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
845            if self.temporaryBuffer.lower() == "script":
846                self.state = self.scriptDataEscapedState
847            else:
848                self.state = self.scriptDataDoubleEscapedState
849        elif data in asciiLetters:
850            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
851            self.temporaryBuffer += data
852        else:
853            self.stream.unget(data)
854            self.state = self.scriptDataDoubleEscapedState
855        return True
856
857    def beforeAttributeNameState(self):
858        data = self.stream.char()
859        if data in spaceCharacters:
860            self.stream.charsUntil(spaceCharacters, True)
861        elif data in asciiLetters:
862            self.currentToken["data"].append([data, ""])
863            self.state = self.attributeNameState
864        elif data == ">":
865            self.emitCurrentToken()
866        elif data == "/":
867            self.state = self.selfClosingStartTagState
868        elif data in ("'", '"', "=", "<"):
869            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
870                                    "invalid-character-in-attribute-name"})
871            self.currentToken["data"].append([data, ""])
872            self.state = self.attributeNameState
873        elif data == "\u0000":
874            self.tokenQueue.append({"type": tokenTypes["ParseError"],
875                                    "data": "invalid-codepoint"})
876            self.currentToken["data"].append(["\uFFFD", ""])
877            self.state = self.attributeNameState
878        elif data is EOF:
879            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
880                                    "expected-attribute-name-but-got-eof"})
881            self.state = self.dataState
882        else:
883            self.currentToken["data"].append([data, ""])
884            self.state = self.attributeNameState
885        return True
886
887    def attributeNameState(self):
888        data = self.stream.char()
889        leavingThisState = True
890        emitToken = False
891        if data == "=":
892            self.state = self.beforeAttributeValueState
893        elif data in asciiLetters:
894            self.currentToken["data"][-1][0] += data +\
895                self.stream.charsUntil(asciiLetters, True)
896            leavingThisState = False
897        elif data == ">":
898            # XXX If we emit here the attributes are converted to a dict
899            # without being checked and when the code below runs we error
900            # because data is a dict not a list
901            emitToken = True
902        elif data in spaceCharacters:
903            self.state = self.afterAttributeNameState
904        elif data == "/":
905            self.state = self.selfClosingStartTagState
906        elif data == "\u0000":
907            self.tokenQueue.append({"type": tokenTypes["ParseError"],
908                                    "data": "invalid-codepoint"})
909            self.currentToken["data"][-1][0] += "\uFFFD"
910            leavingThisState = False
911        elif data in ("'", '"', "<"):
912            self.tokenQueue.append({"type": tokenTypes["ParseError"],
913                                    "data":
914                                    "invalid-character-in-attribute-name"})
915            self.currentToken["data"][-1][0] += data
916            leavingThisState = False
917        elif data is EOF:
918            self.tokenQueue.append({"type": tokenTypes["ParseError"],
919                                    "data": "eof-in-attribute-name"})
920            self.state = self.dataState
921        else:
922            self.currentToken["data"][-1][0] += data
923            leavingThisState = False
924
925        if leavingThisState:
926            # Attributes are not dropped at this stage. That happens when the
927            # start tag token is emitted so values can still be safely appended
928            # to attributes, but we do want to report the parse error in time.
929            self.currentToken["data"][-1][0] = (
930                self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
931            for name, _ in self.currentToken["data"][:-1]:
932                if self.currentToken["data"][-1][0] == name:
933                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
934                                            "duplicate-attribute"})
935                    break
936            # XXX Fix for above XXX
937            if emitToken:
938                self.emitCurrentToken()
939        return True
940
941    def afterAttributeNameState(self):
942        data = self.stream.char()
943        if data in spaceCharacters:
944            self.stream.charsUntil(spaceCharacters, True)
945        elif data == "=":
946            self.state = self.beforeAttributeValueState
947        elif data == ">":
948            self.emitCurrentToken()
949        elif data in asciiLetters:
950            self.currentToken["data"].append([data, ""])
951            self.state = self.attributeNameState
952        elif data == "/":
953            self.state = self.selfClosingStartTagState
954        elif data == "\u0000":
955            self.tokenQueue.append({"type": tokenTypes["ParseError"],
956                                    "data": "invalid-codepoint"})
957            self.currentToken["data"].append(["\uFFFD", ""])
958            self.state = self.attributeNameState
959        elif data in ("'", '"', "<"):
960            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
961                                    "invalid-character-after-attribute-name"})
962            self.currentToken["data"].append([data, ""])
963            self.state = self.attributeNameState
964        elif data is EOF:
965            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
966                                    "expected-end-of-tag-but-got-eof"})
967            self.state = self.dataState
968        else:
969            self.currentToken["data"].append([data, ""])
970            self.state = self.attributeNameState
971        return True
972
973    def beforeAttributeValueState(self):
974        data = self.stream.char()
975        if data in spaceCharacters:
976            self.stream.charsUntil(spaceCharacters, True)
977        elif data == "\"":
978            self.state = self.attributeValueDoubleQuotedState
979        elif data == "&":
980            self.state = self.attributeValueUnQuotedState
981            self.stream.unget(data)
982        elif data == "'":
983            self.state = self.attributeValueSingleQuotedState
984        elif data == ">":
985            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
986                                    "expected-attribute-value-but-got-right-bracket"})
987            self.emitCurrentToken()
988        elif data == "\u0000":
989            self.tokenQueue.append({"type": tokenTypes["ParseError"],
990                                    "data": "invalid-codepoint"})
991            self.currentToken["data"][-1][1] += "\uFFFD"
992            self.state = self.attributeValueUnQuotedState
993        elif data in ("=", "<", "`"):
994            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
995                                    "equals-in-unquoted-attribute-value"})
996            self.currentToken["data"][-1][1] += data
997            self.state = self.attributeValueUnQuotedState
998        elif data is EOF:
999            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1000                                    "expected-attribute-value-but-got-eof"})
1001            self.state = self.dataState
1002        else:
1003            self.currentToken["data"][-1][1] += data
1004            self.state = self.attributeValueUnQuotedState
1005        return True
1006
1007    def attributeValueDoubleQuotedState(self):
1008        data = self.stream.char()
1009        if data == "\"":
1010            self.state = self.afterAttributeValueState
1011        elif data == "&":
1012            self.processEntityInAttribute('"')
1013        elif data == "\u0000":
1014            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1015                                    "data": "invalid-codepoint"})
1016            self.currentToken["data"][-1][1] += "\uFFFD"
1017        elif data is EOF:
1018            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1019                                    "eof-in-attribute-value-double-quote"})
1020            self.state = self.dataState
1021        else:
1022            self.currentToken["data"][-1][1] += data +\
1023                self.stream.charsUntil(("\"", "&", "\u0000"))
1024        return True
1025
1026    def attributeValueSingleQuotedState(self):
1027        data = self.stream.char()
1028        if data == "'":
1029            self.state = self.afterAttributeValueState
1030        elif data == "&":
1031            self.processEntityInAttribute("'")
1032        elif data == "\u0000":
1033            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1034                                    "data": "invalid-codepoint"})
1035            self.currentToken["data"][-1][1] += "\uFFFD"
1036        elif data is EOF:
1037            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1038                                    "eof-in-attribute-value-single-quote"})
1039            self.state = self.dataState
1040        else:
1041            self.currentToken["data"][-1][1] += data +\
1042                self.stream.charsUntil(("'", "&", "\u0000"))
1043        return True
1044
1045    def attributeValueUnQuotedState(self):
1046        data = self.stream.char()
1047        if data in spaceCharacters:
1048            self.state = self.beforeAttributeNameState
1049        elif data == "&":
1050            self.processEntityInAttribute(">")
1051        elif data == ">":
1052            self.emitCurrentToken()
1053        elif data in ('"', "'", "=", "<", "`"):
1054            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1055                                    "unexpected-character-in-unquoted-attribute-value"})
1056            self.currentToken["data"][-1][1] += data
1057        elif data == "\u0000":
1058            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1059                                    "data": "invalid-codepoint"})
1060            self.currentToken["data"][-1][1] += "\uFFFD"
1061        elif data is EOF:
1062            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1063                                    "eof-in-attribute-value-no-quotes"})
1064            self.state = self.dataState
1065        else:
1066            self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
1067                frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
1068        return True
1069
1070    def afterAttributeValueState(self):
1071        data = self.stream.char()
1072        if data in spaceCharacters:
1073            self.state = self.beforeAttributeNameState
1074        elif data == ">":
1075            self.emitCurrentToken()
1076        elif data == "/":
1077            self.state = self.selfClosingStartTagState
1078        elif data is EOF:
1079            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1080                                    "unexpected-EOF-after-attribute-value"})
1081            self.stream.unget(data)
1082            self.state = self.dataState
1083        else:
1084            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1085                                    "unexpected-character-after-attribute-value"})
1086            self.stream.unget(data)
1087            self.state = self.beforeAttributeNameState
1088        return True
1089
1090    def selfClosingStartTagState(self):
1091        data = self.stream.char()
1092        if data == ">":
1093            self.currentToken["selfClosing"] = True
1094            self.emitCurrentToken()
1095        elif data is EOF:
1096            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1097                                    "data":
1098                                    "unexpected-EOF-after-solidus-in-tag"})
1099            self.stream.unget(data)
1100            self.state = self.dataState
1101        else:
1102            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1103                                    "unexpected-character-after-solidus-in-tag"})
1104            self.stream.unget(data)
1105            self.state = self.beforeAttributeNameState
1106        return True
1107
1108    def bogusCommentState(self):
1109        # Make a new comment token and give it as value all the characters
1110        # until the first > or EOF (charsUntil checks for EOF automatically)
1111        # and emit it.
1112        data = self.stream.charsUntil(">")
1113        data = data.replace("\u0000", "\uFFFD")
1114        self.tokenQueue.append(
1115            {"type": tokenTypes["Comment"], "data": data})
1116
1117        # Eat the character directly after the bogus comment which is either a
1118        # ">" or an EOF.
1119        self.stream.char()
1120        self.state = self.dataState
1121        return True
1122
1123    def markupDeclarationOpenState(self):
1124        charStack = [self.stream.char()]
1125        if charStack[-1] == "-":
1126            charStack.append(self.stream.char())
1127            if charStack[-1] == "-":
1128                self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
1129                self.state = self.commentStartState
1130                return True
1131        elif charStack[-1] in ('d', 'D'):
1132            matched = True
1133            for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
1134                             ('y', 'Y'), ('p', 'P'), ('e', 'E')):
1135                charStack.append(self.stream.char())
1136                if charStack[-1] not in expected:
1137                    matched = False
1138                    break
1139            if matched:
1140                self.currentToken = {"type": tokenTypes["Doctype"],
1141                                     "name": "",
1142                                     "publicId": None, "systemId": None,
1143                                     "correct": True}
1144                self.state = self.doctypeState
1145                return True
1146        elif (charStack[-1] == "[" and
1147              self.parser is not None and
1148              self.parser.tree.openElements and
1149              self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
1150            matched = True
1151            for expected in ["C", "D", "A", "T", "A", "["]:
1152                charStack.append(self.stream.char())
1153                if charStack[-1] != expected:
1154                    matched = False
1155                    break
1156            if matched:
1157                self.state = self.cdataSectionState
1158                return True
1159
1160        self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1161                                "expected-dashes-or-doctype"})
1162
1163        while charStack:
1164            self.stream.unget(charStack.pop())
1165        self.state = self.bogusCommentState
1166        return True
1167
1168    def commentStartState(self):
1169        data = self.stream.char()
1170        if data == "-":
1171            self.state = self.commentStartDashState
1172        elif data == "\u0000":
1173            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1174                                    "data": "invalid-codepoint"})
1175            self.currentToken["data"] += "\uFFFD"
1176        elif data == ">":
1177            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1178                                    "incorrect-comment"})
1179            self.tokenQueue.append(self.currentToken)
1180            self.state = self.dataState
1181        elif data is EOF:
1182            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1183                                    "eof-in-comment"})
1184            self.tokenQueue.append(self.currentToken)
1185            self.state = self.dataState
1186        else:
1187            self.currentToken["data"] += data
1188            self.state = self.commentState
1189        return True
1190
1191    def commentStartDashState(self):
1192        data = self.stream.char()
1193        if data == "-":
1194            self.state = self.commentEndState
1195        elif data == "\u0000":
1196            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1197                                    "data": "invalid-codepoint"})
1198            self.currentToken["data"] += "-\uFFFD"
1199        elif data == ">":
1200            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1201                                    "incorrect-comment"})
1202            self.tokenQueue.append(self.currentToken)
1203            self.state = self.dataState
1204        elif data is EOF:
1205            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1206                                    "eof-in-comment"})
1207            self.tokenQueue.append(self.currentToken)
1208            self.state = self.dataState
1209        else:
1210            self.currentToken["data"] += "-" + data
1211            self.state = self.commentState
1212        return True
1213
1214    def commentState(self):
1215        data = self.stream.char()
1216        if data == "-":
1217            self.state = self.commentEndDashState
1218        elif data == "\u0000":
1219            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1220                                    "data": "invalid-codepoint"})
1221            self.currentToken["data"] += "\uFFFD"
1222        elif data is EOF:
1223            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1224                                    "data": "eof-in-comment"})
1225            self.tokenQueue.append(self.currentToken)
1226            self.state = self.dataState
1227        else:
1228            self.currentToken["data"] += data + \
1229                self.stream.charsUntil(("-", "\u0000"))
1230        return True
1231
1232    def commentEndDashState(self):
1233        data = self.stream.char()
1234        if data == "-":
1235            self.state = self.commentEndState
1236        elif data == "\u0000":
1237            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1238                                    "data": "invalid-codepoint"})
1239            self.currentToken["data"] += "-\uFFFD"
1240            self.state = self.commentState
1241        elif data is EOF:
1242            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1243                                    "eof-in-comment-end-dash"})
1244            self.tokenQueue.append(self.currentToken)
1245            self.state = self.dataState
1246        else:
1247            self.currentToken["data"] += "-" + data
1248            self.state = self.commentState
1249        return True
1250
1251    def commentEndState(self):
1252        data = self.stream.char()
1253        if data == ">":
1254            self.tokenQueue.append(self.currentToken)
1255            self.state = self.dataState
1256        elif data == "\u0000":
1257            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1258                                    "data": "invalid-codepoint"})
1259            self.currentToken["data"] += "--\uFFFD"
1260            self.state = self.commentState
1261        elif data == "!":
1262            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1263                                    "unexpected-bang-after-double-dash-in-comment"})
1264            self.state = self.commentEndBangState
1265        elif data == "-":
1266            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1267                                    "unexpected-dash-after-double-dash-in-comment"})
1268            self.currentToken["data"] += data
1269        elif data is EOF:
1270            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1271                                    "eof-in-comment-double-dash"})
1272            self.tokenQueue.append(self.currentToken)
1273            self.state = self.dataState
1274        else:
1275            # XXX
1276            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1277                                    "unexpected-char-in-comment"})
1278            self.currentToken["data"] += "--" + data
1279            self.state = self.commentState
1280        return True
1281
1282    def commentEndBangState(self):
1283        data = self.stream.char()
1284        if data == ">":
1285            self.tokenQueue.append(self.currentToken)
1286            self.state = self.dataState
1287        elif data == "-":
1288            self.currentToken["data"] += "--!"
1289            self.state = self.commentEndDashState
1290        elif data == "\u0000":
1291            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1292                                    "data": "invalid-codepoint"})
1293            self.currentToken["data"] += "--!\uFFFD"
1294            self.state = self.commentState
1295        elif data is EOF:
1296            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1297                                    "eof-in-comment-end-bang-state"})
1298            self.tokenQueue.append(self.currentToken)
1299            self.state = self.dataState
1300        else:
1301            self.currentToken["data"] += "--!" + data
1302            self.state = self.commentState
1303        return True
1304
1305    def doctypeState(self):
1306        data = self.stream.char()
1307        if data in spaceCharacters:
1308            self.state = self.beforeDoctypeNameState
1309        elif data is EOF:
1310            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1311                                    "expected-doctype-name-but-got-eof"})
1312            self.currentToken["correct"] = False
1313            self.tokenQueue.append(self.currentToken)
1314            self.state = self.dataState
1315        else:
1316            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1317                                    "need-space-after-doctype"})
1318            self.stream.unget(data)
1319            self.state = self.beforeDoctypeNameState
1320        return True
1321
1322    def beforeDoctypeNameState(self):
1323        data = self.stream.char()
1324        if data in spaceCharacters:
1325            pass
1326        elif data == ">":
1327            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1328                                    "expected-doctype-name-but-got-right-bracket"})
1329            self.currentToken["correct"] = False
1330            self.tokenQueue.append(self.currentToken)
1331            self.state = self.dataState
1332        elif data == "\u0000":
1333            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1334                                    "data": "invalid-codepoint"})
1335            self.currentToken["name"] = "\uFFFD"
1336            self.state = self.doctypeNameState
1337        elif data is EOF:
1338            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1339                                    "expected-doctype-name-but-got-eof"})
1340            self.currentToken["correct"] = False
1341            self.tokenQueue.append(self.currentToken)
1342            self.state = self.dataState
1343        else:
1344            self.currentToken["name"] = data
1345            self.state = self.doctypeNameState
1346        return True
1347
1348    def doctypeNameState(self):
1349        data = self.stream.char()
1350        if data in spaceCharacters:
1351            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1352            self.state = self.afterDoctypeNameState
1353        elif data == ">":
1354            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1355            self.tokenQueue.append(self.currentToken)
1356            self.state = self.dataState
1357        elif data == "\u0000":
1358            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1359                                    "data": "invalid-codepoint"})
1360            self.currentToken["name"] += "\uFFFD"
1361            self.state = self.doctypeNameState
1362        elif data is EOF:
1363            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1364                                    "eof-in-doctype-name"})
1365            self.currentToken["correct"] = False
1366            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1367            self.tokenQueue.append(self.currentToken)
1368            self.state = self.dataState
1369        else:
1370            self.currentToken["name"] += data
1371        return True
1372
1373    def afterDoctypeNameState(self):
1374        data = self.stream.char()
1375        if data in spaceCharacters:
1376            pass
1377        elif data == ">":
1378            self.tokenQueue.append(self.currentToken)
1379            self.state = self.dataState
1380        elif data is EOF:
1381            self.currentToken["correct"] = False
1382            self.stream.unget(data)
1383            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1384                                    "eof-in-doctype"})
1385            self.tokenQueue.append(self.currentToken)
1386            self.state = self.dataState
1387        else:
1388            if data in ("p", "P"):
1389                matched = True
1390                for expected in (("u", "U"), ("b", "B"), ("l", "L"),
1391                                 ("i", "I"), ("c", "C")):
1392                    data = self.stream.char()
1393                    if data not in expected:
1394                        matched = False
1395                        break
1396                if matched:
1397                    self.state = self.afterDoctypePublicKeywordState
1398                    return True
1399            elif data in ("s", "S"):
1400                matched = True
1401                for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
1402                                 ("e", "E"), ("m", "M")):
1403                    data = self.stream.char()
1404                    if data not in expected:
1405                        matched = False
1406                        break
1407                if matched:
1408                    self.state = self.afterDoctypeSystemKeywordState
1409                    return True
1410
1411            # All the characters read before the current 'data' will be
1412            # [a-zA-Z], so they're garbage in the bogus doctype and can be
1413            # discarded; only the latest character might be '>' or EOF
1414            # and needs to be ungetted
1415            self.stream.unget(data)
1416            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1417                                    "expected-space-or-right-bracket-in-doctype", "datavars":
1418                                    {"data": data}})
1419            self.currentToken["correct"] = False
1420            self.state = self.bogusDoctypeState
1421
1422        return True
1423
1424    def afterDoctypePublicKeywordState(self):
1425        data = self.stream.char()
1426        if data in spaceCharacters:
1427            self.state = self.beforeDoctypePublicIdentifierState
1428        elif data in ("'", '"'):
1429            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1430                                    "unexpected-char-in-doctype"})
1431            self.stream.unget(data)
1432            self.state = self.beforeDoctypePublicIdentifierState
1433        elif data is EOF:
1434            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1435                                    "eof-in-doctype"})
1436            self.currentToken["correct"] = False
1437            self.tokenQueue.append(self.currentToken)
1438            self.state = self.dataState
1439        else:
1440            self.stream.unget(data)
1441            self.state = self.beforeDoctypePublicIdentifierState
1442        return True
1443
1444    def beforeDoctypePublicIdentifierState(self):
1445        data = self.stream.char()
1446        if data in spaceCharacters:
1447            pass
1448        elif data == "\"":
1449            self.currentToken["publicId"] = ""
1450            self.state = self.doctypePublicIdentifierDoubleQuotedState
1451        elif data == "'":
1452            self.currentToken["publicId"] = ""
1453            self.state = self.doctypePublicIdentifierSingleQuotedState
1454        elif data == ">":
1455            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1456                                    "unexpected-end-of-doctype"})
1457            self.currentToken["correct"] = False
1458            self.tokenQueue.append(self.currentToken)
1459            self.state = self.dataState
1460        elif data is EOF:
1461            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1462                                    "eof-in-doctype"})
1463            self.currentToken["correct"] = False
1464            self.tokenQueue.append(self.currentToken)
1465            self.state = self.dataState
1466        else:
1467            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1468                                    "unexpected-char-in-doctype"})
1469            self.currentToken["correct"] = False
1470            self.state = self.bogusDoctypeState
1471        return True
1472
1473    def doctypePublicIdentifierDoubleQuotedState(self):
1474        data = self.stream.char()
1475        if data == "\"":
1476            self.state = self.afterDoctypePublicIdentifierState
1477        elif data == "\u0000":
1478            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1479                                    "data": "invalid-codepoint"})
1480            self.currentToken["publicId"] += "\uFFFD"
1481        elif data == ">":
1482            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1483                                    "unexpected-end-of-doctype"})
1484            self.currentToken["correct"] = False
1485            self.tokenQueue.append(self.currentToken)
1486            self.state = self.dataState
1487        elif data is EOF:
1488            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1489                                    "eof-in-doctype"})
1490            self.currentToken["correct"] = False
1491            self.tokenQueue.append(self.currentToken)
1492            self.state = self.dataState
1493        else:
1494            self.currentToken["publicId"] += data
1495        return True
1496
1497    def doctypePublicIdentifierSingleQuotedState(self):
1498        data = self.stream.char()
1499        if data == "'":
1500            self.state = self.afterDoctypePublicIdentifierState
1501        elif data == "\u0000":
1502            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1503                                    "data": "invalid-codepoint"})
1504            self.currentToken["publicId"] += "\uFFFD"
1505        elif data == ">":
1506            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1507                                    "unexpected-end-of-doctype"})
1508            self.currentToken["correct"] = False
1509            self.tokenQueue.append(self.currentToken)
1510            self.state = self.dataState
1511        elif data is EOF:
1512            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1513                                    "eof-in-doctype"})
1514            self.currentToken["correct"] = False
1515            self.tokenQueue.append(self.currentToken)
1516            self.state = self.dataState
1517        else:
1518            self.currentToken["publicId"] += data
1519        return True
1520
1521    def afterDoctypePublicIdentifierState(self):
1522        data = self.stream.char()
1523        if data in spaceCharacters:
1524            self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1525        elif data == ">":
1526            self.tokenQueue.append(self.currentToken)
1527            self.state = self.dataState
1528        elif data == '"':
1529            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1530                                    "unexpected-char-in-doctype"})
1531            self.currentToken["systemId"] = ""
1532            self.state = self.doctypeSystemIdentifierDoubleQuotedState
1533        elif data == "'":
1534            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1535                                    "unexpected-char-in-doctype"})
1536            self.currentToken["systemId"] = ""
1537            self.state = self.doctypeSystemIdentifierSingleQuotedState
1538        elif data is EOF:
1539            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1540                                    "eof-in-doctype"})
1541            self.currentToken["correct"] = False
1542            self.tokenQueue.append(self.currentToken)
1543            self.state = self.dataState
1544        else:
1545            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1546                                    "unexpected-char-in-doctype"})
1547            self.currentToken["correct"] = False
1548            self.state = self.bogusDoctypeState
1549        return True
1550
1551    def betweenDoctypePublicAndSystemIdentifiersState(self):
1552        data = self.stream.char()
1553        if data in spaceCharacters:
1554            pass
1555        elif data == ">":
1556            self.tokenQueue.append(self.currentToken)
1557            self.state = self.dataState
1558        elif data == '"':
1559            self.currentToken["systemId"] = ""
1560            self.state = self.doctypeSystemIdentifierDoubleQuotedState
1561        elif data == "'":
1562            self.currentToken["systemId"] = ""
1563            self.state = self.doctypeSystemIdentifierSingleQuotedState
1564        elif data == EOF:
1565            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1566                                    "eof-in-doctype"})
1567            self.currentToken["correct"] = False
1568            self.tokenQueue.append(self.currentToken)
1569            self.state = self.dataState
1570        else:
1571            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1572                                    "unexpected-char-in-doctype"})
1573            self.currentToken["correct"] = False
1574            self.state = self.bogusDoctypeState
1575        return True
1576
1577    def afterDoctypeSystemKeywordState(self):
1578        data = self.stream.char()
1579        if data in spaceCharacters:
1580            self.state = self.beforeDoctypeSystemIdentifierState
1581        elif data in ("'", '"'):
1582            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1583                                    "unexpected-char-in-doctype"})
1584            self.stream.unget(data)
1585            self.state = self.beforeDoctypeSystemIdentifierState
1586        elif data is EOF:
1587            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1588                                    "eof-in-doctype"})
1589            self.currentToken["correct"] = False
1590            self.tokenQueue.append(self.currentToken)
1591            self.state = self.dataState
1592        else:
1593            self.stream.unget(data)
1594            self.state = self.beforeDoctypeSystemIdentifierState
1595        return True
1596
1597    def beforeDoctypeSystemIdentifierState(self):
1598        data = self.stream.char()
1599        if data in spaceCharacters:
1600            pass
1601        elif data == "\"":
1602            self.currentToken["systemId"] = ""
1603            self.state = self.doctypeSystemIdentifierDoubleQuotedState
1604        elif data == "'":
1605            self.currentToken["systemId"] = ""
1606            self.state = self.doctypeSystemIdentifierSingleQuotedState
1607        elif data == ">":
1608            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1609                                    "unexpected-char-in-doctype"})
1610            self.currentToken["correct"] = False
1611            self.tokenQueue.append(self.currentToken)
1612            self.state = self.dataState
1613        elif data is EOF:
1614            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1615                                    "eof-in-doctype"})
1616            self.currentToken["correct"] = False
1617            self.tokenQueue.append(self.currentToken)
1618            self.state = self.dataState
1619        else:
1620            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1621                                    "unexpected-char-in-doctype"})
1622            self.currentToken["correct"] = False
1623            self.state = self.bogusDoctypeState
1624        return True
1625
1626    def doctypeSystemIdentifierDoubleQuotedState(self):
1627        data = self.stream.char()
1628        if data == "\"":
1629            self.state = self.afterDoctypeSystemIdentifierState
1630        elif data == "\u0000":
1631            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1632                                    "data": "invalid-codepoint"})
1633            self.currentToken["systemId"] += "\uFFFD"
1634        elif data == ">":
1635            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1636                                    "unexpected-end-of-doctype"})
1637            self.currentToken["correct"] = False
1638            self.tokenQueue.append(self.currentToken)
1639            self.state = self.dataState
1640        elif data is EOF:
1641            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1642                                    "eof-in-doctype"})
1643            self.currentToken["correct"] = False
1644            self.tokenQueue.append(self.currentToken)
1645            self.state = self.dataState
1646        else:
1647            self.currentToken["systemId"] += data
1648        return True
1649
1650    def doctypeSystemIdentifierSingleQuotedState(self):
1651        data = self.stream.char()
1652        if data == "'":
1653            self.state = self.afterDoctypeSystemIdentifierState
1654        elif data == "\u0000":
1655            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1656                                    "data": "invalid-codepoint"})
1657            self.currentToken["systemId"] += "\uFFFD"
1658        elif data == ">":
1659            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1660                                    "unexpected-end-of-doctype"})
1661            self.currentToken["correct"] = False
1662            self.tokenQueue.append(self.currentToken)
1663            self.state = self.dataState
1664        elif data is EOF:
1665            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1666                                    "eof-in-doctype"})
1667            self.currentToken["correct"] = False
1668            self.tokenQueue.append(self.currentToken)
1669            self.state = self.dataState
1670        else:
1671            self.currentToken["systemId"] += data
1672        return True
1673
1674    def afterDoctypeSystemIdentifierState(self):
1675        data = self.stream.char()
1676        if data in spaceCharacters:
1677            pass
1678        elif data == ">":
1679            self.tokenQueue.append(self.currentToken)
1680            self.state = self.dataState
1681        elif data is EOF:
1682            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1683                                    "eof-in-doctype"})
1684            self.currentToken["correct"] = False
1685            self.tokenQueue.append(self.currentToken)
1686            self.state = self.dataState
1687        else:
1688            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1689                                    "unexpected-char-in-doctype"})
1690            self.state = self.bogusDoctypeState
1691        return True
1692
1693    def bogusDoctypeState(self):
1694        data = self.stream.char()
1695        if data == ">":
1696            self.tokenQueue.append(self.currentToken)
1697            self.state = self.dataState
1698        elif data is EOF:
1699            # XXX EMIT
1700            self.stream.unget(data)
1701            self.tokenQueue.append(self.currentToken)
1702            self.state = self.dataState
1703        else:
1704            pass
1705        return True
1706
1707    def cdataSectionState(self):
1708        data = []
1709        while True:
1710            data.append(self.stream.charsUntil("]"))
1711            data.append(self.stream.charsUntil(">"))
1712            char = self.stream.char()
1713            if char == EOF:
1714                break
1715            else:
1716                assert char == ">"
1717                if data[-1][-2:] == "]]":
1718                    data[-1] = data[-1][:-2]
1719                    break
1720                else:
1721                    data.append(char)
1722
1723        data = "".join(data)  # pylint:disable=redefined-variable-type
1724        # Deal with null here rather than in the parser
1725        nullCount = data.count("\u0000")
1726        if nullCount > 0:
1727            for _ in range(nullCount):
1728                self.tokenQueue.append({"type": tokenTypes["ParseError"],
1729                                        "data": "invalid-codepoint"})
1730            data = data.replace("\u0000", "\uFFFD")
1731        if data:
1732            self.tokenQueue.append({"type": tokenTypes["Characters"],
1733                                    "data": data})
1734        self.state = self.dataState
1735        return True
1736