1from __future__ import absolute_import, division, unicode_literals 2 3from pip._vendor.six import unichr as chr 4 5from collections import deque, OrderedDict 6from sys import version_info 7 8from .constants import spaceCharacters 9from .constants import entities 10from .constants import asciiLetters, asciiUpper2Lower 11from .constants import digits, hexDigits, EOF 12from .constants import tokenTypes, tagTokenTypes 13from .constants import replacementCharacters 14 15from ._inputstream import HTMLInputStream 16 17from ._trie import Trie 18 19entitiesTrie = Trie(entities) 20 21if version_info >= (3, 7): 22 attributeMap = dict 23else: 24 attributeMap = OrderedDict 25 26 27class HTMLTokenizer(object): 28 """ This class takes care of tokenizing HTML. 29 30 * self.currentToken 31 Holds the token that is currently being processed. 32 33 * self.state 34 Holds a reference to the method to be invoked... XXX 35 36 * self.stream 37 Points to HTMLInputStream object. 38 """ 39 40 def __init__(self, stream, parser=None, **kwargs): 41 42 self.stream = HTMLInputStream(stream, **kwargs) 43 self.parser = parser 44 45 # Setup the initial tokenizer state 46 self.escapeFlag = False 47 self.lastFourChars = [] 48 self.state = self.dataState 49 self.escape = False 50 51 # The current token being created 52 self.currentToken = None 53 super(HTMLTokenizer, self).__init__() 54 55 def __iter__(self): 56 """ This is where the magic happens. 57 58 We do our usually processing through the states and when we have a token 59 to return we yield the token which pauses processing until the next token 60 is requested. 61 """ 62 self.tokenQueue = deque([]) 63 # Start processing. When EOF is reached self.state will return False 64 # instead of True and the loop will terminate. 65 while self.state(): 66 while self.stream.errors: 67 yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} 68 while self.tokenQueue: 69 yield self.tokenQueue.popleft() 70 71 def consumeNumberEntity(self, isHex): 72 """This function returns either U+FFFD or the character based on the 73 decimal or hexadecimal representation. It also discards ";" if present. 74 If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. 75 """ 76 77 allowed = digits 78 radix = 10 79 if isHex: 80 allowed = hexDigits 81 radix = 16 82 83 charStack = [] 84 85 # Consume all the characters that are in range while making sure we 86 # don't hit an EOF. 87 c = self.stream.char() 88 while c in allowed and c is not EOF: 89 charStack.append(c) 90 c = self.stream.char() 91 92 # Convert the set of characters consumed to an int. 93 charAsInt = int("".join(charStack), radix) 94 95 # Certain characters get replaced with others 96 if charAsInt in replacementCharacters: 97 char = replacementCharacters[charAsInt] 98 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 99 "illegal-codepoint-for-numeric-entity", 100 "datavars": {"charAsInt": charAsInt}}) 101 elif ((0xD800 <= charAsInt <= 0xDFFF) or 102 (charAsInt > 0x10FFFF)): 103 char = "\uFFFD" 104 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 105 "illegal-codepoint-for-numeric-entity", 106 "datavars": {"charAsInt": charAsInt}}) 107 else: 108 # Should speed up this check somehow (e.g. move the set to a constant) 109 if ((0x0001 <= charAsInt <= 0x0008) or 110 (0x000E <= charAsInt <= 0x001F) or 111 (0x007F <= charAsInt <= 0x009F) or 112 (0xFDD0 <= charAsInt <= 0xFDEF) or 113 charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 114 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 115 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 116 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 117 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 118 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 119 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 120 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 121 0xFFFFF, 0x10FFFE, 0x10FFFF])): 122 self.tokenQueue.append({"type": tokenTypes["ParseError"], 123 "data": 124 "illegal-codepoint-for-numeric-entity", 125 "datavars": {"charAsInt": charAsInt}}) 126 try: 127 # Try/except needed as UCS-2 Python builds' unichar only works 128 # within the BMP. 129 char = chr(charAsInt) 130 except ValueError: 131 v = charAsInt - 0x10000 132 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) 133 134 # Discard the ; if present. Otherwise, put it back on the queue and 135 # invoke parseError on parser. 136 if c != ";": 137 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 138 "numeric-entity-without-semicolon"}) 139 self.stream.unget(c) 140 141 return char 142 143 def consumeEntity(self, allowedChar=None, fromAttribute=False): 144 # Initialise to the default output for when no entity is matched 145 output = "&" 146 147 charStack = [self.stream.char()] 148 if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or 149 (allowedChar is not None and allowedChar == charStack[0])): 150 self.stream.unget(charStack[0]) 151 152 elif charStack[0] == "#": 153 # Read the next character to see if it's hex or decimal 154 hex = False 155 charStack.append(self.stream.char()) 156 if charStack[-1] in ("x", "X"): 157 hex = True 158 charStack.append(self.stream.char()) 159 160 # charStack[-1] should be the first digit 161 if (hex and charStack[-1] in hexDigits) \ 162 or (not hex and charStack[-1] in digits): 163 # At least one digit found, so consume the whole number 164 self.stream.unget(charStack[-1]) 165 output = self.consumeNumberEntity(hex) 166 else: 167 # No digits found 168 self.tokenQueue.append({"type": tokenTypes["ParseError"], 169 "data": "expected-numeric-entity"}) 170 self.stream.unget(charStack.pop()) 171 output = "&" + "".join(charStack) 172 173 else: 174 # At this point in the process might have named entity. Entities 175 # are stored in the global variable "entities". 176 # 177 # Consume characters and compare to these to a substring of the 178 # entity names in the list until the substring no longer matches. 179 while (charStack[-1] is not EOF): 180 if not entitiesTrie.has_keys_with_prefix("".join(charStack)): 181 break 182 charStack.append(self.stream.char()) 183 184 # At this point we have a string that starts with some characters 185 # that may match an entity 186 # Try to find the longest entity the string will match to take care 187 # of ¬i for instance. 188 try: 189 entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) 190 entityLength = len(entityName) 191 except KeyError: 192 entityName = None 193 194 if entityName is not None: 195 if entityName[-1] != ";": 196 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 197 "named-entity-without-semicolon"}) 198 if (entityName[-1] != ";" and fromAttribute and 199 (charStack[entityLength] in asciiLetters or 200 charStack[entityLength] in digits or 201 charStack[entityLength] == "=")): 202 self.stream.unget(charStack.pop()) 203 output = "&" + "".join(charStack) 204 else: 205 output = entities[entityName] 206 self.stream.unget(charStack.pop()) 207 output += "".join(charStack[entityLength:]) 208 else: 209 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 210 "expected-named-entity"}) 211 self.stream.unget(charStack.pop()) 212 output = "&" + "".join(charStack) 213 214 if fromAttribute: 215 self.currentToken["data"][-1][1] += output 216 else: 217 if output in spaceCharacters: 218 tokenType = "SpaceCharacters" 219 else: 220 tokenType = "Characters" 221 self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) 222 223 def processEntityInAttribute(self, allowedChar): 224 """This method replaces the need for "entityInAttributeValueState". 225 """ 226 self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) 227 228 def emitCurrentToken(self): 229 """This method is a generic handler for emitting the tags. It also sets 230 the state to "data" because that's what's needed after a token has been 231 emitted. 232 """ 233 token = self.currentToken 234 # Add token to the queue to be yielded 235 if (token["type"] in tagTokenTypes): 236 token["name"] = token["name"].translate(asciiUpper2Lower) 237 if token["type"] == tokenTypes["StartTag"]: 238 raw = token["data"] 239 data = attributeMap(raw) 240 if len(raw) > len(data): 241 # we had some duplicated attribute, fix so first wins 242 data.update(raw[::-1]) 243 token["data"] = data 244 245 if token["type"] == tokenTypes["EndTag"]: 246 if token["data"]: 247 self.tokenQueue.append({"type": tokenTypes["ParseError"], 248 "data": "attributes-in-end-tag"}) 249 if token["selfClosing"]: 250 self.tokenQueue.append({"type": tokenTypes["ParseError"], 251 "data": "self-closing-flag-on-end-tag"}) 252 self.tokenQueue.append(token) 253 self.state = self.dataState 254 255 # Below are the various tokenizer states worked out. 256 def dataState(self): 257 data = self.stream.char() 258 if data == "&": 259 self.state = self.entityDataState 260 elif data == "<": 261 self.state = self.tagOpenState 262 elif data == "\u0000": 263 self.tokenQueue.append({"type": tokenTypes["ParseError"], 264 "data": "invalid-codepoint"}) 265 self.tokenQueue.append({"type": tokenTypes["Characters"], 266 "data": "\u0000"}) 267 elif data is EOF: 268 # Tokenization ends. 269 return False 270 elif data in spaceCharacters: 271 # Directly after emitting a token you switch back to the "data 272 # state". At that point spaceCharacters are important so they are 273 # emitted separately. 274 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": 275 data + self.stream.charsUntil(spaceCharacters, True)}) 276 # No need to update lastFourChars here, since the first space will 277 # have already been appended to lastFourChars and will have broken 278 # any <!-- or --> sequences 279 else: 280 chars = self.stream.charsUntil(("&", "<", "\u0000")) 281 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 282 data + chars}) 283 return True 284 285 def entityDataState(self): 286 self.consumeEntity() 287 self.state = self.dataState 288 return True 289 290 def rcdataState(self): 291 data = self.stream.char() 292 if data == "&": 293 self.state = self.characterReferenceInRcdata 294 elif data == "<": 295 self.state = self.rcdataLessThanSignState 296 elif data == EOF: 297 # Tokenization ends. 298 return False 299 elif data == "\u0000": 300 self.tokenQueue.append({"type": tokenTypes["ParseError"], 301 "data": "invalid-codepoint"}) 302 self.tokenQueue.append({"type": tokenTypes["Characters"], 303 "data": "\uFFFD"}) 304 elif data in spaceCharacters: 305 # Directly after emitting a token you switch back to the "data 306 # state". At that point spaceCharacters are important so they are 307 # emitted separately. 308 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": 309 data + self.stream.charsUntil(spaceCharacters, True)}) 310 # No need to update lastFourChars here, since the first space will 311 # have already been appended to lastFourChars and will have broken 312 # any <!-- or --> sequences 313 else: 314 chars = self.stream.charsUntil(("&", "<", "\u0000")) 315 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 316 data + chars}) 317 return True 318 319 def characterReferenceInRcdata(self): 320 self.consumeEntity() 321 self.state = self.rcdataState 322 return True 323 324 def rawtextState(self): 325 data = self.stream.char() 326 if data == "<": 327 self.state = self.rawtextLessThanSignState 328 elif data == "\u0000": 329 self.tokenQueue.append({"type": tokenTypes["ParseError"], 330 "data": "invalid-codepoint"}) 331 self.tokenQueue.append({"type": tokenTypes["Characters"], 332 "data": "\uFFFD"}) 333 elif data == EOF: 334 # Tokenization ends. 335 return False 336 else: 337 chars = self.stream.charsUntil(("<", "\u0000")) 338 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 339 data + chars}) 340 return True 341 342 def scriptDataState(self): 343 data = self.stream.char() 344 if data == "<": 345 self.state = self.scriptDataLessThanSignState 346 elif data == "\u0000": 347 self.tokenQueue.append({"type": tokenTypes["ParseError"], 348 "data": "invalid-codepoint"}) 349 self.tokenQueue.append({"type": tokenTypes["Characters"], 350 "data": "\uFFFD"}) 351 elif data == EOF: 352 # Tokenization ends. 353 return False 354 else: 355 chars = self.stream.charsUntil(("<", "\u0000")) 356 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 357 data + chars}) 358 return True 359 360 def plaintextState(self): 361 data = self.stream.char() 362 if data == EOF: 363 # Tokenization ends. 364 return False 365 elif data == "\u0000": 366 self.tokenQueue.append({"type": tokenTypes["ParseError"], 367 "data": "invalid-codepoint"}) 368 self.tokenQueue.append({"type": tokenTypes["Characters"], 369 "data": "\uFFFD"}) 370 else: 371 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 372 data + self.stream.charsUntil("\u0000")}) 373 return True 374 375 def tagOpenState(self): 376 data = self.stream.char() 377 if data == "!": 378 self.state = self.markupDeclarationOpenState 379 elif data == "/": 380 self.state = self.closeTagOpenState 381 elif data in asciiLetters: 382 self.currentToken = {"type": tokenTypes["StartTag"], 383 "name": data, "data": [], 384 "selfClosing": False, 385 "selfClosingAcknowledged": False} 386 self.state = self.tagNameState 387 elif data == ">": 388 # XXX In theory it could be something besides a tag name. But 389 # do we really care? 390 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 391 "expected-tag-name-but-got-right-bracket"}) 392 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) 393 self.state = self.dataState 394 elif data == "?": 395 # XXX In theory it could be something besides a tag name. But 396 # do we really care? 397 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 398 "expected-tag-name-but-got-question-mark"}) 399 self.stream.unget(data) 400 self.state = self.bogusCommentState 401 else: 402 # XXX 403 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 404 "expected-tag-name"}) 405 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 406 self.stream.unget(data) 407 self.state = self.dataState 408 return True 409 410 def closeTagOpenState(self): 411 data = self.stream.char() 412 if data in asciiLetters: 413 self.currentToken = {"type": tokenTypes["EndTag"], "name": data, 414 "data": [], "selfClosing": False} 415 self.state = self.tagNameState 416 elif data == ">": 417 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 418 "expected-closing-tag-but-got-right-bracket"}) 419 self.state = self.dataState 420 elif data is EOF: 421 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 422 "expected-closing-tag-but-got-eof"}) 423 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 424 self.state = self.dataState 425 else: 426 # XXX data can be _'_... 427 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 428 "expected-closing-tag-but-got-char", 429 "datavars": {"data": data}}) 430 self.stream.unget(data) 431 self.state = self.bogusCommentState 432 return True 433 434 def tagNameState(self): 435 data = self.stream.char() 436 if data in spaceCharacters: 437 self.state = self.beforeAttributeNameState 438 elif data == ">": 439 self.emitCurrentToken() 440 elif data is EOF: 441 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 442 "eof-in-tag-name"}) 443 self.state = self.dataState 444 elif data == "/": 445 self.state = self.selfClosingStartTagState 446 elif data == "\u0000": 447 self.tokenQueue.append({"type": tokenTypes["ParseError"], 448 "data": "invalid-codepoint"}) 449 self.currentToken["name"] += "\uFFFD" 450 else: 451 self.currentToken["name"] += data 452 # (Don't use charsUntil here, because tag names are 453 # very short and it's faster to not do anything fancy) 454 return True 455 456 def rcdataLessThanSignState(self): 457 data = self.stream.char() 458 if data == "/": 459 self.temporaryBuffer = "" 460 self.state = self.rcdataEndTagOpenState 461 else: 462 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 463 self.stream.unget(data) 464 self.state = self.rcdataState 465 return True 466 467 def rcdataEndTagOpenState(self): 468 data = self.stream.char() 469 if data in asciiLetters: 470 self.temporaryBuffer += data 471 self.state = self.rcdataEndTagNameState 472 else: 473 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 474 self.stream.unget(data) 475 self.state = self.rcdataState 476 return True 477 478 def rcdataEndTagNameState(self): 479 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 480 data = self.stream.char() 481 if data in spaceCharacters and appropriate: 482 self.currentToken = {"type": tokenTypes["EndTag"], 483 "name": self.temporaryBuffer, 484 "data": [], "selfClosing": False} 485 self.state = self.beforeAttributeNameState 486 elif data == "/" and appropriate: 487 self.currentToken = {"type": tokenTypes["EndTag"], 488 "name": self.temporaryBuffer, 489 "data": [], "selfClosing": False} 490 self.state = self.selfClosingStartTagState 491 elif data == ">" and appropriate: 492 self.currentToken = {"type": tokenTypes["EndTag"], 493 "name": self.temporaryBuffer, 494 "data": [], "selfClosing": False} 495 self.emitCurrentToken() 496 self.state = self.dataState 497 elif data in asciiLetters: 498 self.temporaryBuffer += data 499 else: 500 self.tokenQueue.append({"type": tokenTypes["Characters"], 501 "data": "</" + self.temporaryBuffer}) 502 self.stream.unget(data) 503 self.state = self.rcdataState 504 return True 505 506 def rawtextLessThanSignState(self): 507 data = self.stream.char() 508 if data == "/": 509 self.temporaryBuffer = "" 510 self.state = self.rawtextEndTagOpenState 511 else: 512 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 513 self.stream.unget(data) 514 self.state = self.rawtextState 515 return True 516 517 def rawtextEndTagOpenState(self): 518 data = self.stream.char() 519 if data in asciiLetters: 520 self.temporaryBuffer += data 521 self.state = self.rawtextEndTagNameState 522 else: 523 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 524 self.stream.unget(data) 525 self.state = self.rawtextState 526 return True 527 528 def rawtextEndTagNameState(self): 529 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 530 data = self.stream.char() 531 if data in spaceCharacters and appropriate: 532 self.currentToken = {"type": tokenTypes["EndTag"], 533 "name": self.temporaryBuffer, 534 "data": [], "selfClosing": False} 535 self.state = self.beforeAttributeNameState 536 elif data == "/" and appropriate: 537 self.currentToken = {"type": tokenTypes["EndTag"], 538 "name": self.temporaryBuffer, 539 "data": [], "selfClosing": False} 540 self.state = self.selfClosingStartTagState 541 elif data == ">" and appropriate: 542 self.currentToken = {"type": tokenTypes["EndTag"], 543 "name": self.temporaryBuffer, 544 "data": [], "selfClosing": False} 545 self.emitCurrentToken() 546 self.state = self.dataState 547 elif data in asciiLetters: 548 self.temporaryBuffer += data 549 else: 550 self.tokenQueue.append({"type": tokenTypes["Characters"], 551 "data": "</" + self.temporaryBuffer}) 552 self.stream.unget(data) 553 self.state = self.rawtextState 554 return True 555 556 def scriptDataLessThanSignState(self): 557 data = self.stream.char() 558 if data == "/": 559 self.temporaryBuffer = "" 560 self.state = self.scriptDataEndTagOpenState 561 elif data == "!": 562 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"}) 563 self.state = self.scriptDataEscapeStartState 564 else: 565 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 566 self.stream.unget(data) 567 self.state = self.scriptDataState 568 return True 569 570 def scriptDataEndTagOpenState(self): 571 data = self.stream.char() 572 if data in asciiLetters: 573 self.temporaryBuffer += data 574 self.state = self.scriptDataEndTagNameState 575 else: 576 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 577 self.stream.unget(data) 578 self.state = self.scriptDataState 579 return True 580 581 def scriptDataEndTagNameState(self): 582 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 583 data = self.stream.char() 584 if data in spaceCharacters and appropriate: 585 self.currentToken = {"type": tokenTypes["EndTag"], 586 "name": self.temporaryBuffer, 587 "data": [], "selfClosing": False} 588 self.state = self.beforeAttributeNameState 589 elif data == "/" and appropriate: 590 self.currentToken = {"type": tokenTypes["EndTag"], 591 "name": self.temporaryBuffer, 592 "data": [], "selfClosing": False} 593 self.state = self.selfClosingStartTagState 594 elif data == ">" and appropriate: 595 self.currentToken = {"type": tokenTypes["EndTag"], 596 "name": self.temporaryBuffer, 597 "data": [], "selfClosing": False} 598 self.emitCurrentToken() 599 self.state = self.dataState 600 elif data in asciiLetters: 601 self.temporaryBuffer += data 602 else: 603 self.tokenQueue.append({"type": tokenTypes["Characters"], 604 "data": "</" + self.temporaryBuffer}) 605 self.stream.unget(data) 606 self.state = self.scriptDataState 607 return True 608 609 def scriptDataEscapeStartState(self): 610 data = self.stream.char() 611 if data == "-": 612 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 613 self.state = self.scriptDataEscapeStartDashState 614 else: 615 self.stream.unget(data) 616 self.state = self.scriptDataState 617 return True 618 619 def scriptDataEscapeStartDashState(self): 620 data = self.stream.char() 621 if data == "-": 622 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 623 self.state = self.scriptDataEscapedDashDashState 624 else: 625 self.stream.unget(data) 626 self.state = self.scriptDataState 627 return True 628 629 def scriptDataEscapedState(self): 630 data = self.stream.char() 631 if data == "-": 632 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 633 self.state = self.scriptDataEscapedDashState 634 elif data == "<": 635 self.state = self.scriptDataEscapedLessThanSignState 636 elif data == "\u0000": 637 self.tokenQueue.append({"type": tokenTypes["ParseError"], 638 "data": "invalid-codepoint"}) 639 self.tokenQueue.append({"type": tokenTypes["Characters"], 640 "data": "\uFFFD"}) 641 elif data == EOF: 642 self.state = self.dataState 643 else: 644 chars = self.stream.charsUntil(("<", "-", "\u0000")) 645 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 646 data + chars}) 647 return True 648 649 def scriptDataEscapedDashState(self): 650 data = self.stream.char() 651 if data == "-": 652 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 653 self.state = self.scriptDataEscapedDashDashState 654 elif data == "<": 655 self.state = self.scriptDataEscapedLessThanSignState 656 elif data == "\u0000": 657 self.tokenQueue.append({"type": tokenTypes["ParseError"], 658 "data": "invalid-codepoint"}) 659 self.tokenQueue.append({"type": tokenTypes["Characters"], 660 "data": "\uFFFD"}) 661 self.state = self.scriptDataEscapedState 662 elif data == EOF: 663 self.state = self.dataState 664 else: 665 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 666 self.state = self.scriptDataEscapedState 667 return True 668 669 def scriptDataEscapedDashDashState(self): 670 data = self.stream.char() 671 if data == "-": 672 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 673 elif data == "<": 674 self.state = self.scriptDataEscapedLessThanSignState 675 elif data == ">": 676 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) 677 self.state = self.scriptDataState 678 elif data == "\u0000": 679 self.tokenQueue.append({"type": tokenTypes["ParseError"], 680 "data": "invalid-codepoint"}) 681 self.tokenQueue.append({"type": tokenTypes["Characters"], 682 "data": "\uFFFD"}) 683 self.state = self.scriptDataEscapedState 684 elif data == EOF: 685 self.state = self.dataState 686 else: 687 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 688 self.state = self.scriptDataEscapedState 689 return True 690 691 def scriptDataEscapedLessThanSignState(self): 692 data = self.stream.char() 693 if data == "/": 694 self.temporaryBuffer = "" 695 self.state = self.scriptDataEscapedEndTagOpenState 696 elif data in asciiLetters: 697 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) 698 self.temporaryBuffer = data 699 self.state = self.scriptDataDoubleEscapeStartState 700 else: 701 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 702 self.stream.unget(data) 703 self.state = self.scriptDataEscapedState 704 return True 705 706 def scriptDataEscapedEndTagOpenState(self): 707 data = self.stream.char() 708 if data in asciiLetters: 709 self.temporaryBuffer = data 710 self.state = self.scriptDataEscapedEndTagNameState 711 else: 712 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 713 self.stream.unget(data) 714 self.state = self.scriptDataEscapedState 715 return True 716 717 def scriptDataEscapedEndTagNameState(self): 718 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 719 data = self.stream.char() 720 if data in spaceCharacters and appropriate: 721 self.currentToken = {"type": tokenTypes["EndTag"], 722 "name": self.temporaryBuffer, 723 "data": [], "selfClosing": False} 724 self.state = self.beforeAttributeNameState 725 elif data == "/" and appropriate: 726 self.currentToken = {"type": tokenTypes["EndTag"], 727 "name": self.temporaryBuffer, 728 "data": [], "selfClosing": False} 729 self.state = self.selfClosingStartTagState 730 elif data == ">" and appropriate: 731 self.currentToken = {"type": tokenTypes["EndTag"], 732 "name": self.temporaryBuffer, 733 "data": [], "selfClosing": False} 734 self.emitCurrentToken() 735 self.state = self.dataState 736 elif data in asciiLetters: 737 self.temporaryBuffer += data 738 else: 739 self.tokenQueue.append({"type": tokenTypes["Characters"], 740 "data": "</" + self.temporaryBuffer}) 741 self.stream.unget(data) 742 self.state = self.scriptDataEscapedState 743 return True 744 745 def scriptDataDoubleEscapeStartState(self): 746 data = self.stream.char() 747 if data in (spaceCharacters | frozenset(("/", ">"))): 748 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 749 if self.temporaryBuffer.lower() == "script": 750 self.state = self.scriptDataDoubleEscapedState 751 else: 752 self.state = self.scriptDataEscapedState 753 elif data in asciiLetters: 754 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 755 self.temporaryBuffer += data 756 else: 757 self.stream.unget(data) 758 self.state = self.scriptDataEscapedState 759 return True 760 761 def scriptDataDoubleEscapedState(self): 762 data = self.stream.char() 763 if data == "-": 764 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 765 self.state = self.scriptDataDoubleEscapedDashState 766 elif data == "<": 767 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 768 self.state = self.scriptDataDoubleEscapedLessThanSignState 769 elif data == "\u0000": 770 self.tokenQueue.append({"type": tokenTypes["ParseError"], 771 "data": "invalid-codepoint"}) 772 self.tokenQueue.append({"type": tokenTypes["Characters"], 773 "data": "\uFFFD"}) 774 elif data == EOF: 775 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 776 "eof-in-script-in-script"}) 777 self.state = self.dataState 778 else: 779 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 780 return True 781 782 def scriptDataDoubleEscapedDashState(self): 783 data = self.stream.char() 784 if data == "-": 785 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 786 self.state = self.scriptDataDoubleEscapedDashDashState 787 elif data == "<": 788 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 789 self.state = self.scriptDataDoubleEscapedLessThanSignState 790 elif data == "\u0000": 791 self.tokenQueue.append({"type": tokenTypes["ParseError"], 792 "data": "invalid-codepoint"}) 793 self.tokenQueue.append({"type": tokenTypes["Characters"], 794 "data": "\uFFFD"}) 795 self.state = self.scriptDataDoubleEscapedState 796 elif data == EOF: 797 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 798 "eof-in-script-in-script"}) 799 self.state = self.dataState 800 else: 801 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 802 self.state = self.scriptDataDoubleEscapedState 803 return True 804 805 def scriptDataDoubleEscapedDashDashState(self): 806 data = self.stream.char() 807 if data == "-": 808 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 809 elif data == "<": 810 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 811 self.state = self.scriptDataDoubleEscapedLessThanSignState 812 elif data == ">": 813 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) 814 self.state = self.scriptDataState 815 elif data == "\u0000": 816 self.tokenQueue.append({"type": tokenTypes["ParseError"], 817 "data": "invalid-codepoint"}) 818 self.tokenQueue.append({"type": tokenTypes["Characters"], 819 "data": "\uFFFD"}) 820 self.state = self.scriptDataDoubleEscapedState 821 elif data == EOF: 822 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 823 "eof-in-script-in-script"}) 824 self.state = self.dataState 825 else: 826 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 827 self.state = self.scriptDataDoubleEscapedState 828 return True 829 830 def scriptDataDoubleEscapedLessThanSignState(self): 831 data = self.stream.char() 832 if data == "/": 833 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) 834 self.temporaryBuffer = "" 835 self.state = self.scriptDataDoubleEscapeEndState 836 else: 837 self.stream.unget(data) 838 self.state = self.scriptDataDoubleEscapedState 839 return True 840 841 def scriptDataDoubleEscapeEndState(self): 842 data = self.stream.char() 843 if data in (spaceCharacters | frozenset(("/", ">"))): 844 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 845 if self.temporaryBuffer.lower() == "script": 846 self.state = self.scriptDataEscapedState 847 else: 848 self.state = self.scriptDataDoubleEscapedState 849 elif data in asciiLetters: 850 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 851 self.temporaryBuffer += data 852 else: 853 self.stream.unget(data) 854 self.state = self.scriptDataDoubleEscapedState 855 return True 856 857 def beforeAttributeNameState(self): 858 data = self.stream.char() 859 if data in spaceCharacters: 860 self.stream.charsUntil(spaceCharacters, True) 861 elif data in asciiLetters: 862 self.currentToken["data"].append([data, ""]) 863 self.state = self.attributeNameState 864 elif data == ">": 865 self.emitCurrentToken() 866 elif data == "/": 867 self.state = self.selfClosingStartTagState 868 elif data in ("'", '"', "=", "<"): 869 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 870 "invalid-character-in-attribute-name"}) 871 self.currentToken["data"].append([data, ""]) 872 self.state = self.attributeNameState 873 elif data == "\u0000": 874 self.tokenQueue.append({"type": tokenTypes["ParseError"], 875 "data": "invalid-codepoint"}) 876 self.currentToken["data"].append(["\uFFFD", ""]) 877 self.state = self.attributeNameState 878 elif data is EOF: 879 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 880 "expected-attribute-name-but-got-eof"}) 881 self.state = self.dataState 882 else: 883 self.currentToken["data"].append([data, ""]) 884 self.state = self.attributeNameState 885 return True 886 887 def attributeNameState(self): 888 data = self.stream.char() 889 leavingThisState = True 890 emitToken = False 891 if data == "=": 892 self.state = self.beforeAttributeValueState 893 elif data in asciiLetters: 894 self.currentToken["data"][-1][0] += data +\ 895 self.stream.charsUntil(asciiLetters, True) 896 leavingThisState = False 897 elif data == ">": 898 # XXX If we emit here the attributes are converted to a dict 899 # without being checked and when the code below runs we error 900 # because data is a dict not a list 901 emitToken = True 902 elif data in spaceCharacters: 903 self.state = self.afterAttributeNameState 904 elif data == "/": 905 self.state = self.selfClosingStartTagState 906 elif data == "\u0000": 907 self.tokenQueue.append({"type": tokenTypes["ParseError"], 908 "data": "invalid-codepoint"}) 909 self.currentToken["data"][-1][0] += "\uFFFD" 910 leavingThisState = False 911 elif data in ("'", '"', "<"): 912 self.tokenQueue.append({"type": tokenTypes["ParseError"], 913 "data": 914 "invalid-character-in-attribute-name"}) 915 self.currentToken["data"][-1][0] += data 916 leavingThisState = False 917 elif data is EOF: 918 self.tokenQueue.append({"type": tokenTypes["ParseError"], 919 "data": "eof-in-attribute-name"}) 920 self.state = self.dataState 921 else: 922 self.currentToken["data"][-1][0] += data 923 leavingThisState = False 924 925 if leavingThisState: 926 # Attributes are not dropped at this stage. That happens when the 927 # start tag token is emitted so values can still be safely appended 928 # to attributes, but we do want to report the parse error in time. 929 self.currentToken["data"][-1][0] = ( 930 self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) 931 for name, _ in self.currentToken["data"][:-1]: 932 if self.currentToken["data"][-1][0] == name: 933 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 934 "duplicate-attribute"}) 935 break 936 # XXX Fix for above XXX 937 if emitToken: 938 self.emitCurrentToken() 939 return True 940 941 def afterAttributeNameState(self): 942 data = self.stream.char() 943 if data in spaceCharacters: 944 self.stream.charsUntil(spaceCharacters, True) 945 elif data == "=": 946 self.state = self.beforeAttributeValueState 947 elif data == ">": 948 self.emitCurrentToken() 949 elif data in asciiLetters: 950 self.currentToken["data"].append([data, ""]) 951 self.state = self.attributeNameState 952 elif data == "/": 953 self.state = self.selfClosingStartTagState 954 elif data == "\u0000": 955 self.tokenQueue.append({"type": tokenTypes["ParseError"], 956 "data": "invalid-codepoint"}) 957 self.currentToken["data"].append(["\uFFFD", ""]) 958 self.state = self.attributeNameState 959 elif data in ("'", '"', "<"): 960 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 961 "invalid-character-after-attribute-name"}) 962 self.currentToken["data"].append([data, ""]) 963 self.state = self.attributeNameState 964 elif data is EOF: 965 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 966 "expected-end-of-tag-but-got-eof"}) 967 self.state = self.dataState 968 else: 969 self.currentToken["data"].append([data, ""]) 970 self.state = self.attributeNameState 971 return True 972 973 def beforeAttributeValueState(self): 974 data = self.stream.char() 975 if data in spaceCharacters: 976 self.stream.charsUntil(spaceCharacters, True) 977 elif data == "\"": 978 self.state = self.attributeValueDoubleQuotedState 979 elif data == "&": 980 self.state = self.attributeValueUnQuotedState 981 self.stream.unget(data) 982 elif data == "'": 983 self.state = self.attributeValueSingleQuotedState 984 elif data == ">": 985 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 986 "expected-attribute-value-but-got-right-bracket"}) 987 self.emitCurrentToken() 988 elif data == "\u0000": 989 self.tokenQueue.append({"type": tokenTypes["ParseError"], 990 "data": "invalid-codepoint"}) 991 self.currentToken["data"][-1][1] += "\uFFFD" 992 self.state = self.attributeValueUnQuotedState 993 elif data in ("=", "<", "`"): 994 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 995 "equals-in-unquoted-attribute-value"}) 996 self.currentToken["data"][-1][1] += data 997 self.state = self.attributeValueUnQuotedState 998 elif data is EOF: 999 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1000 "expected-attribute-value-but-got-eof"}) 1001 self.state = self.dataState 1002 else: 1003 self.currentToken["data"][-1][1] += data 1004 self.state = self.attributeValueUnQuotedState 1005 return True 1006 1007 def attributeValueDoubleQuotedState(self): 1008 data = self.stream.char() 1009 if data == "\"": 1010 self.state = self.afterAttributeValueState 1011 elif data == "&": 1012 self.processEntityInAttribute('"') 1013 elif data == "\u0000": 1014 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1015 "data": "invalid-codepoint"}) 1016 self.currentToken["data"][-1][1] += "\uFFFD" 1017 elif data is EOF: 1018 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1019 "eof-in-attribute-value-double-quote"}) 1020 self.state = self.dataState 1021 else: 1022 self.currentToken["data"][-1][1] += data +\ 1023 self.stream.charsUntil(("\"", "&", "\u0000")) 1024 return True 1025 1026 def attributeValueSingleQuotedState(self): 1027 data = self.stream.char() 1028 if data == "'": 1029 self.state = self.afterAttributeValueState 1030 elif data == "&": 1031 self.processEntityInAttribute("'") 1032 elif data == "\u0000": 1033 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1034 "data": "invalid-codepoint"}) 1035 self.currentToken["data"][-1][1] += "\uFFFD" 1036 elif data is EOF: 1037 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1038 "eof-in-attribute-value-single-quote"}) 1039 self.state = self.dataState 1040 else: 1041 self.currentToken["data"][-1][1] += data +\ 1042 self.stream.charsUntil(("'", "&", "\u0000")) 1043 return True 1044 1045 def attributeValueUnQuotedState(self): 1046 data = self.stream.char() 1047 if data in spaceCharacters: 1048 self.state = self.beforeAttributeNameState 1049 elif data == "&": 1050 self.processEntityInAttribute(">") 1051 elif data == ">": 1052 self.emitCurrentToken() 1053 elif data in ('"', "'", "=", "<", "`"): 1054 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1055 "unexpected-character-in-unquoted-attribute-value"}) 1056 self.currentToken["data"][-1][1] += data 1057 elif data == "\u0000": 1058 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1059 "data": "invalid-codepoint"}) 1060 self.currentToken["data"][-1][1] += "\uFFFD" 1061 elif data is EOF: 1062 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1063 "eof-in-attribute-value-no-quotes"}) 1064 self.state = self.dataState 1065 else: 1066 self.currentToken["data"][-1][1] += data + self.stream.charsUntil( 1067 frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) 1068 return True 1069 1070 def afterAttributeValueState(self): 1071 data = self.stream.char() 1072 if data in spaceCharacters: 1073 self.state = self.beforeAttributeNameState 1074 elif data == ">": 1075 self.emitCurrentToken() 1076 elif data == "/": 1077 self.state = self.selfClosingStartTagState 1078 elif data is EOF: 1079 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1080 "unexpected-EOF-after-attribute-value"}) 1081 self.stream.unget(data) 1082 self.state = self.dataState 1083 else: 1084 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1085 "unexpected-character-after-attribute-value"}) 1086 self.stream.unget(data) 1087 self.state = self.beforeAttributeNameState 1088 return True 1089 1090 def selfClosingStartTagState(self): 1091 data = self.stream.char() 1092 if data == ">": 1093 self.currentToken["selfClosing"] = True 1094 self.emitCurrentToken() 1095 elif data is EOF: 1096 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1097 "data": 1098 "unexpected-EOF-after-solidus-in-tag"}) 1099 self.stream.unget(data) 1100 self.state = self.dataState 1101 else: 1102 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1103 "unexpected-character-after-solidus-in-tag"}) 1104 self.stream.unget(data) 1105 self.state = self.beforeAttributeNameState 1106 return True 1107 1108 def bogusCommentState(self): 1109 # Make a new comment token and give it as value all the characters 1110 # until the first > or EOF (charsUntil checks for EOF automatically) 1111 # and emit it. 1112 data = self.stream.charsUntil(">") 1113 data = data.replace("\u0000", "\uFFFD") 1114 self.tokenQueue.append( 1115 {"type": tokenTypes["Comment"], "data": data}) 1116 1117 # Eat the character directly after the bogus comment which is either a 1118 # ">" or an EOF. 1119 self.stream.char() 1120 self.state = self.dataState 1121 return True 1122 1123 def markupDeclarationOpenState(self): 1124 charStack = [self.stream.char()] 1125 if charStack[-1] == "-": 1126 charStack.append(self.stream.char()) 1127 if charStack[-1] == "-": 1128 self.currentToken = {"type": tokenTypes["Comment"], "data": ""} 1129 self.state = self.commentStartState 1130 return True 1131 elif charStack[-1] in ('d', 'D'): 1132 matched = True 1133 for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), 1134 ('y', 'Y'), ('p', 'P'), ('e', 'E')): 1135 charStack.append(self.stream.char()) 1136 if charStack[-1] not in expected: 1137 matched = False 1138 break 1139 if matched: 1140 self.currentToken = {"type": tokenTypes["Doctype"], 1141 "name": "", 1142 "publicId": None, "systemId": None, 1143 "correct": True} 1144 self.state = self.doctypeState 1145 return True 1146 elif (charStack[-1] == "[" and 1147 self.parser is not None and 1148 self.parser.tree.openElements and 1149 self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): 1150 matched = True 1151 for expected in ["C", "D", "A", "T", "A", "["]: 1152 charStack.append(self.stream.char()) 1153 if charStack[-1] != expected: 1154 matched = False 1155 break 1156 if matched: 1157 self.state = self.cdataSectionState 1158 return True 1159 1160 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1161 "expected-dashes-or-doctype"}) 1162 1163 while charStack: 1164 self.stream.unget(charStack.pop()) 1165 self.state = self.bogusCommentState 1166 return True 1167 1168 def commentStartState(self): 1169 data = self.stream.char() 1170 if data == "-": 1171 self.state = self.commentStartDashState 1172 elif data == "\u0000": 1173 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1174 "data": "invalid-codepoint"}) 1175 self.currentToken["data"] += "\uFFFD" 1176 elif data == ">": 1177 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1178 "incorrect-comment"}) 1179 self.tokenQueue.append(self.currentToken) 1180 self.state = self.dataState 1181 elif data is EOF: 1182 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1183 "eof-in-comment"}) 1184 self.tokenQueue.append(self.currentToken) 1185 self.state = self.dataState 1186 else: 1187 self.currentToken["data"] += data 1188 self.state = self.commentState 1189 return True 1190 1191 def commentStartDashState(self): 1192 data = self.stream.char() 1193 if data == "-": 1194 self.state = self.commentEndState 1195 elif data == "\u0000": 1196 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1197 "data": "invalid-codepoint"}) 1198 self.currentToken["data"] += "-\uFFFD" 1199 elif data == ">": 1200 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1201 "incorrect-comment"}) 1202 self.tokenQueue.append(self.currentToken) 1203 self.state = self.dataState 1204 elif data is EOF: 1205 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1206 "eof-in-comment"}) 1207 self.tokenQueue.append(self.currentToken) 1208 self.state = self.dataState 1209 else: 1210 self.currentToken["data"] += "-" + data 1211 self.state = self.commentState 1212 return True 1213 1214 def commentState(self): 1215 data = self.stream.char() 1216 if data == "-": 1217 self.state = self.commentEndDashState 1218 elif data == "\u0000": 1219 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1220 "data": "invalid-codepoint"}) 1221 self.currentToken["data"] += "\uFFFD" 1222 elif data is EOF: 1223 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1224 "data": "eof-in-comment"}) 1225 self.tokenQueue.append(self.currentToken) 1226 self.state = self.dataState 1227 else: 1228 self.currentToken["data"] += data + \ 1229 self.stream.charsUntil(("-", "\u0000")) 1230 return True 1231 1232 def commentEndDashState(self): 1233 data = self.stream.char() 1234 if data == "-": 1235 self.state = self.commentEndState 1236 elif data == "\u0000": 1237 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1238 "data": "invalid-codepoint"}) 1239 self.currentToken["data"] += "-\uFFFD" 1240 self.state = self.commentState 1241 elif data is EOF: 1242 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1243 "eof-in-comment-end-dash"}) 1244 self.tokenQueue.append(self.currentToken) 1245 self.state = self.dataState 1246 else: 1247 self.currentToken["data"] += "-" + data 1248 self.state = self.commentState 1249 return True 1250 1251 def commentEndState(self): 1252 data = self.stream.char() 1253 if data == ">": 1254 self.tokenQueue.append(self.currentToken) 1255 self.state = self.dataState 1256 elif data == "\u0000": 1257 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1258 "data": "invalid-codepoint"}) 1259 self.currentToken["data"] += "--\uFFFD" 1260 self.state = self.commentState 1261 elif data == "!": 1262 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1263 "unexpected-bang-after-double-dash-in-comment"}) 1264 self.state = self.commentEndBangState 1265 elif data == "-": 1266 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1267 "unexpected-dash-after-double-dash-in-comment"}) 1268 self.currentToken["data"] += data 1269 elif data is EOF: 1270 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1271 "eof-in-comment-double-dash"}) 1272 self.tokenQueue.append(self.currentToken) 1273 self.state = self.dataState 1274 else: 1275 # XXX 1276 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1277 "unexpected-char-in-comment"}) 1278 self.currentToken["data"] += "--" + data 1279 self.state = self.commentState 1280 return True 1281 1282 def commentEndBangState(self): 1283 data = self.stream.char() 1284 if data == ">": 1285 self.tokenQueue.append(self.currentToken) 1286 self.state = self.dataState 1287 elif data == "-": 1288 self.currentToken["data"] += "--!" 1289 self.state = self.commentEndDashState 1290 elif data == "\u0000": 1291 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1292 "data": "invalid-codepoint"}) 1293 self.currentToken["data"] += "--!\uFFFD" 1294 self.state = self.commentState 1295 elif data is EOF: 1296 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1297 "eof-in-comment-end-bang-state"}) 1298 self.tokenQueue.append(self.currentToken) 1299 self.state = self.dataState 1300 else: 1301 self.currentToken["data"] += "--!" + data 1302 self.state = self.commentState 1303 return True 1304 1305 def doctypeState(self): 1306 data = self.stream.char() 1307 if data in spaceCharacters: 1308 self.state = self.beforeDoctypeNameState 1309 elif data is EOF: 1310 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1311 "expected-doctype-name-but-got-eof"}) 1312 self.currentToken["correct"] = False 1313 self.tokenQueue.append(self.currentToken) 1314 self.state = self.dataState 1315 else: 1316 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1317 "need-space-after-doctype"}) 1318 self.stream.unget(data) 1319 self.state = self.beforeDoctypeNameState 1320 return True 1321 1322 def beforeDoctypeNameState(self): 1323 data = self.stream.char() 1324 if data in spaceCharacters: 1325 pass 1326 elif data == ">": 1327 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1328 "expected-doctype-name-but-got-right-bracket"}) 1329 self.currentToken["correct"] = False 1330 self.tokenQueue.append(self.currentToken) 1331 self.state = self.dataState 1332 elif data == "\u0000": 1333 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1334 "data": "invalid-codepoint"}) 1335 self.currentToken["name"] = "\uFFFD" 1336 self.state = self.doctypeNameState 1337 elif data is EOF: 1338 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1339 "expected-doctype-name-but-got-eof"}) 1340 self.currentToken["correct"] = False 1341 self.tokenQueue.append(self.currentToken) 1342 self.state = self.dataState 1343 else: 1344 self.currentToken["name"] = data 1345 self.state = self.doctypeNameState 1346 return True 1347 1348 def doctypeNameState(self): 1349 data = self.stream.char() 1350 if data in spaceCharacters: 1351 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 1352 self.state = self.afterDoctypeNameState 1353 elif data == ">": 1354 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 1355 self.tokenQueue.append(self.currentToken) 1356 self.state = self.dataState 1357 elif data == "\u0000": 1358 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1359 "data": "invalid-codepoint"}) 1360 self.currentToken["name"] += "\uFFFD" 1361 self.state = self.doctypeNameState 1362 elif data is EOF: 1363 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1364 "eof-in-doctype-name"}) 1365 self.currentToken["correct"] = False 1366 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 1367 self.tokenQueue.append(self.currentToken) 1368 self.state = self.dataState 1369 else: 1370 self.currentToken["name"] += data 1371 return True 1372 1373 def afterDoctypeNameState(self): 1374 data = self.stream.char() 1375 if data in spaceCharacters: 1376 pass 1377 elif data == ">": 1378 self.tokenQueue.append(self.currentToken) 1379 self.state = self.dataState 1380 elif data is EOF: 1381 self.currentToken["correct"] = False 1382 self.stream.unget(data) 1383 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1384 "eof-in-doctype"}) 1385 self.tokenQueue.append(self.currentToken) 1386 self.state = self.dataState 1387 else: 1388 if data in ("p", "P"): 1389 matched = True 1390 for expected in (("u", "U"), ("b", "B"), ("l", "L"), 1391 ("i", "I"), ("c", "C")): 1392 data = self.stream.char() 1393 if data not in expected: 1394 matched = False 1395 break 1396 if matched: 1397 self.state = self.afterDoctypePublicKeywordState 1398 return True 1399 elif data in ("s", "S"): 1400 matched = True 1401 for expected in (("y", "Y"), ("s", "S"), ("t", "T"), 1402 ("e", "E"), ("m", "M")): 1403 data = self.stream.char() 1404 if data not in expected: 1405 matched = False 1406 break 1407 if matched: 1408 self.state = self.afterDoctypeSystemKeywordState 1409 return True 1410 1411 # All the characters read before the current 'data' will be 1412 # [a-zA-Z], so they're garbage in the bogus doctype and can be 1413 # discarded; only the latest character might be '>' or EOF 1414 # and needs to be ungetted 1415 self.stream.unget(data) 1416 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1417 "expected-space-or-right-bracket-in-doctype", "datavars": 1418 {"data": data}}) 1419 self.currentToken["correct"] = False 1420 self.state = self.bogusDoctypeState 1421 1422 return True 1423 1424 def afterDoctypePublicKeywordState(self): 1425 data = self.stream.char() 1426 if data in spaceCharacters: 1427 self.state = self.beforeDoctypePublicIdentifierState 1428 elif data in ("'", '"'): 1429 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1430 "unexpected-char-in-doctype"}) 1431 self.stream.unget(data) 1432 self.state = self.beforeDoctypePublicIdentifierState 1433 elif data is EOF: 1434 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1435 "eof-in-doctype"}) 1436 self.currentToken["correct"] = False 1437 self.tokenQueue.append(self.currentToken) 1438 self.state = self.dataState 1439 else: 1440 self.stream.unget(data) 1441 self.state = self.beforeDoctypePublicIdentifierState 1442 return True 1443 1444 def beforeDoctypePublicIdentifierState(self): 1445 data = self.stream.char() 1446 if data in spaceCharacters: 1447 pass 1448 elif data == "\"": 1449 self.currentToken["publicId"] = "" 1450 self.state = self.doctypePublicIdentifierDoubleQuotedState 1451 elif data == "'": 1452 self.currentToken["publicId"] = "" 1453 self.state = self.doctypePublicIdentifierSingleQuotedState 1454 elif data == ">": 1455 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1456 "unexpected-end-of-doctype"}) 1457 self.currentToken["correct"] = False 1458 self.tokenQueue.append(self.currentToken) 1459 self.state = self.dataState 1460 elif data is EOF: 1461 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1462 "eof-in-doctype"}) 1463 self.currentToken["correct"] = False 1464 self.tokenQueue.append(self.currentToken) 1465 self.state = self.dataState 1466 else: 1467 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1468 "unexpected-char-in-doctype"}) 1469 self.currentToken["correct"] = False 1470 self.state = self.bogusDoctypeState 1471 return True 1472 1473 def doctypePublicIdentifierDoubleQuotedState(self): 1474 data = self.stream.char() 1475 if data == "\"": 1476 self.state = self.afterDoctypePublicIdentifierState 1477 elif data == "\u0000": 1478 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1479 "data": "invalid-codepoint"}) 1480 self.currentToken["publicId"] += "\uFFFD" 1481 elif data == ">": 1482 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1483 "unexpected-end-of-doctype"}) 1484 self.currentToken["correct"] = False 1485 self.tokenQueue.append(self.currentToken) 1486 self.state = self.dataState 1487 elif data is EOF: 1488 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1489 "eof-in-doctype"}) 1490 self.currentToken["correct"] = False 1491 self.tokenQueue.append(self.currentToken) 1492 self.state = self.dataState 1493 else: 1494 self.currentToken["publicId"] += data 1495 return True 1496 1497 def doctypePublicIdentifierSingleQuotedState(self): 1498 data = self.stream.char() 1499 if data == "'": 1500 self.state = self.afterDoctypePublicIdentifierState 1501 elif data == "\u0000": 1502 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1503 "data": "invalid-codepoint"}) 1504 self.currentToken["publicId"] += "\uFFFD" 1505 elif data == ">": 1506 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1507 "unexpected-end-of-doctype"}) 1508 self.currentToken["correct"] = False 1509 self.tokenQueue.append(self.currentToken) 1510 self.state = self.dataState 1511 elif data is EOF: 1512 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1513 "eof-in-doctype"}) 1514 self.currentToken["correct"] = False 1515 self.tokenQueue.append(self.currentToken) 1516 self.state = self.dataState 1517 else: 1518 self.currentToken["publicId"] += data 1519 return True 1520 1521 def afterDoctypePublicIdentifierState(self): 1522 data = self.stream.char() 1523 if data in spaceCharacters: 1524 self.state = self.betweenDoctypePublicAndSystemIdentifiersState 1525 elif data == ">": 1526 self.tokenQueue.append(self.currentToken) 1527 self.state = self.dataState 1528 elif data == '"': 1529 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1530 "unexpected-char-in-doctype"}) 1531 self.currentToken["systemId"] = "" 1532 self.state = self.doctypeSystemIdentifierDoubleQuotedState 1533 elif data == "'": 1534 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1535 "unexpected-char-in-doctype"}) 1536 self.currentToken["systemId"] = "" 1537 self.state = self.doctypeSystemIdentifierSingleQuotedState 1538 elif data is EOF: 1539 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1540 "eof-in-doctype"}) 1541 self.currentToken["correct"] = False 1542 self.tokenQueue.append(self.currentToken) 1543 self.state = self.dataState 1544 else: 1545 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1546 "unexpected-char-in-doctype"}) 1547 self.currentToken["correct"] = False 1548 self.state = self.bogusDoctypeState 1549 return True 1550 1551 def betweenDoctypePublicAndSystemIdentifiersState(self): 1552 data = self.stream.char() 1553 if data in spaceCharacters: 1554 pass 1555 elif data == ">": 1556 self.tokenQueue.append(self.currentToken) 1557 self.state = self.dataState 1558 elif data == '"': 1559 self.currentToken["systemId"] = "" 1560 self.state = self.doctypeSystemIdentifierDoubleQuotedState 1561 elif data == "'": 1562 self.currentToken["systemId"] = "" 1563 self.state = self.doctypeSystemIdentifierSingleQuotedState 1564 elif data == EOF: 1565 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1566 "eof-in-doctype"}) 1567 self.currentToken["correct"] = False 1568 self.tokenQueue.append(self.currentToken) 1569 self.state = self.dataState 1570 else: 1571 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1572 "unexpected-char-in-doctype"}) 1573 self.currentToken["correct"] = False 1574 self.state = self.bogusDoctypeState 1575 return True 1576 1577 def afterDoctypeSystemKeywordState(self): 1578 data = self.stream.char() 1579 if data in spaceCharacters: 1580 self.state = self.beforeDoctypeSystemIdentifierState 1581 elif data in ("'", '"'): 1582 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1583 "unexpected-char-in-doctype"}) 1584 self.stream.unget(data) 1585 self.state = self.beforeDoctypeSystemIdentifierState 1586 elif data is EOF: 1587 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1588 "eof-in-doctype"}) 1589 self.currentToken["correct"] = False 1590 self.tokenQueue.append(self.currentToken) 1591 self.state = self.dataState 1592 else: 1593 self.stream.unget(data) 1594 self.state = self.beforeDoctypeSystemIdentifierState 1595 return True 1596 1597 def beforeDoctypeSystemIdentifierState(self): 1598 data = self.stream.char() 1599 if data in spaceCharacters: 1600 pass 1601 elif data == "\"": 1602 self.currentToken["systemId"] = "" 1603 self.state = self.doctypeSystemIdentifierDoubleQuotedState 1604 elif data == "'": 1605 self.currentToken["systemId"] = "" 1606 self.state = self.doctypeSystemIdentifierSingleQuotedState 1607 elif data == ">": 1608 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1609 "unexpected-char-in-doctype"}) 1610 self.currentToken["correct"] = False 1611 self.tokenQueue.append(self.currentToken) 1612 self.state = self.dataState 1613 elif data is EOF: 1614 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1615 "eof-in-doctype"}) 1616 self.currentToken["correct"] = False 1617 self.tokenQueue.append(self.currentToken) 1618 self.state = self.dataState 1619 else: 1620 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1621 "unexpected-char-in-doctype"}) 1622 self.currentToken["correct"] = False 1623 self.state = self.bogusDoctypeState 1624 return True 1625 1626 def doctypeSystemIdentifierDoubleQuotedState(self): 1627 data = self.stream.char() 1628 if data == "\"": 1629 self.state = self.afterDoctypeSystemIdentifierState 1630 elif data == "\u0000": 1631 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1632 "data": "invalid-codepoint"}) 1633 self.currentToken["systemId"] += "\uFFFD" 1634 elif data == ">": 1635 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1636 "unexpected-end-of-doctype"}) 1637 self.currentToken["correct"] = False 1638 self.tokenQueue.append(self.currentToken) 1639 self.state = self.dataState 1640 elif data is EOF: 1641 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1642 "eof-in-doctype"}) 1643 self.currentToken["correct"] = False 1644 self.tokenQueue.append(self.currentToken) 1645 self.state = self.dataState 1646 else: 1647 self.currentToken["systemId"] += data 1648 return True 1649 1650 def doctypeSystemIdentifierSingleQuotedState(self): 1651 data = self.stream.char() 1652 if data == "'": 1653 self.state = self.afterDoctypeSystemIdentifierState 1654 elif data == "\u0000": 1655 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1656 "data": "invalid-codepoint"}) 1657 self.currentToken["systemId"] += "\uFFFD" 1658 elif data == ">": 1659 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1660 "unexpected-end-of-doctype"}) 1661 self.currentToken["correct"] = False 1662 self.tokenQueue.append(self.currentToken) 1663 self.state = self.dataState 1664 elif data is EOF: 1665 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1666 "eof-in-doctype"}) 1667 self.currentToken["correct"] = False 1668 self.tokenQueue.append(self.currentToken) 1669 self.state = self.dataState 1670 else: 1671 self.currentToken["systemId"] += data 1672 return True 1673 1674 def afterDoctypeSystemIdentifierState(self): 1675 data = self.stream.char() 1676 if data in spaceCharacters: 1677 pass 1678 elif data == ">": 1679 self.tokenQueue.append(self.currentToken) 1680 self.state = self.dataState 1681 elif data is EOF: 1682 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1683 "eof-in-doctype"}) 1684 self.currentToken["correct"] = False 1685 self.tokenQueue.append(self.currentToken) 1686 self.state = self.dataState 1687 else: 1688 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1689 "unexpected-char-in-doctype"}) 1690 self.state = self.bogusDoctypeState 1691 return True 1692 1693 def bogusDoctypeState(self): 1694 data = self.stream.char() 1695 if data == ">": 1696 self.tokenQueue.append(self.currentToken) 1697 self.state = self.dataState 1698 elif data is EOF: 1699 # XXX EMIT 1700 self.stream.unget(data) 1701 self.tokenQueue.append(self.currentToken) 1702 self.state = self.dataState 1703 else: 1704 pass 1705 return True 1706 1707 def cdataSectionState(self): 1708 data = [] 1709 while True: 1710 data.append(self.stream.charsUntil("]")) 1711 data.append(self.stream.charsUntil(">")) 1712 char = self.stream.char() 1713 if char == EOF: 1714 break 1715 else: 1716 assert char == ">" 1717 if data[-1][-2:] == "]]": 1718 data[-1] = data[-1][:-2] 1719 break 1720 else: 1721 data.append(char) 1722 1723 data = "".join(data) # pylint:disable=redefined-variable-type 1724 # Deal with null here rather than in the parser 1725 nullCount = data.count("\u0000") 1726 if nullCount > 0: 1727 for _ in range(nullCount): 1728 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1729 "data": "invalid-codepoint"}) 1730 data = data.replace("\u0000", "\uFFFD") 1731 if data: 1732 self.tokenQueue.append({"type": tokenTypes["Characters"], 1733 "data": data}) 1734 self.state = self.dataState 1735 return True 1736