1from __future__ import absolute_import, division, unicode_literals 2 3from pip9._vendor.six import unichr as chr 4 5from collections import deque 6 7from .constants import spaceCharacters 8from .constants import entities 9from .constants import asciiLetters, asciiUpper2Lower 10from .constants import digits, hexDigits, EOF 11from .constants import tokenTypes, tagTokenTypes 12from .constants import replacementCharacters 13 14from ._inputstream import HTMLInputStream 15 16from ._trie import Trie 17 18entitiesTrie = Trie(entities) 19 20 21class HTMLTokenizer(object): 22 """ This class takes care of tokenizing HTML. 23 24 * self.currentToken 25 Holds the token that is currently being processed. 26 27 * self.state 28 Holds a reference to the method to be invoked... XXX 29 30 * self.stream 31 Points to HTMLInputStream object. 32 """ 33 34 def __init__(self, stream, parser=None, **kwargs): 35 36 self.stream = HTMLInputStream(stream, **kwargs) 37 self.parser = parser 38 39 # Setup the initial tokenizer state 40 self.escapeFlag = False 41 self.lastFourChars = [] 42 self.state = self.dataState 43 self.escape = False 44 45 # The current token being created 46 self.currentToken = None 47 super(HTMLTokenizer, self).__init__() 48 49 def __iter__(self): 50 """ This is where the magic happens. 51 52 We do our usually processing through the states and when we have a token 53 to return we yield the token which pauses processing until the next token 54 is requested. 55 """ 56 self.tokenQueue = deque([]) 57 # Start processing. When EOF is reached self.state will return False 58 # instead of True and the loop will terminate. 59 while self.state(): 60 while self.stream.errors: 61 yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} 62 while self.tokenQueue: 63 yield self.tokenQueue.popleft() 64 65 def consumeNumberEntity(self, isHex): 66 """This function returns either U+FFFD or the character based on the 67 decimal or hexadecimal representation. It also discards ";" if present. 68 If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. 69 """ 70 71 allowed = digits 72 radix = 10 73 if isHex: 74 allowed = hexDigits 75 radix = 16 76 77 charStack = [] 78 79 # Consume all the characters that are in range while making sure we 80 # don't hit an EOF. 81 c = self.stream.char() 82 while c in allowed and c is not EOF: 83 charStack.append(c) 84 c = self.stream.char() 85 86 # Convert the set of characters consumed to an int. 87 charAsInt = int("".join(charStack), radix) 88 89 # Certain characters get replaced with others 90 if charAsInt in replacementCharacters: 91 char = replacementCharacters[charAsInt] 92 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 93 "illegal-codepoint-for-numeric-entity", 94 "datavars": {"charAsInt": charAsInt}}) 95 elif ((0xD800 <= charAsInt <= 0xDFFF) or 96 (charAsInt > 0x10FFFF)): 97 char = "\uFFFD" 98 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 99 "illegal-codepoint-for-numeric-entity", 100 "datavars": {"charAsInt": charAsInt}}) 101 else: 102 # Should speed up this check somehow (e.g. move the set to a constant) 103 if ((0x0001 <= charAsInt <= 0x0008) or 104 (0x000E <= charAsInt <= 0x001F) or 105 (0x007F <= charAsInt <= 0x009F) or 106 (0xFDD0 <= charAsInt <= 0xFDEF) or 107 charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 108 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 109 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 110 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 111 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 112 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 113 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 114 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 115 0xFFFFF, 0x10FFFE, 0x10FFFF])): 116 self.tokenQueue.append({"type": tokenTypes["ParseError"], 117 "data": 118 "illegal-codepoint-for-numeric-entity", 119 "datavars": {"charAsInt": charAsInt}}) 120 try: 121 # Try/except needed as UCS-2 Python builds' unichar only works 122 # within the BMP. 123 char = chr(charAsInt) 124 except ValueError: 125 v = charAsInt - 0x10000 126 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) 127 128 # Discard the ; if present. Otherwise, put it back on the queue and 129 # invoke parseError on parser. 130 if c != ";": 131 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 132 "numeric-entity-without-semicolon"}) 133 self.stream.unget(c) 134 135 return char 136 137 def consumeEntity(self, allowedChar=None, fromAttribute=False): 138 # Initialise to the default output for when no entity is matched 139 output = "&" 140 141 charStack = [self.stream.char()] 142 if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or 143 (allowedChar is not None and allowedChar == charStack[0])): 144 self.stream.unget(charStack[0]) 145 146 elif charStack[0] == "#": 147 # Read the next character to see if it's hex or decimal 148 hex = False 149 charStack.append(self.stream.char()) 150 if charStack[-1] in ("x", "X"): 151 hex = True 152 charStack.append(self.stream.char()) 153 154 # charStack[-1] should be the first digit 155 if (hex and charStack[-1] in hexDigits) \ 156 or (not hex and charStack[-1] in digits): 157 # At least one digit found, so consume the whole number 158 self.stream.unget(charStack[-1]) 159 output = self.consumeNumberEntity(hex) 160 else: 161 # No digits found 162 self.tokenQueue.append({"type": tokenTypes["ParseError"], 163 "data": "expected-numeric-entity"}) 164 self.stream.unget(charStack.pop()) 165 output = "&" + "".join(charStack) 166 167 else: 168 # At this point in the process might have named entity. Entities 169 # are stored in the global variable "entities". 170 # 171 # Consume characters and compare to these to a substring of the 172 # entity names in the list until the substring no longer matches. 173 while (charStack[-1] is not EOF): 174 if not entitiesTrie.has_keys_with_prefix("".join(charStack)): 175 break 176 charStack.append(self.stream.char()) 177 178 # At this point we have a string that starts with some characters 179 # that may match an entity 180 # Try to find the longest entity the string will match to take care 181 # of ¬i for instance. 182 try: 183 entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) 184 entityLength = len(entityName) 185 except KeyError: 186 entityName = None 187 188 if entityName is not None: 189 if entityName[-1] != ";": 190 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 191 "named-entity-without-semicolon"}) 192 if (entityName[-1] != ";" and fromAttribute and 193 (charStack[entityLength] in asciiLetters or 194 charStack[entityLength] in digits or 195 charStack[entityLength] == "=")): 196 self.stream.unget(charStack.pop()) 197 output = "&" + "".join(charStack) 198 else: 199 output = entities[entityName] 200 self.stream.unget(charStack.pop()) 201 output += "".join(charStack[entityLength:]) 202 else: 203 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 204 "expected-named-entity"}) 205 self.stream.unget(charStack.pop()) 206 output = "&" + "".join(charStack) 207 208 if fromAttribute: 209 self.currentToken["data"][-1][1] += output 210 else: 211 if output in spaceCharacters: 212 tokenType = "SpaceCharacters" 213 else: 214 tokenType = "Characters" 215 self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) 216 217 def processEntityInAttribute(self, allowedChar): 218 """This method replaces the need for "entityInAttributeValueState". 219 """ 220 self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) 221 222 def emitCurrentToken(self): 223 """This method is a generic handler for emitting the tags. It also sets 224 the state to "data" because that's what's needed after a token has been 225 emitted. 226 """ 227 token = self.currentToken 228 # Add token to the queue to be yielded 229 if (token["type"] in tagTokenTypes): 230 token["name"] = token["name"].translate(asciiUpper2Lower) 231 if token["type"] == tokenTypes["EndTag"]: 232 if token["data"]: 233 self.tokenQueue.append({"type": tokenTypes["ParseError"], 234 "data": "attributes-in-end-tag"}) 235 if token["selfClosing"]: 236 self.tokenQueue.append({"type": tokenTypes["ParseError"], 237 "data": "self-closing-flag-on-end-tag"}) 238 self.tokenQueue.append(token) 239 self.state = self.dataState 240 241 # Below are the various tokenizer states worked out. 242 def dataState(self): 243 data = self.stream.char() 244 if data == "&": 245 self.state = self.entityDataState 246 elif data == "<": 247 self.state = self.tagOpenState 248 elif data == "\u0000": 249 self.tokenQueue.append({"type": tokenTypes["ParseError"], 250 "data": "invalid-codepoint"}) 251 self.tokenQueue.append({"type": tokenTypes["Characters"], 252 "data": "\u0000"}) 253 elif data is EOF: 254 # Tokenization ends. 255 return False 256 elif data in spaceCharacters: 257 # Directly after emitting a token you switch back to the "data 258 # state". At that point spaceCharacters are important so they are 259 # emitted separately. 260 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": 261 data + self.stream.charsUntil(spaceCharacters, True)}) 262 # No need to update lastFourChars here, since the first space will 263 # have already been appended to lastFourChars and will have broken 264 # any <!-- or --> sequences 265 else: 266 chars = self.stream.charsUntil(("&", "<", "\u0000")) 267 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 268 data + chars}) 269 return True 270 271 def entityDataState(self): 272 self.consumeEntity() 273 self.state = self.dataState 274 return True 275 276 def rcdataState(self): 277 data = self.stream.char() 278 if data == "&": 279 self.state = self.characterReferenceInRcdata 280 elif data == "<": 281 self.state = self.rcdataLessThanSignState 282 elif data == EOF: 283 # Tokenization ends. 284 return False 285 elif data == "\u0000": 286 self.tokenQueue.append({"type": tokenTypes["ParseError"], 287 "data": "invalid-codepoint"}) 288 self.tokenQueue.append({"type": tokenTypes["Characters"], 289 "data": "\uFFFD"}) 290 elif data in spaceCharacters: 291 # Directly after emitting a token you switch back to the "data 292 # state". At that point spaceCharacters are important so they are 293 # emitted separately. 294 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": 295 data + self.stream.charsUntil(spaceCharacters, True)}) 296 # No need to update lastFourChars here, since the first space will 297 # have already been appended to lastFourChars and will have broken 298 # any <!-- or --> sequences 299 else: 300 chars = self.stream.charsUntil(("&", "<", "\u0000")) 301 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 302 data + chars}) 303 return True 304 305 def characterReferenceInRcdata(self): 306 self.consumeEntity() 307 self.state = self.rcdataState 308 return True 309 310 def rawtextState(self): 311 data = self.stream.char() 312 if data == "<": 313 self.state = self.rawtextLessThanSignState 314 elif data == "\u0000": 315 self.tokenQueue.append({"type": tokenTypes["ParseError"], 316 "data": "invalid-codepoint"}) 317 self.tokenQueue.append({"type": tokenTypes["Characters"], 318 "data": "\uFFFD"}) 319 elif data == EOF: 320 # Tokenization ends. 321 return False 322 else: 323 chars = self.stream.charsUntil(("<", "\u0000")) 324 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 325 data + chars}) 326 return True 327 328 def scriptDataState(self): 329 data = self.stream.char() 330 if data == "<": 331 self.state = self.scriptDataLessThanSignState 332 elif data == "\u0000": 333 self.tokenQueue.append({"type": tokenTypes["ParseError"], 334 "data": "invalid-codepoint"}) 335 self.tokenQueue.append({"type": tokenTypes["Characters"], 336 "data": "\uFFFD"}) 337 elif data == EOF: 338 # Tokenization ends. 339 return False 340 else: 341 chars = self.stream.charsUntil(("<", "\u0000")) 342 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 343 data + chars}) 344 return True 345 346 def plaintextState(self): 347 data = self.stream.char() 348 if data == EOF: 349 # Tokenization ends. 350 return False 351 elif data == "\u0000": 352 self.tokenQueue.append({"type": tokenTypes["ParseError"], 353 "data": "invalid-codepoint"}) 354 self.tokenQueue.append({"type": tokenTypes["Characters"], 355 "data": "\uFFFD"}) 356 else: 357 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 358 data + self.stream.charsUntil("\u0000")}) 359 return True 360 361 def tagOpenState(self): 362 data = self.stream.char() 363 if data == "!": 364 self.state = self.markupDeclarationOpenState 365 elif data == "/": 366 self.state = self.closeTagOpenState 367 elif data in asciiLetters: 368 self.currentToken = {"type": tokenTypes["StartTag"], 369 "name": data, "data": [], 370 "selfClosing": False, 371 "selfClosingAcknowledged": False} 372 self.state = self.tagNameState 373 elif data == ">": 374 # XXX In theory it could be something besides a tag name. But 375 # do we really care? 376 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 377 "expected-tag-name-but-got-right-bracket"}) 378 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) 379 self.state = self.dataState 380 elif data == "?": 381 # XXX In theory it could be something besides a tag name. But 382 # do we really care? 383 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 384 "expected-tag-name-but-got-question-mark"}) 385 self.stream.unget(data) 386 self.state = self.bogusCommentState 387 else: 388 # XXX 389 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 390 "expected-tag-name"}) 391 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 392 self.stream.unget(data) 393 self.state = self.dataState 394 return True 395 396 def closeTagOpenState(self): 397 data = self.stream.char() 398 if data in asciiLetters: 399 self.currentToken = {"type": tokenTypes["EndTag"], "name": data, 400 "data": [], "selfClosing": False} 401 self.state = self.tagNameState 402 elif data == ">": 403 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 404 "expected-closing-tag-but-got-right-bracket"}) 405 self.state = self.dataState 406 elif data is EOF: 407 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 408 "expected-closing-tag-but-got-eof"}) 409 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 410 self.state = self.dataState 411 else: 412 # XXX data can be _'_... 413 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 414 "expected-closing-tag-but-got-char", 415 "datavars": {"data": data}}) 416 self.stream.unget(data) 417 self.state = self.bogusCommentState 418 return True 419 420 def tagNameState(self): 421 data = self.stream.char() 422 if data in spaceCharacters: 423 self.state = self.beforeAttributeNameState 424 elif data == ">": 425 self.emitCurrentToken() 426 elif data is EOF: 427 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 428 "eof-in-tag-name"}) 429 self.state = self.dataState 430 elif data == "/": 431 self.state = self.selfClosingStartTagState 432 elif data == "\u0000": 433 self.tokenQueue.append({"type": tokenTypes["ParseError"], 434 "data": "invalid-codepoint"}) 435 self.currentToken["name"] += "\uFFFD" 436 else: 437 self.currentToken["name"] += data 438 # (Don't use charsUntil here, because tag names are 439 # very short and it's faster to not do anything fancy) 440 return True 441 442 def rcdataLessThanSignState(self): 443 data = self.stream.char() 444 if data == "/": 445 self.temporaryBuffer = "" 446 self.state = self.rcdataEndTagOpenState 447 else: 448 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 449 self.stream.unget(data) 450 self.state = self.rcdataState 451 return True 452 453 def rcdataEndTagOpenState(self): 454 data = self.stream.char() 455 if data in asciiLetters: 456 self.temporaryBuffer += data 457 self.state = self.rcdataEndTagNameState 458 else: 459 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 460 self.stream.unget(data) 461 self.state = self.rcdataState 462 return True 463 464 def rcdataEndTagNameState(self): 465 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 466 data = self.stream.char() 467 if data in spaceCharacters and appropriate: 468 self.currentToken = {"type": tokenTypes["EndTag"], 469 "name": self.temporaryBuffer, 470 "data": [], "selfClosing": False} 471 self.state = self.beforeAttributeNameState 472 elif data == "/" and appropriate: 473 self.currentToken = {"type": tokenTypes["EndTag"], 474 "name": self.temporaryBuffer, 475 "data": [], "selfClosing": False} 476 self.state = self.selfClosingStartTagState 477 elif data == ">" and appropriate: 478 self.currentToken = {"type": tokenTypes["EndTag"], 479 "name": self.temporaryBuffer, 480 "data": [], "selfClosing": False} 481 self.emitCurrentToken() 482 self.state = self.dataState 483 elif data in asciiLetters: 484 self.temporaryBuffer += data 485 else: 486 self.tokenQueue.append({"type": tokenTypes["Characters"], 487 "data": "</" + self.temporaryBuffer}) 488 self.stream.unget(data) 489 self.state = self.rcdataState 490 return True 491 492 def rawtextLessThanSignState(self): 493 data = self.stream.char() 494 if data == "/": 495 self.temporaryBuffer = "" 496 self.state = self.rawtextEndTagOpenState 497 else: 498 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 499 self.stream.unget(data) 500 self.state = self.rawtextState 501 return True 502 503 def rawtextEndTagOpenState(self): 504 data = self.stream.char() 505 if data in asciiLetters: 506 self.temporaryBuffer += data 507 self.state = self.rawtextEndTagNameState 508 else: 509 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 510 self.stream.unget(data) 511 self.state = self.rawtextState 512 return True 513 514 def rawtextEndTagNameState(self): 515 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 516 data = self.stream.char() 517 if data in spaceCharacters and appropriate: 518 self.currentToken = {"type": tokenTypes["EndTag"], 519 "name": self.temporaryBuffer, 520 "data": [], "selfClosing": False} 521 self.state = self.beforeAttributeNameState 522 elif data == "/" and appropriate: 523 self.currentToken = {"type": tokenTypes["EndTag"], 524 "name": self.temporaryBuffer, 525 "data": [], "selfClosing": False} 526 self.state = self.selfClosingStartTagState 527 elif data == ">" and appropriate: 528 self.currentToken = {"type": tokenTypes["EndTag"], 529 "name": self.temporaryBuffer, 530 "data": [], "selfClosing": False} 531 self.emitCurrentToken() 532 self.state = self.dataState 533 elif data in asciiLetters: 534 self.temporaryBuffer += data 535 else: 536 self.tokenQueue.append({"type": tokenTypes["Characters"], 537 "data": "</" + self.temporaryBuffer}) 538 self.stream.unget(data) 539 self.state = self.rawtextState 540 return True 541 542 def scriptDataLessThanSignState(self): 543 data = self.stream.char() 544 if data == "/": 545 self.temporaryBuffer = "" 546 self.state = self.scriptDataEndTagOpenState 547 elif data == "!": 548 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"}) 549 self.state = self.scriptDataEscapeStartState 550 else: 551 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 552 self.stream.unget(data) 553 self.state = self.scriptDataState 554 return True 555 556 def scriptDataEndTagOpenState(self): 557 data = self.stream.char() 558 if data in asciiLetters: 559 self.temporaryBuffer += data 560 self.state = self.scriptDataEndTagNameState 561 else: 562 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 563 self.stream.unget(data) 564 self.state = self.scriptDataState 565 return True 566 567 def scriptDataEndTagNameState(self): 568 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 569 data = self.stream.char() 570 if data in spaceCharacters and appropriate: 571 self.currentToken = {"type": tokenTypes["EndTag"], 572 "name": self.temporaryBuffer, 573 "data": [], "selfClosing": False} 574 self.state = self.beforeAttributeNameState 575 elif data == "/" and appropriate: 576 self.currentToken = {"type": tokenTypes["EndTag"], 577 "name": self.temporaryBuffer, 578 "data": [], "selfClosing": False} 579 self.state = self.selfClosingStartTagState 580 elif data == ">" and appropriate: 581 self.currentToken = {"type": tokenTypes["EndTag"], 582 "name": self.temporaryBuffer, 583 "data": [], "selfClosing": False} 584 self.emitCurrentToken() 585 self.state = self.dataState 586 elif data in asciiLetters: 587 self.temporaryBuffer += data 588 else: 589 self.tokenQueue.append({"type": tokenTypes["Characters"], 590 "data": "</" + self.temporaryBuffer}) 591 self.stream.unget(data) 592 self.state = self.scriptDataState 593 return True 594 595 def scriptDataEscapeStartState(self): 596 data = self.stream.char() 597 if data == "-": 598 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 599 self.state = self.scriptDataEscapeStartDashState 600 else: 601 self.stream.unget(data) 602 self.state = self.scriptDataState 603 return True 604 605 def scriptDataEscapeStartDashState(self): 606 data = self.stream.char() 607 if data == "-": 608 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 609 self.state = self.scriptDataEscapedDashDashState 610 else: 611 self.stream.unget(data) 612 self.state = self.scriptDataState 613 return True 614 615 def scriptDataEscapedState(self): 616 data = self.stream.char() 617 if data == "-": 618 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 619 self.state = self.scriptDataEscapedDashState 620 elif data == "<": 621 self.state = self.scriptDataEscapedLessThanSignState 622 elif data == "\u0000": 623 self.tokenQueue.append({"type": tokenTypes["ParseError"], 624 "data": "invalid-codepoint"}) 625 self.tokenQueue.append({"type": tokenTypes["Characters"], 626 "data": "\uFFFD"}) 627 elif data == EOF: 628 self.state = self.dataState 629 else: 630 chars = self.stream.charsUntil(("<", "-", "\u0000")) 631 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 632 data + chars}) 633 return True 634 635 def scriptDataEscapedDashState(self): 636 data = self.stream.char() 637 if data == "-": 638 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 639 self.state = self.scriptDataEscapedDashDashState 640 elif data == "<": 641 self.state = self.scriptDataEscapedLessThanSignState 642 elif data == "\u0000": 643 self.tokenQueue.append({"type": tokenTypes["ParseError"], 644 "data": "invalid-codepoint"}) 645 self.tokenQueue.append({"type": tokenTypes["Characters"], 646 "data": "\uFFFD"}) 647 self.state = self.scriptDataEscapedState 648 elif data == EOF: 649 self.state = self.dataState 650 else: 651 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 652 self.state = self.scriptDataEscapedState 653 return True 654 655 def scriptDataEscapedDashDashState(self): 656 data = self.stream.char() 657 if data == "-": 658 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 659 elif data == "<": 660 self.state = self.scriptDataEscapedLessThanSignState 661 elif data == ">": 662 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) 663 self.state = self.scriptDataState 664 elif data == "\u0000": 665 self.tokenQueue.append({"type": tokenTypes["ParseError"], 666 "data": "invalid-codepoint"}) 667 self.tokenQueue.append({"type": tokenTypes["Characters"], 668 "data": "\uFFFD"}) 669 self.state = self.scriptDataEscapedState 670 elif data == EOF: 671 self.state = self.dataState 672 else: 673 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 674 self.state = self.scriptDataEscapedState 675 return True 676 677 def scriptDataEscapedLessThanSignState(self): 678 data = self.stream.char() 679 if data == "/": 680 self.temporaryBuffer = "" 681 self.state = self.scriptDataEscapedEndTagOpenState 682 elif data in asciiLetters: 683 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) 684 self.temporaryBuffer = data 685 self.state = self.scriptDataDoubleEscapeStartState 686 else: 687 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 688 self.stream.unget(data) 689 self.state = self.scriptDataEscapedState 690 return True 691 692 def scriptDataEscapedEndTagOpenState(self): 693 data = self.stream.char() 694 if data in asciiLetters: 695 self.temporaryBuffer = data 696 self.state = self.scriptDataEscapedEndTagNameState 697 else: 698 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 699 self.stream.unget(data) 700 self.state = self.scriptDataEscapedState 701 return True 702 703 def scriptDataEscapedEndTagNameState(self): 704 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 705 data = self.stream.char() 706 if data in spaceCharacters and appropriate: 707 self.currentToken = {"type": tokenTypes["EndTag"], 708 "name": self.temporaryBuffer, 709 "data": [], "selfClosing": False} 710 self.state = self.beforeAttributeNameState 711 elif data == "/" and appropriate: 712 self.currentToken = {"type": tokenTypes["EndTag"], 713 "name": self.temporaryBuffer, 714 "data": [], "selfClosing": False} 715 self.state = self.selfClosingStartTagState 716 elif data == ">" and appropriate: 717 self.currentToken = {"type": tokenTypes["EndTag"], 718 "name": self.temporaryBuffer, 719 "data": [], "selfClosing": False} 720 self.emitCurrentToken() 721 self.state = self.dataState 722 elif data in asciiLetters: 723 self.temporaryBuffer += data 724 else: 725 self.tokenQueue.append({"type": tokenTypes["Characters"], 726 "data": "</" + self.temporaryBuffer}) 727 self.stream.unget(data) 728 self.state = self.scriptDataEscapedState 729 return True 730 731 def scriptDataDoubleEscapeStartState(self): 732 data = self.stream.char() 733 if data in (spaceCharacters | frozenset(("/", ">"))): 734 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 735 if self.temporaryBuffer.lower() == "script": 736 self.state = self.scriptDataDoubleEscapedState 737 else: 738 self.state = self.scriptDataEscapedState 739 elif data in asciiLetters: 740 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 741 self.temporaryBuffer += data 742 else: 743 self.stream.unget(data) 744 self.state = self.scriptDataEscapedState 745 return True 746 747 def scriptDataDoubleEscapedState(self): 748 data = self.stream.char() 749 if data == "-": 750 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 751 self.state = self.scriptDataDoubleEscapedDashState 752 elif data == "<": 753 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 754 self.state = self.scriptDataDoubleEscapedLessThanSignState 755 elif data == "\u0000": 756 self.tokenQueue.append({"type": tokenTypes["ParseError"], 757 "data": "invalid-codepoint"}) 758 self.tokenQueue.append({"type": tokenTypes["Characters"], 759 "data": "\uFFFD"}) 760 elif data == EOF: 761 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 762 "eof-in-script-in-script"}) 763 self.state = self.dataState 764 else: 765 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 766 return True 767 768 def scriptDataDoubleEscapedDashState(self): 769 data = self.stream.char() 770 if data == "-": 771 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 772 self.state = self.scriptDataDoubleEscapedDashDashState 773 elif data == "<": 774 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 775 self.state = self.scriptDataDoubleEscapedLessThanSignState 776 elif data == "\u0000": 777 self.tokenQueue.append({"type": tokenTypes["ParseError"], 778 "data": "invalid-codepoint"}) 779 self.tokenQueue.append({"type": tokenTypes["Characters"], 780 "data": "\uFFFD"}) 781 self.state = self.scriptDataDoubleEscapedState 782 elif data == EOF: 783 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 784 "eof-in-script-in-script"}) 785 self.state = self.dataState 786 else: 787 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 788 self.state = self.scriptDataDoubleEscapedState 789 return True 790 791 def scriptDataDoubleEscapedDashDashState(self): 792 data = self.stream.char() 793 if data == "-": 794 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 795 elif data == "<": 796 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 797 self.state = self.scriptDataDoubleEscapedLessThanSignState 798 elif data == ">": 799 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) 800 self.state = self.scriptDataState 801 elif data == "\u0000": 802 self.tokenQueue.append({"type": tokenTypes["ParseError"], 803 "data": "invalid-codepoint"}) 804 self.tokenQueue.append({"type": tokenTypes["Characters"], 805 "data": "\uFFFD"}) 806 self.state = self.scriptDataDoubleEscapedState 807 elif data == EOF: 808 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 809 "eof-in-script-in-script"}) 810 self.state = self.dataState 811 else: 812 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 813 self.state = self.scriptDataDoubleEscapedState 814 return True 815 816 def scriptDataDoubleEscapedLessThanSignState(self): 817 data = self.stream.char() 818 if data == "/": 819 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) 820 self.temporaryBuffer = "" 821 self.state = self.scriptDataDoubleEscapeEndState 822 else: 823 self.stream.unget(data) 824 self.state = self.scriptDataDoubleEscapedState 825 return True 826 827 def scriptDataDoubleEscapeEndState(self): 828 data = self.stream.char() 829 if data in (spaceCharacters | frozenset(("/", ">"))): 830 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 831 if self.temporaryBuffer.lower() == "script": 832 self.state = self.scriptDataEscapedState 833 else: 834 self.state = self.scriptDataDoubleEscapedState 835 elif data in asciiLetters: 836 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 837 self.temporaryBuffer += data 838 else: 839 self.stream.unget(data) 840 self.state = self.scriptDataDoubleEscapedState 841 return True 842 843 def beforeAttributeNameState(self): 844 data = self.stream.char() 845 if data in spaceCharacters: 846 self.stream.charsUntil(spaceCharacters, True) 847 elif data in asciiLetters: 848 self.currentToken["data"].append([data, ""]) 849 self.state = self.attributeNameState 850 elif data == ">": 851 self.emitCurrentToken() 852 elif data == "/": 853 self.state = self.selfClosingStartTagState 854 elif data in ("'", '"', "=", "<"): 855 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 856 "invalid-character-in-attribute-name"}) 857 self.currentToken["data"].append([data, ""]) 858 self.state = self.attributeNameState 859 elif data == "\u0000": 860 self.tokenQueue.append({"type": tokenTypes["ParseError"], 861 "data": "invalid-codepoint"}) 862 self.currentToken["data"].append(["\uFFFD", ""]) 863 self.state = self.attributeNameState 864 elif data is EOF: 865 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 866 "expected-attribute-name-but-got-eof"}) 867 self.state = self.dataState 868 else: 869 self.currentToken["data"].append([data, ""]) 870 self.state = self.attributeNameState 871 return True 872 873 def attributeNameState(self): 874 data = self.stream.char() 875 leavingThisState = True 876 emitToken = False 877 if data == "=": 878 self.state = self.beforeAttributeValueState 879 elif data in asciiLetters: 880 self.currentToken["data"][-1][0] += data +\ 881 self.stream.charsUntil(asciiLetters, True) 882 leavingThisState = False 883 elif data == ">": 884 # XXX If we emit here the attributes are converted to a dict 885 # without being checked and when the code below runs we error 886 # because data is a dict not a list 887 emitToken = True 888 elif data in spaceCharacters: 889 self.state = self.afterAttributeNameState 890 elif data == "/": 891 self.state = self.selfClosingStartTagState 892 elif data == "\u0000": 893 self.tokenQueue.append({"type": tokenTypes["ParseError"], 894 "data": "invalid-codepoint"}) 895 self.currentToken["data"][-1][0] += "\uFFFD" 896 leavingThisState = False 897 elif data in ("'", '"', "<"): 898 self.tokenQueue.append({"type": tokenTypes["ParseError"], 899 "data": 900 "invalid-character-in-attribute-name"}) 901 self.currentToken["data"][-1][0] += data 902 leavingThisState = False 903 elif data is EOF: 904 self.tokenQueue.append({"type": tokenTypes["ParseError"], 905 "data": "eof-in-attribute-name"}) 906 self.state = self.dataState 907 else: 908 self.currentToken["data"][-1][0] += data 909 leavingThisState = False 910 911 if leavingThisState: 912 # Attributes are not dropped at this stage. That happens when the 913 # start tag token is emitted so values can still be safely appended 914 # to attributes, but we do want to report the parse error in time. 915 self.currentToken["data"][-1][0] = ( 916 self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) 917 for name, _ in self.currentToken["data"][:-1]: 918 if self.currentToken["data"][-1][0] == name: 919 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 920 "duplicate-attribute"}) 921 break 922 # XXX Fix for above XXX 923 if emitToken: 924 self.emitCurrentToken() 925 return True 926 927 def afterAttributeNameState(self): 928 data = self.stream.char() 929 if data in spaceCharacters: 930 self.stream.charsUntil(spaceCharacters, True) 931 elif data == "=": 932 self.state = self.beforeAttributeValueState 933 elif data == ">": 934 self.emitCurrentToken() 935 elif data in asciiLetters: 936 self.currentToken["data"].append([data, ""]) 937 self.state = self.attributeNameState 938 elif data == "/": 939 self.state = self.selfClosingStartTagState 940 elif data == "\u0000": 941 self.tokenQueue.append({"type": tokenTypes["ParseError"], 942 "data": "invalid-codepoint"}) 943 self.currentToken["data"].append(["\uFFFD", ""]) 944 self.state = self.attributeNameState 945 elif data in ("'", '"', "<"): 946 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 947 "invalid-character-after-attribute-name"}) 948 self.currentToken["data"].append([data, ""]) 949 self.state = self.attributeNameState 950 elif data is EOF: 951 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 952 "expected-end-of-tag-but-got-eof"}) 953 self.state = self.dataState 954 else: 955 self.currentToken["data"].append([data, ""]) 956 self.state = self.attributeNameState 957 return True 958 959 def beforeAttributeValueState(self): 960 data = self.stream.char() 961 if data in spaceCharacters: 962 self.stream.charsUntil(spaceCharacters, True) 963 elif data == "\"": 964 self.state = self.attributeValueDoubleQuotedState 965 elif data == "&": 966 self.state = self.attributeValueUnQuotedState 967 self.stream.unget(data) 968 elif data == "'": 969 self.state = self.attributeValueSingleQuotedState 970 elif data == ">": 971 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 972 "expected-attribute-value-but-got-right-bracket"}) 973 self.emitCurrentToken() 974 elif data == "\u0000": 975 self.tokenQueue.append({"type": tokenTypes["ParseError"], 976 "data": "invalid-codepoint"}) 977 self.currentToken["data"][-1][1] += "\uFFFD" 978 self.state = self.attributeValueUnQuotedState 979 elif data in ("=", "<", "`"): 980 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 981 "equals-in-unquoted-attribute-value"}) 982 self.currentToken["data"][-1][1] += data 983 self.state = self.attributeValueUnQuotedState 984 elif data is EOF: 985 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 986 "expected-attribute-value-but-got-eof"}) 987 self.state = self.dataState 988 else: 989 self.currentToken["data"][-1][1] += data 990 self.state = self.attributeValueUnQuotedState 991 return True 992 993 def attributeValueDoubleQuotedState(self): 994 data = self.stream.char() 995 if data == "\"": 996 self.state = self.afterAttributeValueState 997 elif data == "&": 998 self.processEntityInAttribute('"') 999 elif data == "\u0000": 1000 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1001 "data": "invalid-codepoint"}) 1002 self.currentToken["data"][-1][1] += "\uFFFD" 1003 elif data is EOF: 1004 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1005 "eof-in-attribute-value-double-quote"}) 1006 self.state = self.dataState 1007 else: 1008 self.currentToken["data"][-1][1] += data +\ 1009 self.stream.charsUntil(("\"", "&", "\u0000")) 1010 return True 1011 1012 def attributeValueSingleQuotedState(self): 1013 data = self.stream.char() 1014 if data == "'": 1015 self.state = self.afterAttributeValueState 1016 elif data == "&": 1017 self.processEntityInAttribute("'") 1018 elif data == "\u0000": 1019 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1020 "data": "invalid-codepoint"}) 1021 self.currentToken["data"][-1][1] += "\uFFFD" 1022 elif data is EOF: 1023 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1024 "eof-in-attribute-value-single-quote"}) 1025 self.state = self.dataState 1026 else: 1027 self.currentToken["data"][-1][1] += data +\ 1028 self.stream.charsUntil(("'", "&", "\u0000")) 1029 return True 1030 1031 def attributeValueUnQuotedState(self): 1032 data = self.stream.char() 1033 if data in spaceCharacters: 1034 self.state = self.beforeAttributeNameState 1035 elif data == "&": 1036 self.processEntityInAttribute(">") 1037 elif data == ">": 1038 self.emitCurrentToken() 1039 elif data in ('"', "'", "=", "<", "`"): 1040 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1041 "unexpected-character-in-unquoted-attribute-value"}) 1042 self.currentToken["data"][-1][1] += data 1043 elif data == "\u0000": 1044 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1045 "data": "invalid-codepoint"}) 1046 self.currentToken["data"][-1][1] += "\uFFFD" 1047 elif data is EOF: 1048 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1049 "eof-in-attribute-value-no-quotes"}) 1050 self.state = self.dataState 1051 else: 1052 self.currentToken["data"][-1][1] += data + self.stream.charsUntil( 1053 frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) 1054 return True 1055 1056 def afterAttributeValueState(self): 1057 data = self.stream.char() 1058 if data in spaceCharacters: 1059 self.state = self.beforeAttributeNameState 1060 elif data == ">": 1061 self.emitCurrentToken() 1062 elif data == "/": 1063 self.state = self.selfClosingStartTagState 1064 elif data is EOF: 1065 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1066 "unexpected-EOF-after-attribute-value"}) 1067 self.stream.unget(data) 1068 self.state = self.dataState 1069 else: 1070 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1071 "unexpected-character-after-attribute-value"}) 1072 self.stream.unget(data) 1073 self.state = self.beforeAttributeNameState 1074 return True 1075 1076 def selfClosingStartTagState(self): 1077 data = self.stream.char() 1078 if data == ">": 1079 self.currentToken["selfClosing"] = True 1080 self.emitCurrentToken() 1081 elif data is EOF: 1082 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1083 "data": 1084 "unexpected-EOF-after-solidus-in-tag"}) 1085 self.stream.unget(data) 1086 self.state = self.dataState 1087 else: 1088 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1089 "unexpected-character-after-solidus-in-tag"}) 1090 self.stream.unget(data) 1091 self.state = self.beforeAttributeNameState 1092 return True 1093 1094 def bogusCommentState(self): 1095 # Make a new comment token and give it as value all the characters 1096 # until the first > or EOF (charsUntil checks for EOF automatically) 1097 # and emit it. 1098 data = self.stream.charsUntil(">") 1099 data = data.replace("\u0000", "\uFFFD") 1100 self.tokenQueue.append( 1101 {"type": tokenTypes["Comment"], "data": data}) 1102 1103 # Eat the character directly after the bogus comment which is either a 1104 # ">" or an EOF. 1105 self.stream.char() 1106 self.state = self.dataState 1107 return True 1108 1109 def markupDeclarationOpenState(self): 1110 charStack = [self.stream.char()] 1111 if charStack[-1] == "-": 1112 charStack.append(self.stream.char()) 1113 if charStack[-1] == "-": 1114 self.currentToken = {"type": tokenTypes["Comment"], "data": ""} 1115 self.state = self.commentStartState 1116 return True 1117 elif charStack[-1] in ('d', 'D'): 1118 matched = True 1119 for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), 1120 ('y', 'Y'), ('p', 'P'), ('e', 'E')): 1121 charStack.append(self.stream.char()) 1122 if charStack[-1] not in expected: 1123 matched = False 1124 break 1125 if matched: 1126 self.currentToken = {"type": tokenTypes["Doctype"], 1127 "name": "", 1128 "publicId": None, "systemId": None, 1129 "correct": True} 1130 self.state = self.doctypeState 1131 return True 1132 elif (charStack[-1] == "[" and 1133 self.parser is not None and 1134 self.parser.tree.openElements and 1135 self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): 1136 matched = True 1137 for expected in ["C", "D", "A", "T", "A", "["]: 1138 charStack.append(self.stream.char()) 1139 if charStack[-1] != expected: 1140 matched = False 1141 break 1142 if matched: 1143 self.state = self.cdataSectionState 1144 return True 1145 1146 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1147 "expected-dashes-or-doctype"}) 1148 1149 while charStack: 1150 self.stream.unget(charStack.pop()) 1151 self.state = self.bogusCommentState 1152 return True 1153 1154 def commentStartState(self): 1155 data = self.stream.char() 1156 if data == "-": 1157 self.state = self.commentStartDashState 1158 elif data == "\u0000": 1159 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1160 "data": "invalid-codepoint"}) 1161 self.currentToken["data"] += "\uFFFD" 1162 elif data == ">": 1163 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1164 "incorrect-comment"}) 1165 self.tokenQueue.append(self.currentToken) 1166 self.state = self.dataState 1167 elif data is EOF: 1168 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1169 "eof-in-comment"}) 1170 self.tokenQueue.append(self.currentToken) 1171 self.state = self.dataState 1172 else: 1173 self.currentToken["data"] += data 1174 self.state = self.commentState 1175 return True 1176 1177 def commentStartDashState(self): 1178 data = self.stream.char() 1179 if data == "-": 1180 self.state = self.commentEndState 1181 elif data == "\u0000": 1182 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1183 "data": "invalid-codepoint"}) 1184 self.currentToken["data"] += "-\uFFFD" 1185 elif data == ">": 1186 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1187 "incorrect-comment"}) 1188 self.tokenQueue.append(self.currentToken) 1189 self.state = self.dataState 1190 elif data is EOF: 1191 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1192 "eof-in-comment"}) 1193 self.tokenQueue.append(self.currentToken) 1194 self.state = self.dataState 1195 else: 1196 self.currentToken["data"] += "-" + data 1197 self.state = self.commentState 1198 return True 1199 1200 def commentState(self): 1201 data = self.stream.char() 1202 if data == "-": 1203 self.state = self.commentEndDashState 1204 elif data == "\u0000": 1205 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1206 "data": "invalid-codepoint"}) 1207 self.currentToken["data"] += "\uFFFD" 1208 elif data is EOF: 1209 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1210 "data": "eof-in-comment"}) 1211 self.tokenQueue.append(self.currentToken) 1212 self.state = self.dataState 1213 else: 1214 self.currentToken["data"] += data + \ 1215 self.stream.charsUntil(("-", "\u0000")) 1216 return True 1217 1218 def commentEndDashState(self): 1219 data = self.stream.char() 1220 if data == "-": 1221 self.state = self.commentEndState 1222 elif data == "\u0000": 1223 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1224 "data": "invalid-codepoint"}) 1225 self.currentToken["data"] += "-\uFFFD" 1226 self.state = self.commentState 1227 elif data is EOF: 1228 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1229 "eof-in-comment-end-dash"}) 1230 self.tokenQueue.append(self.currentToken) 1231 self.state = self.dataState 1232 else: 1233 self.currentToken["data"] += "-" + data 1234 self.state = self.commentState 1235 return True 1236 1237 def commentEndState(self): 1238 data = self.stream.char() 1239 if data == ">": 1240 self.tokenQueue.append(self.currentToken) 1241 self.state = self.dataState 1242 elif data == "\u0000": 1243 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1244 "data": "invalid-codepoint"}) 1245 self.currentToken["data"] += "--\uFFFD" 1246 self.state = self.commentState 1247 elif data == "!": 1248 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1249 "unexpected-bang-after-double-dash-in-comment"}) 1250 self.state = self.commentEndBangState 1251 elif data == "-": 1252 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1253 "unexpected-dash-after-double-dash-in-comment"}) 1254 self.currentToken["data"] += data 1255 elif data is EOF: 1256 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1257 "eof-in-comment-double-dash"}) 1258 self.tokenQueue.append(self.currentToken) 1259 self.state = self.dataState 1260 else: 1261 # XXX 1262 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1263 "unexpected-char-in-comment"}) 1264 self.currentToken["data"] += "--" + data 1265 self.state = self.commentState 1266 return True 1267 1268 def commentEndBangState(self): 1269 data = self.stream.char() 1270 if data == ">": 1271 self.tokenQueue.append(self.currentToken) 1272 self.state = self.dataState 1273 elif data == "-": 1274 self.currentToken["data"] += "--!" 1275 self.state = self.commentEndDashState 1276 elif data == "\u0000": 1277 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1278 "data": "invalid-codepoint"}) 1279 self.currentToken["data"] += "--!\uFFFD" 1280 self.state = self.commentState 1281 elif data is EOF: 1282 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1283 "eof-in-comment-end-bang-state"}) 1284 self.tokenQueue.append(self.currentToken) 1285 self.state = self.dataState 1286 else: 1287 self.currentToken["data"] += "--!" + data 1288 self.state = self.commentState 1289 return True 1290 1291 def doctypeState(self): 1292 data = self.stream.char() 1293 if data in spaceCharacters: 1294 self.state = self.beforeDoctypeNameState 1295 elif data is EOF: 1296 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1297 "expected-doctype-name-but-got-eof"}) 1298 self.currentToken["correct"] = False 1299 self.tokenQueue.append(self.currentToken) 1300 self.state = self.dataState 1301 else: 1302 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1303 "need-space-after-doctype"}) 1304 self.stream.unget(data) 1305 self.state = self.beforeDoctypeNameState 1306 return True 1307 1308 def beforeDoctypeNameState(self): 1309 data = self.stream.char() 1310 if data in spaceCharacters: 1311 pass 1312 elif data == ">": 1313 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1314 "expected-doctype-name-but-got-right-bracket"}) 1315 self.currentToken["correct"] = False 1316 self.tokenQueue.append(self.currentToken) 1317 self.state = self.dataState 1318 elif data == "\u0000": 1319 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1320 "data": "invalid-codepoint"}) 1321 self.currentToken["name"] = "\uFFFD" 1322 self.state = self.doctypeNameState 1323 elif data is EOF: 1324 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1325 "expected-doctype-name-but-got-eof"}) 1326 self.currentToken["correct"] = False 1327 self.tokenQueue.append(self.currentToken) 1328 self.state = self.dataState 1329 else: 1330 self.currentToken["name"] = data 1331 self.state = self.doctypeNameState 1332 return True 1333 1334 def doctypeNameState(self): 1335 data = self.stream.char() 1336 if data in spaceCharacters: 1337 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 1338 self.state = self.afterDoctypeNameState 1339 elif data == ">": 1340 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 1341 self.tokenQueue.append(self.currentToken) 1342 self.state = self.dataState 1343 elif data == "\u0000": 1344 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1345 "data": "invalid-codepoint"}) 1346 self.currentToken["name"] += "\uFFFD" 1347 self.state = self.doctypeNameState 1348 elif data is EOF: 1349 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1350 "eof-in-doctype-name"}) 1351 self.currentToken["correct"] = False 1352 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 1353 self.tokenQueue.append(self.currentToken) 1354 self.state = self.dataState 1355 else: 1356 self.currentToken["name"] += data 1357 return True 1358 1359 def afterDoctypeNameState(self): 1360 data = self.stream.char() 1361 if data in spaceCharacters: 1362 pass 1363 elif data == ">": 1364 self.tokenQueue.append(self.currentToken) 1365 self.state = self.dataState 1366 elif data is EOF: 1367 self.currentToken["correct"] = False 1368 self.stream.unget(data) 1369 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1370 "eof-in-doctype"}) 1371 self.tokenQueue.append(self.currentToken) 1372 self.state = self.dataState 1373 else: 1374 if data in ("p", "P"): 1375 matched = True 1376 for expected in (("u", "U"), ("b", "B"), ("l", "L"), 1377 ("i", "I"), ("c", "C")): 1378 data = self.stream.char() 1379 if data not in expected: 1380 matched = False 1381 break 1382 if matched: 1383 self.state = self.afterDoctypePublicKeywordState 1384 return True 1385 elif data in ("s", "S"): 1386 matched = True 1387 for expected in (("y", "Y"), ("s", "S"), ("t", "T"), 1388 ("e", "E"), ("m", "M")): 1389 data = self.stream.char() 1390 if data not in expected: 1391 matched = False 1392 break 1393 if matched: 1394 self.state = self.afterDoctypeSystemKeywordState 1395 return True 1396 1397 # All the characters read before the current 'data' will be 1398 # [a-zA-Z], so they're garbage in the bogus doctype and can be 1399 # discarded; only the latest character might be '>' or EOF 1400 # and needs to be ungetted 1401 self.stream.unget(data) 1402 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1403 "expected-space-or-right-bracket-in-doctype", "datavars": 1404 {"data": data}}) 1405 self.currentToken["correct"] = False 1406 self.state = self.bogusDoctypeState 1407 1408 return True 1409 1410 def afterDoctypePublicKeywordState(self): 1411 data = self.stream.char() 1412 if data in spaceCharacters: 1413 self.state = self.beforeDoctypePublicIdentifierState 1414 elif data in ("'", '"'): 1415 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1416 "unexpected-char-in-doctype"}) 1417 self.stream.unget(data) 1418 self.state = self.beforeDoctypePublicIdentifierState 1419 elif data is EOF: 1420 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1421 "eof-in-doctype"}) 1422 self.currentToken["correct"] = False 1423 self.tokenQueue.append(self.currentToken) 1424 self.state = self.dataState 1425 else: 1426 self.stream.unget(data) 1427 self.state = self.beforeDoctypePublicIdentifierState 1428 return True 1429 1430 def beforeDoctypePublicIdentifierState(self): 1431 data = self.stream.char() 1432 if data in spaceCharacters: 1433 pass 1434 elif data == "\"": 1435 self.currentToken["publicId"] = "" 1436 self.state = self.doctypePublicIdentifierDoubleQuotedState 1437 elif data == "'": 1438 self.currentToken["publicId"] = "" 1439 self.state = self.doctypePublicIdentifierSingleQuotedState 1440 elif data == ">": 1441 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1442 "unexpected-end-of-doctype"}) 1443 self.currentToken["correct"] = False 1444 self.tokenQueue.append(self.currentToken) 1445 self.state = self.dataState 1446 elif data is EOF: 1447 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1448 "eof-in-doctype"}) 1449 self.currentToken["correct"] = False 1450 self.tokenQueue.append(self.currentToken) 1451 self.state = self.dataState 1452 else: 1453 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1454 "unexpected-char-in-doctype"}) 1455 self.currentToken["correct"] = False 1456 self.state = self.bogusDoctypeState 1457 return True 1458 1459 def doctypePublicIdentifierDoubleQuotedState(self): 1460 data = self.stream.char() 1461 if data == "\"": 1462 self.state = self.afterDoctypePublicIdentifierState 1463 elif data == "\u0000": 1464 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1465 "data": "invalid-codepoint"}) 1466 self.currentToken["publicId"] += "\uFFFD" 1467 elif data == ">": 1468 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1469 "unexpected-end-of-doctype"}) 1470 self.currentToken["correct"] = False 1471 self.tokenQueue.append(self.currentToken) 1472 self.state = self.dataState 1473 elif data is EOF: 1474 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1475 "eof-in-doctype"}) 1476 self.currentToken["correct"] = False 1477 self.tokenQueue.append(self.currentToken) 1478 self.state = self.dataState 1479 else: 1480 self.currentToken["publicId"] += data 1481 return True 1482 1483 def doctypePublicIdentifierSingleQuotedState(self): 1484 data = self.stream.char() 1485 if data == "'": 1486 self.state = self.afterDoctypePublicIdentifierState 1487 elif data == "\u0000": 1488 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1489 "data": "invalid-codepoint"}) 1490 self.currentToken["publicId"] += "\uFFFD" 1491 elif data == ">": 1492 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1493 "unexpected-end-of-doctype"}) 1494 self.currentToken["correct"] = False 1495 self.tokenQueue.append(self.currentToken) 1496 self.state = self.dataState 1497 elif data is EOF: 1498 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1499 "eof-in-doctype"}) 1500 self.currentToken["correct"] = False 1501 self.tokenQueue.append(self.currentToken) 1502 self.state = self.dataState 1503 else: 1504 self.currentToken["publicId"] += data 1505 return True 1506 1507 def afterDoctypePublicIdentifierState(self): 1508 data = self.stream.char() 1509 if data in spaceCharacters: 1510 self.state = self.betweenDoctypePublicAndSystemIdentifiersState 1511 elif data == ">": 1512 self.tokenQueue.append(self.currentToken) 1513 self.state = self.dataState 1514 elif data == '"': 1515 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1516 "unexpected-char-in-doctype"}) 1517 self.currentToken["systemId"] = "" 1518 self.state = self.doctypeSystemIdentifierDoubleQuotedState 1519 elif data == "'": 1520 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1521 "unexpected-char-in-doctype"}) 1522 self.currentToken["systemId"] = "" 1523 self.state = self.doctypeSystemIdentifierSingleQuotedState 1524 elif data is EOF: 1525 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1526 "eof-in-doctype"}) 1527 self.currentToken["correct"] = False 1528 self.tokenQueue.append(self.currentToken) 1529 self.state = self.dataState 1530 else: 1531 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1532 "unexpected-char-in-doctype"}) 1533 self.currentToken["correct"] = False 1534 self.state = self.bogusDoctypeState 1535 return True 1536 1537 def betweenDoctypePublicAndSystemIdentifiersState(self): 1538 data = self.stream.char() 1539 if data in spaceCharacters: 1540 pass 1541 elif data == ">": 1542 self.tokenQueue.append(self.currentToken) 1543 self.state = self.dataState 1544 elif data == '"': 1545 self.currentToken["systemId"] = "" 1546 self.state = self.doctypeSystemIdentifierDoubleQuotedState 1547 elif data == "'": 1548 self.currentToken["systemId"] = "" 1549 self.state = self.doctypeSystemIdentifierSingleQuotedState 1550 elif data == EOF: 1551 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1552 "eof-in-doctype"}) 1553 self.currentToken["correct"] = False 1554 self.tokenQueue.append(self.currentToken) 1555 self.state = self.dataState 1556 else: 1557 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1558 "unexpected-char-in-doctype"}) 1559 self.currentToken["correct"] = False 1560 self.state = self.bogusDoctypeState 1561 return True 1562 1563 def afterDoctypeSystemKeywordState(self): 1564 data = self.stream.char() 1565 if data in spaceCharacters: 1566 self.state = self.beforeDoctypeSystemIdentifierState 1567 elif data in ("'", '"'): 1568 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1569 "unexpected-char-in-doctype"}) 1570 self.stream.unget(data) 1571 self.state = self.beforeDoctypeSystemIdentifierState 1572 elif data is EOF: 1573 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1574 "eof-in-doctype"}) 1575 self.currentToken["correct"] = False 1576 self.tokenQueue.append(self.currentToken) 1577 self.state = self.dataState 1578 else: 1579 self.stream.unget(data) 1580 self.state = self.beforeDoctypeSystemIdentifierState 1581 return True 1582 1583 def beforeDoctypeSystemIdentifierState(self): 1584 data = self.stream.char() 1585 if data in spaceCharacters: 1586 pass 1587 elif data == "\"": 1588 self.currentToken["systemId"] = "" 1589 self.state = self.doctypeSystemIdentifierDoubleQuotedState 1590 elif data == "'": 1591 self.currentToken["systemId"] = "" 1592 self.state = self.doctypeSystemIdentifierSingleQuotedState 1593 elif data == ">": 1594 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1595 "unexpected-char-in-doctype"}) 1596 self.currentToken["correct"] = False 1597 self.tokenQueue.append(self.currentToken) 1598 self.state = self.dataState 1599 elif data is EOF: 1600 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1601 "eof-in-doctype"}) 1602 self.currentToken["correct"] = False 1603 self.tokenQueue.append(self.currentToken) 1604 self.state = self.dataState 1605 else: 1606 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1607 "unexpected-char-in-doctype"}) 1608 self.currentToken["correct"] = False 1609 self.state = self.bogusDoctypeState 1610 return True 1611 1612 def doctypeSystemIdentifierDoubleQuotedState(self): 1613 data = self.stream.char() 1614 if data == "\"": 1615 self.state = self.afterDoctypeSystemIdentifierState 1616 elif data == "\u0000": 1617 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1618 "data": "invalid-codepoint"}) 1619 self.currentToken["systemId"] += "\uFFFD" 1620 elif data == ">": 1621 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1622 "unexpected-end-of-doctype"}) 1623 self.currentToken["correct"] = False 1624 self.tokenQueue.append(self.currentToken) 1625 self.state = self.dataState 1626 elif data is EOF: 1627 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1628 "eof-in-doctype"}) 1629 self.currentToken["correct"] = False 1630 self.tokenQueue.append(self.currentToken) 1631 self.state = self.dataState 1632 else: 1633 self.currentToken["systemId"] += data 1634 return True 1635 1636 def doctypeSystemIdentifierSingleQuotedState(self): 1637 data = self.stream.char() 1638 if data == "'": 1639 self.state = self.afterDoctypeSystemIdentifierState 1640 elif data == "\u0000": 1641 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1642 "data": "invalid-codepoint"}) 1643 self.currentToken["systemId"] += "\uFFFD" 1644 elif data == ">": 1645 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1646 "unexpected-end-of-doctype"}) 1647 self.currentToken["correct"] = False 1648 self.tokenQueue.append(self.currentToken) 1649 self.state = self.dataState 1650 elif data is EOF: 1651 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1652 "eof-in-doctype"}) 1653 self.currentToken["correct"] = False 1654 self.tokenQueue.append(self.currentToken) 1655 self.state = self.dataState 1656 else: 1657 self.currentToken["systemId"] += data 1658 return True 1659 1660 def afterDoctypeSystemIdentifierState(self): 1661 data = self.stream.char() 1662 if data in spaceCharacters: 1663 pass 1664 elif data == ">": 1665 self.tokenQueue.append(self.currentToken) 1666 self.state = self.dataState 1667 elif data is EOF: 1668 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1669 "eof-in-doctype"}) 1670 self.currentToken["correct"] = False 1671 self.tokenQueue.append(self.currentToken) 1672 self.state = self.dataState 1673 else: 1674 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1675 "unexpected-char-in-doctype"}) 1676 self.state = self.bogusDoctypeState 1677 return True 1678 1679 def bogusDoctypeState(self): 1680 data = self.stream.char() 1681 if data == ">": 1682 self.tokenQueue.append(self.currentToken) 1683 self.state = self.dataState 1684 elif data is EOF: 1685 # XXX EMIT 1686 self.stream.unget(data) 1687 self.tokenQueue.append(self.currentToken) 1688 self.state = self.dataState 1689 else: 1690 pass 1691 return True 1692 1693 def cdataSectionState(self): 1694 data = [] 1695 while True: 1696 data.append(self.stream.charsUntil("]")) 1697 data.append(self.stream.charsUntil(">")) 1698 char = self.stream.char() 1699 if char == EOF: 1700 break 1701 else: 1702 assert char == ">" 1703 if data[-1][-2:] == "]]": 1704 data[-1] = data[-1][:-2] 1705 break 1706 else: 1707 data.append(char) 1708 1709 data = "".join(data) # pylint:disable=redefined-variable-type 1710 # Deal with null here rather than in the parser 1711 nullCount = data.count("\u0000") 1712 if nullCount > 0: 1713 for _ in range(nullCount): 1714 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1715 "data": "invalid-codepoint"}) 1716 data = data.replace("\u0000", "\uFFFD") 1717 if data: 1718 self.tokenQueue.append({"type": tokenTypes["Characters"], 1719 "data": data}) 1720 self.state = self.dataState 1721 return True 1722