1 /* 2 * Copyright (c) 2005-2007 Henri Sivonen 3 * Copyright (c) 2007-2017 Mozilla Foundation 4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 5 * Foundation, and Opera Software ASA. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 * DEALINGS IN THE SOFTWARE. 24 */ 25 26 /* 27 * The comments following this one that use the same comment syntax as this 28 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 29 * amended as of June 18 2008 and May 31 2010. 30 * That document came with this statement: 31 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and 32 * Opera Software ASA. You are granted a license to use, reproduce and 33 * create derivative works of this document." 34 */ 35 36 package nu.validator.htmlparser.impl; 37 38 import org.xml.sax.ErrorHandler; 39 import org.xml.sax.Locator; 40 import org.xml.sax.ext.Locator2; 41 import org.xml.sax.SAXException; 42 import org.xml.sax.SAXParseException; 43 44 import nu.validator.htmlparser.annotation.Auto; 45 import nu.validator.htmlparser.annotation.CharacterName; 46 import nu.validator.htmlparser.annotation.Const; 47 import nu.validator.htmlparser.annotation.Inline; 48 import nu.validator.htmlparser.annotation.Local; 49 import nu.validator.htmlparser.annotation.NoLength; 50 import nu.validator.htmlparser.common.EncodingDeclarationHandler; 51 import nu.validator.htmlparser.common.Interner; 52 import nu.validator.htmlparser.common.TokenHandler; 53 import nu.validator.htmlparser.common.XmlViolationPolicy; 54 55 /** 56 * An implementation of 57 * https://html.spec.whatwg.org/multipage/syntax.html#tokenization 58 * 59 * This class implements the <code>Locator</code> interface. This is not an 60 * incidental implementation detail: Users of this class are encouraged to make 61 * use of the <code>Locator</code> nature. 62 * 63 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer 64 * can be configured to treat these conditions as fatal or to coerce the infoset 65 * to something that XML 1.0 allows. 66 * 67 * @version $Id$ 68 * @author hsivonen 69 */ 70 public class Tokenizer implements Locator, Locator2 { 71 72 private static final int DATA_AND_RCDATA_MASK = ~1; 73 74 public static final int DATA = 0; 75 76 public static final int RCDATA = 1; 77 78 public static final int SCRIPT_DATA = 2; 79 80 public static final int RAWTEXT = 3; 81 82 public static final int SCRIPT_DATA_ESCAPED = 4; 83 84 public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; 85 86 public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; 87 88 public static final int ATTRIBUTE_VALUE_UNQUOTED = 7; 89 90 public static final int PLAINTEXT = 8; 91 92 public static final int TAG_OPEN = 9; 93 94 public static final int CLOSE_TAG_OPEN = 10; 95 96 public static final int TAG_NAME = 11; 97 98 public static final int BEFORE_ATTRIBUTE_NAME = 12; 99 100 public static final int ATTRIBUTE_NAME = 13; 101 102 public static final int AFTER_ATTRIBUTE_NAME = 14; 103 104 public static final int BEFORE_ATTRIBUTE_VALUE = 15; 105 106 public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16; 107 108 public static final int BOGUS_COMMENT = 17; 109 110 public static final int MARKUP_DECLARATION_OPEN = 18; 111 112 public static final int DOCTYPE = 19; 113 114 public static final int BEFORE_DOCTYPE_NAME = 20; 115 116 public static final int DOCTYPE_NAME = 21; 117 118 public static final int AFTER_DOCTYPE_NAME = 22; 119 120 public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; 121 122 public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; 123 124 public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; 125 126 public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; 127 128 public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; 129 130 public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; 131 132 public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; 133 134 public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; 135 136 public static final int BOGUS_DOCTYPE = 31; 137 138 public static final int COMMENT_START = 32; 139 140 public static final int COMMENT_START_DASH = 33; 141 142 public static final int COMMENT = 34; 143 144 public static final int COMMENT_END_DASH = 35; 145 146 public static final int COMMENT_END = 36; 147 148 public static final int COMMENT_END_BANG = 37; 149 150 public static final int NON_DATA_END_TAG_NAME = 38; 151 152 public static final int MARKUP_DECLARATION_HYPHEN = 39; 153 154 public static final int MARKUP_DECLARATION_OCTYPE = 40; 155 156 public static final int DOCTYPE_UBLIC = 41; 157 158 public static final int DOCTYPE_YSTEM = 42; 159 160 public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; 161 162 public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; 163 164 public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; 165 166 public static final int CONSUME_CHARACTER_REFERENCE = 46; 167 168 public static final int CONSUME_NCR = 47; 169 170 public static final int CHARACTER_REFERENCE_TAIL = 48; 171 172 public static final int HEX_NCR_LOOP = 49; 173 174 public static final int DECIMAL_NRC_LOOP = 50; 175 176 public static final int HANDLE_NCR_VALUE = 51; 177 178 public static final int HANDLE_NCR_VALUE_RECONSUME = 52; 179 180 public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53; 181 182 public static final int SELF_CLOSING_START_TAG = 54; 183 184 public static final int CDATA_START = 55; 185 186 public static final int CDATA_SECTION = 56; 187 188 public static final int CDATA_RSQB = 57; 189 190 public static final int CDATA_RSQB_RSQB = 58; 191 192 public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59; 193 194 public static final int SCRIPT_DATA_ESCAPE_START = 60; 195 196 public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61; 197 198 public static final int SCRIPT_DATA_ESCAPED_DASH = 62; 199 200 public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63; 201 202 public static final int BOGUS_COMMENT_HYPHEN = 64; 203 204 public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; 205 206 public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; 207 208 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; 209 210 public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68; 211 212 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; 213 214 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; 215 216 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; 217 218 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; 219 220 public static final int PROCESSING_INSTRUCTION = 73; 221 222 public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74; 223 224 /** 225 * Magic value for UTF-16 operations. 226 */ 227 private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); 228 229 /** 230 * UTF-16 code unit array containing less than and greater than for emitting 231 * those characters on certain parse errors. 232 */ 233 private static final @NoLength char[] LT_GT = { '<', '>' }; 234 235 /** 236 * UTF-16 code unit array containing less than and solidus for emitting 237 * those characters on certain parse errors. 238 */ 239 private static final @NoLength char[] LT_SOLIDUS = { '<', '/' }; 240 241 /** 242 * UTF-16 code unit array containing ]] for emitting those characters on 243 * state transitions. 244 */ 245 private static final @NoLength char[] RSQB_RSQB = { ']', ']' }; 246 247 /** 248 * Array version of U+FFFD. 249 */ 250 private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; 251 252 // [NOCPP[ 253 254 /** 255 * Array version of space. 256 */ 257 private static final @NoLength char[] SPACE = { ' ' }; 258 259 // ]NOCPP] 260 261 /** 262 * Array version of line feed. 263 */ 264 private static final @NoLength char[] LF = { '\n' }; 265 266 /** 267 * "CDATA[" as <code>char[]</code> 268 */ 269 private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T', 270 'A', '[' }; 271 272 /** 273 * "octype" as <code>char[]</code> 274 */ 275 private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p', 276 'e' }; 277 278 /** 279 * "ublic" as <code>char[]</code> 280 */ 281 private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' }; 282 283 /** 284 * "ystem" as <code>char[]</code> 285 */ 286 private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' }; 287 288 private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' }; 289 290 private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' }; 291 292 private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' }; 293 294 private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't', 295 'e', 'x', 't' }; 296 297 private static final char[] XMP_ARR = { 'x', 'm', 'p' }; 298 299 private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r', 300 'e', 'a' }; 301 302 private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' }; 303 304 private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e', 305 'd' }; 306 307 private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i', 308 'p', 't' }; 309 310 private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm', 311 'e', 's' }; 312 313 /** 314 * The token handler. 315 */ 316 protected final TokenHandler tokenHandler; 317 318 protected EncodingDeclarationHandler encodingDeclarationHandler; 319 320 // [NOCPP[ 321 322 /** 323 * The error handler. 324 */ 325 protected ErrorHandler errorHandler; 326 327 // ]NOCPP] 328 329 /** 330 * Whether the previous char read was CR. 331 */ 332 protected boolean lastCR; 333 334 protected int stateSave; 335 336 private int returnStateSave; 337 338 protected int index; 339 340 private boolean forceQuirks; 341 342 private char additional; 343 344 private int entCol; 345 346 private int firstCharKey; 347 348 private int lo; 349 350 private int hi; 351 352 private int candidate; 353 354 private int charRefBufMark; 355 356 protected int value; 357 358 private boolean seenDigits; 359 360 protected int cstart; 361 362 /** 363 * The SAX public id for the resource being tokenized. (Only passed to back 364 * as part of locator data.) 365 */ 366 private String publicId; 367 368 /** 369 * The SAX system id for the resource being tokenized. (Only passed to back 370 * as part of locator data.) 371 */ 372 private String systemId; 373 374 /** 375 * Buffer for bufferable things other than those that fit the description 376 * of <code>charRefBuf</code>. 377 */ 378 private @Auto char[] strBuf; 379 380 /** 381 * Number of significant <code>char</code>s in <code>strBuf</code>. 382 */ 383 private int strBufLen; 384 385 /** 386 * Buffer for characters that might form a character reference but may 387 * end up not forming one. 388 */ 389 private final @Auto char[] charRefBuf; 390 391 /** 392 * Number of significant <code>char</code>s in <code>charRefBuf</code>. 393 */ 394 private int charRefBufLen; 395 396 /** 397 * Buffer for expanding NCRs falling into the Basic Multilingual Plane. 398 */ 399 private final @Auto char[] bmpChar; 400 401 /** 402 * Buffer for expanding astral NCRs. 403 */ 404 private final @Auto char[] astralChar; 405 406 /** 407 * The element whose end tag closes the current CDATA or RCDATA element. 408 */ 409 protected ElementName endTagExpectation = null; 410 411 private char[] endTagExpectationAsArray; // not @Auto! 412 413 /** 414 * <code>true</code> if tokenizing an end tag 415 */ 416 protected boolean endTag; 417 418 /** 419 * <code>true</code> iff the current element/attribute name contains 420 * a hyphen. 421 */ 422 private boolean containsHyphen; 423 424 /** 425 * The current tag token name. One of 426 * 1) null, 427 * 2) non-owning reference to nonInternedTagName 428 * 3) non-owning reference to a pre-interned ElementName 429 */ 430 private ElementName tagName = null; 431 432 /** 433 * The recycled ElementName instance for the non-pre-interned cases. 434 */ 435 private ElementName nonInternedTagName = null; 436 437 /** 438 * The current attribute name. 439 */ 440 protected AttributeName attributeName = null; 441 442 // CPPONLY: private AttributeName nonInternedAttributeName = null; 443 444 // [NOCPP[ 445 446 /** 447 * Whether comment tokens are emitted. 448 */ 449 private boolean wantsComments = false; 450 451 /** 452 * Whether the stream is past the first 1024 bytes. 453 */ 454 private boolean metaBoundaryPassed; 455 456 // ]NOCPP] 457 458 /** 459 * The name of the current doctype token. 460 */ 461 private @Local String doctypeName; 462 463 /** 464 * The public id of the current doctype token. 465 */ 466 private String publicIdentifier; 467 468 /** 469 * The system id of the current doctype token. 470 */ 471 private String systemIdentifier; 472 473 /** 474 * The attribute holder. 475 */ 476 private HtmlAttributes attributes; 477 478 // [NOCPP[ 479 480 /** 481 * The policy for vertical tab and form feed. 482 */ 483 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET; 484 485 /** 486 * The policy for comments. 487 */ 488 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET; 489 490 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET; 491 492 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET; 493 494 private int mappingLangToXmlLang; 495 496 // ]NOCPP] 497 498 private final boolean newAttributesEachTime; 499 500 private boolean shouldSuspend; 501 502 protected boolean confident; 503 504 private int line; 505 506 /* 507 * The line number of the current attribute. First set to the line of the 508 * attribute name and if there is a value, set to the line the value 509 * started on. 510 */ 511 // CPPONLY: private int attributeLine; 512 513 private Interner interner; 514 515 // CPPONLY: private boolean viewingXmlSource; 516 517 // [NOCPP[ 518 519 protected LocatorImpl ampersandLocation; 520 Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime)521 public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { 522 this.tokenHandler = tokenHandler; 523 this.encodingDeclarationHandler = null; 524 this.lastCR = false; 525 this.stateSave = 0; 526 this.returnStateSave = 0; 527 this.index = 0; 528 this.forceQuirks = false; 529 this.additional = '\u0000'; 530 this.entCol = 0; 531 this.firstCharKey = 0; 532 this.lo = 0; 533 this.hi = 0; 534 this.candidate = 0; 535 this.charRefBufMark = 0; 536 this.value = 0; 537 this.seenDigits = false; 538 this.cstart = 0; 539 this.strBufLen = 0; 540 this.newAttributesEachTime = newAttributesEachTime; 541 // ∳ is the longest valid char ref and 542 // the semicolon never gets appended to the buffer. 543 this.charRefBuf = new char[32]; 544 this.charRefBufLen = 0; 545 this.bmpChar = new char[1]; 546 this.astralChar = new char[2]; 547 this.endTagExpectation = null; 548 this.endTagExpectationAsArray = null; 549 this.endTag = false; 550 this.containsHyphen = false; 551 this.tagName = null; 552 this.nonInternedTagName = new ElementName(); 553 this.attributeName = null; 554 // CPPONLY: this.nonInternedAttributeName = new AttributeName(); 555 this.doctypeName = null; 556 this.publicIdentifier = null; 557 this.systemIdentifier = null; 558 this.attributes = null; 559 this.shouldSuspend = false; 560 this.confident = false; 561 this.line = 0; 562 // CPPONLY: this.attributeLine = 0; 563 this.interner = null; 564 } 565 566 // ]NOCPP] 567 568 /** 569 * The constructor. 570 * 571 * @param tokenHandler 572 * the handler for receiving tokens 573 */ Tokenizer(TokenHandler tokenHandler )574 public Tokenizer(TokenHandler tokenHandler 575 // CPPONLY: , boolean viewingXmlSource 576 ) { 577 this.tokenHandler = tokenHandler; 578 this.encodingDeclarationHandler = null; 579 // [NOCPP[ 580 this.newAttributesEachTime = false; 581 // ]NOCPP] 582 this.lastCR = false; 583 this.stateSave = 0; 584 this.returnStateSave = 0; 585 this.index = 0; 586 this.forceQuirks = false; 587 this.additional = '\u0000'; 588 this.entCol = 0; 589 this.firstCharKey = 0; 590 this.lo = 0; 591 this.hi = 0; 592 this.candidate = 0; 593 this.charRefBufMark = 0; 594 this.value = 0; 595 this.seenDigits = false; 596 this.cstart = 0; 597 this.strBufLen = 0; 598 // ∳ is the longest valid char ref and 599 // the semicolon never gets appended to the buffer. 600 this.charRefBuf = new char[32]; 601 this.charRefBufLen = 0; 602 this.bmpChar = new char[1]; 603 this.astralChar = new char[2]; 604 this.endTagExpectation = null; 605 this.endTagExpectationAsArray = null; 606 this.endTag = false; 607 this.containsHyphen = false; 608 this.tagName = null; 609 this.nonInternedTagName = new ElementName(); 610 this.attributeName = null; 611 // CPPONLY: this.nonInternedAttributeName = new AttributeName(); 612 this.doctypeName = null; 613 this.publicIdentifier = null; 614 this.systemIdentifier = null; 615 // [NOCPP[ 616 this.attributes = null; 617 // ]NOCPP] 618 // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null; 619 // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder(); 620 this.shouldSuspend = false; 621 this.confident = false; 622 this.line = 0; 623 // CPPONLY: this.attributeLine = 0; 624 this.interner = null; 625 // CPPONLY: this.viewingXmlSource = viewingXmlSource; 626 } 627 setInterner(Interner interner)628 public void setInterner(Interner interner) { 629 this.interner = interner; 630 } 631 initLocation(String newPublicId, String newSystemId)632 public void initLocation(String newPublicId, String newSystemId) { 633 this.systemId = newSystemId; 634 this.publicId = newPublicId; 635 636 } 637 638 // CPPONLY: boolean isViewingXmlSource() { 639 // CPPONLY: return viewingXmlSource; 640 // CPPONLY: } 641 642 // [NOCPP[ 643 644 /** 645 * Returns the mappingLangToXmlLang. 646 * 647 * @return the mappingLangToXmlLang 648 */ isMappingLangToXmlLang()649 public boolean isMappingLangToXmlLang() { 650 return mappingLangToXmlLang == AttributeName.HTML_LANG; 651 } 652 653 /** 654 * Sets the mappingLangToXmlLang. 655 * 656 * @param mappingLangToXmlLang 657 * the mappingLangToXmlLang to set 658 */ setMappingLangToXmlLang(boolean mappingLangToXmlLang)659 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { 660 this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG 661 : AttributeName.HTML; 662 } 663 664 /** 665 * Sets the error handler. 666 * 667 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 668 */ setErrorHandler(ErrorHandler eh)669 public void setErrorHandler(ErrorHandler eh) { 670 this.errorHandler = eh; 671 } 672 getErrorHandler()673 public ErrorHandler getErrorHandler() { 674 return this.errorHandler; 675 } 676 677 /** 678 * Sets the commentPolicy. 679 * 680 * @param commentPolicy 681 * the commentPolicy to set 682 */ setCommentPolicy(XmlViolationPolicy commentPolicy)683 public void setCommentPolicy(XmlViolationPolicy commentPolicy) { 684 this.commentPolicy = commentPolicy; 685 } 686 687 /** 688 * Sets the contentNonXmlCharPolicy. 689 * 690 * @param contentNonXmlCharPolicy 691 * the contentNonXmlCharPolicy to set 692 */ setContentNonXmlCharPolicy( XmlViolationPolicy contentNonXmlCharPolicy)693 public void setContentNonXmlCharPolicy( 694 XmlViolationPolicy contentNonXmlCharPolicy) { 695 if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) { 696 throw new IllegalArgumentException( 697 "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW."); 698 } 699 } 700 701 /** 702 * Sets the contentSpacePolicy. 703 * 704 * @param contentSpacePolicy 705 * the contentSpacePolicy to set 706 */ setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy)707 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { 708 this.contentSpacePolicy = contentSpacePolicy; 709 } 710 711 /** 712 * Sets the xmlnsPolicy. 713 * 714 * @param xmlnsPolicy 715 * the xmlnsPolicy to set 716 */ setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy)717 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { 718 if (xmlnsPolicy == XmlViolationPolicy.FATAL) { 719 throw new IllegalArgumentException("Can't use FATAL here."); 720 } 721 this.xmlnsPolicy = xmlnsPolicy; 722 } 723 setNamePolicy(XmlViolationPolicy namePolicy)724 public void setNamePolicy(XmlViolationPolicy namePolicy) { 725 this.namePolicy = namePolicy; 726 } 727 728 // ]NOCPP] 729 730 // For the token handler to call 731 732 /** 733 * Sets the tokenizer state and the associated element name. This should 734 * only ever used to put the tokenizer into one of the states that have 735 * a special end tag expectation. 736 * 737 * @param specialTokenizerState 738 * the tokenizer state to set 739 */ setState(int specialTokenizerState)740 public void setState(int specialTokenizerState) { 741 this.stateSave = specialTokenizerState; 742 this.endTagExpectation = null; 743 this.endTagExpectationAsArray = null; 744 } 745 746 // [NOCPP[ 747 748 /** 749 * Sets the tokenizer state and the associated element name. This should 750 * only ever used to put the tokenizer into one of the states that have 751 * a special end tag expectation. For use from the tokenizer test harness. 752 * 753 * @param specialTokenizerState 754 * the tokenizer state to set 755 * @param endTagExpectation 756 * the expected end tag for transitioning back to normal 757 */ setStateAndEndTagExpectation(int specialTokenizerState, @Local String endTagExpectation)758 public void setStateAndEndTagExpectation(int specialTokenizerState, 759 @Local String endTagExpectation) { 760 this.stateSave = specialTokenizerState; 761 if (specialTokenizerState == Tokenizer.DATA) { 762 return; 763 } 764 @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation); 765 this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 766 asArray.length, interner); 767 assert this.endTagExpectation != null; 768 endTagExpectationToArray(); 769 } 770 771 // ]NOCPP] 772 773 /** 774 * Sets the tokenizer state and the associated element name. This should 775 * only ever used to put the tokenizer into one of the states that have 776 * a special end tag expectation. 777 * 778 * @param specialTokenizerState 779 * the tokenizer state to set 780 * @param endTagExpectation 781 * the expected end tag for transitioning back to normal 782 */ setStateAndEndTagExpectation(int specialTokenizerState, ElementName endTagExpectation)783 public void setStateAndEndTagExpectation(int specialTokenizerState, 784 ElementName endTagExpectation) { 785 this.stateSave = specialTokenizerState; 786 this.endTagExpectation = endTagExpectation; 787 endTagExpectationToArray(); 788 } 789 endTagExpectationToArray()790 private void endTagExpectationToArray() { 791 switch (endTagExpectation.getGroup()) { 792 case TreeBuilder.TITLE: 793 endTagExpectationAsArray = TITLE_ARR; 794 return; 795 case TreeBuilder.SCRIPT: 796 endTagExpectationAsArray = SCRIPT_ARR; 797 return; 798 case TreeBuilder.STYLE: 799 endTagExpectationAsArray = STYLE_ARR; 800 return; 801 case TreeBuilder.PLAINTEXT: 802 endTagExpectationAsArray = PLAINTEXT_ARR; 803 return; 804 case TreeBuilder.XMP: 805 endTagExpectationAsArray = XMP_ARR; 806 return; 807 case TreeBuilder.TEXTAREA: 808 endTagExpectationAsArray = TEXTAREA_ARR; 809 return; 810 case TreeBuilder.IFRAME: 811 endTagExpectationAsArray = IFRAME_ARR; 812 return; 813 case TreeBuilder.NOEMBED: 814 endTagExpectationAsArray = NOEMBED_ARR; 815 return; 816 case TreeBuilder.NOSCRIPT: 817 endTagExpectationAsArray = NOSCRIPT_ARR; 818 return; 819 case TreeBuilder.NOFRAMES: 820 endTagExpectationAsArray = NOFRAMES_ARR; 821 return; 822 default: 823 assert false: "Bad end tag expectation."; 824 return; 825 } 826 } 827 828 /** 829 * For C++ use only. 830 */ setLineNumber(int line)831 public void setLineNumber(int line) { 832 // CPPONLY: this.attributeLine = line; // XXX is this needed? 833 this.line = line; 834 } 835 836 // start Locator impl 837 838 /** 839 * @see org.xml.sax.Locator#getLineNumber() 840 */ getLineNumber()841 @Inline public int getLineNumber() { 842 return line; 843 } 844 845 // [NOCPP[ 846 847 /** 848 * @see org.xml.sax.Locator#getColumnNumber() 849 */ getColumnNumber()850 @Inline public int getColumnNumber() { 851 return -1; 852 } 853 854 /** 855 * @see org.xml.sax.Locator#getPublicId() 856 */ getPublicId()857 public String getPublicId() { 858 return publicId; 859 } 860 861 /** 862 * @see org.xml.sax.Locator#getSystemId() 863 */ getSystemId()864 public String getSystemId() { 865 return systemId; 866 } 867 868 /** 869 * @see org.xml.sax.ext.Locator2#getXMLVersion() 870 */ getXMLVersion()871 public String getXMLVersion() { 872 return "1.0"; 873 } 874 875 /** 876 * @see org.xml.sax.ext.Locator2#getXMLVersion() 877 */ getEncoding()878 public String getEncoding() { 879 try { 880 return encodingDeclarationHandler == null ? null : encodingDeclarationHandler.getCharacterEncoding(); 881 } catch (SAXException e) { 882 return null; 883 } 884 } 885 886 // end Locator impl 887 888 // end public API 889 notifyAboutMetaBoundary()890 public void notifyAboutMetaBoundary() { 891 metaBoundaryPassed = true; 892 } 893 894 // ]NOCPP] 895 emptyAttributes()896 HtmlAttributes emptyAttributes() { 897 // [NOCPP[ 898 if (newAttributesEachTime) { 899 return new HtmlAttributes(mappingLangToXmlLang); 900 } else { 901 // ]NOCPP] 902 return HtmlAttributes.EMPTY_ATTRIBUTES; 903 // [NOCPP[ 904 } 905 // ]NOCPP] 906 } 907 appendCharRefBuf(char c)908 @Inline private void appendCharRefBuf(char c) { 909 // CPPONLY: assert charRefBufLen < charRefBuf.length: 910 // CPPONLY: "RELEASE: Attempted to overrun charRefBuf!"; 911 charRefBuf[charRefBufLen++] = c; 912 } 913 emitOrAppendCharRefBuf(int returnState)914 private void emitOrAppendCharRefBuf(int returnState) throws SAXException { 915 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 916 appendCharRefBufToStrBuf(); 917 } else { 918 if (charRefBufLen > 0) { 919 tokenHandler.characters(charRefBuf, 0, charRefBufLen); 920 charRefBufLen = 0; 921 } 922 } 923 } 924 clearStrBufAfterUse()925 @Inline private void clearStrBufAfterUse() { 926 strBufLen = 0; 927 } 928 clearStrBufBeforeUse()929 @Inline private void clearStrBufBeforeUse() { 930 assert strBufLen == 0: "strBufLen not reset after previous use!"; 931 strBufLen = 0; // no-op in the absence of bugs 932 } 933 clearStrBufAfterOneHyphen()934 @Inline private void clearStrBufAfterOneHyphen() { 935 assert strBufLen == 1: "strBufLen length not one!"; 936 assert strBuf[0] == '-': "strBuf does not start with a hyphen!"; 937 strBufLen = 0; 938 } 939 940 /** 941 * Appends to the buffer. 942 * 943 * @param c 944 * the UTF-16 code unit to append 945 */ appendStrBuf(char c)946 @Inline private void appendStrBuf(char c) { 947 // CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient."; 948 // CPPONLY: if (strBufLen == strBuf.length) { 949 // CPPONLY: if (!EnsureBufferSpace(1)) { 950 // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure"; 951 // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not 952 // CPPONLY: } 953 strBuf[strBufLen++] = c; 954 } 955 956 /** 957 * The buffer as a String. Currently only used for error reporting. 958 * 959 * <p> 960 * C++ memory note: The return value must be released. 961 * 962 * @return the buffer as a string 963 */ strBufToString()964 protected String strBufToString() { 965 String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen 966 // CPPONLY: , tokenHandler, !newAttributesEachTime && attributeName == AttributeName.CLASS 967 ); 968 clearStrBufAfterUse(); 969 return str; 970 } 971 972 /** 973 * Returns the buffer as a local name. The return value is released in 974 * emitDoctypeToken(). 975 * 976 * @return the buffer as local name 977 */ strBufToDoctypeName()978 private void strBufToDoctypeName() { 979 doctypeName = Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner); 980 clearStrBufAfterUse(); 981 } 982 983 /** 984 * Emits the buffer as character tokens. 985 * 986 * @throws SAXException 987 * if the token handler threw 988 */ emitStrBuf()989 private void emitStrBuf() throws SAXException { 990 if (strBufLen > 0) { 991 tokenHandler.characters(strBuf, 0, strBufLen); 992 clearStrBufAfterUse(); 993 } 994 } 995 appendSecondHyphenToBogusComment()996 @Inline private void appendSecondHyphenToBogusComment() throws SAXException { 997 // [NOCPP[ 998 switch (commentPolicy) { 999 case ALTER_INFOSET: 1000 appendStrBuf(' '); 1001 // CPPONLY: MOZ_FALLTHROUGH; 1002 case ALLOW: 1003 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1004 // ]NOCPP] 1005 appendStrBuf('-'); 1006 // [NOCPP[ 1007 break; 1008 case FATAL: 1009 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1010 break; 1011 } 1012 // ]NOCPP] 1013 } 1014 1015 // [NOCPP[ maybeAppendSpaceToBogusComment()1016 private void maybeAppendSpaceToBogusComment() throws SAXException { 1017 switch (commentPolicy) { 1018 case ALTER_INFOSET: 1019 appendStrBuf(' '); 1020 // CPPONLY: MOZ_FALLTHROUGH; 1021 case ALLOW: 1022 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 1023 break; 1024 case FATAL: 1025 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 1026 break; 1027 } 1028 } 1029 1030 // ]NOCPP] 1031 adjustDoubleHyphenAndAppendToStrBufAndErr(char c)1032 @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c) 1033 throws SAXException { 1034 errConsecutiveHyphens(); 1035 // [NOCPP[ 1036 switch (commentPolicy) { 1037 case ALTER_INFOSET: 1038 strBufLen--; 1039 // WARNING!!! This expands the worst case of the buffer length 1040 // given the length of input! 1041 appendStrBuf(' '); 1042 appendStrBuf('-'); 1043 // CPPONLY: MOZ_FALLTHROUGH; 1044 case ALLOW: 1045 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1046 // ]NOCPP] 1047 appendStrBuf(c); 1048 // [NOCPP[ 1049 break; 1050 case FATAL: 1051 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1052 break; 1053 } 1054 // ]NOCPP] 1055 } 1056 appendStrBuf(@oLength char[] buffer, int offset, int length)1057 private void appendStrBuf(@NoLength char[] buffer, int offset, int length) throws SAXException { 1058 int newLen = Portability.checkedAdd(strBufLen, length); 1059 // CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient."; 1060 // CPPONLY: if (strBuf.length < newLen) { 1061 // CPPONLY: if (!EnsureBufferSpace(length)) { 1062 // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure"; 1063 // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not 1064 // CPPONLY: } 1065 System.arraycopy(buffer, offset, strBuf, strBufLen, length); 1066 strBufLen = newLen; 1067 } 1068 1069 /** 1070 * Append the contents of the char reference buffer to the main one. 1071 */ appendCharRefBufToStrBuf()1072 @Inline private void appendCharRefBufToStrBuf() throws SAXException { 1073 appendStrBuf(charRefBuf, 0, charRefBufLen); 1074 charRefBufLen = 0; 1075 } 1076 1077 /** 1078 * Emits the current comment token. 1079 * 1080 * @param pos 1081 * TODO 1082 * 1083 * @throws SAXException 1084 */ emitComment(int provisionalHyphens, int pos)1085 private void emitComment(int provisionalHyphens, int pos) 1086 throws SAXException { 1087 // [NOCPP[ 1088 if (wantsComments) { 1089 // ]NOCPP] 1090 tokenHandler.comment(strBuf, 0, strBufLen 1091 - provisionalHyphens); 1092 // [NOCPP[ 1093 } 1094 // ]NOCPP] 1095 clearStrBufAfterUse(); 1096 cstart = pos + 1; 1097 } 1098 1099 /** 1100 * Flushes coalesced character tokens. 1101 * 1102 * @param buf 1103 * TODO 1104 * @param pos 1105 * TODO 1106 * 1107 * @throws SAXException 1108 */ flushChars(@oLength char[] buf, int pos)1109 protected void flushChars(@NoLength char[] buf, int pos) 1110 throws SAXException { 1111 if (pos > cstart) { 1112 tokenHandler.characters(buf, cstart, pos - cstart); 1113 } 1114 cstart = Integer.MAX_VALUE; 1115 } 1116 1117 /** 1118 * Reports an condition that would make the infoset incompatible with XML 1119 * 1.0 as fatal. 1120 * 1121 * @param message 1122 * the message 1123 * @throws SAXException 1124 * @throws SAXParseException 1125 */ fatal(String message)1126 public void fatal(String message) throws SAXException { 1127 SAXParseException spe = new SAXParseException(message, this); 1128 if (errorHandler != null) { 1129 errorHandler.fatalError(spe); 1130 } 1131 throw spe; 1132 } 1133 1134 /** 1135 * Reports a Parse Error. 1136 * 1137 * @param message 1138 * the message 1139 * @throws SAXException 1140 */ err(String message)1141 public void err(String message) throws SAXException { 1142 if (errorHandler == null) { 1143 return; 1144 } 1145 SAXParseException spe = new SAXParseException(message, this); 1146 errorHandler.error(spe); 1147 } 1148 errTreeBuilder(String message)1149 public void errTreeBuilder(String message) throws SAXException { 1150 ErrorHandler eh = null; 1151 if (tokenHandler instanceof TreeBuilder<?>) { 1152 TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler; 1153 eh = treeBuilder.getErrorHandler(); 1154 } 1155 if (eh == null) { 1156 eh = errorHandler; 1157 } 1158 if (eh == null) { 1159 return; 1160 } 1161 SAXParseException spe = new SAXParseException(message, this); 1162 eh.error(spe); 1163 } 1164 1165 /** 1166 * Reports a warning 1167 * 1168 * @param message 1169 * the message 1170 * @throws SAXException 1171 */ warn(String message)1172 public void warn(String message) throws SAXException { 1173 if (errorHandler == null) { 1174 return; 1175 } 1176 SAXParseException spe = new SAXParseException(message, this); 1177 errorHandler.warning(spe); 1178 } 1179 strBufToElementNameString()1180 private void strBufToElementNameString() { 1181 if (containsHyphen) { 1182 // We've got a custom element or annotation-xml. 1183 @Local String annotationName = ElementName.ANNOTATION_XML.getName(); 1184 if (Portability.localEqualsBuffer(annotationName, strBuf, strBufLen)) { 1185 tagName = ElementName.ANNOTATION_XML; 1186 } else { 1187 nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, 1188 interner) 1189 // CPPONLY: , true 1190 ); 1191 tagName = nonInternedTagName; 1192 } 1193 } else { 1194 tagName = ElementName.elementNameByBuffer(strBuf, strBufLen, interner); 1195 if (tagName == null) { 1196 nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, 1197 interner) 1198 // CPPONLY: , false 1199 ); 1200 tagName = nonInternedTagName; 1201 } 1202 } 1203 containsHyphen = false; 1204 clearStrBufAfterUse(); 1205 } 1206 emitCurrentTagToken(boolean selfClosing, int pos)1207 private int emitCurrentTagToken(boolean selfClosing, int pos) 1208 throws SAXException { 1209 cstart = pos + 1; 1210 maybeErrSlashInEndTag(selfClosing); 1211 stateSave = Tokenizer.DATA; 1212 HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES 1213 : attributes); 1214 if (endTag) { 1215 /* 1216 * When an end tag token is emitted, the content model flag must be 1217 * switched to the PCDATA state. 1218 */ 1219 maybeErrAttributesOnEndTag(attrs); 1220 // CPPONLY: if (!viewingXmlSource) { 1221 tokenHandler.endTag(tagName); 1222 // CPPONLY: } 1223 // CPPONLY: if (newAttributesEachTime) { 1224 // CPPONLY: Portability.delete(attributes); 1225 // CPPONLY: attributes = null; 1226 // CPPONLY: } 1227 } else { 1228 // CPPONLY: if (viewingXmlSource) { 1229 // CPPONLY: assert newAttributesEachTime; 1230 // CPPONLY: Portability.delete(attributes); 1231 // CPPONLY: attributes = null; 1232 // CPPONLY: } else { 1233 tokenHandler.startTag(tagName, attrs, selfClosing); 1234 // CPPONLY: } 1235 } 1236 tagName = null; 1237 if (newAttributesEachTime) { 1238 attributes = null; 1239 } else { 1240 attributes.clear(mappingLangToXmlLang); 1241 } 1242 /* 1243 * The token handler may have called setStateAndEndTagExpectation 1244 * and changed stateSave since the start of this method. 1245 */ 1246 return stateSave; 1247 } 1248 attributeNameComplete()1249 private void attributeNameComplete() throws SAXException { 1250 attributeName = AttributeName.nameByBuffer(strBuf, strBufLen, interner); 1251 if (attributeName == null) { 1252 // [NOCPP[ 1253 attributeName = AttributeName.createAttributeName( 1254 Portability.newLocalNameFromBuffer(strBuf, strBufLen, 1255 interner), 1256 namePolicy != XmlViolationPolicy.ALLOW); 1257 // ]NOCPP] 1258 // CPPONLY: nonInternedAttributeName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner)); 1259 // CPPONLY: attributeName = nonInternedAttributeName; 1260 } 1261 clearStrBufAfterUse(); 1262 1263 if (attributes == null) { 1264 attributes = new HtmlAttributes(mappingLangToXmlLang); 1265 } 1266 1267 /* 1268 * When the user agent leaves the attribute name state (and before 1269 * emitting the tag token, if appropriate), the complete attribute's 1270 * name must be compared to the other attributes on the same token; if 1271 * there is already an attribute on the token with the exact same name, 1272 * then this is a parse error and the new attribute must be dropped, 1273 * along with the value that gets associated with it (if any). 1274 */ 1275 if (attributes.contains(attributeName)) { 1276 errDuplicateAttribute(); 1277 attributeName = null; 1278 } 1279 } 1280 addAttributeWithoutValue()1281 private void addAttributeWithoutValue() throws SAXException { 1282 noteAttributeWithoutValue(); 1283 1284 // [NOCPP[ 1285 if (metaBoundaryPassed && AttributeName.CHARSET == attributeName 1286 && ElementName.META == tagName) { 1287 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 1024 bytes."); 1288 } 1289 // ]NOCPP] 1290 if (attributeName != null) { 1291 // [NOCPP[ 1292 if (AttributeName.SRC == attributeName 1293 || AttributeName.HREF == attributeName) { 1294 warn("Attribute \u201C" 1295 + attributeName.getLocal(AttributeName.HTML) 1296 + "\u201D without an explicit value seen. The attribute may be dropped by IE7."); 1297 } 1298 // ]NOCPP] 1299 attributes.addAttribute(attributeName, 1300 Portability.newEmptyString() 1301 // [NOCPP[ 1302 , xmlnsPolicy 1303 // ]NOCPP] 1304 // CPPONLY: , attributeLine 1305 ); 1306 attributeName = null; 1307 } else { 1308 clearStrBufAfterUse(); 1309 } 1310 } 1311 addAttributeWithValue()1312 private void addAttributeWithValue() throws SAXException { 1313 // [NOCPP[ 1314 if (metaBoundaryPassed && ElementName.META == tagName 1315 && AttributeName.CHARSET == attributeName) { 1316 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 1024 bytes."); 1317 } 1318 // ]NOCPP] 1319 if (attributeName != null) { 1320 String val = strBufToString(); // Ownership transferred to 1321 // HtmlAttributes 1322 // CPPONLY: if (mViewSource) { 1323 // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val); 1324 // CPPONLY: } 1325 attributes.addAttribute(attributeName, val 1326 // [NOCPP[ 1327 , xmlnsPolicy 1328 // ]NOCPP] 1329 // CPPONLY: , attributeLine 1330 ); 1331 attributeName = null; 1332 } else { 1333 // We have a duplicate attribute. Explicitly discard its value. 1334 clearStrBufAfterUse(); 1335 } 1336 } 1337 1338 // [NOCPP[ 1339 startErrorReporting()1340 protected void startErrorReporting() throws SAXException { 1341 1342 } 1343 1344 // ]NOCPP] 1345 start()1346 public void start() throws SAXException { 1347 initializeWithoutStarting(); 1348 tokenHandler.startTokenization(this); 1349 // [NOCPP[ 1350 startErrorReporting(); 1351 // ]NOCPP] 1352 } 1353 tokenizeBuffer(UTF16Buffer buffer)1354 public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException { 1355 int state = stateSave; 1356 int returnState = returnStateSave; 1357 char c = '\u0000'; 1358 shouldSuspend = false; 1359 lastCR = false; 1360 1361 int start = buffer.getStart(); 1362 int end = buffer.getEnd(); 1363 1364 // In C++, the caller of tokenizeBuffer needs to do this explicitly. 1365 // [NOCPP[ 1366 ensureBufferSpace(end - start); 1367 // ]NOCPP] 1368 1369 /** 1370 * The index of the last <code>char</code> read from <code>buf</code>. 1371 */ 1372 int pos = start - 1; 1373 1374 /** 1375 * The index of the first <code>char</code> in <code>buf</code> that is 1376 * part of a coalesced run of character tokens or 1377 * <code>Integer.MAX_VALUE</code> if there is not a current run being 1378 * coalesced. 1379 */ 1380 switch (state) { 1381 case DATA: 1382 case RCDATA: 1383 case SCRIPT_DATA: 1384 case PLAINTEXT: 1385 case RAWTEXT: 1386 case CDATA_SECTION: 1387 case SCRIPT_DATA_ESCAPED: 1388 case SCRIPT_DATA_ESCAPE_START: 1389 case SCRIPT_DATA_ESCAPE_START_DASH: 1390 case SCRIPT_DATA_ESCAPED_DASH: 1391 case SCRIPT_DATA_ESCAPED_DASH_DASH: 1392 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 1393 case SCRIPT_DATA_DOUBLE_ESCAPED: 1394 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 1395 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 1396 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 1397 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 1398 cstart = start; 1399 break; 1400 default: 1401 cstart = Integer.MAX_VALUE; 1402 break; 1403 } 1404 1405 /** 1406 * The number of <code>char</code>s in <code>buf</code> that have 1407 * meaning. (The rest of the array is garbage and should not be 1408 * examined.) 1409 */ 1410 // CPPONLY: if (mViewSource) { 1411 // CPPONLY: mViewSource.SetBuffer(buffer); 1412 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1413 // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1); 1414 // CPPONLY: } else { 1415 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1416 // CPPONLY: } 1417 // [NOCPP[ 1418 pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, 1419 end); 1420 // ]NOCPP] 1421 if (pos == end) { 1422 // exiting due to end of buffer 1423 buffer.setStart(pos); 1424 } else { 1425 buffer.setStart(pos + 1); 1426 } 1427 return lastCR; 1428 } 1429 1430 // [NOCPP[ ensureBufferSpace(int inputLength)1431 private void ensureBufferSpace(int inputLength) throws SAXException { 1432 // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB. 1433 // Adding to the general worst case instead of only the 1434 // TreeBuilder-exposed worst case to avoid re-introducing a bug when 1435 // unifying the tokenizer and tree builder buffers in the future. 1436 int worstCase = strBufLen + inputLength + charRefBufLen + 2; 1437 tokenHandler.ensureBufferSpace(worstCase); 1438 if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) { 1439 // When altering infoset, if the comment contents are consecutive 1440 // hyphens, each hyphen generates a space, too. These buffer 1441 // contents never get emitted as characters() to the tokenHandler, 1442 // which is why this calculation happens after the call to 1443 // ensureBufferSpace on tokenHandler. 1444 worstCase *= 2; 1445 } 1446 if (strBuf == null) { 1447 // Add an arbitrary small value to avoid immediate reallocation 1448 // once there are a few characters in the buffer. 1449 strBuf = new char[worstCase + 128]; 1450 } else if (worstCase > strBuf.length) { 1451 // HotSpot reportedly allocates memory with 8-byte accuracy, so 1452 // there's no point in trying to do math here to avoid slop. 1453 // Maybe we should add some small constant to worstCase here 1454 // but not doing that without profiling. In C++ with jemalloc, 1455 // the corresponding method should do math to round up here 1456 // to avoid slop. 1457 char[] newBuf = new char[worstCase]; 1458 System.arraycopy(strBuf, 0, newBuf, 0, strBufLen); 1459 strBuf = newBuf; 1460 } 1461 } 1462 // ]NOCPP] 1463 stateLoop(int state, char c, int pos, @NoLength char[] buf, boolean reconsume, int returnState, int endPos)1464 @SuppressWarnings("unused") private int stateLoop(int state, char c, 1465 int pos, @NoLength char[] buf, boolean reconsume, int returnState, 1466 int endPos) throws SAXException { 1467 /* 1468 * Idioms used in this code: 1469 * 1470 * 1471 * Consuming the next input character 1472 * 1473 * To consume the next input character, the code does this: if (++pos == 1474 * endPos) { break stateloop; } c = checkChar(buf, pos); 1475 * 1476 * 1477 * Staying in a state 1478 * 1479 * When there's a state that the tokenizer may stay in over multiple 1480 * input characters, the state has a wrapper |for(;;)| loop and staying 1481 * in the state continues the loop. 1482 * 1483 * 1484 * Switching to another state 1485 * 1486 * To switch to another state, the code sets the state variable to the 1487 * magic number of the new state. Then it either continues stateloop or 1488 * breaks out of the state's own wrapper loop if the target state is 1489 * right after the current state in source order. (This is a partial 1490 * workaround for Java's lack of goto.) 1491 * 1492 * 1493 * Reconsume support 1494 * 1495 * The spec sometimes says that an input character is reconsumed in 1496 * another state. If a state can ever be entered so that an input 1497 * character can be reconsumed in it, the state's code starts with an 1498 * |if (reconsume)| that sets reconsume to false and skips over the 1499 * normal code for consuming a new character. 1500 * 1501 * To reconsume the current character in another state, the code sets 1502 * |reconsume| to true and then switches to the other state. 1503 * 1504 * 1505 * Emitting character tokens 1506 * 1507 * This method emits character tokens lazily. Whenever a new range of 1508 * character tokens starts, the field cstart must be set to the start 1509 * index of the range. The flushChars() method must be called at the end 1510 * of a range to flush it. 1511 * 1512 * 1513 * U+0000 handling 1514 * 1515 * The various states have to handle the replacement of U+0000 with 1516 * U+FFFD. However, if U+0000 would be reconsumed in another state, the 1517 * replacement doesn't need to happen, because it's handled by the 1518 * reconsuming state. 1519 * 1520 * 1521 * LF handling 1522 * 1523 * Every state needs to increment the line number upon LF unless the LF 1524 * gets reconsumed by another state which increments the line number. 1525 * 1526 * 1527 * CR handling 1528 * 1529 * Every state needs to handle CR unless the CR gets reconsumed and is 1530 * handled by the reconsuming state. The CR needs to be handled as if it 1531 * were and LF, the lastCR field must be set to true and then this 1532 * method must return. The IO driver will then swallow the next 1533 * character if it is an LF to coalesce CRLF. 1534 */ 1535 stateloop: for (;;) { 1536 switch (state) { 1537 case DATA: 1538 dataloop: for (;;) { 1539 if (reconsume) { 1540 reconsume = false; 1541 } else { 1542 if (++pos == endPos) { 1543 break stateloop; 1544 } 1545 c = checkChar(buf, pos); 1546 } 1547 switch (c) { 1548 case '&': 1549 /* 1550 * U+0026 AMPERSAND (&) Switch to the character 1551 * reference in data state. 1552 */ 1553 flushChars(buf, pos); 1554 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 1555 appendCharRefBuf(c); 1556 setAdditionalAndRememberAmpersandLocation('\u0000'); 1557 returnState = state; 1558 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1559 continue stateloop; 1560 case '<': 1561 /* 1562 * U+003C LESS-THAN SIGN (<) Switch to the tag 1563 * open state. 1564 */ 1565 flushChars(buf, pos); 1566 1567 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); 1568 break dataloop; // FALL THROUGH continue 1569 // stateloop; 1570 case '\u0000': 1571 emitReplacementCharacter(buf, pos); 1572 continue; 1573 case '\r': 1574 emitCarriageReturn(buf, pos); 1575 break stateloop; 1576 case '\n': 1577 silentLineFeed(); 1578 // CPPONLY: MOZ_FALLTHROUGH; 1579 default: 1580 /* 1581 * Anything else Emit the input character as a 1582 * character token. 1583 * 1584 * Stay in the data state. 1585 */ 1586 continue; 1587 } 1588 } 1589 // CPPONLY: MOZ_FALLTHROUGH; 1590 case TAG_OPEN: 1591 tagopenloop: for (;;) { 1592 /* 1593 * The behavior of this state depends on the content 1594 * model flag. 1595 */ 1596 if (++pos == endPos) { 1597 break stateloop; 1598 } 1599 c = checkChar(buf, pos); 1600 /* 1601 * If the content model flag is set to the PCDATA state 1602 * Consume the next input character: 1603 */ 1604 if (c >= 'A' && c <= 'Z') { 1605 /* 1606 * U+0041 LATIN CAPITAL LETTER A through to U+005A 1607 * LATIN CAPITAL LETTER Z Create a new start tag 1608 * token, 1609 */ 1610 endTag = false; 1611 /* 1612 * set its tag name to the lowercase version of the 1613 * input character (add 0x0020 to the character's 1614 * code point), 1615 */ 1616 clearStrBufBeforeUse(); 1617 appendStrBuf((char) (c + 0x20)); 1618 containsHyphen = false; 1619 /* then switch to the tag name state. */ 1620 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1621 /* 1622 * (Don't emit the token yet; further details will 1623 * be filled in before it is emitted.) 1624 */ 1625 break tagopenloop; 1626 // continue stateloop; 1627 } else if (c >= 'a' && c <= 'z') { 1628 /* 1629 * U+0061 LATIN SMALL LETTER A through to U+007A 1630 * LATIN SMALL LETTER Z Create a new start tag 1631 * token, 1632 */ 1633 endTag = false; 1634 /* 1635 * set its tag name to the input character, 1636 */ 1637 clearStrBufBeforeUse(); 1638 appendStrBuf(c); 1639 containsHyphen = false; 1640 /* then switch to the tag name state. */ 1641 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1642 /* 1643 * (Don't emit the token yet; further details will 1644 * be filled in before it is emitted.) 1645 */ 1646 break tagopenloop; 1647 // continue stateloop; 1648 } 1649 switch (c) { 1650 case '!': 1651 /* 1652 * U+0021 EXCLAMATION MARK (!) Switch to the 1653 * markup declaration open state. 1654 */ 1655 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); 1656 continue stateloop; 1657 case '/': 1658 /* 1659 * U+002F SOLIDUS (/) Switch to the close tag 1660 * open state. 1661 */ 1662 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); 1663 continue stateloop; 1664 case '?': 1665 // CPPONLY: if (viewingXmlSource) { 1666 // CPPONLY: state = transition(state, 1667 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION, 1668 // CPPONLY: reconsume, 1669 // CPPONLY: pos); 1670 // CPPONLY: continue stateloop; 1671 // CPPONLY: } 1672 /* 1673 * U+003F QUESTION MARK (?) Parse error. 1674 */ 1675 errProcessingInstruction(); 1676 /* 1677 * Switch to the bogus comment state. 1678 */ 1679 clearStrBufBeforeUse(); 1680 appendStrBuf(c); 1681 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1682 continue stateloop; 1683 case '>': 1684 /* 1685 * U+003E GREATER-THAN SIGN (>) Parse error. 1686 */ 1687 errLtGt(); 1688 /* 1689 * Emit a U+003C LESS-THAN SIGN character token 1690 * and a U+003E GREATER-THAN SIGN character 1691 * token. 1692 */ 1693 tokenHandler.characters(Tokenizer.LT_GT, 0, 2); 1694 /* Switch to the data state. */ 1695 cstart = pos + 1; 1696 state = transition(state, Tokenizer.DATA, reconsume, pos); 1697 continue stateloop; 1698 default: 1699 /* 1700 * Anything else Parse error. 1701 */ 1702 errBadCharAfterLt(c); 1703 /* 1704 * Emit a U+003C LESS-THAN SIGN character token 1705 */ 1706 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1707 /* 1708 * and reconsume the current input character in 1709 * the data state. 1710 */ 1711 cstart = pos; 1712 reconsume = true; 1713 state = transition(state, Tokenizer.DATA, reconsume, pos); 1714 continue stateloop; 1715 } 1716 } 1717 // CPPONLY: MOZ_FALLTHROUGH; 1718 case TAG_NAME: 1719 tagnameloop: for (;;) { 1720 if (++pos == endPos) { 1721 break stateloop; 1722 } 1723 c = checkChar(buf, pos); 1724 /* 1725 * Consume the next input character: 1726 */ 1727 switch (c) { 1728 case '\r': 1729 silentCarriageReturn(); 1730 strBufToElementNameString(); 1731 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1732 break stateloop; 1733 case '\n': 1734 silentLineFeed(); 1735 // CPPONLY: MOZ_FALLTHROUGH; 1736 case ' ': 1737 case '\t': 1738 case '\u000C': 1739 /* 1740 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1741 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1742 * Switch to the before attribute name state. 1743 */ 1744 strBufToElementNameString(); 1745 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1746 break tagnameloop; 1747 // continue stateloop; 1748 case '/': 1749 /* 1750 * U+002F SOLIDUS (/) Switch to the self-closing 1751 * start tag state. 1752 */ 1753 strBufToElementNameString(); 1754 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1755 continue stateloop; 1756 case '>': 1757 /* 1758 * U+003E GREATER-THAN SIGN (>) Emit the current 1759 * tag token. 1760 */ 1761 strBufToElementNameString(); 1762 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1763 if (shouldSuspend) { 1764 break stateloop; 1765 } 1766 /* 1767 * Switch to the data state. 1768 */ 1769 continue stateloop; 1770 case '\u0000': 1771 c = '\uFFFD'; 1772 // CPPONLY: MOZ_FALLTHROUGH; 1773 default: 1774 if (c >= 'A' && c <= 'Z') { 1775 /* 1776 * U+0041 LATIN CAPITAL LETTER A through to 1777 * U+005A LATIN CAPITAL LETTER Z Append the 1778 * lowercase version of the current input 1779 * character (add 0x0020 to the character's 1780 * code point) to the current tag token's 1781 * tag name. 1782 */ 1783 c += 0x20; 1784 } else if (c == '-') { 1785 containsHyphen = true; 1786 } 1787 /* 1788 * Anything else Append the current input 1789 * character to the current tag token's tag 1790 * name. 1791 */ 1792 appendStrBuf(c); 1793 /* 1794 * Stay in the tag name state. 1795 */ 1796 continue; 1797 } 1798 } 1799 // CPPONLY: MOZ_FALLTHROUGH; 1800 case BEFORE_ATTRIBUTE_NAME: 1801 beforeattributenameloop: for (;;) { 1802 if (reconsume) { 1803 reconsume = false; 1804 } else { 1805 if (++pos == endPos) { 1806 break stateloop; 1807 } 1808 c = checkChar(buf, pos); 1809 } 1810 /* 1811 * Consume the next input character: 1812 */ 1813 switch (c) { 1814 case '\r': 1815 silentCarriageReturn(); 1816 break stateloop; 1817 case '\n': 1818 silentLineFeed(); 1819 // CPPONLY: MOZ_FALLTHROUGH; 1820 case ' ': 1821 case '\t': 1822 case '\u000C': 1823 /* 1824 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1825 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1826 * in the before attribute name state. 1827 */ 1828 continue; 1829 case '/': 1830 /* 1831 * U+002F SOLIDUS (/) Switch to the self-closing 1832 * start tag state. 1833 */ 1834 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1835 continue stateloop; 1836 case '>': 1837 /* 1838 * U+003E GREATER-THAN SIGN (>) Emit the current 1839 * tag token. 1840 */ 1841 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1842 if (shouldSuspend) { 1843 break stateloop; 1844 } 1845 /* 1846 * Switch to the data state. 1847 */ 1848 continue stateloop; 1849 case '\u0000': 1850 c = '\uFFFD'; 1851 // CPPONLY: MOZ_FALLTHROUGH; 1852 case '\"': 1853 case '\'': 1854 case '<': 1855 case '=': 1856 /* 1857 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1858 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 1859 * SIGN (=) Parse error. 1860 */ 1861 errBadCharBeforeAttributeNameOrNull(c); 1862 /* 1863 * Treat it as per the "anything else" entry 1864 * below. 1865 */ 1866 // CPPONLY: MOZ_FALLTHROUGH; 1867 default: 1868 /* 1869 * Anything else Start a new attribute in the 1870 * current tag token. 1871 */ 1872 if (c >= 'A' && c <= 'Z') { 1873 /* 1874 * U+0041 LATIN CAPITAL LETTER A through to 1875 * U+005A LATIN CAPITAL LETTER Z Set that 1876 * attribute's name to the lowercase version 1877 * of the current input character (add 1878 * 0x0020 to the character's code point) 1879 */ 1880 c += 0x20; 1881 } 1882 // CPPONLY: attributeLine = line; 1883 /* 1884 * Set that attribute's name to the current 1885 * input character, 1886 */ 1887 clearStrBufBeforeUse(); 1888 appendStrBuf(c); 1889 /* 1890 * and its value to the empty string. 1891 */ 1892 // Will do later. 1893 /* 1894 * Switch to the attribute name state. 1895 */ 1896 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 1897 break beforeattributenameloop; 1898 // continue stateloop; 1899 } 1900 } 1901 // CPPONLY: MOZ_FALLTHROUGH; 1902 case ATTRIBUTE_NAME: 1903 attributenameloop: for (;;) { 1904 if (++pos == endPos) { 1905 break stateloop; 1906 } 1907 c = checkChar(buf, pos); 1908 /* 1909 * Consume the next input character: 1910 */ 1911 switch (c) { 1912 case '\r': 1913 silentCarriageReturn(); 1914 attributeNameComplete(); 1915 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 1916 break stateloop; 1917 case '\n': 1918 silentLineFeed(); 1919 // CPPONLY: MOZ_FALLTHROUGH; 1920 case ' ': 1921 case '\t': 1922 case '\u000C': 1923 /* 1924 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1925 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1926 * Switch to the after attribute name state. 1927 */ 1928 attributeNameComplete(); 1929 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 1930 continue stateloop; 1931 case '/': 1932 /* 1933 * U+002F SOLIDUS (/) Switch to the self-closing 1934 * start tag state. 1935 */ 1936 attributeNameComplete(); 1937 addAttributeWithoutValue(); 1938 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1939 continue stateloop; 1940 case '=': 1941 /* 1942 * U+003D EQUALS SIGN (=) Switch to the before 1943 * attribute value state. 1944 */ 1945 attributeNameComplete(); 1946 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 1947 break attributenameloop; 1948 // continue stateloop; 1949 case '>': 1950 /* 1951 * U+003E GREATER-THAN SIGN (>) Emit the current 1952 * tag token. 1953 */ 1954 attributeNameComplete(); 1955 addAttributeWithoutValue(); 1956 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1957 if (shouldSuspend) { 1958 break stateloop; 1959 } 1960 /* 1961 * Switch to the data state. 1962 */ 1963 continue stateloop; 1964 case '\u0000': 1965 c = '\uFFFD'; 1966 // CPPONLY: MOZ_FALLTHROUGH; 1967 case '\"': 1968 case '\'': 1969 case '<': 1970 /* 1971 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1972 * (') U+003C LESS-THAN SIGN (<) Parse error. 1973 */ 1974 errQuoteOrLtInAttributeNameOrNull(c); 1975 /* 1976 * Treat it as per the "anything else" entry 1977 * below. 1978 */ 1979 // CPPONLY: MOZ_FALLTHROUGH; 1980 default: 1981 if (c >= 'A' && c <= 'Z') { 1982 /* 1983 * U+0041 LATIN CAPITAL LETTER A through to 1984 * U+005A LATIN CAPITAL LETTER Z Append the 1985 * lowercase version of the current input 1986 * character (add 0x0020 to the character's 1987 * code point) to the current attribute's 1988 * name. 1989 */ 1990 c += 0x20; 1991 } 1992 /* 1993 * Anything else Append the current input 1994 * character to the current attribute's name. 1995 */ 1996 appendStrBuf(c); 1997 /* 1998 * Stay in the attribute name state. 1999 */ 2000 continue; 2001 } 2002 } 2003 // CPPONLY: MOZ_FALLTHROUGH; 2004 case BEFORE_ATTRIBUTE_VALUE: 2005 beforeattributevalueloop: for (;;) { 2006 if (++pos == endPos) { 2007 break stateloop; 2008 } 2009 c = checkChar(buf, pos); 2010 /* 2011 * Consume the next input character: 2012 */ 2013 switch (c) { 2014 case '\r': 2015 silentCarriageReturn(); 2016 break stateloop; 2017 case '\n': 2018 silentLineFeed(); 2019 // CPPONLY: MOZ_FALLTHROUGH; 2020 case ' ': 2021 case '\t': 2022 case '\u000C': 2023 /* 2024 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2025 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 2026 * in the before attribute value state. 2027 */ 2028 continue; 2029 case '"': 2030 /* 2031 * U+0022 QUOTATION MARK (") Switch to the 2032 * attribute value (double-quoted) state. 2033 */ 2034 // CPPONLY: attributeLine = line; 2035 clearStrBufBeforeUse(); 2036 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); 2037 break beforeattributevalueloop; 2038 // continue stateloop; 2039 case '&': 2040 /* 2041 * U+0026 AMPERSAND (&) Switch to the attribute 2042 * value (unquoted) state and reconsume this 2043 * input character. 2044 */ 2045 // CPPONLY: attributeLine = line; 2046 clearStrBufBeforeUse(); 2047 reconsume = true; 2048 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 2049 noteUnquotedAttributeValue(); 2050 continue stateloop; 2051 case '\'': 2052 /* 2053 * U+0027 APOSTROPHE (') Switch to the attribute 2054 * value (single-quoted) state. 2055 */ 2056 // CPPONLY: attributeLine = line; 2057 clearStrBufBeforeUse(); 2058 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); 2059 continue stateloop; 2060 case '>': 2061 /* 2062 * U+003E GREATER-THAN SIGN (>) Parse error. 2063 */ 2064 errAttributeValueMissing(); 2065 /* 2066 * Emit the current tag token. 2067 */ 2068 addAttributeWithoutValue(); 2069 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2070 if (shouldSuspend) { 2071 break stateloop; 2072 } 2073 /* 2074 * Switch to the data state. 2075 */ 2076 continue stateloop; 2077 case '\u0000': 2078 c = '\uFFFD'; 2079 // CPPONLY: MOZ_FALLTHROUGH; 2080 case '<': 2081 case '=': 2082 case '`': 2083 /* 2084 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN 2085 * (=) U+0060 GRAVE ACCENT (`) 2086 */ 2087 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); 2088 /* 2089 * Treat it as per the "anything else" entry 2090 * below. 2091 */ 2092 // CPPONLY: MOZ_FALLTHROUGH; 2093 default: 2094 /* 2095 * Anything else Append the current input 2096 * character to the current attribute's value. 2097 */ 2098 // CPPONLY: attributeLine = line; 2099 clearStrBufBeforeUse(); 2100 appendStrBuf(c); 2101 /* 2102 * Switch to the attribute value (unquoted) 2103 * state. 2104 */ 2105 2106 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 2107 noteUnquotedAttributeValue(); 2108 continue stateloop; 2109 } 2110 } 2111 // CPPONLY: MOZ_FALLTHROUGH; 2112 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 2113 attributevaluedoublequotedloop: for (;;) { 2114 if (reconsume) { 2115 reconsume = false; 2116 } else { 2117 if (++pos == endPos) { 2118 break stateloop; 2119 } 2120 c = checkChar(buf, pos); 2121 } 2122 /* 2123 * Consume the next input character: 2124 */ 2125 switch (c) { 2126 case '"': 2127 /* 2128 * U+0022 QUOTATION MARK (") Switch to the after 2129 * attribute value (quoted) state. 2130 */ 2131 addAttributeWithValue(); 2132 2133 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 2134 break attributevaluedoublequotedloop; 2135 // continue stateloop; 2136 case '&': 2137 /* 2138 * U+0026 AMPERSAND (&) Switch to the character 2139 * reference in attribute value state, with the 2140 * additional allowed character being U+0022 2141 * QUOTATION MARK ("). 2142 */ 2143 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2144 appendCharRefBuf(c); 2145 setAdditionalAndRememberAmpersandLocation('\"'); 2146 returnState = state; 2147 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2148 continue stateloop; 2149 case '\r': 2150 appendStrBufCarriageReturn(); 2151 break stateloop; 2152 case '\n': 2153 appendStrBufLineFeed(); 2154 continue; 2155 case '\u0000': 2156 c = '\uFFFD'; 2157 // CPPONLY: MOZ_FALLTHROUGH; 2158 default: 2159 /* 2160 * Anything else Append the current input 2161 * character to the current attribute's value. 2162 */ 2163 appendStrBuf(c); 2164 /* 2165 * Stay in the attribute value (double-quoted) 2166 * state. 2167 */ 2168 continue; 2169 } 2170 } 2171 // CPPONLY: MOZ_FALLTHROUGH; 2172 case AFTER_ATTRIBUTE_VALUE_QUOTED: 2173 afterattributevaluequotedloop: for (;;) { 2174 if (++pos == endPos) { 2175 break stateloop; 2176 } 2177 c = checkChar(buf, pos); 2178 /* 2179 * Consume the next input character: 2180 */ 2181 switch (c) { 2182 case '\r': 2183 silentCarriageReturn(); 2184 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2185 break stateloop; 2186 case '\n': 2187 silentLineFeed(); 2188 // CPPONLY: MOZ_FALLTHROUGH; 2189 case ' ': 2190 case '\t': 2191 case '\u000C': 2192 /* 2193 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2194 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2195 * Switch to the before attribute name state. 2196 */ 2197 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2198 continue stateloop; 2199 case '/': 2200 /* 2201 * U+002F SOLIDUS (/) Switch to the self-closing 2202 * start tag state. 2203 */ 2204 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2205 break afterattributevaluequotedloop; 2206 // continue stateloop; 2207 case '>': 2208 /* 2209 * U+003E GREATER-THAN SIGN (>) Emit the current 2210 * tag token. 2211 */ 2212 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2213 if (shouldSuspend) { 2214 break stateloop; 2215 } 2216 /* 2217 * Switch to the data state. 2218 */ 2219 continue stateloop; 2220 default: 2221 /* 2222 * Anything else Parse error. 2223 */ 2224 errNoSpaceBetweenAttributes(); 2225 /* 2226 * Reconsume the character in the before 2227 * attribute name state. 2228 */ 2229 reconsume = true; 2230 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2231 continue stateloop; 2232 } 2233 } 2234 // CPPONLY: MOZ_FALLTHROUGH; 2235 case SELF_CLOSING_START_TAG: 2236 if (++pos == endPos) { 2237 break stateloop; 2238 } 2239 c = checkChar(buf, pos); 2240 /* 2241 * Consume the next input character: 2242 */ 2243 switch (c) { 2244 case '>': 2245 /* 2246 * U+003E GREATER-THAN SIGN (>) Set the self-closing 2247 * flag of the current tag token. Emit the current 2248 * tag token. 2249 */ 2250 state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); 2251 if (shouldSuspend) { 2252 break stateloop; 2253 } 2254 /* 2255 * Switch to the data state. 2256 */ 2257 continue stateloop; 2258 default: 2259 /* Anything else Parse error. */ 2260 errSlashNotFollowedByGt(); 2261 /* 2262 * Reconsume the character in the before attribute 2263 * name state. 2264 */ 2265 reconsume = true; 2266 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2267 continue stateloop; 2268 } 2269 case ATTRIBUTE_VALUE_UNQUOTED: 2270 for (;;) { 2271 if (reconsume) { 2272 reconsume = false; 2273 } else { 2274 if (++pos == endPos) { 2275 break stateloop; 2276 } 2277 c = checkChar(buf, pos); 2278 } 2279 /* 2280 * Consume the next input character: 2281 */ 2282 switch (c) { 2283 case '\r': 2284 silentCarriageReturn(); 2285 addAttributeWithValue(); 2286 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2287 break stateloop; 2288 case '\n': 2289 silentLineFeed(); 2290 // CPPONLY: MOZ_FALLTHROUGH; 2291 case ' ': 2292 case '\t': 2293 case '\u000C': 2294 /* 2295 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2296 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2297 * Switch to the before attribute name state. 2298 */ 2299 addAttributeWithValue(); 2300 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2301 continue stateloop; 2302 case '&': 2303 /* 2304 * U+0026 AMPERSAND (&) Switch to the character 2305 * reference in attribute value state, with the 2306 * additional allowed character being U+003E 2307 * GREATER-THAN SIGN (>) 2308 */ 2309 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2310 appendCharRefBuf(c); 2311 setAdditionalAndRememberAmpersandLocation('>'); 2312 returnState = state; 2313 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2314 continue stateloop; 2315 case '>': 2316 /* 2317 * U+003E GREATER-THAN SIGN (>) Emit the current 2318 * tag token. 2319 */ 2320 addAttributeWithValue(); 2321 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2322 if (shouldSuspend) { 2323 break stateloop; 2324 } 2325 /* 2326 * Switch to the data state. 2327 */ 2328 continue stateloop; 2329 case '\u0000': 2330 c = '\uFFFD'; 2331 // CPPONLY: MOZ_FALLTHROUGH; 2332 case '<': 2333 case '\"': 2334 case '\'': 2335 case '=': 2336 case '`': 2337 /* 2338 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 2339 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 2340 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. 2341 */ 2342 errUnquotedAttributeValOrNull(c); 2343 /* 2344 * Treat it as per the "anything else" entry 2345 * below. 2346 */ 2347 // CPPONLY: MOZ_FALLTHROUGH; 2348 default: 2349 /* 2350 * Anything else Append the current input 2351 * character to the current attribute's value. 2352 */ 2353 appendStrBuf(c); 2354 /* 2355 * Stay in the attribute value (unquoted) state. 2356 */ 2357 continue; 2358 } 2359 } 2360 case AFTER_ATTRIBUTE_NAME: 2361 for (;;) { 2362 if (++pos == endPos) { 2363 break stateloop; 2364 } 2365 c = checkChar(buf, pos); 2366 /* 2367 * Consume the next input character: 2368 */ 2369 switch (c) { 2370 case '\r': 2371 silentCarriageReturn(); 2372 break stateloop; 2373 case '\n': 2374 silentLineFeed(); 2375 // CPPONLY: MOZ_FALLTHROUGH; 2376 case ' ': 2377 case '\t': 2378 case '\u000C': 2379 /* 2380 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2381 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 2382 * in the after attribute name state. 2383 */ 2384 continue; 2385 case '/': 2386 /* 2387 * U+002F SOLIDUS (/) Switch to the self-closing 2388 * start tag state. 2389 */ 2390 addAttributeWithoutValue(); 2391 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2392 continue stateloop; 2393 case '=': 2394 /* 2395 * U+003D EQUALS SIGN (=) Switch to the before 2396 * attribute value state. 2397 */ 2398 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 2399 continue stateloop; 2400 case '>': 2401 /* 2402 * U+003E GREATER-THAN SIGN (>) Emit the current 2403 * tag token. 2404 */ 2405 addAttributeWithoutValue(); 2406 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2407 if (shouldSuspend) { 2408 break stateloop; 2409 } 2410 /* 2411 * Switch to the data state. 2412 */ 2413 continue stateloop; 2414 case '\u0000': 2415 c = '\uFFFD'; 2416 // CPPONLY: MOZ_FALLTHROUGH; 2417 case '\"': 2418 case '\'': 2419 case '<': 2420 errQuoteOrLtInAttributeNameOrNull(c); 2421 /* 2422 * Treat it as per the "anything else" entry 2423 * below. 2424 */ 2425 // CPPONLY: MOZ_FALLTHROUGH; 2426 default: 2427 addAttributeWithoutValue(); 2428 /* 2429 * Anything else Start a new attribute in the 2430 * current tag token. 2431 */ 2432 if (c >= 'A' && c <= 'Z') { 2433 /* 2434 * U+0041 LATIN CAPITAL LETTER A through to 2435 * U+005A LATIN CAPITAL LETTER Z Set that 2436 * attribute's name to the lowercase version 2437 * of the current input character (add 2438 * 0x0020 to the character's code point) 2439 */ 2440 c += 0x20; 2441 } 2442 /* 2443 * Set that attribute's name to the current 2444 * input character, 2445 */ 2446 clearStrBufBeforeUse(); 2447 appendStrBuf(c); 2448 /* 2449 * and its value to the empty string. 2450 */ 2451 // Will do later. 2452 /* 2453 * Switch to the attribute name state. 2454 */ 2455 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 2456 continue stateloop; 2457 } 2458 } 2459 case MARKUP_DECLARATION_OPEN: 2460 markupdeclarationopenloop: for (;;) { 2461 if (++pos == endPos) { 2462 break stateloop; 2463 } 2464 c = checkChar(buf, pos); 2465 /* 2466 * If the next two characters are both U+002D 2467 * HYPHEN-MINUS characters (-), consume those two 2468 * characters, create a comment token whose data is the 2469 * empty string, and switch to the comment start state. 2470 * 2471 * Otherwise, if the next seven characters are an ASCII 2472 * case-insensitive match for the word "DOCTYPE", then 2473 * consume those characters and switch to the DOCTYPE 2474 * state. 2475 * 2476 * Otherwise, if the insertion mode is 2477 * "in foreign content" and the current node is not an 2478 * element in the HTML namespace and the next seven 2479 * characters are an case-sensitive match for the string 2480 * "[CDATA[" (the five uppercase letters "CDATA" with a 2481 * U+005B LEFT SQUARE BRACKET character before and 2482 * after), then consume those characters and switch to 2483 * the CDATA section state. 2484 * 2485 * Otherwise, is is a parse error. Switch to the bogus 2486 * comment state. The next character that is consumed, 2487 * if any, is the first character that will be in the 2488 * comment. 2489 */ 2490 switch (c) { 2491 case '-': 2492 clearStrBufBeforeUse(); 2493 appendStrBuf(c); 2494 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); 2495 break markupdeclarationopenloop; 2496 // continue stateloop; 2497 case 'd': 2498 case 'D': 2499 clearStrBufBeforeUse(); 2500 appendStrBuf(c); 2501 index = 0; 2502 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); 2503 continue stateloop; 2504 case '[': 2505 if (tokenHandler.cdataSectionAllowed()) { 2506 clearStrBufBeforeUse(); 2507 appendStrBuf(c); 2508 index = 0; 2509 state = transition(state, Tokenizer.CDATA_START, reconsume, pos); 2510 continue stateloop; 2511 } 2512 // CPPONLY: MOZ_FALLTHROUGH; 2513 default: 2514 errBogusComment(); 2515 clearStrBufBeforeUse(); 2516 reconsume = true; 2517 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2518 continue stateloop; 2519 } 2520 } 2521 // CPPONLY: MOZ_FALLTHROUGH; 2522 case MARKUP_DECLARATION_HYPHEN: 2523 markupdeclarationhyphenloop: for (;;) { 2524 if (++pos == endPos) { 2525 break stateloop; 2526 } 2527 c = checkChar(buf, pos); 2528 switch (c) { 2529 case '-': 2530 clearStrBufAfterOneHyphen(); 2531 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); 2532 break markupdeclarationhyphenloop; 2533 // continue stateloop; 2534 default: 2535 errBogusComment(); 2536 reconsume = true; 2537 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2538 continue stateloop; 2539 } 2540 } 2541 // CPPONLY: MOZ_FALLTHROUGH; 2542 case COMMENT_START: 2543 commentstartloop: for (;;) { 2544 if (++pos == endPos) { 2545 break stateloop; 2546 } 2547 c = checkChar(buf, pos); 2548 /* 2549 * Comment start state 2550 * 2551 * 2552 * Consume the next input character: 2553 */ 2554 switch (c) { 2555 case '-': 2556 /* 2557 * U+002D HYPHEN-MINUS (-) Switch to the comment 2558 * start dash state. 2559 */ 2560 appendStrBuf(c); 2561 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); 2562 continue stateloop; 2563 case '>': 2564 /* 2565 * U+003E GREATER-THAN SIGN (>) Parse error. 2566 */ 2567 errPrematureEndOfComment(); 2568 /* Emit the comment token. */ 2569 emitComment(0, pos); 2570 /* 2571 * Switch to the data state. 2572 */ 2573 state = transition(state, Tokenizer.DATA, reconsume, pos); 2574 continue stateloop; 2575 case '\r': 2576 appendStrBufCarriageReturn(); 2577 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2578 break stateloop; 2579 case '\n': 2580 appendStrBufLineFeed(); 2581 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2582 break commentstartloop; 2583 case '\u0000': 2584 c = '\uFFFD'; 2585 // CPPONLY: MOZ_FALLTHROUGH; 2586 default: 2587 /* 2588 * Anything else Append the input character to 2589 * the comment token's data. 2590 */ 2591 appendStrBuf(c); 2592 /* 2593 * Switch to the comment state. 2594 */ 2595 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2596 break commentstartloop; 2597 // continue stateloop; 2598 } 2599 } 2600 // CPPONLY: MOZ_FALLTHROUGH; 2601 case COMMENT: 2602 commentloop: for (;;) { 2603 if (++pos == endPos) { 2604 break stateloop; 2605 } 2606 c = checkChar(buf, pos); 2607 /* 2608 * Comment state Consume the next input character: 2609 */ 2610 switch (c) { 2611 case '-': 2612 /* 2613 * U+002D HYPHEN-MINUS (-) Switch to the comment 2614 * end dash state 2615 */ 2616 appendStrBuf(c); 2617 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2618 break commentloop; 2619 // continue stateloop; 2620 case '\r': 2621 appendStrBufCarriageReturn(); 2622 break stateloop; 2623 case '\n': 2624 appendStrBufLineFeed(); 2625 continue; 2626 case '\u0000': 2627 c = '\uFFFD'; 2628 // CPPONLY: MOZ_FALLTHROUGH; 2629 default: 2630 /* 2631 * Anything else Append the input character to 2632 * the comment token's data. 2633 */ 2634 appendStrBuf(c); 2635 /* 2636 * Stay in the comment state. 2637 */ 2638 continue; 2639 } 2640 } 2641 // CPPONLY: MOZ_FALLTHROUGH; 2642 case COMMENT_END_DASH: 2643 commentenddashloop: for (;;) { 2644 if (++pos == endPos) { 2645 break stateloop; 2646 } 2647 c = checkChar(buf, pos); 2648 /* 2649 * Comment end dash state Consume the next input 2650 * character: 2651 */ 2652 switch (c) { 2653 case '-': 2654 /* 2655 * U+002D HYPHEN-MINUS (-) Switch to the comment 2656 * end state 2657 */ 2658 appendStrBuf(c); 2659 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 2660 break commentenddashloop; 2661 // continue stateloop; 2662 case '\r': 2663 appendStrBufCarriageReturn(); 2664 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2665 break stateloop; 2666 case '\n': 2667 appendStrBufLineFeed(); 2668 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2669 continue stateloop; 2670 case '\u0000': 2671 c = '\uFFFD'; 2672 // CPPONLY: MOZ_FALLTHROUGH; 2673 default: 2674 /* 2675 * Anything else Append a U+002D HYPHEN-MINUS 2676 * (-) character and the input character to the 2677 * comment token's data. 2678 */ 2679 appendStrBuf(c); 2680 /* 2681 * Switch to the comment state. 2682 */ 2683 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2684 continue stateloop; 2685 } 2686 } 2687 // CPPONLY: MOZ_FALLTHROUGH; 2688 case COMMENT_END: 2689 commentendloop: for (;;) { 2690 if (++pos == endPos) { 2691 break stateloop; 2692 } 2693 c = checkChar(buf, pos); 2694 /* 2695 * Comment end dash state Consume the next input 2696 * character: 2697 */ 2698 switch (c) { 2699 case '>': 2700 /* 2701 * U+003E GREATER-THAN SIGN (>) Emit the comment 2702 * token. 2703 */ 2704 emitComment(2, pos); 2705 /* 2706 * Switch to the data state. 2707 */ 2708 state = transition(state, Tokenizer.DATA, reconsume, pos); 2709 continue stateloop; 2710 case '-': 2711 /* U+002D HYPHEN-MINUS (-) Parse error. */ 2712 /* 2713 * Append a U+002D HYPHEN-MINUS (-) character to 2714 * the comment token's data. 2715 */ 2716 adjustDoubleHyphenAndAppendToStrBufAndErr(c); 2717 /* 2718 * Stay in the comment end state. 2719 */ 2720 continue; 2721 case '\r': 2722 adjustDoubleHyphenAndAppendToStrBufCarriageReturn(); 2723 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2724 break stateloop; 2725 case '\n': 2726 adjustDoubleHyphenAndAppendToStrBufLineFeed(); 2727 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2728 continue stateloop; 2729 case '!': 2730 errHyphenHyphenBang(); 2731 appendStrBuf(c); 2732 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); 2733 continue stateloop; 2734 case '\u0000': 2735 c = '\uFFFD'; 2736 // CPPONLY: MOZ_FALLTHROUGH; 2737 default: 2738 /* 2739 * Append two U+002D HYPHEN-MINUS (-) characters 2740 * and the input character to the comment 2741 * token's data. 2742 */ 2743 adjustDoubleHyphenAndAppendToStrBufAndErr(c); 2744 /* 2745 * Switch to the comment state. 2746 */ 2747 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2748 continue stateloop; 2749 } 2750 } 2751 case COMMENT_END_BANG: 2752 for (;;) { 2753 if (++pos == endPos) { 2754 break stateloop; 2755 } 2756 c = checkChar(buf, pos); 2757 /* 2758 * Comment end bang state 2759 * 2760 * Consume the next input character: 2761 */ 2762 switch (c) { 2763 case '>': 2764 /* 2765 * U+003E GREATER-THAN SIGN (>) Emit the comment 2766 * token. 2767 */ 2768 emitComment(3, pos); 2769 /* 2770 * Switch to the data state. 2771 */ 2772 state = transition(state, Tokenizer.DATA, reconsume, pos); 2773 continue stateloop; 2774 case '-': 2775 /* 2776 * Append two U+002D HYPHEN-MINUS (-) characters 2777 * and a U+0021 EXCLAMATION MARK (!) character 2778 * to the comment token's data. 2779 */ 2780 appendStrBuf(c); 2781 /* 2782 * Switch to the comment end dash state. 2783 */ 2784 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2785 continue stateloop; 2786 case '\r': 2787 appendStrBufCarriageReturn(); 2788 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2789 break stateloop; 2790 case '\n': 2791 appendStrBufLineFeed(); 2792 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2793 continue stateloop; 2794 case '\u0000': 2795 c = '\uFFFD'; 2796 // CPPONLY: MOZ_FALLTHROUGH; 2797 default: 2798 /* 2799 * Anything else Append two U+002D HYPHEN-MINUS 2800 * (-) characters, a U+0021 EXCLAMATION MARK (!) 2801 * character, and the input character to the 2802 * comment token's data. Switch to the comment 2803 * state. 2804 */ 2805 appendStrBuf(c); 2806 /* 2807 * Switch to the comment state. 2808 */ 2809 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2810 continue stateloop; 2811 } 2812 } 2813 case COMMENT_START_DASH: 2814 if (++pos == endPos) { 2815 break stateloop; 2816 } 2817 c = checkChar(buf, pos); 2818 /* 2819 * Comment start dash state 2820 * 2821 * Consume the next input character: 2822 */ 2823 switch (c) { 2824 case '-': 2825 /* 2826 * U+002D HYPHEN-MINUS (-) Switch to the comment end 2827 * state 2828 */ 2829 appendStrBuf(c); 2830 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 2831 continue stateloop; 2832 case '>': 2833 errPrematureEndOfComment(); 2834 /* Emit the comment token. */ 2835 emitComment(1, pos); 2836 /* 2837 * Switch to the data state. 2838 */ 2839 state = transition(state, Tokenizer.DATA, reconsume, pos); 2840 continue stateloop; 2841 case '\r': 2842 appendStrBufCarriageReturn(); 2843 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2844 break stateloop; 2845 case '\n': 2846 appendStrBufLineFeed(); 2847 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2848 continue stateloop; 2849 case '\u0000': 2850 c = '\uFFFD'; 2851 // CPPONLY: MOZ_FALLTHROUGH; 2852 default: 2853 /* 2854 * Append a U+002D HYPHEN-MINUS character (-) and 2855 * the current input character to the comment 2856 * token's data. 2857 */ 2858 appendStrBuf(c); 2859 /* 2860 * Switch to the comment state. 2861 */ 2862 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2863 continue stateloop; 2864 } 2865 case CDATA_START: 2866 for (;;) { 2867 if (++pos == endPos) { 2868 break stateloop; 2869 } 2870 c = checkChar(buf, pos); 2871 if (index < 6) { // CDATA_LSQB.length 2872 if (c == Tokenizer.CDATA_LSQB[index]) { 2873 appendStrBuf(c); 2874 } else { 2875 errBogusComment(); 2876 reconsume = true; 2877 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2878 continue stateloop; 2879 } 2880 index++; 2881 continue; 2882 } else { 2883 clearStrBufAfterUse(); 2884 cstart = pos; // start coalescing 2885 reconsume = true; 2886 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2887 break; // FALL THROUGH continue stateloop; 2888 } 2889 } 2890 // CPPONLY: MOZ_FALLTHROUGH; 2891 case CDATA_SECTION: 2892 cdatasectionloop: for (;;) { 2893 if (reconsume) { 2894 reconsume = false; 2895 } else { 2896 if (++pos == endPos) { 2897 break stateloop; 2898 } 2899 c = checkChar(buf, pos); 2900 } 2901 switch (c) { 2902 case ']': 2903 flushChars(buf, pos); 2904 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); 2905 break cdatasectionloop; // FALL THROUGH 2906 case '\u0000': 2907 emitReplacementCharacter(buf, pos); 2908 continue; 2909 case '\r': 2910 emitCarriageReturn(buf, pos); 2911 break stateloop; 2912 case '\n': 2913 silentLineFeed(); 2914 // CPPONLY: MOZ_FALLTHROUGH; 2915 default: 2916 continue; 2917 } 2918 } 2919 // CPPONLY: MOZ_FALLTHROUGH; 2920 case CDATA_RSQB: 2921 cdatarsqb: for (;;) { 2922 if (++pos == endPos) { 2923 break stateloop; 2924 } 2925 c = checkChar(buf, pos); 2926 switch (c) { 2927 case ']': 2928 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); 2929 break cdatarsqb; 2930 default: 2931 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2932 1); 2933 cstart = pos; 2934 reconsume = true; 2935 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2936 continue stateloop; 2937 } 2938 } 2939 // CPPONLY: MOZ_FALLTHROUGH; 2940 case CDATA_RSQB_RSQB: 2941 cdatarsqbrsqb: for (;;) { 2942 if (++pos == endPos) { 2943 break stateloop; 2944 } 2945 c = checkChar(buf, pos); 2946 switch (c) { 2947 case ']': 2948 // Saw a third ]. Emit one ] (logically the 2949 // first one) and stay in this state to 2950 // remember that the last two characters seen 2951 // have been ]]. 2952 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 2953 continue; 2954 case '>': 2955 cstart = pos + 1; 2956 state = transition(state, Tokenizer.DATA, reconsume, pos); 2957 continue stateloop; 2958 default: 2959 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 2960 cstart = pos; 2961 reconsume = true; 2962 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2963 continue stateloop; 2964 } 2965 } 2966 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 2967 attributevaluesinglequotedloop: for (;;) { 2968 if (reconsume) { 2969 reconsume = false; 2970 } else { 2971 if (++pos == endPos) { 2972 break stateloop; 2973 } 2974 c = checkChar(buf, pos); 2975 } 2976 /* 2977 * Consume the next input character: 2978 */ 2979 switch (c) { 2980 case '\'': 2981 /* 2982 * U+0027 APOSTROPHE (') Switch to the after 2983 * attribute value (quoted) state. 2984 */ 2985 addAttributeWithValue(); 2986 2987 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 2988 continue stateloop; 2989 case '&': 2990 /* 2991 * U+0026 AMPERSAND (&) Switch to the character 2992 * reference in attribute value state, with the 2993 * + additional allowed character being U+0027 2994 * APOSTROPHE ('). 2995 */ 2996 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2997 appendCharRefBuf(c); 2998 setAdditionalAndRememberAmpersandLocation('\''); 2999 returnState = state; 3000 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 3001 break attributevaluesinglequotedloop; 3002 // continue stateloop; 3003 case '\r': 3004 appendStrBufCarriageReturn(); 3005 break stateloop; 3006 case '\n': 3007 appendStrBufLineFeed(); 3008 continue; 3009 case '\u0000': 3010 c = '\uFFFD'; 3011 // CPPONLY: MOZ_FALLTHROUGH; 3012 default: 3013 /* 3014 * Anything else Append the current input 3015 * character to the current attribute's value. 3016 */ 3017 appendStrBuf(c); 3018 /* 3019 * Stay in the attribute value (double-quoted) 3020 * state. 3021 */ 3022 continue; 3023 } 3024 } 3025 // CPPONLY: MOZ_FALLTHROUGH; 3026 case CONSUME_CHARACTER_REFERENCE: 3027 if (++pos == endPos) { 3028 break stateloop; 3029 } 3030 c = checkChar(buf, pos); 3031 /* 3032 * Unlike the definition is the spec, this state does not 3033 * return a value and never requires the caller to 3034 * backtrack. This state takes care of emitting characters 3035 * or appending to the current attribute value. It also 3036 * takes care of that in the case when consuming the 3037 * character reference fails. 3038 */ 3039 /* 3040 * This section defines how to consume a character 3041 * reference. This definition is used when parsing character 3042 * references in text and in attributes. 3043 * 3044 * The behavior depends on the identity of the next 3045 * character (the one immediately after the U+0026 AMPERSAND 3046 * character): 3047 */ 3048 switch (c) { 3049 case ' ': 3050 case '\t': 3051 case '\n': 3052 case '\r': // we'll reconsume! 3053 case '\u000C': 3054 case '<': 3055 case '&': 3056 case '\u0000': 3057 emitOrAppendCharRefBuf(returnState); 3058 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3059 cstart = pos; 3060 } 3061 reconsume = true; 3062 state = transition(state, returnState, reconsume, pos); 3063 continue stateloop; 3064 case '#': 3065 /* 3066 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER 3067 * SIGN. 3068 */ 3069 appendCharRefBuf('#'); 3070 state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); 3071 continue stateloop; 3072 default: 3073 if (c == additional) { 3074 emitOrAppendCharRefBuf(returnState); 3075 reconsume = true; 3076 state = transition(state, returnState, reconsume, pos); 3077 continue stateloop; 3078 } 3079 if (c >= 'a' && c <= 'z') { 3080 firstCharKey = c - 'a' + 26; 3081 } else if (c >= 'A' && c <= 'Z') { 3082 firstCharKey = c - 'A'; 3083 } else { 3084 // No match 3085 /* 3086 * If no match can be made, then this is a parse 3087 * error. 3088 */ 3089 errNoNamedCharacterMatch(); 3090 emitOrAppendCharRefBuf(returnState); 3091 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3092 cstart = pos; 3093 } 3094 reconsume = true; 3095 state = transition(state, returnState, reconsume, pos); 3096 continue stateloop; 3097 } 3098 // Didn't fail yet 3099 appendCharRefBuf(c); 3100 state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); 3101 // FALL THROUGH continue stateloop; 3102 } 3103 // CPPONLY: MOZ_FALLTHROUGH; 3104 case CHARACTER_REFERENCE_HILO_LOOKUP: 3105 { 3106 if (++pos == endPos) { 3107 break stateloop; 3108 } 3109 c = checkChar(buf, pos); 3110 /* 3111 * The data structure is as follows: 3112 * 3113 * HILO_ACCEL is a two-dimensional int array whose major 3114 * index corresponds to the second character of the 3115 * character reference (code point as index) and the 3116 * minor index corresponds to the first character of the 3117 * character reference (packed so that A-Z runs from 0 3118 * to 25 and a-z runs from 26 to 51). This layout makes 3119 * it easier to use the sparseness of the data structure 3120 * to omit parts of it: The second dimension of the 3121 * table is null when no character reference starts with 3122 * the character corresponding to that row. 3123 * 3124 * The int value HILO_ACCEL (by these indeces) is zero 3125 * if there exists no character reference starting with 3126 * that two-letter prefix. Otherwise, the value is an 3127 * int that packs two shorts so that the higher short is 3128 * the index of the highest character reference name 3129 * with that prefix in NAMES and the lower short 3130 * corresponds to the index of the lowest character 3131 * reference name with that prefix. (It happens that the 3132 * first two character reference names share their 3133 * prefix so the packed int cannot be 0 by packing the 3134 * two shorts.) 3135 * 3136 * NAMES is an array of byte arrays where each byte 3137 * array encodes the name of a character references as 3138 * ASCII. The names omit the first two letters of the 3139 * name. (Since storing the first two letters would be 3140 * redundant with the data contained in HILO_ACCEL.) The 3141 * entries are lexically sorted. 3142 * 3143 * For a given index in NAMES, the same index in VALUES 3144 * contains the corresponding expansion as an array of 3145 * two UTF-16 code units (either the character and 3146 * U+0000 or a suggogate pair). 3147 */ 3148 int hilo = 0; 3149 if (c <= 'z') { 3150 @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c]; 3151 if (row != null) { 3152 hilo = row[firstCharKey]; 3153 } 3154 } 3155 if (hilo == 0) { 3156 /* 3157 * If no match can be made, then this is a parse 3158 * error. 3159 */ 3160 errNoNamedCharacterMatch(); 3161 emitOrAppendCharRefBuf(returnState); 3162 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3163 cstart = pos; 3164 } 3165 reconsume = true; 3166 state = transition(state, returnState, reconsume, pos); 3167 continue stateloop; 3168 } 3169 // Didn't fail yet 3170 appendCharRefBuf(c); 3171 lo = hilo & 0xFFFF; 3172 hi = hilo >> 16; 3173 entCol = -1; 3174 candidate = -1; 3175 charRefBufMark = 0; 3176 state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); 3177 // FALL THROUGH continue stateloop; 3178 } 3179 // CPPONLY: MOZ_FALLTHROUGH; 3180 case CHARACTER_REFERENCE_TAIL: 3181 outer: for (;;) { 3182 if (++pos == endPos) { 3183 break stateloop; 3184 } 3185 c = checkChar(buf, pos); 3186 entCol++; 3187 /* 3188 * Consume the maximum number of characters possible, 3189 * with the consumed characters matching one of the 3190 * identifiers in the first column of the named 3191 * character references table (in a case-sensitive 3192 * manner). 3193 */ 3194 loloop: for (;;) { 3195 if (hi < lo) { 3196 break outer; 3197 } 3198 if (entCol == NamedCharacters.NAMES[lo].length()) { 3199 candidate = lo; 3200 charRefBufMark = charRefBufLen; 3201 lo++; 3202 } else if (entCol > NamedCharacters.NAMES[lo].length()) { 3203 break outer; 3204 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 3205 lo++; 3206 } else { 3207 break loloop; 3208 } 3209 } 3210 3211 hiloop: for (;;) { 3212 if (hi < lo) { 3213 break outer; 3214 } 3215 if (entCol == NamedCharacters.NAMES[hi].length()) { 3216 break hiloop; 3217 } 3218 if (entCol > NamedCharacters.NAMES[hi].length()) { 3219 break outer; 3220 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 3221 hi--; 3222 } else { 3223 break hiloop; 3224 } 3225 } 3226 3227 if (c == ';') { 3228 // If we see a semicolon, there cannot be a 3229 // longer match. Break the loop. However, before 3230 // breaking, take the longest match so far as the 3231 // candidate, if we are just about to complete a 3232 // match. 3233 if (entCol + 1 == NamedCharacters.NAMES[lo].length()) { 3234 candidate = lo; 3235 charRefBufMark = charRefBufLen; 3236 } 3237 break outer; 3238 } 3239 3240 if (hi < lo) { 3241 break outer; 3242 } 3243 appendCharRefBuf(c); 3244 continue; 3245 } 3246 3247 if (candidate == -1) { 3248 // reconsume deals with CR, LF or nul 3249 /* 3250 * If no match can be made, then this is a parse error. 3251 */ 3252 errNoNamedCharacterMatch(); 3253 emitOrAppendCharRefBuf(returnState); 3254 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3255 cstart = pos; 3256 } 3257 reconsume = true; 3258 state = transition(state, returnState, reconsume, pos); 3259 continue stateloop; 3260 } else { 3261 // c can't be CR, LF or nul if we got here 3262 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 3263 if (candidateName.length() == 0 3264 || candidateName.charAt(candidateName.length() - 1) != ';') { 3265 /* 3266 * If the last character matched is not a U+003B 3267 * SEMICOLON (;), there is a parse error. 3268 */ 3269 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3270 /* 3271 * If the entity is being consumed as part of an 3272 * attribute, and the last character matched is 3273 * not a U+003B SEMICOLON (;), 3274 */ 3275 char ch; 3276 if (charRefBufMark == charRefBufLen) { 3277 ch = c; 3278 } else { 3279 ch = charRefBuf[charRefBufMark]; 3280 } 3281 if (ch == '=' || (ch >= '0' && ch <= '9') 3282 || (ch >= 'A' && ch <= 'Z') 3283 || (ch >= 'a' && ch <= 'z')) { 3284 /* 3285 * and the next character is either a U+003D 3286 * EQUALS SIGN character (=) or in the range 3287 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 3288 * U+0041 LATIN CAPITAL LETTER A to U+005A 3289 * LATIN CAPITAL LETTER Z, or U+0061 LATIN 3290 * SMALL LETTER A to U+007A LATIN SMALL 3291 * LETTER Z, then, for historical reasons, 3292 * all the characters that were matched 3293 * after the U+0026 AMPERSAND (&) must be 3294 * unconsumed, and nothing is returned. 3295 */ 3296 errNoNamedCharacterMatch(); 3297 appendCharRefBufToStrBuf(); 3298 reconsume = true; 3299 state = transition(state, returnState, reconsume, pos); 3300 continue stateloop; 3301 } 3302 } 3303 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3304 errUnescapedAmpersandInterpretedAsCharacterReference(); 3305 } else { 3306 errNotSemicolonTerminated(); 3307 } 3308 } 3309 3310 /* 3311 * Otherwise, return a character token for the character 3312 * corresponding to the entity name (as given by the 3313 * second column of the named character references 3314 * table). 3315 */ 3316 // CPPONLY: completedNamedCharacterReference(); 3317 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 3318 if ( 3319 // [NOCPP[ 3320 val.length == 1 3321 // ]NOCPP] 3322 // CPPONLY: val[1] == 0 3323 ) { 3324 emitOrAppendOne(val, returnState); 3325 } else { 3326 emitOrAppendTwo(val, returnState); 3327 } 3328 // this is so complicated! 3329 if (charRefBufMark < charRefBufLen) { 3330 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3331 appendStrBuf(charRefBuf, charRefBufMark, 3332 charRefBufLen - charRefBufMark); 3333 } else { 3334 tokenHandler.characters(charRefBuf, charRefBufMark, 3335 charRefBufLen - charRefBufMark); 3336 } 3337 } 3338 // charRefBufLen will be zeroed below! 3339 3340 // Check if we broke out early with c being the last 3341 // character that matched as opposed to being the 3342 // first one that didn't match. In the case of an 3343 // early break, the next run on text should start 3344 // *after* the current character and the current 3345 // character shouldn't be reconsumed. 3346 boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen); 3347 charRefBufLen = 0; 3348 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3349 cstart = earlyBreak ? pos + 1 : pos; 3350 } 3351 reconsume = !earlyBreak; 3352 state = transition(state, returnState, reconsume, pos); 3353 continue stateloop; 3354 /* 3355 * If the markup contains I'm ¬it; I tell you, the 3356 * entity is parsed as "not", as in, I'm ¬it; I tell 3357 * you. But if the markup was I'm ∉ I tell you, 3358 * the entity would be parsed as "notin;", resulting in 3359 * I'm ∉ I tell you. 3360 */ 3361 } 3362 case CONSUME_NCR: 3363 if (++pos == endPos) { 3364 break stateloop; 3365 } 3366 c = checkChar(buf, pos); 3367 value = 0; 3368 seenDigits = false; 3369 /* 3370 * The behavior further depends on the character after the 3371 * U+0023 NUMBER SIGN: 3372 */ 3373 switch (c) { 3374 case 'x': 3375 case 'X': 3376 3377 /* 3378 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL 3379 * LETTER X Consume the X. 3380 * 3381 * Follow the steps below, but using the range of 3382 * characters U+0030 DIGIT ZERO through to U+0039 3383 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through 3384 * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN 3385 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL 3386 * LETTER F (in other words, 0-9, A-F, a-f). 3387 * 3388 * When it comes to interpreting the number, 3389 * interpret it as a hexadecimal number. 3390 */ 3391 appendCharRefBuf(c); 3392 state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos); 3393 continue stateloop; 3394 default: 3395 /* 3396 * Anything else Follow the steps below, but using 3397 * the range of characters U+0030 DIGIT ZERO through 3398 * to U+0039 DIGIT NINE (i.e. just 0-9). 3399 * 3400 * When it comes to interpreting the number, 3401 * interpret it as a decimal number. 3402 */ 3403 reconsume = true; 3404 state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); 3405 // FALL THROUGH continue stateloop; 3406 } 3407 // CPPONLY: MOZ_FALLTHROUGH; 3408 case DECIMAL_NRC_LOOP: 3409 decimalloop: for (;;) { 3410 if (reconsume) { 3411 reconsume = false; 3412 } else { 3413 if (++pos == endPos) { 3414 break stateloop; 3415 } 3416 c = checkChar(buf, pos); 3417 } 3418 /* 3419 * Consume as many characters as match the range of 3420 * characters given above. 3421 */ 3422 assert value >= 0: "value must not become negative."; 3423 if (c >= '0' && c <= '9') { 3424 seenDigits = true; 3425 // Avoid overflow 3426 if (value <= 0x10FFFF) { 3427 value *= 10; 3428 value += c - '0'; 3429 } 3430 continue; 3431 } else if (c == ';') { 3432 if (seenDigits) { 3433 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3434 cstart = pos + 1; 3435 } 3436 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3437 // FALL THROUGH continue stateloop; 3438 break decimalloop; 3439 } else { 3440 errNoDigitsInNCR(); 3441 appendCharRefBuf(';'); 3442 emitOrAppendCharRefBuf(returnState); 3443 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3444 cstart = pos + 1; 3445 } 3446 state = transition(state, returnState, reconsume, pos); 3447 continue stateloop; 3448 } 3449 } else { 3450 /* 3451 * If no characters match the range, then don't 3452 * consume any characters (and unconsume the U+0023 3453 * NUMBER SIGN character and, if appropriate, the X 3454 * character). This is a parse error; nothing is 3455 * returned. 3456 * 3457 * Otherwise, if the next character is a U+003B 3458 * SEMICOLON, consume that too. If it isn't, there 3459 * is a parse error. 3460 */ 3461 if (!seenDigits) { 3462 errNoDigitsInNCR(); 3463 emitOrAppendCharRefBuf(returnState); 3464 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3465 cstart = pos; 3466 } 3467 reconsume = true; 3468 state = transition(state, returnState, reconsume, pos); 3469 continue stateloop; 3470 } else { 3471 errCharRefLacksSemicolon(); 3472 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3473 cstart = pos; 3474 } 3475 reconsume = true; 3476 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3477 // FALL THROUGH continue stateloop; 3478 break decimalloop; 3479 } 3480 } 3481 } 3482 // CPPONLY: MOZ_FALLTHROUGH; 3483 case HANDLE_NCR_VALUE: 3484 // WARNING previous state sets reconsume 3485 // We are not going to emit the contents of charRefBuf. 3486 charRefBufLen = 0; 3487 // XXX inline this case if the method size can take it 3488 handleNcrValue(returnState); 3489 state = transition(state, returnState, reconsume, pos); 3490 continue stateloop; 3491 case HEX_NCR_LOOP: 3492 for (;;) { 3493 if (++pos == endPos) { 3494 break stateloop; 3495 } 3496 c = checkChar(buf, pos); 3497 /* 3498 * Consume as many characters as match the range of 3499 * characters given above. 3500 */ 3501 assert value >= 0: "value must not become negative."; 3502 if (c >= '0' && c <= '9') { 3503 seenDigits = true; 3504 // Avoid overflow 3505 if (value <= 0x10FFFF) { 3506 value *= 16; 3507 value += c - '0'; 3508 } 3509 continue; 3510 } else if (c >= 'A' && c <= 'F') { 3511 seenDigits = true; 3512 // Avoid overflow 3513 if (value <= 0x10FFFF) { 3514 value *= 16; 3515 value += c - 'A' + 10; 3516 } 3517 continue; 3518 } else if (c >= 'a' && c <= 'f') { 3519 seenDigits = true; 3520 // Avoid overflow 3521 if (value <= 0x10FFFF) { 3522 value *= 16; 3523 value += c - 'a' + 10; 3524 } 3525 continue; 3526 } else if (c == ';') { 3527 if (seenDigits) { 3528 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3529 cstart = pos + 1; 3530 } 3531 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3532 continue stateloop; 3533 } else { 3534 errNoDigitsInNCR(); 3535 appendCharRefBuf(';'); 3536 emitOrAppendCharRefBuf(returnState); 3537 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3538 cstart = pos + 1; 3539 } 3540 state = transition(state, returnState, reconsume, pos); 3541 continue stateloop; 3542 } 3543 } else { 3544 /* 3545 * If no characters match the range, then don't 3546 * consume any characters (and unconsume the U+0023 3547 * NUMBER SIGN character and, if appropriate, the X 3548 * character). This is a parse error; nothing is 3549 * returned. 3550 * 3551 * Otherwise, if the next character is a U+003B 3552 * SEMICOLON, consume that too. If it isn't, there 3553 * is a parse error. 3554 */ 3555 if (!seenDigits) { 3556 errNoDigitsInNCR(); 3557 emitOrAppendCharRefBuf(returnState); 3558 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3559 cstart = pos; 3560 } 3561 reconsume = true; 3562 state = transition(state, returnState, reconsume, pos); 3563 continue stateloop; 3564 } else { 3565 errCharRefLacksSemicolon(); 3566 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3567 cstart = pos; 3568 } 3569 reconsume = true; 3570 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3571 continue stateloop; 3572 } 3573 } 3574 } 3575 case PLAINTEXT: 3576 plaintextloop: for (;;) { 3577 if (reconsume) { 3578 reconsume = false; 3579 } else { 3580 if (++pos == endPos) { 3581 break stateloop; 3582 } 3583 c = checkChar(buf, pos); 3584 } 3585 switch (c) { 3586 case '\u0000': 3587 emitPlaintextReplacementCharacter(buf, pos); 3588 continue; 3589 case '\r': 3590 emitCarriageReturn(buf, pos); 3591 break stateloop; 3592 case '\n': 3593 silentLineFeed(); 3594 // CPPONLY: MOZ_FALLTHROUGH; 3595 default: 3596 /* 3597 * Anything else Emit the current input 3598 * character as a character token. Stay in the 3599 * RAWTEXT state. 3600 */ 3601 continue; 3602 } 3603 } 3604 case CLOSE_TAG_OPEN: 3605 if (++pos == endPos) { 3606 break stateloop; 3607 } 3608 c = checkChar(buf, pos); 3609 /* 3610 * Otherwise, if the content model flag is set to the PCDATA 3611 * state, or if the next few characters do match that tag 3612 * name, consume the next input character: 3613 */ 3614 switch (c) { 3615 case '>': 3616 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 3617 errLtSlashGt(); 3618 /* 3619 * Switch to the data state. 3620 */ 3621 cstart = pos + 1; 3622 state = transition(state, Tokenizer.DATA, reconsume, pos); 3623 continue stateloop; 3624 case '\r': 3625 silentCarriageReturn(); 3626 /* Anything else Parse error. */ 3627 errGarbageAfterLtSlash(); 3628 /* 3629 * Switch to the bogus comment state. 3630 */ 3631 clearStrBufBeforeUse(); 3632 appendStrBuf('\n'); 3633 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3634 break stateloop; 3635 case '\n': 3636 silentLineFeed(); 3637 /* Anything else Parse error. */ 3638 errGarbageAfterLtSlash(); 3639 /* 3640 * Switch to the bogus comment state. 3641 */ 3642 clearStrBufBeforeUse(); 3643 appendStrBuf(c); 3644 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3645 continue stateloop; 3646 case '\u0000': 3647 c = '\uFFFD'; 3648 // CPPONLY: MOZ_FALLTHROUGH; 3649 default: 3650 if (c >= 'A' && c <= 'Z') { 3651 c += 0x20; 3652 } 3653 if (c >= 'a' && c <= 'z') { 3654 /* 3655 * U+0061 LATIN SMALL LETTER A through to U+007A 3656 * LATIN SMALL LETTER Z Create a new end tag 3657 * token, 3658 */ 3659 endTag = true; 3660 /* 3661 * set its tag name to the input character, 3662 */ 3663 clearStrBufBeforeUse(); 3664 appendStrBuf(c); 3665 containsHyphen = false; 3666 /* 3667 * then switch to the tag name state. (Don't 3668 * emit the token yet; further details will be 3669 * filled in before it is emitted.) 3670 */ 3671 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 3672 continue stateloop; 3673 } else { 3674 /* Anything else Parse error. */ 3675 errGarbageAfterLtSlash(); 3676 /* 3677 * Switch to the bogus comment state. 3678 */ 3679 clearStrBufBeforeUse(); 3680 appendStrBuf(c); 3681 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3682 continue stateloop; 3683 } 3684 } 3685 case RCDATA: 3686 rcdataloop: for (;;) { 3687 if (reconsume) { 3688 reconsume = false; 3689 } else { 3690 if (++pos == endPos) { 3691 break stateloop; 3692 } 3693 c = checkChar(buf, pos); 3694 } 3695 switch (c) { 3696 case '&': 3697 /* 3698 * U+0026 AMPERSAND (&) Switch to the character 3699 * reference in RCDATA state. 3700 */ 3701 flushChars(buf, pos); 3702 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 3703 appendCharRefBuf(c); 3704 setAdditionalAndRememberAmpersandLocation('\u0000'); 3705 returnState = state; 3706 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 3707 continue stateloop; 3708 case '<': 3709 /* 3710 * U+003C LESS-THAN SIGN (<) Switch to the 3711 * RCDATA less-than sign state. 3712 */ 3713 flushChars(buf, pos); 3714 3715 returnState = state; 3716 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 3717 continue stateloop; 3718 case '\u0000': 3719 emitReplacementCharacter(buf, pos); 3720 continue; 3721 case '\r': 3722 emitCarriageReturn(buf, pos); 3723 break stateloop; 3724 case '\n': 3725 silentLineFeed(); 3726 // CPPONLY: MOZ_FALLTHROUGH; 3727 default: 3728 /* 3729 * Emit the current input character as a 3730 * character token. Stay in the RCDATA state. 3731 */ 3732 continue; 3733 } 3734 } 3735 case RAWTEXT: 3736 rawtextloop: for (;;) { 3737 if (reconsume) { 3738 reconsume = false; 3739 } else { 3740 if (++pos == endPos) { 3741 break stateloop; 3742 } 3743 c = checkChar(buf, pos); 3744 } 3745 switch (c) { 3746 case '<': 3747 /* 3748 * U+003C LESS-THAN SIGN (<) Switch to the 3749 * RAWTEXT less-than sign state. 3750 */ 3751 flushChars(buf, pos); 3752 3753 returnState = state; 3754 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 3755 break rawtextloop; 3756 // FALL THRU continue stateloop; 3757 case '\u0000': 3758 emitReplacementCharacter(buf, pos); 3759 continue; 3760 case '\r': 3761 emitCarriageReturn(buf, pos); 3762 break stateloop; 3763 case '\n': 3764 silentLineFeed(); 3765 // CPPONLY: MOZ_FALLTHROUGH; 3766 default: 3767 /* 3768 * Emit the current input character as a 3769 * character token. Stay in the RAWTEXT state. 3770 */ 3771 continue; 3772 } 3773 } 3774 // CPPONLY: MOZ_FALLTHROUGH; 3775 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 3776 rawtextrcdatalessthansignloop: for (;;) { 3777 if (++pos == endPos) { 3778 break stateloop; 3779 } 3780 c = checkChar(buf, pos); 3781 switch (c) { 3782 case '/': 3783 /* 3784 * U+002F SOLIDUS (/) Set the temporary buffer 3785 * to the empty string. Switch to the script 3786 * data end tag open state. 3787 */ 3788 index = 0; 3789 clearStrBufBeforeUse(); 3790 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 3791 break rawtextrcdatalessthansignloop; 3792 // FALL THRU continue stateloop; 3793 default: 3794 /* 3795 * Otherwise, emit a U+003C LESS-THAN SIGN 3796 * character token 3797 */ 3798 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 3799 /* 3800 * and reconsume the current input character in 3801 * the data state. 3802 */ 3803 cstart = pos; 3804 reconsume = true; 3805 state = transition(state, returnState, reconsume, pos); 3806 continue stateloop; 3807 } 3808 } 3809 // CPPONLY: MOZ_FALLTHROUGH; 3810 case NON_DATA_END_TAG_NAME: 3811 for (;;) { 3812 if (++pos == endPos) { 3813 break stateloop; 3814 } 3815 c = checkChar(buf, pos); 3816 /* 3817 * ASSERT! when entering this state, set index to 0 and 3818 * call clearStrBufBeforeUse(); Let's implement the above 3819 * without lookahead. strBuf is the 'temporary buffer'. 3820 */ 3821 if (endTagExpectationAsArray == null) { 3822 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 3823 0, 2); 3824 cstart = pos; 3825 reconsume = true; 3826 state = transition(state, returnState, reconsume, pos); 3827 continue stateloop; 3828 } else if (index < endTagExpectationAsArray.length) { 3829 char e = endTagExpectationAsArray[index]; 3830 char folded = c; 3831 if (c >= 'A' && c <= 'Z') { 3832 folded += 0x20; 3833 } 3834 if (folded != e) { 3835 // [NOCPP[ 3836 errHtml4LtSlashInRcdata(folded); 3837 // ]NOCPP] 3838 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 3839 0, 2); 3840 emitStrBuf(); 3841 cstart = pos; 3842 reconsume = true; 3843 state = transition(state, returnState, reconsume, pos); 3844 continue stateloop; 3845 } 3846 appendStrBuf(c); 3847 index++; 3848 continue; 3849 } else { 3850 endTag = true; 3851 // XXX replace contentModelElement with different 3852 // type 3853 tagName = endTagExpectation; 3854 switch (c) { 3855 case '\r': 3856 silentCarriageReturn(); 3857 clearStrBufAfterUse(); // strBuf not used 3858 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 3859 break stateloop; 3860 case '\n': 3861 silentLineFeed(); 3862 // CPPONLY: MOZ_FALLTHROUGH; 3863 case ' ': 3864 case '\t': 3865 case '\u000C': 3866 /* 3867 * U+0009 CHARACTER TABULATION U+000A LINE 3868 * FEED (LF) U+000C FORM FEED (FF) U+0020 3869 * SPACE If the current end tag token is an 3870 * appropriate end tag token, then switch to 3871 * the before attribute name state. 3872 */ 3873 clearStrBufAfterUse(); // strBuf not used 3874 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 3875 continue stateloop; 3876 case '/': 3877 /* 3878 * U+002F SOLIDUS (/) If the current end tag 3879 * token is an appropriate end tag token, 3880 * then switch to the self-closing start tag 3881 * state. 3882 */ 3883 clearStrBufAfterUse(); // strBuf not used 3884 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 3885 continue stateloop; 3886 case '>': 3887 /* 3888 * U+003E GREATER-THAN SIGN (>) If the 3889 * current end tag token is an appropriate 3890 * end tag token, then emit the current tag 3891 * token and switch to the data state. 3892 */ 3893 clearStrBufAfterUse(); // strBuf not used 3894 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 3895 if (shouldSuspend) { 3896 break stateloop; 3897 } 3898 continue stateloop; 3899 default: 3900 /* 3901 * Emit a U+003C LESS-THAN SIGN character 3902 * token, a U+002F SOLIDUS character token, 3903 * a character token for each of the 3904 * characters in the temporary buffer (in 3905 * the order they were added to the buffer), 3906 * and reconsume the current input character 3907 * in the RAWTEXT state. 3908 */ 3909 // [NOCPP[ 3910 errWarnLtSlashInRcdata(); 3911 // ]NOCPP] 3912 tokenHandler.characters( 3913 Tokenizer.LT_SOLIDUS, 0, 2); 3914 emitStrBuf(); 3915 cstart = pos; // don't drop the 3916 // character 3917 reconsume = true; 3918 state = transition(state, returnState, reconsume, pos); 3919 continue stateloop; 3920 } 3921 } 3922 } 3923 // BEGIN HOTSPOT WORKAROUND 3924 case BOGUS_COMMENT: 3925 boguscommentloop: for (;;) { 3926 if (reconsume) { 3927 reconsume = false; 3928 } else { 3929 if (++pos == endPos) { 3930 break stateloop; 3931 } 3932 c = checkChar(buf, pos); 3933 } 3934 /* 3935 * Consume every character up to and including the first 3936 * U+003E GREATER-THAN SIGN character (>) or the end of 3937 * the file (EOF), whichever comes first. Emit a comment 3938 * token whose data is the concatenation of all the 3939 * characters starting from and including the character 3940 * that caused the state machine to switch into the 3941 * bogus comment state, up to and including the 3942 * character immediately before the last consumed 3943 * character (i.e. up to the character just before the 3944 * U+003E or EOF character). (If the comment was started 3945 * by the end of the file (EOF), the token is empty.) 3946 * 3947 * Switch to the data state. 3948 * 3949 * If the end of the file was reached, reconsume the EOF 3950 * character. 3951 */ 3952 switch (c) { 3953 case '>': 3954 emitComment(0, pos); 3955 state = transition(state, Tokenizer.DATA, reconsume, pos); 3956 continue stateloop; 3957 case '-': 3958 appendStrBuf(c); 3959 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); 3960 break boguscommentloop; 3961 case '\r': 3962 appendStrBufCarriageReturn(); 3963 break stateloop; 3964 case '\n': 3965 appendStrBufLineFeed(); 3966 continue; 3967 case '\u0000': 3968 c = '\uFFFD'; 3969 // CPPONLY: MOZ_FALLTHROUGH; 3970 default: 3971 appendStrBuf(c); 3972 continue; 3973 } 3974 } 3975 // CPPONLY: MOZ_FALLTHROUGH; 3976 case BOGUS_COMMENT_HYPHEN: 3977 boguscommenthyphenloop: for (;;) { 3978 if (++pos == endPos) { 3979 break stateloop; 3980 } 3981 c = checkChar(buf, pos); 3982 switch (c) { 3983 case '>': 3984 // [NOCPP[ 3985 maybeAppendSpaceToBogusComment(); 3986 // ]NOCPP] 3987 emitComment(0, pos); 3988 state = transition(state, Tokenizer.DATA, reconsume, pos); 3989 continue stateloop; 3990 case '-': 3991 appendSecondHyphenToBogusComment(); 3992 continue boguscommenthyphenloop; 3993 case '\r': 3994 appendStrBufCarriageReturn(); 3995 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3996 break stateloop; 3997 case '\n': 3998 appendStrBufLineFeed(); 3999 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4000 continue stateloop; 4001 case '\u0000': 4002 c = '\uFFFD'; 4003 // CPPONLY: MOZ_FALLTHROUGH; 4004 default: 4005 appendStrBuf(c); 4006 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4007 continue stateloop; 4008 } 4009 } 4010 case SCRIPT_DATA: 4011 scriptdataloop: for (;;) { 4012 if (reconsume) { 4013 reconsume = false; 4014 } else { 4015 if (++pos == endPos) { 4016 break stateloop; 4017 } 4018 c = checkChar(buf, pos); 4019 } 4020 switch (c) { 4021 case '<': 4022 /* 4023 * U+003C LESS-THAN SIGN (<) Switch to the 4024 * script data less-than sign state. 4025 */ 4026 flushChars(buf, pos); 4027 returnState = state; 4028 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); 4029 break scriptdataloop; // FALL THRU continue 4030 // stateloop; 4031 case '\u0000': 4032 emitReplacementCharacter(buf, pos); 4033 continue; 4034 case '\r': 4035 emitCarriageReturn(buf, pos); 4036 break stateloop; 4037 case '\n': 4038 silentLineFeed(); 4039 // CPPONLY: MOZ_FALLTHROUGH; 4040 default: 4041 /* 4042 * Anything else Emit the current input 4043 * character as a character token. Stay in the 4044 * script data state. 4045 */ 4046 continue; 4047 } 4048 } 4049 // CPPONLY: MOZ_FALLTHROUGH; 4050 case SCRIPT_DATA_LESS_THAN_SIGN: 4051 scriptdatalessthansignloop: for (;;) { 4052 if (++pos == endPos) { 4053 break stateloop; 4054 } 4055 c = checkChar(buf, pos); 4056 switch (c) { 4057 case '/': 4058 /* 4059 * U+002F SOLIDUS (/) Set the temporary buffer 4060 * to the empty string. Switch to the script 4061 * data end tag open state. 4062 */ 4063 index = 0; 4064 clearStrBufBeforeUse(); 4065 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 4066 continue stateloop; 4067 case '!': 4068 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4069 cstart = pos; 4070 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); 4071 break scriptdatalessthansignloop; // FALL THRU 4072 // continue 4073 // stateloop; 4074 default: 4075 /* 4076 * Otherwise, emit a U+003C LESS-THAN SIGN 4077 * character token 4078 */ 4079 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4080 /* 4081 * and reconsume the current input character in 4082 * the data state. 4083 */ 4084 cstart = pos; 4085 reconsume = true; 4086 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4087 continue stateloop; 4088 } 4089 } 4090 // CPPONLY: MOZ_FALLTHROUGH; 4091 case SCRIPT_DATA_ESCAPE_START: 4092 scriptdataescapestartloop: for (;;) { 4093 if (++pos == endPos) { 4094 break stateloop; 4095 } 4096 c = checkChar(buf, pos); 4097 /* 4098 * Consume the next input character: 4099 */ 4100 switch (c) { 4101 case '-': 4102 /* 4103 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4104 * HYPHEN-MINUS character token. Switch to the 4105 * script data escape start dash state. 4106 */ 4107 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); 4108 break scriptdataescapestartloop; // FALL THRU 4109 // continue 4110 // stateloop; 4111 default: 4112 /* 4113 * Anything else Reconsume the current input 4114 * character in the script data state. 4115 */ 4116 reconsume = true; 4117 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4118 continue stateloop; 4119 } 4120 } 4121 // CPPONLY: MOZ_FALLTHROUGH; 4122 case SCRIPT_DATA_ESCAPE_START_DASH: 4123 scriptdataescapestartdashloop: for (;;) { 4124 if (++pos == endPos) { 4125 break stateloop; 4126 } 4127 c = checkChar(buf, pos); 4128 /* 4129 * Consume the next input character: 4130 */ 4131 switch (c) { 4132 case '-': 4133 /* 4134 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4135 * HYPHEN-MINUS character token. Switch to the 4136 * script data escaped dash dash state. 4137 */ 4138 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 4139 break scriptdataescapestartdashloop; 4140 // continue stateloop; 4141 default: 4142 /* 4143 * Anything else Reconsume the current input 4144 * character in the script data state. 4145 */ 4146 reconsume = true; 4147 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4148 continue stateloop; 4149 } 4150 } 4151 // CPPONLY: MOZ_FALLTHROUGH; 4152 case SCRIPT_DATA_ESCAPED_DASH_DASH: 4153 scriptdataescapeddashdashloop: for (;;) { 4154 if (++pos == endPos) { 4155 break stateloop; 4156 } 4157 c = checkChar(buf, pos); 4158 /* 4159 * Consume the next input character: 4160 */ 4161 switch (c) { 4162 case '-': 4163 /* 4164 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4165 * HYPHEN-MINUS character token. Stay in the 4166 * script data escaped dash dash state. 4167 */ 4168 continue; 4169 case '<': 4170 /* 4171 * U+003C LESS-THAN SIGN (<) Switch to the 4172 * script data escaped less-than sign state. 4173 */ 4174 flushChars(buf, pos); 4175 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4176 continue stateloop; 4177 case '>': 4178 /* 4179 * U+003E GREATER-THAN SIGN (>) Emit a U+003E 4180 * GREATER-THAN SIGN character token. Switch to 4181 * the script data state. 4182 */ 4183 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4184 continue stateloop; 4185 case '\u0000': 4186 emitReplacementCharacter(buf, pos); 4187 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4188 break scriptdataescapeddashdashloop; 4189 case '\r': 4190 emitCarriageReturn(buf, pos); 4191 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4192 break stateloop; 4193 case '\n': 4194 silentLineFeed(); 4195 // CPPONLY: MOZ_FALLTHROUGH; 4196 default: 4197 /* 4198 * Anything else Emit the current input 4199 * character as a character token. Switch to the 4200 * script data escaped state. 4201 */ 4202 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4203 break scriptdataescapeddashdashloop; 4204 // continue stateloop; 4205 } 4206 } 4207 // CPPONLY: MOZ_FALLTHROUGH; 4208 case SCRIPT_DATA_ESCAPED: 4209 scriptdataescapedloop: for (;;) { 4210 if (reconsume) { 4211 reconsume = false; 4212 } else { 4213 if (++pos == endPos) { 4214 break stateloop; 4215 } 4216 c = checkChar(buf, pos); 4217 } 4218 /* 4219 * Consume the next input character: 4220 */ 4221 switch (c) { 4222 case '-': 4223 /* 4224 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4225 * HYPHEN-MINUS character token. Switch to the 4226 * script data escaped dash state. 4227 */ 4228 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); 4229 break scriptdataescapedloop; // FALL THRU 4230 // continue 4231 // stateloop; 4232 case '<': 4233 /* 4234 * U+003C LESS-THAN SIGN (<) Switch to the 4235 * script data escaped less-than sign state. 4236 */ 4237 flushChars(buf, pos); 4238 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4239 continue stateloop; 4240 case '\u0000': 4241 emitReplacementCharacter(buf, pos); 4242 continue; 4243 case '\r': 4244 emitCarriageReturn(buf, pos); 4245 break stateloop; 4246 case '\n': 4247 silentLineFeed(); 4248 // CPPONLY: MOZ_FALLTHROUGH; 4249 default: 4250 /* 4251 * Anything else Emit the current input 4252 * character as a character token. Stay in the 4253 * script data escaped state. 4254 */ 4255 continue; 4256 } 4257 } 4258 // CPPONLY: MOZ_FALLTHROUGH; 4259 case SCRIPT_DATA_ESCAPED_DASH: 4260 scriptdataescapeddashloop: for (;;) { 4261 if (++pos == endPos) { 4262 break stateloop; 4263 } 4264 c = checkChar(buf, pos); 4265 /* 4266 * Consume the next input character: 4267 */ 4268 switch (c) { 4269 case '-': 4270 /* 4271 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4272 * HYPHEN-MINUS character token. Switch to the 4273 * script data escaped dash dash state. 4274 */ 4275 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 4276 continue stateloop; 4277 case '<': 4278 /* 4279 * U+003C LESS-THAN SIGN (<) Switch to the 4280 * script data escaped less-than sign state. 4281 */ 4282 flushChars(buf, pos); 4283 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4284 break scriptdataescapeddashloop; 4285 // continue stateloop; 4286 case '\u0000': 4287 emitReplacementCharacter(buf, pos); 4288 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4289 continue stateloop; 4290 case '\r': 4291 emitCarriageReturn(buf, pos); 4292 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4293 break stateloop; 4294 case '\n': 4295 silentLineFeed(); 4296 // CPPONLY: MOZ_FALLTHROUGH; 4297 default: 4298 /* 4299 * Anything else Emit the current input 4300 * character as a character token. Switch to the 4301 * script data escaped state. 4302 */ 4303 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4304 continue stateloop; 4305 } 4306 } 4307 // CPPONLY: MOZ_FALLTHROUGH; 4308 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 4309 scriptdataescapedlessthanloop: for (;;) { 4310 if (++pos == endPos) { 4311 break stateloop; 4312 } 4313 c = checkChar(buf, pos); 4314 /* 4315 * Consume the next input character: 4316 */ 4317 switch (c) { 4318 case '/': 4319 /* 4320 * U+002F SOLIDUS (/) Set the temporary buffer 4321 * to the empty string. Switch to the script 4322 * data escaped end tag open state. 4323 */ 4324 index = 0; 4325 clearStrBufBeforeUse(); 4326 returnState = Tokenizer.SCRIPT_DATA_ESCAPED; 4327 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 4328 continue stateloop; 4329 case 'S': 4330 case 's': 4331 /* 4332 * U+0041 LATIN CAPITAL LETTER A through to 4333 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C 4334 * LESS-THAN SIGN character token and the 4335 * current input character as a character token. 4336 */ 4337 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4338 cstart = pos; 4339 index = 1; 4340 /* 4341 * Set the temporary buffer to the empty string. 4342 * Append the lowercase version of the current 4343 * input character (add 0x0020 to the 4344 * character's code point) to the temporary 4345 * buffer. Switch to the script data double 4346 * escape start state. 4347 */ 4348 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); 4349 break scriptdataescapedlessthanloop; 4350 // continue stateloop; 4351 default: 4352 /* 4353 * Anything else Emit a U+003C LESS-THAN SIGN 4354 * character token and reconsume the current 4355 * input character in the script data escaped 4356 * state. 4357 */ 4358 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4359 cstart = pos; 4360 reconsume = true; 4361 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4362 continue stateloop; 4363 } 4364 } 4365 // CPPONLY: MOZ_FALLTHROUGH; 4366 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 4367 scriptdatadoubleescapestartloop: for (;;) { 4368 if (++pos == endPos) { 4369 break stateloop; 4370 } 4371 c = checkChar(buf, pos); 4372 assert index > 0; 4373 if (index < 6) { // SCRIPT_ARR.length 4374 char folded = c; 4375 if (c >= 'A' && c <= 'Z') { 4376 folded += 0x20; 4377 } 4378 if (folded != Tokenizer.SCRIPT_ARR[index]) { 4379 reconsume = true; 4380 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4381 continue stateloop; 4382 } 4383 index++; 4384 continue; 4385 } 4386 switch (c) { 4387 case '\r': 4388 emitCarriageReturn(buf, pos); 4389 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4390 break stateloop; 4391 case '\n': 4392 silentLineFeed(); 4393 // CPPONLY: MOZ_FALLTHROUGH; 4394 case ' ': 4395 case '\t': 4396 case '\u000C': 4397 case '/': 4398 case '>': 4399 /* 4400 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4401 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4402 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 4403 * (>) Emit the current input character as a 4404 * character token. If the temporary buffer is 4405 * the string "script", then switch to the 4406 * script data double escaped state. 4407 */ 4408 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4409 break scriptdatadoubleescapestartloop; 4410 // continue stateloop; 4411 default: 4412 /* 4413 * Anything else Reconsume the current input 4414 * character in the script data escaped state. 4415 */ 4416 reconsume = true; 4417 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4418 continue stateloop; 4419 } 4420 } 4421 // CPPONLY: MOZ_FALLTHROUGH; 4422 case SCRIPT_DATA_DOUBLE_ESCAPED: 4423 scriptdatadoubleescapedloop: for (;;) { 4424 if (reconsume) { 4425 reconsume = false; 4426 } else { 4427 if (++pos == endPos) { 4428 break stateloop; 4429 } 4430 c = checkChar(buf, pos); 4431 } 4432 /* 4433 * Consume the next input character: 4434 */ 4435 switch (c) { 4436 case '-': 4437 /* 4438 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4439 * HYPHEN-MINUS character token. Switch to the 4440 * script data double escaped dash state. 4441 */ 4442 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); 4443 break scriptdatadoubleescapedloop; // FALL THRU 4444 // continue 4445 // stateloop; 4446 case '<': 4447 /* 4448 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4449 * LESS-THAN SIGN character token. Switch to the 4450 * script data double escaped less-than sign 4451 * state. 4452 */ 4453 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4454 continue stateloop; 4455 case '\u0000': 4456 emitReplacementCharacter(buf, pos); 4457 continue; 4458 case '\r': 4459 emitCarriageReturn(buf, pos); 4460 break stateloop; 4461 case '\n': 4462 silentLineFeed(); 4463 // CPPONLY: MOZ_FALLTHROUGH; 4464 default: 4465 /* 4466 * Anything else Emit the current input 4467 * character as a character token. Stay in the 4468 * script data double escaped state. 4469 */ 4470 continue; 4471 } 4472 } 4473 // CPPONLY: MOZ_FALLTHROUGH; 4474 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 4475 scriptdatadoubleescapeddashloop: for (;;) { 4476 if (++pos == endPos) { 4477 break stateloop; 4478 } 4479 c = checkChar(buf, pos); 4480 /* 4481 * Consume the next input character: 4482 */ 4483 switch (c) { 4484 case '-': 4485 /* 4486 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4487 * HYPHEN-MINUS character token. Switch to the 4488 * script data double escaped dash dash state. 4489 */ 4490 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); 4491 break scriptdatadoubleescapeddashloop; 4492 // continue stateloop; 4493 case '<': 4494 /* 4495 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4496 * LESS-THAN SIGN character token. Switch to the 4497 * script data double escaped less-than sign 4498 * state. 4499 */ 4500 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4501 continue stateloop; 4502 case '\u0000': 4503 emitReplacementCharacter(buf, pos); 4504 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4505 continue stateloop; 4506 case '\r': 4507 emitCarriageReturn(buf, pos); 4508 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4509 break stateloop; 4510 case '\n': 4511 silentLineFeed(); 4512 // CPPONLY: MOZ_FALLTHROUGH; 4513 default: 4514 /* 4515 * Anything else Emit the current input 4516 * character as a character token. Switch to the 4517 * script data double escaped state. 4518 */ 4519 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4520 continue stateloop; 4521 } 4522 } 4523 // CPPONLY: MOZ_FALLTHROUGH; 4524 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 4525 scriptdatadoubleescapeddashdashloop: for (;;) { 4526 if (++pos == endPos) { 4527 break stateloop; 4528 } 4529 c = checkChar(buf, pos); 4530 /* 4531 * Consume the next input character: 4532 */ 4533 switch (c) { 4534 case '-': 4535 /* 4536 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4537 * HYPHEN-MINUS character token. Stay in the 4538 * script data double escaped dash dash state. 4539 */ 4540 continue; 4541 case '<': 4542 /* 4543 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4544 * LESS-THAN SIGN character token. Switch to the 4545 * script data double escaped less-than sign 4546 * state. 4547 */ 4548 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4549 break scriptdatadoubleescapeddashdashloop; 4550 case '>': 4551 /* 4552 * U+003E GREATER-THAN SIGN (>) Emit a U+003E 4553 * GREATER-THAN SIGN character token. Switch to 4554 * the script data state. 4555 */ 4556 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4557 continue stateloop; 4558 case '\u0000': 4559 emitReplacementCharacter(buf, pos); 4560 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4561 continue stateloop; 4562 case '\r': 4563 emitCarriageReturn(buf, pos); 4564 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4565 break stateloop; 4566 case '\n': 4567 silentLineFeed(); 4568 // CPPONLY: MOZ_FALLTHROUGH; 4569 default: 4570 /* 4571 * Anything else Emit the current input 4572 * character as a character token. Switch to the 4573 * script data double escaped state. 4574 */ 4575 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4576 continue stateloop; 4577 } 4578 } 4579 // CPPONLY: MOZ_FALLTHROUGH; 4580 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 4581 scriptdatadoubleescapedlessthanloop: for (;;) { 4582 if (++pos == endPos) { 4583 break stateloop; 4584 } 4585 c = checkChar(buf, pos); 4586 /* 4587 * Consume the next input character: 4588 */ 4589 switch (c) { 4590 case '/': 4591 /* 4592 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS 4593 * character token. Set the temporary buffer to 4594 * the empty string. Switch to the script data 4595 * double escape end state. 4596 */ 4597 index = 0; 4598 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); 4599 break scriptdatadoubleescapedlessthanloop; 4600 default: 4601 /* 4602 * Anything else Reconsume the current input 4603 * character in the script data double escaped 4604 * state. 4605 */ 4606 reconsume = true; 4607 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4608 continue stateloop; 4609 } 4610 } 4611 // CPPONLY: MOZ_FALLTHROUGH; 4612 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 4613 scriptdatadoubleescapeendloop: for (;;) { 4614 if (++pos == endPos) { 4615 break stateloop; 4616 } 4617 c = checkChar(buf, pos); 4618 if (index < 6) { // SCRIPT_ARR.length 4619 char folded = c; 4620 if (c >= 'A' && c <= 'Z') { 4621 folded += 0x20; 4622 } 4623 if (folded != Tokenizer.SCRIPT_ARR[index]) { 4624 reconsume = true; 4625 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4626 continue stateloop; 4627 } 4628 index++; 4629 continue; 4630 } 4631 switch (c) { 4632 case '\r': 4633 emitCarriageReturn(buf, pos); 4634 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4635 break stateloop; 4636 case '\n': 4637 silentLineFeed(); 4638 // CPPONLY: MOZ_FALLTHROUGH; 4639 case ' ': 4640 case '\t': 4641 case '\u000C': 4642 case '/': 4643 case '>': 4644 /* 4645 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4646 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4647 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 4648 * (>) Emit the current input character as a 4649 * character token. If the temporary buffer is 4650 * the string "script", then switch to the 4651 * script data escaped state. 4652 */ 4653 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4654 continue stateloop; 4655 default: 4656 /* 4657 * Reconsume the current input character in the 4658 * script data double escaped state. 4659 */ 4660 reconsume = true; 4661 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4662 continue stateloop; 4663 } 4664 } 4665 case MARKUP_DECLARATION_OCTYPE: 4666 markupdeclarationdoctypeloop: for (;;) { 4667 if (++pos == endPos) { 4668 break stateloop; 4669 } 4670 c = checkChar(buf, pos); 4671 if (index < 6) { // OCTYPE.length 4672 char folded = c; 4673 if (c >= 'A' && c <= 'Z') { 4674 folded += 0x20; 4675 } 4676 if (folded == Tokenizer.OCTYPE[index]) { 4677 appendStrBuf(c); 4678 } else { 4679 errBogusComment(); 4680 reconsume = true; 4681 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4682 continue stateloop; 4683 } 4684 index++; 4685 continue; 4686 } else { 4687 reconsume = true; 4688 state = transition(state, Tokenizer.DOCTYPE, reconsume, pos); 4689 break markupdeclarationdoctypeloop; 4690 // continue stateloop; 4691 } 4692 } 4693 // CPPONLY: MOZ_FALLTHROUGH; 4694 case DOCTYPE: 4695 doctypeloop: for (;;) { 4696 if (reconsume) { 4697 reconsume = false; 4698 } else { 4699 if (++pos == endPos) { 4700 break stateloop; 4701 } 4702 c = checkChar(buf, pos); 4703 } 4704 initDoctypeFields(); 4705 /* 4706 * Consume the next input character: 4707 */ 4708 switch (c) { 4709 case '\r': 4710 silentCarriageReturn(); 4711 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4712 break stateloop; 4713 case '\n': 4714 silentLineFeed(); 4715 // CPPONLY: MOZ_FALLTHROUGH; 4716 case ' ': 4717 case '\t': 4718 case '\u000C': 4719 /* 4720 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4721 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4722 * Switch to the before DOCTYPE name state. 4723 */ 4724 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4725 break doctypeloop; 4726 // continue stateloop; 4727 default: 4728 /* 4729 * Anything else Parse error. 4730 */ 4731 errMissingSpaceBeforeDoctypeName(); 4732 /* 4733 * Reconsume the current character in the before 4734 * DOCTYPE name state. 4735 */ 4736 reconsume = true; 4737 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4738 break doctypeloop; 4739 // continue stateloop; 4740 } 4741 } 4742 // CPPONLY: MOZ_FALLTHROUGH; 4743 case BEFORE_DOCTYPE_NAME: 4744 beforedoctypenameloop: for (;;) { 4745 if (reconsume) { 4746 reconsume = false; 4747 } else { 4748 if (++pos == endPos) { 4749 break stateloop; 4750 } 4751 c = checkChar(buf, pos); 4752 } 4753 /* 4754 * Consume the next input character: 4755 */ 4756 switch (c) { 4757 case '\r': 4758 silentCarriageReturn(); 4759 break stateloop; 4760 case '\n': 4761 silentLineFeed(); 4762 // CPPONLY: MOZ_FALLTHROUGH; 4763 case ' ': 4764 case '\t': 4765 case '\u000C': 4766 /* 4767 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4768 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 4769 * in the before DOCTYPE name state. 4770 */ 4771 continue; 4772 case '>': 4773 /* 4774 * U+003E GREATER-THAN SIGN (>) Parse error. 4775 */ 4776 errNamelessDoctype(); 4777 /* 4778 * Create a new DOCTYPE token. Set its 4779 * force-quirks flag to on. 4780 */ 4781 forceQuirks = true; 4782 /* 4783 * Emit the token. 4784 */ 4785 emitDoctypeToken(pos); 4786 /* 4787 * Switch to the data state. 4788 */ 4789 state = transition(state, Tokenizer.DATA, reconsume, pos); 4790 continue stateloop; 4791 case '\u0000': 4792 c = '\uFFFD'; 4793 // CPPONLY: MOZ_FALLTHROUGH; 4794 default: 4795 if (c >= 'A' && c <= 'Z') { 4796 /* 4797 * U+0041 LATIN CAPITAL LETTER A through to 4798 * U+005A LATIN CAPITAL LETTER Z Create a 4799 * new DOCTYPE token. Set the token's name 4800 * to the lowercase version of the input 4801 * character (add 0x0020 to the character's 4802 * code point). 4803 */ 4804 c += 0x20; 4805 } 4806 /* Anything else Create a new DOCTYPE token. */ 4807 /* 4808 * Set the token's name name to the current 4809 * input character. 4810 */ 4811 clearStrBufBeforeUse(); 4812 appendStrBuf(c); 4813 /* 4814 * Switch to the DOCTYPE name state. 4815 */ 4816 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); 4817 break beforedoctypenameloop; 4818 // continue stateloop; 4819 } 4820 } 4821 // CPPONLY: MOZ_FALLTHROUGH; 4822 case DOCTYPE_NAME: 4823 doctypenameloop: for (;;) { 4824 if (++pos == endPos) { 4825 break stateloop; 4826 } 4827 c = checkChar(buf, pos); 4828 /* 4829 * Consume the next input character: 4830 */ 4831 switch (c) { 4832 case '\r': 4833 silentCarriageReturn(); 4834 strBufToDoctypeName(); 4835 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 4836 break stateloop; 4837 case '\n': 4838 silentLineFeed(); 4839 // CPPONLY: MOZ_FALLTHROUGH; 4840 case ' ': 4841 case '\t': 4842 case '\u000C': 4843 /* 4844 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4845 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4846 * Switch to the after DOCTYPE name state. 4847 */ 4848 strBufToDoctypeName(); 4849 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 4850 break doctypenameloop; 4851 // continue stateloop; 4852 case '>': 4853 /* 4854 * U+003E GREATER-THAN SIGN (>) Emit the current 4855 * DOCTYPE token. 4856 */ 4857 strBufToDoctypeName(); 4858 emitDoctypeToken(pos); 4859 /* 4860 * Switch to the data state. 4861 */ 4862 state = transition(state, Tokenizer.DATA, reconsume, pos); 4863 continue stateloop; 4864 case '\u0000': 4865 c = '\uFFFD'; 4866 // CPPONLY: MOZ_FALLTHROUGH; 4867 default: 4868 /* 4869 * U+0041 LATIN CAPITAL LETTER A through to 4870 * U+005A LATIN CAPITAL LETTER Z Append the 4871 * lowercase version of the input character (add 4872 * 0x0020 to the character's code point) to the 4873 * current DOCTYPE token's name. 4874 */ 4875 if (c >= 'A' && c <= 'Z') { 4876 c += 0x0020; 4877 } 4878 /* 4879 * Anything else Append the current input 4880 * character to the current DOCTYPE token's 4881 * name. 4882 */ 4883 appendStrBuf(c); 4884 /* 4885 * Stay in the DOCTYPE name state. 4886 */ 4887 continue; 4888 } 4889 } 4890 // CPPONLY: MOZ_FALLTHROUGH; 4891 case AFTER_DOCTYPE_NAME: 4892 afterdoctypenameloop: for (;;) { 4893 if (++pos == endPos) { 4894 break stateloop; 4895 } 4896 c = checkChar(buf, pos); 4897 /* 4898 * Consume the next input character: 4899 */ 4900 switch (c) { 4901 case '\r': 4902 silentCarriageReturn(); 4903 break stateloop; 4904 case '\n': 4905 silentLineFeed(); 4906 // CPPONLY: MOZ_FALLTHROUGH; 4907 case ' ': 4908 case '\t': 4909 case '\u000C': 4910 /* 4911 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4912 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 4913 * in the after DOCTYPE name state. 4914 */ 4915 continue; 4916 case '>': 4917 /* 4918 * U+003E GREATER-THAN SIGN (>) Emit the current 4919 * DOCTYPE token. 4920 */ 4921 emitDoctypeToken(pos); 4922 /* 4923 * Switch to the data state. 4924 */ 4925 state = transition(state, Tokenizer.DATA, reconsume, pos); 4926 continue stateloop; 4927 case 'p': 4928 case 'P': 4929 index = 0; 4930 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); 4931 break afterdoctypenameloop; 4932 // continue stateloop; 4933 case 's': 4934 case 'S': 4935 index = 0; 4936 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); 4937 continue stateloop; 4938 default: 4939 /* 4940 * Otherwise, this is the parse error. 4941 */ 4942 bogusDoctype(); 4943 4944 /* 4945 * Set the DOCTYPE token's force-quirks flag to 4946 * on. 4947 */ 4948 // done by bogusDoctype(); 4949 /* 4950 * Switch to the bogus DOCTYPE state. 4951 */ 4952 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 4953 continue stateloop; 4954 } 4955 } 4956 // CPPONLY: MOZ_FALLTHROUGH; 4957 case DOCTYPE_UBLIC: 4958 doctypeublicloop: for (;;) { 4959 if (++pos == endPos) { 4960 break stateloop; 4961 } 4962 c = checkChar(buf, pos); 4963 /* 4964 * If the six characters starting from the current input 4965 * character are an ASCII case-insensitive match for the 4966 * word "PUBLIC", then consume those characters and 4967 * switch to the before DOCTYPE public identifier state. 4968 */ 4969 if (index < 5) { // UBLIC.length 4970 char folded = c; 4971 if (c >= 'A' && c <= 'Z') { 4972 folded += 0x20; 4973 } 4974 if (folded != Tokenizer.UBLIC[index]) { 4975 bogusDoctype(); 4976 // forceQuirks = true; 4977 reconsume = true; 4978 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 4979 continue stateloop; 4980 } 4981 index++; 4982 continue; 4983 } else { 4984 reconsume = true; 4985 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); 4986 break doctypeublicloop; 4987 // continue stateloop; 4988 } 4989 } 4990 // CPPONLY: MOZ_FALLTHROUGH; 4991 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 4992 afterdoctypepublickeywordloop: for (;;) { 4993 if (reconsume) { 4994 reconsume = false; 4995 } else { 4996 if (++pos == endPos) { 4997 break stateloop; 4998 } 4999 c = checkChar(buf, pos); 5000 } 5001 /* 5002 * Consume the next input character: 5003 */ 5004 switch (c) { 5005 case '\r': 5006 silentCarriageReturn(); 5007 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5008 break stateloop; 5009 case '\n': 5010 silentLineFeed(); 5011 // CPPONLY: MOZ_FALLTHROUGH; 5012 case ' ': 5013 case '\t': 5014 case '\u000C': 5015 /* 5016 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5017 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5018 * Switch to the before DOCTYPE public 5019 * identifier state. 5020 */ 5021 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5022 break afterdoctypepublickeywordloop; 5023 // FALL THROUGH continue stateloop 5024 case '"': 5025 /* 5026 * U+0022 QUOTATION MARK (") Parse Error. 5027 */ 5028 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 5029 /* 5030 * Set the DOCTYPE token's public identifier to 5031 * the empty string (not missing), 5032 */ 5033 clearStrBufBeforeUse(); 5034 /* 5035 * then switch to the DOCTYPE public identifier 5036 * (double-quoted) state. 5037 */ 5038 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5039 continue stateloop; 5040 case '\'': 5041 /* 5042 * U+0027 APOSTROPHE (') Parse Error. 5043 */ 5044 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 5045 /* 5046 * Set the DOCTYPE token's public identifier to 5047 * the empty string (not missing), 5048 */ 5049 clearStrBufBeforeUse(); 5050 /* 5051 * then switch to the DOCTYPE public identifier 5052 * (single-quoted) state. 5053 */ 5054 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5055 continue stateloop; 5056 case '>': 5057 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5058 errExpectedPublicId(); 5059 /* 5060 * Set the DOCTYPE token's force-quirks flag to 5061 * on. 5062 */ 5063 forceQuirks = true; 5064 /* 5065 * Emit that DOCTYPE token. 5066 */ 5067 emitDoctypeToken(pos); 5068 /* 5069 * Switch to the data state. 5070 */ 5071 state = transition(state, Tokenizer.DATA, reconsume, pos); 5072 continue stateloop; 5073 default: 5074 bogusDoctype(); 5075 /* 5076 * Set the DOCTYPE token's force-quirks flag to 5077 * on. 5078 */ 5079 // done by bogusDoctype(); 5080 /* 5081 * Switch to the bogus DOCTYPE state. 5082 */ 5083 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5084 continue stateloop; 5085 } 5086 } 5087 // CPPONLY: MOZ_FALLTHROUGH; 5088 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 5089 beforedoctypepublicidentifierloop: for (;;) { 5090 if (++pos == endPos) { 5091 break stateloop; 5092 } 5093 c = checkChar(buf, pos); 5094 /* 5095 * Consume the next input character: 5096 */ 5097 switch (c) { 5098 case '\r': 5099 silentCarriageReturn(); 5100 break stateloop; 5101 case '\n': 5102 silentLineFeed(); 5103 // CPPONLY: MOZ_FALLTHROUGH; 5104 case ' ': 5105 case '\t': 5106 case '\u000C': 5107 /* 5108 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5109 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5110 * in the before DOCTYPE public identifier 5111 * state. 5112 */ 5113 continue; 5114 case '"': 5115 /* 5116 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5117 * token's public identifier to the empty string 5118 * (not missing), 5119 */ 5120 clearStrBufBeforeUse(); 5121 /* 5122 * then switch to the DOCTYPE public identifier 5123 * (double-quoted) state. 5124 */ 5125 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5126 break beforedoctypepublicidentifierloop; 5127 // continue stateloop; 5128 case '\'': 5129 /* 5130 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5131 * public identifier to the empty string (not 5132 * missing), 5133 */ 5134 clearStrBufBeforeUse(); 5135 /* 5136 * then switch to the DOCTYPE public identifier 5137 * (single-quoted) state. 5138 */ 5139 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5140 continue stateloop; 5141 case '>': 5142 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5143 errExpectedPublicId(); 5144 /* 5145 * Set the DOCTYPE token's force-quirks flag to 5146 * on. 5147 */ 5148 forceQuirks = true; 5149 /* 5150 * Emit that DOCTYPE token. 5151 */ 5152 emitDoctypeToken(pos); 5153 /* 5154 * Switch to the data state. 5155 */ 5156 state = transition(state, Tokenizer.DATA, reconsume, pos); 5157 continue stateloop; 5158 default: 5159 bogusDoctype(); 5160 /* 5161 * Set the DOCTYPE token's force-quirks flag to 5162 * on. 5163 */ 5164 // done by bogusDoctype(); 5165 /* 5166 * Switch to the bogus DOCTYPE state. 5167 */ 5168 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5169 continue stateloop; 5170 } 5171 } 5172 // CPPONLY: MOZ_FALLTHROUGH; 5173 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 5174 doctypepublicidentifierdoublequotedloop: for (;;) { 5175 if (++pos == endPos) { 5176 break stateloop; 5177 } 5178 c = checkChar(buf, pos); 5179 /* 5180 * Consume the next input character: 5181 */ 5182 switch (c) { 5183 case '"': 5184 /* 5185 * U+0022 QUOTATION MARK (") Switch to the after 5186 * DOCTYPE public identifier state. 5187 */ 5188 publicIdentifier = strBufToString(); 5189 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5190 break doctypepublicidentifierdoublequotedloop; 5191 // continue stateloop; 5192 case '>': 5193 /* 5194 * U+003E GREATER-THAN SIGN (>) Parse error. 5195 */ 5196 errGtInPublicId(); 5197 /* 5198 * Set the DOCTYPE token's force-quirks flag to 5199 * on. 5200 */ 5201 forceQuirks = true; 5202 /* 5203 * Emit that DOCTYPE token. 5204 */ 5205 publicIdentifier = strBufToString(); 5206 emitDoctypeToken(pos); 5207 /* 5208 * Switch to the data state. 5209 */ 5210 state = transition(state, Tokenizer.DATA, reconsume, pos); 5211 continue stateloop; 5212 case '\r': 5213 appendStrBufCarriageReturn(); 5214 break stateloop; 5215 case '\n': 5216 appendStrBufLineFeed(); 5217 continue; 5218 case '\u0000': 5219 c = '\uFFFD'; 5220 // CPPONLY: MOZ_FALLTHROUGH; 5221 default: 5222 /* 5223 * Anything else Append the current input 5224 * character to the current DOCTYPE token's 5225 * public identifier. 5226 */ 5227 appendStrBuf(c); 5228 /* 5229 * Stay in the DOCTYPE public identifier 5230 * (double-quoted) state. 5231 */ 5232 continue; 5233 } 5234 } 5235 // CPPONLY: MOZ_FALLTHROUGH; 5236 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 5237 afterdoctypepublicidentifierloop: for (;;) { 5238 if (++pos == endPos) { 5239 break stateloop; 5240 } 5241 c = checkChar(buf, pos); 5242 /* 5243 * Consume the next input character: 5244 */ 5245 switch (c) { 5246 case '\r': 5247 silentCarriageReturn(); 5248 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 5249 break stateloop; 5250 case '\n': 5251 silentLineFeed(); 5252 // CPPONLY: MOZ_FALLTHROUGH; 5253 case ' ': 5254 case '\t': 5255 case '\u000C': 5256 /* 5257 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5258 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5259 * Switch to the between DOCTYPE public and 5260 * system identifiers state. 5261 */ 5262 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 5263 break afterdoctypepublicidentifierloop; 5264 // continue stateloop; 5265 case '>': 5266 /* 5267 * U+003E GREATER-THAN SIGN (>) Emit the current 5268 * DOCTYPE token. 5269 */ 5270 emitDoctypeToken(pos); 5271 /* 5272 * Switch to the data state. 5273 */ 5274 state = transition(state, Tokenizer.DATA, reconsume, pos); 5275 continue stateloop; 5276 case '"': 5277 /* 5278 * U+0022 QUOTATION MARK (") Parse error. 5279 */ 5280 errNoSpaceBetweenPublicAndSystemIds(); 5281 /* 5282 * Set the DOCTYPE token's system identifier to 5283 * the empty string (not missing), 5284 */ 5285 clearStrBufBeforeUse(); 5286 /* 5287 * then switch to the DOCTYPE system identifier 5288 * (double-quoted) state. 5289 */ 5290 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5291 continue stateloop; 5292 case '\'': 5293 /* 5294 * U+0027 APOSTROPHE (') Parse error. 5295 */ 5296 errNoSpaceBetweenPublicAndSystemIds(); 5297 /* 5298 * Set the DOCTYPE token's system identifier to 5299 * the empty string (not missing), 5300 */ 5301 clearStrBufBeforeUse(); 5302 /* 5303 * then switch to the DOCTYPE system identifier 5304 * (single-quoted) state. 5305 */ 5306 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5307 continue stateloop; 5308 default: 5309 bogusDoctype(); 5310 /* 5311 * Set the DOCTYPE token's force-quirks flag to 5312 * on. 5313 */ 5314 // done by bogusDoctype(); 5315 /* 5316 * Switch to the bogus DOCTYPE state. 5317 */ 5318 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5319 continue stateloop; 5320 } 5321 } 5322 // CPPONLY: MOZ_FALLTHROUGH; 5323 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 5324 betweendoctypepublicandsystemidentifiersloop: for (;;) { 5325 if (++pos == endPos) { 5326 break stateloop; 5327 } 5328 c = checkChar(buf, pos); 5329 /* 5330 * Consume the next input character: 5331 */ 5332 switch (c) { 5333 case '\r': 5334 silentCarriageReturn(); 5335 break stateloop; 5336 case '\n': 5337 silentLineFeed(); 5338 // CPPONLY: MOZ_FALLTHROUGH; 5339 case ' ': 5340 case '\t': 5341 case '\u000C': 5342 /* 5343 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5344 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5345 * in the between DOCTYPE public and system 5346 * identifiers state. 5347 */ 5348 continue; 5349 case '>': 5350 /* 5351 * U+003E GREATER-THAN SIGN (>) Emit the current 5352 * DOCTYPE token. 5353 */ 5354 emitDoctypeToken(pos); 5355 /* 5356 * Switch to the data state. 5357 */ 5358 state = transition(state, Tokenizer.DATA, reconsume, pos); 5359 continue stateloop; 5360 case '"': 5361 /* 5362 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5363 * token's system identifier to the empty string 5364 * (not missing), 5365 */ 5366 clearStrBufBeforeUse(); 5367 /* 5368 * then switch to the DOCTYPE system identifier 5369 * (double-quoted) state. 5370 */ 5371 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5372 break betweendoctypepublicandsystemidentifiersloop; 5373 // continue stateloop; 5374 case '\'': 5375 /* 5376 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5377 * system identifier to the empty string (not 5378 * missing), 5379 */ 5380 clearStrBufBeforeUse(); 5381 /* 5382 * then switch to the DOCTYPE system identifier 5383 * (single-quoted) state. 5384 */ 5385 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5386 continue stateloop; 5387 default: 5388 bogusDoctype(); 5389 /* 5390 * Set the DOCTYPE token's force-quirks flag to 5391 * on. 5392 */ 5393 // done by bogusDoctype(); 5394 /* 5395 * Switch to the bogus DOCTYPE state. 5396 */ 5397 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5398 continue stateloop; 5399 } 5400 } 5401 // CPPONLY: MOZ_FALLTHROUGH; 5402 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 5403 doctypesystemidentifierdoublequotedloop: for (;;) { 5404 if (++pos == endPos) { 5405 break stateloop; 5406 } 5407 c = checkChar(buf, pos); 5408 /* 5409 * Consume the next input character: 5410 */ 5411 switch (c) { 5412 case '"': 5413 /* 5414 * U+0022 QUOTATION MARK (") Switch to the after 5415 * DOCTYPE system identifier state. 5416 */ 5417 systemIdentifier = strBufToString(); 5418 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5419 continue stateloop; 5420 case '>': 5421 /* 5422 * U+003E GREATER-THAN SIGN (>) Parse error. 5423 */ 5424 errGtInSystemId(); 5425 /* 5426 * Set the DOCTYPE token's force-quirks flag to 5427 * on. 5428 */ 5429 forceQuirks = true; 5430 /* 5431 * Emit that DOCTYPE token. 5432 */ 5433 systemIdentifier = strBufToString(); 5434 emitDoctypeToken(pos); 5435 /* 5436 * Switch to the data state. 5437 */ 5438 state = transition(state, Tokenizer.DATA, reconsume, pos); 5439 continue stateloop; 5440 case '\r': 5441 appendStrBufCarriageReturn(); 5442 break stateloop; 5443 case '\n': 5444 appendStrBufLineFeed(); 5445 continue; 5446 case '\u0000': 5447 c = '\uFFFD'; 5448 // CPPONLY: MOZ_FALLTHROUGH; 5449 default: 5450 /* 5451 * Anything else Append the current input 5452 * character to the current DOCTYPE token's 5453 * system identifier. 5454 */ 5455 appendStrBuf(c); 5456 /* 5457 * Stay in the DOCTYPE system identifier 5458 * (double-quoted) state. 5459 */ 5460 continue; 5461 } 5462 } 5463 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 5464 afterdoctypesystemidentifierloop: for (;;) { 5465 if (++pos == endPos) { 5466 break stateloop; 5467 } 5468 c = checkChar(buf, pos); 5469 /* 5470 * Consume the next input character: 5471 */ 5472 switch (c) { 5473 case '\r': 5474 silentCarriageReturn(); 5475 break stateloop; 5476 case '\n': 5477 silentLineFeed(); 5478 // CPPONLY: MOZ_FALLTHROUGH; 5479 case ' ': 5480 case '\t': 5481 case '\u000C': 5482 /* 5483 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5484 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5485 * in the after DOCTYPE system identifier state. 5486 */ 5487 continue; 5488 case '>': 5489 /* 5490 * U+003E GREATER-THAN SIGN (>) Emit the current 5491 * DOCTYPE token. 5492 */ 5493 emitDoctypeToken(pos); 5494 /* 5495 * Switch to the data state. 5496 */ 5497 state = transition(state, Tokenizer.DATA, reconsume, pos); 5498 continue stateloop; 5499 default: 5500 /* 5501 * Switch to the bogus DOCTYPE state. (This does 5502 * not set the DOCTYPE token's force-quirks flag 5503 * to on.) 5504 */ 5505 bogusDoctypeWithoutQuirks(); 5506 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5507 break afterdoctypesystemidentifierloop; 5508 // continue stateloop; 5509 } 5510 } 5511 // CPPONLY: MOZ_FALLTHROUGH; 5512 case BOGUS_DOCTYPE: 5513 for (;;) { 5514 if (reconsume) { 5515 reconsume = false; 5516 } else { 5517 if (++pos == endPos) { 5518 break stateloop; 5519 } 5520 c = checkChar(buf, pos); 5521 } 5522 /* 5523 * Consume the next input character: 5524 */ 5525 switch (c) { 5526 case '>': 5527 /* 5528 * U+003E GREATER-THAN SIGN (>) Emit that 5529 * DOCTYPE token. 5530 */ 5531 emitDoctypeToken(pos); 5532 /* 5533 * Switch to the data state. 5534 */ 5535 state = transition(state, Tokenizer.DATA, reconsume, pos); 5536 continue stateloop; 5537 case '\r': 5538 silentCarriageReturn(); 5539 break stateloop; 5540 case '\n': 5541 silentLineFeed(); 5542 // CPPONLY: MOZ_FALLTHROUGH; 5543 default: 5544 /* 5545 * Anything else Stay in the bogus DOCTYPE 5546 * state. 5547 */ 5548 continue; 5549 } 5550 } 5551 case DOCTYPE_YSTEM: 5552 doctypeystemloop: for (;;) { 5553 if (++pos == endPos) { 5554 break stateloop; 5555 } 5556 c = checkChar(buf, pos); 5557 /* 5558 * Otherwise, if the six characters starting from the 5559 * current input character are an ASCII case-insensitive 5560 * match for the word "SYSTEM", then consume those 5561 * characters and switch to the before DOCTYPE system 5562 * identifier state. 5563 */ 5564 if (index < 5) { // YSTEM.length 5565 char folded = c; 5566 if (c >= 'A' && c <= 'Z') { 5567 folded += 0x20; 5568 } 5569 if (folded != Tokenizer.YSTEM[index]) { 5570 bogusDoctype(); 5571 reconsume = true; 5572 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5573 continue stateloop; 5574 } 5575 index++; 5576 continue stateloop; 5577 } else { 5578 reconsume = true; 5579 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); 5580 break doctypeystemloop; 5581 // continue stateloop; 5582 } 5583 } 5584 // CPPONLY: MOZ_FALLTHROUGH; 5585 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 5586 afterdoctypesystemkeywordloop: for (;;) { 5587 if (reconsume) { 5588 reconsume = false; 5589 } else { 5590 if (++pos == endPos) { 5591 break stateloop; 5592 } 5593 c = checkChar(buf, pos); 5594 } 5595 /* 5596 * Consume the next input character: 5597 */ 5598 switch (c) { 5599 case '\r': 5600 silentCarriageReturn(); 5601 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5602 break stateloop; 5603 case '\n': 5604 silentLineFeed(); 5605 // CPPONLY: MOZ_FALLTHROUGH; 5606 case ' ': 5607 case '\t': 5608 case '\u000C': 5609 /* 5610 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5611 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5612 * Switch to the before DOCTYPE public 5613 * identifier state. 5614 */ 5615 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5616 break afterdoctypesystemkeywordloop; 5617 // FALL THROUGH continue stateloop 5618 case '"': 5619 /* 5620 * U+0022 QUOTATION MARK (") Parse Error. 5621 */ 5622 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 5623 /* 5624 * Set the DOCTYPE token's system identifier to 5625 * the empty string (not missing), 5626 */ 5627 clearStrBufBeforeUse(); 5628 /* 5629 * then switch to the DOCTYPE public identifier 5630 * (double-quoted) state. 5631 */ 5632 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5633 continue stateloop; 5634 case '\'': 5635 /* 5636 * U+0027 APOSTROPHE (') Parse Error. 5637 */ 5638 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 5639 /* 5640 * Set the DOCTYPE token's public identifier to 5641 * the empty string (not missing), 5642 */ 5643 clearStrBufBeforeUse(); 5644 /* 5645 * then switch to the DOCTYPE public identifier 5646 * (single-quoted) state. 5647 */ 5648 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5649 continue stateloop; 5650 case '>': 5651 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5652 errExpectedPublicId(); 5653 /* 5654 * Set the DOCTYPE token's force-quirks flag to 5655 * on. 5656 */ 5657 forceQuirks = true; 5658 /* 5659 * Emit that DOCTYPE token. 5660 */ 5661 emitDoctypeToken(pos); 5662 /* 5663 * Switch to the data state. 5664 */ 5665 state = transition(state, Tokenizer.DATA, reconsume, pos); 5666 continue stateloop; 5667 default: 5668 bogusDoctype(); 5669 /* 5670 * Set the DOCTYPE token's force-quirks flag to 5671 * on. 5672 */ 5673 // done by bogusDoctype(); 5674 /* 5675 * Switch to the bogus DOCTYPE state. 5676 */ 5677 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5678 continue stateloop; 5679 } 5680 } 5681 // CPPONLY: MOZ_FALLTHROUGH; 5682 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 5683 beforedoctypesystemidentifierloop: for (;;) { 5684 if (++pos == endPos) { 5685 break stateloop; 5686 } 5687 c = checkChar(buf, pos); 5688 /* 5689 * Consume the next input character: 5690 */ 5691 switch (c) { 5692 case '\r': 5693 silentCarriageReturn(); 5694 break stateloop; 5695 case '\n': 5696 silentLineFeed(); 5697 // CPPONLY: MOZ_FALLTHROUGH; 5698 case ' ': 5699 case '\t': 5700 case '\u000C': 5701 /* 5702 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5703 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5704 * in the before DOCTYPE system identifier 5705 * state. 5706 */ 5707 continue; 5708 case '"': 5709 /* 5710 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5711 * token's system identifier to the empty string 5712 * (not missing), 5713 */ 5714 clearStrBufBeforeUse(); 5715 /* 5716 * then switch to the DOCTYPE system identifier 5717 * (double-quoted) state. 5718 */ 5719 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5720 continue stateloop; 5721 case '\'': 5722 /* 5723 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5724 * system identifier to the empty string (not 5725 * missing), 5726 */ 5727 clearStrBufBeforeUse(); 5728 /* 5729 * then switch to the DOCTYPE system identifier 5730 * (single-quoted) state. 5731 */ 5732 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5733 break beforedoctypesystemidentifierloop; 5734 // continue stateloop; 5735 case '>': 5736 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5737 errExpectedSystemId(); 5738 /* 5739 * Set the DOCTYPE token's force-quirks flag to 5740 * on. 5741 */ 5742 forceQuirks = true; 5743 /* 5744 * Emit that DOCTYPE token. 5745 */ 5746 emitDoctypeToken(pos); 5747 /* 5748 * Switch to the data state. 5749 */ 5750 state = transition(state, Tokenizer.DATA, reconsume, pos); 5751 continue stateloop; 5752 default: 5753 bogusDoctype(); 5754 /* 5755 * Set the DOCTYPE token's force-quirks flag to 5756 * on. 5757 */ 5758 // done by bogusDoctype(); 5759 /* 5760 * Switch to the bogus DOCTYPE state. 5761 */ 5762 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5763 continue stateloop; 5764 } 5765 } 5766 // CPPONLY: MOZ_FALLTHROUGH; 5767 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 5768 for (;;) { 5769 if (++pos == endPos) { 5770 break stateloop; 5771 } 5772 c = checkChar(buf, pos); 5773 /* 5774 * Consume the next input character: 5775 */ 5776 switch (c) { 5777 case '\'': 5778 /* 5779 * U+0027 APOSTROPHE (') Switch to the after 5780 * DOCTYPE system identifier state. 5781 */ 5782 systemIdentifier = strBufToString(); 5783 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5784 continue stateloop; 5785 case '>': 5786 errGtInSystemId(); 5787 /* 5788 * Set the DOCTYPE token's force-quirks flag to 5789 * on. 5790 */ 5791 forceQuirks = true; 5792 /* 5793 * Emit that DOCTYPE token. 5794 */ 5795 systemIdentifier = strBufToString(); 5796 emitDoctypeToken(pos); 5797 /* 5798 * Switch to the data state. 5799 */ 5800 state = transition(state, Tokenizer.DATA, reconsume, pos); 5801 continue stateloop; 5802 case '\r': 5803 appendStrBufCarriageReturn(); 5804 break stateloop; 5805 case '\n': 5806 appendStrBufLineFeed(); 5807 continue; 5808 case '\u0000': 5809 c = '\uFFFD'; 5810 // CPPONLY: MOZ_FALLTHROUGH; 5811 default: 5812 /* 5813 * Anything else Append the current input 5814 * character to the current DOCTYPE token's 5815 * system identifier. 5816 */ 5817 appendStrBuf(c); 5818 /* 5819 * Stay in the DOCTYPE system identifier 5820 * (double-quoted) state. 5821 */ 5822 continue; 5823 } 5824 } 5825 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 5826 for (;;) { 5827 if (++pos == endPos) { 5828 break stateloop; 5829 } 5830 c = checkChar(buf, pos); 5831 /* 5832 * Consume the next input character: 5833 */ 5834 switch (c) { 5835 case '\'': 5836 /* 5837 * U+0027 APOSTROPHE (') Switch to the after 5838 * DOCTYPE public identifier state. 5839 */ 5840 publicIdentifier = strBufToString(); 5841 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5842 continue stateloop; 5843 case '>': 5844 errGtInPublicId(); 5845 /* 5846 * Set the DOCTYPE token's force-quirks flag to 5847 * on. 5848 */ 5849 forceQuirks = true; 5850 /* 5851 * Emit that DOCTYPE token. 5852 */ 5853 publicIdentifier = strBufToString(); 5854 emitDoctypeToken(pos); 5855 /* 5856 * Switch to the data state. 5857 */ 5858 state = transition(state, Tokenizer.DATA, reconsume, pos); 5859 continue stateloop; 5860 case '\r': 5861 appendStrBufCarriageReturn(); 5862 break stateloop; 5863 case '\n': 5864 appendStrBufLineFeed(); 5865 continue; 5866 case '\u0000': 5867 c = '\uFFFD'; 5868 // CPPONLY: MOZ_FALLTHROUGH; 5869 default: 5870 /* 5871 * Anything else Append the current input 5872 * character to the current DOCTYPE token's 5873 * public identifier. 5874 */ 5875 appendStrBuf(c); 5876 /* 5877 * Stay in the DOCTYPE public identifier 5878 * (single-quoted) state. 5879 */ 5880 continue; 5881 } 5882 } 5883 case PROCESSING_INSTRUCTION: 5884 processinginstructionloop: for (;;) { 5885 if (++pos == endPos) { 5886 break stateloop; 5887 } 5888 c = checkChar(buf, pos); 5889 switch (c) { 5890 case '?': 5891 state = transition( 5892 state, 5893 Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK, 5894 reconsume, pos); 5895 break processinginstructionloop; 5896 // continue stateloop; 5897 default: 5898 continue; 5899 } 5900 } 5901 // CPPONLY: MOZ_FALLTHROUGH; 5902 case PROCESSING_INSTRUCTION_QUESTION_MARK: 5903 if (++pos == endPos) { 5904 break stateloop; 5905 } 5906 c = checkChar(buf, pos); 5907 switch (c) { 5908 case '>': 5909 state = transition(state, Tokenizer.DATA, 5910 reconsume, pos); 5911 continue stateloop; 5912 default: 5913 state = transition(state, 5914 Tokenizer.PROCESSING_INSTRUCTION, 5915 reconsume, pos); 5916 continue stateloop; 5917 } 5918 // END HOTSPOT WORKAROUND 5919 } 5920 } 5921 flushChars(buf, pos); 5922 /* 5923 * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } 5924 */ 5925 // Save locals 5926 stateSave = state; 5927 returnStateSave = returnState; 5928 return pos; 5929 } 5930 5931 // HOTSPOT WORKAROUND INSERTION POINT 5932 5933 // [NOCPP[ 5934 transition(int from, int to, boolean reconsume, int pos)5935 protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { 5936 return to; 5937 } 5938 5939 // ]NOCPP] 5940 initDoctypeFields()5941 private void initDoctypeFields() { 5942 // Discard the characters "DOCTYPE" accumulated as a potential bogus 5943 // comment into strBuf. 5944 clearStrBufAfterUse(); 5945 doctypeName = ""; 5946 if (systemIdentifier != null) { 5947 Portability.releaseString(systemIdentifier); 5948 systemIdentifier = null; 5949 } 5950 if (publicIdentifier != null) { 5951 Portability.releaseString(publicIdentifier); 5952 publicIdentifier = null; 5953 } 5954 forceQuirks = false; 5955 } 5956 adjustDoubleHyphenAndAppendToStrBufCarriageReturn()5957 @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn() 5958 throws SAXException { 5959 silentCarriageReturn(); 5960 adjustDoubleHyphenAndAppendToStrBufAndErr('\n'); 5961 } 5962 adjustDoubleHyphenAndAppendToStrBufLineFeed()5963 @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed() 5964 throws SAXException { 5965 silentLineFeed(); 5966 adjustDoubleHyphenAndAppendToStrBufAndErr('\n'); 5967 } 5968 appendStrBufLineFeed()5969 @Inline private void appendStrBufLineFeed() { 5970 silentLineFeed(); 5971 appendStrBuf('\n'); 5972 } 5973 appendStrBufCarriageReturn()5974 @Inline private void appendStrBufCarriageReturn() { 5975 silentCarriageReturn(); 5976 appendStrBuf('\n'); 5977 } 5978 silentCarriageReturn()5979 @Inline protected void silentCarriageReturn() { 5980 ++line; 5981 lastCR = true; 5982 } 5983 silentLineFeed()5984 @Inline protected void silentLineFeed() { 5985 ++line; 5986 } 5987 emitCarriageReturn(@oLength char[] buf, int pos)5988 private void emitCarriageReturn(@NoLength char[] buf, int pos) 5989 throws SAXException { 5990 silentCarriageReturn(); 5991 flushChars(buf, pos); 5992 tokenHandler.characters(Tokenizer.LF, 0, 1); 5993 cstart = Integer.MAX_VALUE; 5994 } 5995 emitReplacementCharacter(@oLength char[] buf, int pos)5996 private void emitReplacementCharacter(@NoLength char[] buf, int pos) 5997 throws SAXException { 5998 flushChars(buf, pos); 5999 tokenHandler.zeroOriginatingReplacementCharacter(); 6000 cstart = pos + 1; 6001 } 6002 emitPlaintextReplacementCharacter(@oLength char[] buf, int pos)6003 private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos) 6004 throws SAXException { 6005 flushChars(buf, pos); 6006 tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1); 6007 cstart = pos + 1; 6008 } 6009 setAdditionalAndRememberAmpersandLocation(char add)6010 private void setAdditionalAndRememberAmpersandLocation(char add) { 6011 additional = add; 6012 // [NOCPP[ 6013 ampersandLocation = new LocatorImpl(this); 6014 // ]NOCPP] 6015 } 6016 bogusDoctype()6017 private void bogusDoctype() throws SAXException { 6018 errBogusDoctype(); 6019 forceQuirks = true; 6020 } 6021 bogusDoctypeWithoutQuirks()6022 private void bogusDoctypeWithoutQuirks() throws SAXException { 6023 errBogusDoctype(); 6024 forceQuirks = false; 6025 } 6026 handleNcrValue(int returnState)6027 private void handleNcrValue(int returnState) throws SAXException { 6028 /* 6029 * If one or more characters match the range, then take them all and 6030 * interpret the string of characters as a number (either hexadecimal or 6031 * decimal as appropriate). 6032 */ 6033 if (value <= 0xFFFF) { 6034 if (value >= 0x80 && value <= 0x9f) { 6035 /* 6036 * If that number is one of the numbers in the first column of 6037 * the following table, then this is a parse error. 6038 */ 6039 errNcrInC1Range(); 6040 /* 6041 * Find the row with that number in the first column, and return 6042 * a character token for the Unicode character given in the 6043 * second column of that row. 6044 */ 6045 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80]; 6046 emitOrAppendOne(val, returnState); 6047 // [NOCPP[ 6048 } else if (value == 0xC 6049 && contentSpacePolicy != XmlViolationPolicy.ALLOW) { 6050 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) { 6051 emitOrAppendOne(Tokenizer.SPACE, returnState); 6052 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) { 6053 fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space."); 6054 } 6055 // ]NOCPP] 6056 } else if (value == 0x0) { 6057 errNcrZero(); 6058 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6059 } else if ((value & 0xF800) == 0xD800) { 6060 errNcrSurrogate(); 6061 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6062 } else { 6063 /* 6064 * Otherwise, return a character token for the Unicode character 6065 * whose code point is that number. 6066 */ 6067 char ch = (char) value; 6068 // [NOCPP[ 6069 if (value == 0x0D) { 6070 errNcrCr(); 6071 } else if ((value <= 0x0008) || (value == 0x000B) 6072 || (value >= 0x000E && value <= 0x001F)) { 6073 ch = errNcrControlChar(ch); 6074 } else if (value >= 0xFDD0 && value <= 0xFDEF) { 6075 errNcrUnassigned(); 6076 } else if ((value & 0xFFFE) == 0xFFFE) { 6077 ch = errNcrNonCharacter(ch); 6078 } else if (value >= 0x007F && value <= 0x009F) { 6079 errNcrControlChar(); 6080 } else { 6081 maybeWarnPrivateUse(ch); 6082 } 6083 // ]NOCPP] 6084 bmpChar[0] = ch; 6085 emitOrAppendOne(bmpChar, returnState); 6086 } 6087 } else if (value <= 0x10FFFF) { 6088 // [NOCPP[ 6089 maybeWarnPrivateUseAstral(); 6090 if ((value & 0xFFFE) == 0xFFFE) { 6091 errAstralNonCharacter(value); 6092 } 6093 // ]NOCPP] 6094 astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10)); 6095 astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); 6096 emitOrAppendTwo(astralChar, returnState); 6097 } else { 6098 errNcrOutOfRange(); 6099 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6100 } 6101 } 6102 eof()6103 public void eof() throws SAXException { 6104 int state = stateSave; 6105 int returnState = returnStateSave; 6106 6107 eofloop: for (;;) { 6108 switch (state) { 6109 case SCRIPT_DATA_LESS_THAN_SIGN: 6110 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 6111 /* 6112 * Otherwise, emit a U+003C LESS-THAN SIGN character token 6113 */ 6114 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6115 /* 6116 * and reconsume the current input character in the data 6117 * state. 6118 */ 6119 break eofloop; 6120 case TAG_OPEN: 6121 /* 6122 * The behavior of this state depends on the content model 6123 * flag. 6124 */ 6125 /* 6126 * Anything else Parse error. 6127 */ 6128 errEofAfterLt(); 6129 /* 6130 * Emit a U+003C LESS-THAN SIGN character token 6131 */ 6132 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6133 /* 6134 * and reconsume the current input character in the data 6135 * state. 6136 */ 6137 break eofloop; 6138 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 6139 /* 6140 * Emit a U+003C LESS-THAN SIGN character token 6141 */ 6142 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6143 /* 6144 * and reconsume the current input character in the RCDATA 6145 * state. 6146 */ 6147 break eofloop; 6148 case NON_DATA_END_TAG_NAME: 6149 /* 6150 * Emit a U+003C LESS-THAN SIGN character token, a U+002F 6151 * SOLIDUS character token, 6152 */ 6153 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 6154 /* 6155 * a character token for each of the characters in the 6156 * temporary buffer (in the order they were added to the 6157 * buffer), 6158 */ 6159 emitStrBuf(); 6160 /* 6161 * and reconsume the current input character in the RCDATA 6162 * state. 6163 */ 6164 break eofloop; 6165 case CLOSE_TAG_OPEN: 6166 /* EOF Parse error. */ 6167 errEofAfterLt(); 6168 /* 6169 * Emit a U+003C LESS-THAN SIGN character token and a U+002F 6170 * SOLIDUS character token. 6171 */ 6172 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 6173 /* 6174 * Reconsume the EOF character in the data state. 6175 */ 6176 break eofloop; 6177 case TAG_NAME: 6178 /* 6179 * EOF Parse error. 6180 */ 6181 errEofInTagName(); 6182 /* 6183 * Reconsume the EOF character in the data state. 6184 */ 6185 break eofloop; 6186 case BEFORE_ATTRIBUTE_NAME: 6187 case AFTER_ATTRIBUTE_VALUE_QUOTED: 6188 case SELF_CLOSING_START_TAG: 6189 /* EOF Parse error. */ 6190 errEofWithoutGt(); 6191 /* 6192 * Reconsume the EOF character in the data state. 6193 */ 6194 break eofloop; 6195 case ATTRIBUTE_NAME: 6196 /* 6197 * EOF Parse error. 6198 */ 6199 errEofInAttributeName(); 6200 /* 6201 * Reconsume the EOF character in the data state. 6202 */ 6203 break eofloop; 6204 case AFTER_ATTRIBUTE_NAME: 6205 case BEFORE_ATTRIBUTE_VALUE: 6206 /* EOF Parse error. */ 6207 errEofWithoutGt(); 6208 /* 6209 * Reconsume the EOF character in the data state. 6210 */ 6211 break eofloop; 6212 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 6213 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 6214 case ATTRIBUTE_VALUE_UNQUOTED: 6215 /* EOF Parse error. */ 6216 errEofInAttributeValue(); 6217 /* 6218 * Reconsume the EOF character in the data state. 6219 */ 6220 break eofloop; 6221 case BOGUS_COMMENT: 6222 emitComment(0, 0); 6223 break eofloop; 6224 case BOGUS_COMMENT_HYPHEN: 6225 // [NOCPP[ 6226 maybeAppendSpaceToBogusComment(); 6227 // ]NOCPP] 6228 emitComment(0, 0); 6229 break eofloop; 6230 case MARKUP_DECLARATION_OPEN: 6231 errBogusComment(); 6232 emitComment(0, 0); 6233 break eofloop; 6234 case MARKUP_DECLARATION_HYPHEN: 6235 errBogusComment(); 6236 emitComment(0, 0); 6237 break eofloop; 6238 case MARKUP_DECLARATION_OCTYPE: 6239 if (index < 6) { 6240 errBogusComment(); 6241 emitComment(0, 0); 6242 } else { 6243 /* EOF Parse error. */ 6244 errEofInDoctype(); 6245 /* 6246 * Create a new DOCTYPE token. Set its force-quirks flag 6247 * to on. 6248 */ 6249 doctypeName = ""; 6250 if (systemIdentifier != null) { 6251 Portability.releaseString(systemIdentifier); 6252 systemIdentifier = null; 6253 } 6254 if (publicIdentifier != null) { 6255 Portability.releaseString(publicIdentifier); 6256 publicIdentifier = null; 6257 } 6258 forceQuirks = true; 6259 /* 6260 * Emit the token. 6261 */ 6262 emitDoctypeToken(0); 6263 /* 6264 * Reconsume the EOF character in the data state. 6265 */ 6266 break eofloop; 6267 } 6268 break eofloop; 6269 case COMMENT_START: 6270 case COMMENT: 6271 /* 6272 * EOF Parse error. 6273 */ 6274 errEofInComment(); 6275 /* Emit the comment token. */ 6276 emitComment(0, 0); 6277 /* 6278 * Reconsume the EOF character in the data state. 6279 */ 6280 break eofloop; 6281 case COMMENT_END: 6282 errEofInComment(); 6283 /* Emit the comment token. */ 6284 emitComment(2, 0); 6285 /* 6286 * Reconsume the EOF character in the data state. 6287 */ 6288 break eofloop; 6289 case COMMENT_END_DASH: 6290 case COMMENT_START_DASH: 6291 errEofInComment(); 6292 /* Emit the comment token. */ 6293 emitComment(1, 0); 6294 /* 6295 * Reconsume the EOF character in the data state. 6296 */ 6297 break eofloop; 6298 case COMMENT_END_BANG: 6299 errEofInComment(); 6300 /* Emit the comment token. */ 6301 emitComment(3, 0); 6302 /* 6303 * Reconsume the EOF character in the data state. 6304 */ 6305 break eofloop; 6306 case DOCTYPE: 6307 case BEFORE_DOCTYPE_NAME: 6308 errEofInDoctype(); 6309 /* 6310 * Create a new DOCTYPE token. Set its force-quirks flag to 6311 * on. 6312 */ 6313 forceQuirks = true; 6314 /* 6315 * Emit the token. 6316 */ 6317 emitDoctypeToken(0); 6318 /* 6319 * Reconsume the EOF character in the data state. 6320 */ 6321 break eofloop; 6322 case DOCTYPE_NAME: 6323 errEofInDoctype(); 6324 strBufToDoctypeName(); 6325 /* 6326 * Set the DOCTYPE token's force-quirks flag to on. 6327 */ 6328 forceQuirks = true; 6329 /* 6330 * Emit that DOCTYPE token. 6331 */ 6332 emitDoctypeToken(0); 6333 /* 6334 * Reconsume the EOF character in the data state. 6335 */ 6336 break eofloop; 6337 case DOCTYPE_UBLIC: 6338 case DOCTYPE_YSTEM: 6339 case AFTER_DOCTYPE_NAME: 6340 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 6341 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 6342 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 6343 errEofInDoctype(); 6344 /* 6345 * Set the DOCTYPE token's force-quirks flag to on. 6346 */ 6347 forceQuirks = true; 6348 /* 6349 * Emit that DOCTYPE token. 6350 */ 6351 emitDoctypeToken(0); 6352 /* 6353 * Reconsume the EOF character in the data state. 6354 */ 6355 break eofloop; 6356 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 6357 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 6358 /* EOF Parse error. */ 6359 errEofInPublicId(); 6360 /* 6361 * Set the DOCTYPE token's force-quirks flag to on. 6362 */ 6363 forceQuirks = true; 6364 /* 6365 * Emit that DOCTYPE token. 6366 */ 6367 publicIdentifier = strBufToString(); 6368 emitDoctypeToken(0); 6369 /* 6370 * Reconsume the EOF character in the data state. 6371 */ 6372 break eofloop; 6373 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 6374 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 6375 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 6376 errEofInDoctype(); 6377 /* 6378 * Set the DOCTYPE token's force-quirks flag to on. 6379 */ 6380 forceQuirks = true; 6381 /* 6382 * Emit that DOCTYPE token. 6383 */ 6384 emitDoctypeToken(0); 6385 /* 6386 * Reconsume the EOF character in the data state. 6387 */ 6388 break eofloop; 6389 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 6390 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 6391 /* EOF Parse error. */ 6392 errEofInSystemId(); 6393 /* 6394 * Set the DOCTYPE token's force-quirks flag to on. 6395 */ 6396 forceQuirks = true; 6397 /* 6398 * Emit that DOCTYPE token. 6399 */ 6400 systemIdentifier = strBufToString(); 6401 emitDoctypeToken(0); 6402 /* 6403 * Reconsume the EOF character in the data state. 6404 */ 6405 break eofloop; 6406 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 6407 errEofInDoctype(); 6408 /* 6409 * Set the DOCTYPE token's force-quirks flag to on. 6410 */ 6411 forceQuirks = true; 6412 /* 6413 * Emit that DOCTYPE token. 6414 */ 6415 emitDoctypeToken(0); 6416 /* 6417 * Reconsume the EOF character in the data state. 6418 */ 6419 break eofloop; 6420 case BOGUS_DOCTYPE: 6421 /* 6422 * Emit that DOCTYPE token. 6423 */ 6424 emitDoctypeToken(0); 6425 /* 6426 * Reconsume the EOF character in the data state. 6427 */ 6428 break eofloop; 6429 case CONSUME_CHARACTER_REFERENCE: 6430 /* 6431 * Unlike the definition is the spec, this state does not 6432 * return a value and never requires the caller to 6433 * backtrack. This state takes care of emitting characters 6434 * or appending to the current attribute value. It also 6435 * takes care of that in the case when consuming the entity 6436 * fails. 6437 */ 6438 /* 6439 * This section defines how to consume an entity. This 6440 * definition is used when parsing entities in text and in 6441 * attributes. 6442 * 6443 * The behavior depends on the identity of the next 6444 * character (the one immediately after the U+0026 AMPERSAND 6445 * character): 6446 */ 6447 6448 emitOrAppendCharRefBuf(returnState); 6449 state = returnState; 6450 continue; 6451 case CHARACTER_REFERENCE_HILO_LOOKUP: 6452 errNoNamedCharacterMatch(); 6453 emitOrAppendCharRefBuf(returnState); 6454 state = returnState; 6455 continue; 6456 case CHARACTER_REFERENCE_TAIL: 6457 outer: for (;;) { 6458 char c = '\u0000'; 6459 entCol++; 6460 /* 6461 * Consume the maximum number of characters possible, 6462 * with the consumed characters matching one of the 6463 * identifiers in the first column of the named 6464 * character references table (in a case-sensitive 6465 * manner). 6466 */ 6467 hiloop: for (;;) { 6468 if (hi == -1) { 6469 break hiloop; 6470 } 6471 if (entCol == NamedCharacters.NAMES[hi].length()) { 6472 break hiloop; 6473 } 6474 if (entCol > NamedCharacters.NAMES[hi].length()) { 6475 break outer; 6476 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 6477 hi--; 6478 } else { 6479 break hiloop; 6480 } 6481 } 6482 6483 loloop: for (;;) { 6484 if (hi < lo) { 6485 break outer; 6486 } 6487 if (entCol == NamedCharacters.NAMES[lo].length()) { 6488 candidate = lo; 6489 charRefBufMark = charRefBufLen; 6490 lo++; 6491 } else if (entCol > NamedCharacters.NAMES[lo].length()) { 6492 break outer; 6493 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 6494 lo++; 6495 } else { 6496 break loloop; 6497 } 6498 } 6499 if (hi < lo) { 6500 break outer; 6501 } 6502 continue; 6503 } 6504 6505 if (candidate == -1) { 6506 /* 6507 * If no match can be made, then this is a parse error. 6508 */ 6509 errNoNamedCharacterMatch(); 6510 emitOrAppendCharRefBuf(returnState); 6511 state = returnState; 6512 continue eofloop; 6513 } else { 6514 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 6515 if (candidateName.length() == 0 6516 || candidateName.charAt(candidateName.length() - 1) != ';') { 6517 /* 6518 * If the last character matched is not a U+003B 6519 * SEMICOLON (;), there is a parse error. 6520 */ 6521 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6522 /* 6523 * If the entity is being consumed as part of an 6524 * attribute, and the last character matched is 6525 * not a U+003B SEMICOLON (;), 6526 */ 6527 char ch; 6528 if (charRefBufMark == charRefBufLen) { 6529 ch = '\u0000'; 6530 } else { 6531 ch = charRefBuf[charRefBufMark]; 6532 } 6533 if ((ch >= '0' && ch <= '9') 6534 || (ch >= 'A' && ch <= 'Z') 6535 || (ch >= 'a' && ch <= 'z')) { 6536 /* 6537 * and the next character is in the range 6538 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 6539 * U+0041 LATIN CAPITAL LETTER A to U+005A 6540 * LATIN CAPITAL LETTER Z, or U+0061 LATIN 6541 * SMALL LETTER A to U+007A LATIN SMALL 6542 * LETTER Z, then, for historical reasons, 6543 * all the characters that were matched 6544 * after the U+0026 AMPERSAND (&) must be 6545 * unconsumed, and nothing is returned. 6546 */ 6547 errNoNamedCharacterMatch(); 6548 appendCharRefBufToStrBuf(); 6549 state = returnState; 6550 continue eofloop; 6551 } 6552 } 6553 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6554 errUnescapedAmpersandInterpretedAsCharacterReference(); 6555 } else { 6556 errNotSemicolonTerminated(); 6557 } 6558 } 6559 6560 /* 6561 * Otherwise, return a character token for the character 6562 * corresponding to the entity name (as given by the 6563 * second column of the named character references 6564 * table). 6565 */ 6566 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 6567 if ( 6568 // [NOCPP[ 6569 val.length == 1 6570 // ]NOCPP] 6571 // CPPONLY: val[1] == 0 6572 ) { 6573 emitOrAppendOne(val, returnState); 6574 } else { 6575 emitOrAppendTwo(val, returnState); 6576 } 6577 // this is so complicated! 6578 if (charRefBufMark < charRefBufLen) { 6579 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6580 appendStrBuf(charRefBuf, charRefBufMark, 6581 charRefBufLen - charRefBufMark); 6582 } else { 6583 tokenHandler.characters(charRefBuf, charRefBufMark, 6584 charRefBufLen - charRefBufMark); 6585 } 6586 } 6587 charRefBufLen = 0; 6588 state = returnState; 6589 continue eofloop; 6590 /* 6591 * If the markup contains I'm ¬it; I tell you, the 6592 * entity is parsed as "not", as in, I'm ¬it; I tell 6593 * you. But if the markup was I'm ∉ I tell you, 6594 * the entity would be parsed as "notin;", resulting in 6595 * I'm ∉ I tell you. 6596 */ 6597 } 6598 case CONSUME_NCR: 6599 case DECIMAL_NRC_LOOP: 6600 case HEX_NCR_LOOP: 6601 /* 6602 * If no characters match the range, then don't consume any 6603 * characters (and unconsume the U+0023 NUMBER SIGN 6604 * character and, if appropriate, the X character). This is 6605 * a parse error; nothing is returned. 6606 * 6607 * Otherwise, if the next character is a U+003B SEMICOLON, 6608 * consume that too. If it isn't, there is a parse error. 6609 */ 6610 if (!seenDigits) { 6611 errNoDigitsInNCR(); 6612 emitOrAppendCharRefBuf(returnState); 6613 state = returnState; 6614 continue; 6615 } else { 6616 errCharRefLacksSemicolon(); 6617 } 6618 // WARNING previous state sets reconsume 6619 handleNcrValue(returnState); 6620 state = returnState; 6621 continue; 6622 case CDATA_RSQB: 6623 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 6624 break eofloop; 6625 case CDATA_RSQB_RSQB: 6626 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 6627 break eofloop; 6628 case DATA: 6629 default: 6630 break eofloop; 6631 } 6632 } 6633 // case DATA: 6634 /* 6635 * EOF Emit an end-of-file token. 6636 */ 6637 tokenHandler.eof(); 6638 return; 6639 } 6640 emitDoctypeToken(int pos)6641 private void emitDoctypeToken(int pos) throws SAXException { 6642 cstart = pos + 1; 6643 tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier, 6644 forceQuirks); 6645 // It is OK and sufficient to release these here, since 6646 // there's no way out of the doctype states than through paths 6647 // that call this method. 6648 doctypeName = null; 6649 Portability.releaseString(publicIdentifier); 6650 publicIdentifier = null; 6651 Portability.releaseString(systemIdentifier); 6652 systemIdentifier = null; 6653 } 6654 checkChar(@oLength char[] buf, int pos)6655 @Inline protected char checkChar(@NoLength char[] buf, int pos) 6656 throws SAXException { 6657 return buf[pos]; 6658 } 6659 internalEncodingDeclaration(String internalCharset)6660 public boolean internalEncodingDeclaration(String internalCharset) 6661 throws SAXException { 6662 if (encodingDeclarationHandler != null) { 6663 return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset); 6664 } 6665 return false; 6666 } 6667 6668 /** 6669 * @param val 6670 * @throws SAXException 6671 */ emitOrAppendTwo(@onst @oLength char[] val, int returnState)6672 private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState) 6673 throws SAXException { 6674 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6675 appendStrBuf(val[0]); 6676 appendStrBuf(val[1]); 6677 } else { 6678 tokenHandler.characters(val, 0, 2); 6679 } 6680 } 6681 emitOrAppendOne(@onst @oLength char[] val, int returnState)6682 private void emitOrAppendOne(@Const @NoLength char[] val, int returnState) 6683 throws SAXException { 6684 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6685 appendStrBuf(val[0]); 6686 } else { 6687 tokenHandler.characters(val, 0, 1); 6688 } 6689 } 6690 end()6691 public void end() throws SAXException { 6692 strBuf = null; 6693 doctypeName = null; 6694 if (systemIdentifier != null) { 6695 Portability.releaseString(systemIdentifier); 6696 systemIdentifier = null; 6697 } 6698 if (publicIdentifier != null) { 6699 Portability.releaseString(publicIdentifier); 6700 publicIdentifier = null; 6701 } 6702 tagName = null; 6703 nonInternedTagName.setNameForNonInterned(null 6704 // CPPONLY: , false 6705 ); 6706 attributeName = null; 6707 // CPPONLY: nonInternedAttributeName.setNameForNonInterned(null); 6708 tokenHandler.endTokenization(); 6709 if (attributes != null) { 6710 // [NOCPP[ 6711 attributes = null; 6712 // ]NOCPP] 6713 // CPPONLY: attributes.clear(mappingLangToXmlLang); 6714 } 6715 } 6716 requestSuspension()6717 public void requestSuspension() { 6718 shouldSuspend = true; 6719 } 6720 6721 // [NOCPP[ 6722 becomeConfident()6723 public void becomeConfident() { 6724 confident = true; 6725 } 6726 6727 /** 6728 * Returns the nextCharOnNewLine. 6729 * 6730 * @return the nextCharOnNewLine 6731 */ isNextCharOnNewLine()6732 public boolean isNextCharOnNewLine() { 6733 return false; 6734 } 6735 isPrevCR()6736 public boolean isPrevCR() { 6737 return lastCR; 6738 } 6739 6740 /** 6741 * Returns the line. 6742 * 6743 * @return the line 6744 */ getLine()6745 public int getLine() { 6746 return -1; 6747 } 6748 6749 /** 6750 * Returns the col. 6751 * 6752 * @return the col 6753 */ getCol()6754 public int getCol() { 6755 return -1; 6756 } 6757 6758 // ]NOCPP] 6759 isInDataState()6760 public boolean isInDataState() { 6761 return (stateSave == DATA); 6762 } 6763 resetToDataState()6764 public void resetToDataState() { 6765 clearStrBufAfterUse(); 6766 charRefBufLen = 0; 6767 stateSave = Tokenizer.DATA; 6768 // line = 1; XXX line numbers 6769 lastCR = false; 6770 index = 0; 6771 forceQuirks = false; 6772 additional = '\u0000'; 6773 entCol = -1; 6774 firstCharKey = -1; 6775 lo = 0; 6776 hi = 0; // will always be overwritten before use anyway 6777 candidate = -1; 6778 charRefBufMark = 0; 6779 value = 0; 6780 seenDigits = false; 6781 endTag = false; 6782 shouldSuspend = false; 6783 initDoctypeFields(); 6784 containsHyphen = false; 6785 tagName = null; 6786 attributeName = null; 6787 if (newAttributesEachTime) { 6788 if (attributes != null) { 6789 Portability.delete(attributes); 6790 attributes = null; 6791 } 6792 } 6793 } 6794 loadState(Tokenizer other)6795 public void loadState(Tokenizer other) throws SAXException { 6796 strBufLen = other.strBufLen; 6797 if (strBufLen > strBuf.length) { 6798 strBuf = new char[strBufLen]; 6799 } 6800 System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen); 6801 6802 charRefBufLen = other.charRefBufLen; 6803 System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen); 6804 6805 stateSave = other.stateSave; 6806 returnStateSave = other.returnStateSave; 6807 endTagExpectation = other.endTagExpectation; 6808 endTagExpectationAsArray = other.endTagExpectationAsArray; 6809 // line = 1; XXX line numbers 6810 lastCR = other.lastCR; 6811 index = other.index; 6812 forceQuirks = other.forceQuirks; 6813 additional = other.additional; 6814 entCol = other.entCol; 6815 firstCharKey = other.firstCharKey; 6816 lo = other.lo; 6817 hi = other.hi; 6818 candidate = other.candidate; 6819 charRefBufMark = other.charRefBufMark; 6820 value = other.value; 6821 seenDigits = other.seenDigits; 6822 endTag = other.endTag; 6823 shouldSuspend = false; 6824 doctypeName = other.doctypeName; 6825 6826 Portability.releaseString(systemIdentifier); 6827 if (other.systemIdentifier == null) { 6828 systemIdentifier = null; 6829 } else { 6830 systemIdentifier = Portability.newStringFromString(other.systemIdentifier); 6831 } 6832 6833 Portability.releaseString(publicIdentifier); 6834 if (other.publicIdentifier == null) { 6835 publicIdentifier = null; 6836 } else { 6837 publicIdentifier = Portability.newStringFromString(other.publicIdentifier); 6838 } 6839 6840 containsHyphen = other.containsHyphen; 6841 if (other.tagName == null) { 6842 tagName = null; 6843 } else if (other.tagName.isInterned()) { 6844 tagName = other.tagName; 6845 } else { 6846 // In the C++ case, the atoms in the other tokenizer are from a 6847 // different tokenizer-scoped atom table. Therefore, we have to 6848 // obtain the correspoding atom from our own atom table. 6849 nonInternedTagName.setNameForNonInterned(other.tagName.getName() 6850 // CPPONLY: , other.tagName.isCustom() 6851 ); 6852 tagName = nonInternedTagName; 6853 } 6854 6855 // [NOCPP[ 6856 attributeName = other.attributeName; 6857 // ]NOCPP] 6858 // CPPONLY: if (other.attributeName == null) { 6859 // CPPONLY: attributeName = null; 6860 // CPPONLY: } else if (other.attributeName.isInterned()) { 6861 // CPPONLY: attributeName = other.attributeName; 6862 // CPPONLY: } else { 6863 // CPPONLY: // In the C++ case, the atoms in the other tokenizer are from a 6864 // CPPONLY: // different tokenizer-scoped atom table. Therefore, we have to 6865 // CPPONLY: // obtain the correspoding atom from our own atom table. 6866 // CPPONLY: nonInternedAttributeName.setNameForNonInterned(other.attributeName.getLocal(AttributeName.HTML)); 6867 // CPPONLY: attributeName = nonInternedAttributeName; 6868 // CPPONLY: } 6869 6870 Portability.delete(attributes); 6871 if (other.attributes == null) { 6872 attributes = null; 6873 } else { 6874 attributes = other.attributes.cloneAttributes(); 6875 } 6876 } 6877 initializeWithoutStarting()6878 public void initializeWithoutStarting() throws SAXException { 6879 confident = false; 6880 strBuf = null; 6881 line = 1; 6882 // CPPONLY: attributeLine = 1; 6883 // [NOCPP[ 6884 metaBoundaryPassed = false; 6885 wantsComments = tokenHandler.wantsComments(); 6886 if (!newAttributesEachTime) { 6887 attributes = new HtmlAttributes(mappingLangToXmlLang); 6888 } 6889 // ]NOCPP] 6890 resetToDataState(); 6891 } 6892 errGarbageAfterLtSlash()6893 protected void errGarbageAfterLtSlash() throws SAXException { 6894 } 6895 errLtSlashGt()6896 protected void errLtSlashGt() throws SAXException { 6897 } 6898 errWarnLtSlashInRcdata()6899 protected void errWarnLtSlashInRcdata() throws SAXException { 6900 } 6901 errHtml4LtSlashInRcdata(char folded)6902 protected void errHtml4LtSlashInRcdata(char folded) throws SAXException { 6903 } 6904 errCharRefLacksSemicolon()6905 protected void errCharRefLacksSemicolon() throws SAXException { 6906 } 6907 errNoDigitsInNCR()6908 protected void errNoDigitsInNCR() throws SAXException { 6909 } 6910 errGtInSystemId()6911 protected void errGtInSystemId() throws SAXException { 6912 } 6913 errGtInPublicId()6914 protected void errGtInPublicId() throws SAXException { 6915 } 6916 errNamelessDoctype()6917 protected void errNamelessDoctype() throws SAXException { 6918 } 6919 errConsecutiveHyphens()6920 protected void errConsecutiveHyphens() throws SAXException { 6921 } 6922 errPrematureEndOfComment()6923 protected void errPrematureEndOfComment() throws SAXException { 6924 } 6925 errBogusComment()6926 protected void errBogusComment() throws SAXException { 6927 } 6928 errUnquotedAttributeValOrNull(char c)6929 protected void errUnquotedAttributeValOrNull(char c) throws SAXException { 6930 } 6931 errSlashNotFollowedByGt()6932 protected void errSlashNotFollowedByGt() throws SAXException { 6933 } 6934 errNoSpaceBetweenAttributes()6935 protected void errNoSpaceBetweenAttributes() throws SAXException { 6936 } 6937 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)6938 protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c) 6939 throws SAXException { 6940 } 6941 errAttributeValueMissing()6942 protected void errAttributeValueMissing() throws SAXException { 6943 } 6944 errBadCharBeforeAttributeNameOrNull(char c)6945 protected void errBadCharBeforeAttributeNameOrNull(char c) 6946 throws SAXException { 6947 } 6948 errEqualsSignBeforeAttributeName()6949 protected void errEqualsSignBeforeAttributeName() throws SAXException { 6950 } 6951 errBadCharAfterLt(char c)6952 protected void errBadCharAfterLt(char c) throws SAXException { 6953 } 6954 errLtGt()6955 protected void errLtGt() throws SAXException { 6956 } 6957 errProcessingInstruction()6958 protected void errProcessingInstruction() throws SAXException { 6959 } 6960 errUnescapedAmpersandInterpretedAsCharacterReference()6961 protected void errUnescapedAmpersandInterpretedAsCharacterReference() 6962 throws SAXException { 6963 } 6964 errNotSemicolonTerminated()6965 protected void errNotSemicolonTerminated() throws SAXException { 6966 } 6967 errNoNamedCharacterMatch()6968 protected void errNoNamedCharacterMatch() throws SAXException { 6969 } 6970 errQuoteBeforeAttributeName(char c)6971 protected void errQuoteBeforeAttributeName(char c) throws SAXException { 6972 } 6973 errQuoteOrLtInAttributeNameOrNull(char c)6974 protected void errQuoteOrLtInAttributeNameOrNull(char c) 6975 throws SAXException { 6976 } 6977 errExpectedPublicId()6978 protected void errExpectedPublicId() throws SAXException { 6979 } 6980 errBogusDoctype()6981 protected void errBogusDoctype() throws SAXException { 6982 } 6983 maybeWarnPrivateUseAstral()6984 protected void maybeWarnPrivateUseAstral() throws SAXException { 6985 } 6986 maybeWarnPrivateUse(char ch)6987 protected void maybeWarnPrivateUse(char ch) throws SAXException { 6988 } 6989 maybeErrAttributesOnEndTag(HtmlAttributes attrs)6990 protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs) 6991 throws SAXException { 6992 } 6993 maybeErrSlashInEndTag(boolean selfClosing)6994 protected void maybeErrSlashInEndTag(boolean selfClosing) 6995 throws SAXException { 6996 } 6997 errNcrNonCharacter(char ch)6998 protected char errNcrNonCharacter(char ch) throws SAXException { 6999 return ch; 7000 } 7001 errAstralNonCharacter(int ch)7002 protected void errAstralNonCharacter(int ch) throws SAXException { 7003 } 7004 errNcrSurrogate()7005 protected void errNcrSurrogate() throws SAXException { 7006 } 7007 errNcrControlChar(char ch)7008 protected char errNcrControlChar(char ch) throws SAXException { 7009 return ch; 7010 } 7011 errNcrCr()7012 protected void errNcrCr() throws SAXException { 7013 } 7014 errNcrInC1Range()7015 protected void errNcrInC1Range() throws SAXException { 7016 } 7017 errEofInPublicId()7018 protected void errEofInPublicId() throws SAXException { 7019 } 7020 errEofInComment()7021 protected void errEofInComment() throws SAXException { 7022 } 7023 errEofInDoctype()7024 protected void errEofInDoctype() throws SAXException { 7025 } 7026 errEofInAttributeValue()7027 protected void errEofInAttributeValue() throws SAXException { 7028 } 7029 errEofInAttributeName()7030 protected void errEofInAttributeName() throws SAXException { 7031 } 7032 errEofWithoutGt()7033 protected void errEofWithoutGt() throws SAXException { 7034 } 7035 errEofInTagName()7036 protected void errEofInTagName() throws SAXException { 7037 } 7038 errEofInEndTag()7039 protected void errEofInEndTag() throws SAXException { 7040 } 7041 errEofAfterLt()7042 protected void errEofAfterLt() throws SAXException { 7043 } 7044 errNcrOutOfRange()7045 protected void errNcrOutOfRange() throws SAXException { 7046 } 7047 errNcrUnassigned()7048 protected void errNcrUnassigned() throws SAXException { 7049 } 7050 errDuplicateAttribute()7051 protected void errDuplicateAttribute() throws SAXException { 7052 } 7053 errEofInSystemId()7054 protected void errEofInSystemId() throws SAXException { 7055 } 7056 errExpectedSystemId()7057 protected void errExpectedSystemId() throws SAXException { 7058 } 7059 errMissingSpaceBeforeDoctypeName()7060 protected void errMissingSpaceBeforeDoctypeName() throws SAXException { 7061 } 7062 errHyphenHyphenBang()7063 protected void errHyphenHyphenBang() throws SAXException { 7064 } 7065 errNcrControlChar()7066 protected void errNcrControlChar() throws SAXException { 7067 } 7068 errNcrZero()7069 protected void errNcrZero() throws SAXException { 7070 } 7071 errNoSpaceBetweenDoctypeSystemKeywordAndQuote()7072 protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote() 7073 throws SAXException { 7074 } 7075 errNoSpaceBetweenPublicAndSystemIds()7076 protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException { 7077 } 7078 errNoSpaceBetweenDoctypePublicKeywordAndQuote()7079 protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote() 7080 throws SAXException { 7081 } 7082 noteAttributeWithoutValue()7083 protected void noteAttributeWithoutValue() throws SAXException { 7084 } 7085 noteUnquotedAttributeValue()7086 protected void noteUnquotedAttributeValue() throws SAXException { 7087 } 7088 7089 /** 7090 * Sets the encodingDeclarationHandler. 7091 * 7092 * @param encodingDeclarationHandler 7093 * the encodingDeclarationHandler to set 7094 */ setEncodingDeclarationHandler( EncodingDeclarationHandler encodingDeclarationHandler)7095 public void setEncodingDeclarationHandler( 7096 EncodingDeclarationHandler encodingDeclarationHandler) { 7097 this.encodingDeclarationHandler = encodingDeclarationHandler; 7098 } 7099 destructor()7100 void destructor() { 7101 Portability.delete(nonInternedTagName); 7102 nonInternedTagName = null; 7103 // CPPONLY: Portability.delete(nonInternedAttributeName); 7104 // CPPONLY: nonInternedAttributeName = null; 7105 // The translator will write refcount tracing stuff here 7106 Portability.delete(attributes); 7107 attributes = null; 7108 } 7109 7110 // [NOCPP[ 7111 7112 /** 7113 * Sets an offset to be added to the position reported to 7114 * <code>TransitionHandler</code>. 7115 * 7116 * @param offset the offset 7117 */ setTransitionBaseOffset(int offset)7118 public void setTransitionBaseOffset(int offset) { 7119 7120 } 7121 7122 // ]NOCPP] 7123 7124 } 7125