1 /* 2 * Copyright (c) 2005-2007 Henri Sivonen 3 * Copyright (c) 2007-2015 Mozilla Foundation 4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 5 * Foundation, and Opera Software ASA. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 * DEALINGS IN THE SOFTWARE. 24 */ 25 26 /* 27 * The comments following this one that use the same comment syntax as this 28 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 29 * amended as of June 18 2008 and May 31 2010. 30 * That document came with this statement: 31 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and 32 * Opera Software ASA. You are granted a license to use, reproduce and 33 * create derivative works of this document." 34 */ 35 36 package nu.validator.htmlparser.impl; 37 38 import org.xml.sax.ErrorHandler; 39 import org.xml.sax.Locator; 40 import org.xml.sax.SAXException; 41 import org.xml.sax.SAXParseException; 42 43 import nu.validator.htmlparser.annotation.Auto; 44 import nu.validator.htmlparser.annotation.CharacterName; 45 import nu.validator.htmlparser.annotation.Const; 46 import nu.validator.htmlparser.annotation.Inline; 47 import nu.validator.htmlparser.annotation.Local; 48 import nu.validator.htmlparser.annotation.NoLength; 49 import nu.validator.htmlparser.common.EncodingDeclarationHandler; 50 import nu.validator.htmlparser.common.Interner; 51 import nu.validator.htmlparser.common.TokenHandler; 52 import nu.validator.htmlparser.common.XmlViolationPolicy; 53 54 /** 55 * An implementation of 56 * https://html.spec.whatwg.org/multipage/syntax.html#tokenization 57 * 58 * This class implements the <code>Locator</code> interface. This is not an 59 * incidental implementation detail: Users of this class are encouraged to make 60 * use of the <code>Locator</code> nature. 61 * 62 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer 63 * can be configured to treat these conditions as fatal or to coerce the infoset 64 * to something that XML 1.0 allows. 65 * 66 * @version $Id$ 67 * @author hsivonen 68 */ 69 public class Tokenizer implements Locator { 70 71 private static final int DATA_AND_RCDATA_MASK = ~1; 72 73 public static final int DATA = 0; 74 75 public static final int RCDATA = 1; 76 77 public static final int SCRIPT_DATA = 2; 78 79 public static final int RAWTEXT = 3; 80 81 public static final int SCRIPT_DATA_ESCAPED = 4; 82 83 public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; 84 85 public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; 86 87 public static final int ATTRIBUTE_VALUE_UNQUOTED = 7; 88 89 public static final int PLAINTEXT = 8; 90 91 public static final int TAG_OPEN = 9; 92 93 public static final int CLOSE_TAG_OPEN = 10; 94 95 public static final int TAG_NAME = 11; 96 97 public static final int BEFORE_ATTRIBUTE_NAME = 12; 98 99 public static final int ATTRIBUTE_NAME = 13; 100 101 public static final int AFTER_ATTRIBUTE_NAME = 14; 102 103 public static final int BEFORE_ATTRIBUTE_VALUE = 15; 104 105 public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16; 106 107 public static final int BOGUS_COMMENT = 17; 108 109 public static final int MARKUP_DECLARATION_OPEN = 18; 110 111 public static final int DOCTYPE = 19; 112 113 public static final int BEFORE_DOCTYPE_NAME = 20; 114 115 public static final int DOCTYPE_NAME = 21; 116 117 public static final int AFTER_DOCTYPE_NAME = 22; 118 119 public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; 120 121 public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; 122 123 public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; 124 125 public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; 126 127 public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; 128 129 public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; 130 131 public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; 132 133 public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; 134 135 public static final int BOGUS_DOCTYPE = 31; 136 137 public static final int COMMENT_START = 32; 138 139 public static final int COMMENT_START_DASH = 33; 140 141 public static final int COMMENT = 34; 142 143 public static final int COMMENT_END_DASH = 35; 144 145 public static final int COMMENT_END = 36; 146 147 public static final int COMMENT_END_BANG = 37; 148 149 public static final int NON_DATA_END_TAG_NAME = 38; 150 151 public static final int MARKUP_DECLARATION_HYPHEN = 39; 152 153 public static final int MARKUP_DECLARATION_OCTYPE = 40; 154 155 public static final int DOCTYPE_UBLIC = 41; 156 157 public static final int DOCTYPE_YSTEM = 42; 158 159 public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; 160 161 public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; 162 163 public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; 164 165 public static final int CONSUME_CHARACTER_REFERENCE = 46; 166 167 public static final int CONSUME_NCR = 47; 168 169 public static final int CHARACTER_REFERENCE_TAIL = 48; 170 171 public static final int HEX_NCR_LOOP = 49; 172 173 public static final int DECIMAL_NRC_LOOP = 50; 174 175 public static final int HANDLE_NCR_VALUE = 51; 176 177 public static final int HANDLE_NCR_VALUE_RECONSUME = 52; 178 179 public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53; 180 181 public static final int SELF_CLOSING_START_TAG = 54; 182 183 public static final int CDATA_START = 55; 184 185 public static final int CDATA_SECTION = 56; 186 187 public static final int CDATA_RSQB = 57; 188 189 public static final int CDATA_RSQB_RSQB = 58; 190 191 public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59; 192 193 public static final int SCRIPT_DATA_ESCAPE_START = 60; 194 195 public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61; 196 197 public static final int SCRIPT_DATA_ESCAPED_DASH = 62; 198 199 public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63; 200 201 public static final int BOGUS_COMMENT_HYPHEN = 64; 202 203 public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; 204 205 public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; 206 207 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; 208 209 public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68; 210 211 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; 212 213 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; 214 215 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; 216 217 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; 218 219 public static final int PROCESSING_INSTRUCTION = 73; 220 221 public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74; 222 223 /** 224 * Magic value for UTF-16 operations. 225 */ 226 private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); 227 228 /** 229 * UTF-16 code unit array containing less than and greater than for emitting 230 * those characters on certain parse errors. 231 */ 232 private static final @NoLength char[] LT_GT = { '<', '>' }; 233 234 /** 235 * UTF-16 code unit array containing less than and solidus for emitting 236 * those characters on certain parse errors. 237 */ 238 private static final @NoLength char[] LT_SOLIDUS = { '<', '/' }; 239 240 /** 241 * UTF-16 code unit array containing ]] for emitting those characters on 242 * state transitions. 243 */ 244 private static final @NoLength char[] RSQB_RSQB = { ']', ']' }; 245 246 /** 247 * Array version of U+FFFD. 248 */ 249 private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; 250 251 // [NOCPP[ 252 253 /** 254 * Array version of space. 255 */ 256 private static final @NoLength char[] SPACE = { ' ' }; 257 258 // ]NOCPP] 259 260 /** 261 * Array version of line feed. 262 */ 263 private static final @NoLength char[] LF = { '\n' }; 264 265 /** 266 * "CDATA[" as <code>char[]</code> 267 */ 268 private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T', 269 'A', '[' }; 270 271 /** 272 * "octype" as <code>char[]</code> 273 */ 274 private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p', 275 'e' }; 276 277 /** 278 * "ublic" as <code>char[]</code> 279 */ 280 private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' }; 281 282 /** 283 * "ystem" as <code>char[]</code> 284 */ 285 private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' }; 286 287 private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' }; 288 289 private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' }; 290 291 private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' }; 292 293 private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't', 294 'e', 'x', 't' }; 295 296 private static final char[] XMP_ARR = { 'x', 'm', 'p' }; 297 298 private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r', 299 'e', 'a' }; 300 301 private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' }; 302 303 private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e', 304 'd' }; 305 306 private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i', 307 'p', 't' }; 308 309 private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm', 310 'e', 's' }; 311 312 /** 313 * The token handler. 314 */ 315 protected final TokenHandler tokenHandler; 316 317 protected EncodingDeclarationHandler encodingDeclarationHandler; 318 319 // [NOCPP[ 320 321 /** 322 * The error handler. 323 */ 324 protected ErrorHandler errorHandler; 325 326 // ]NOCPP] 327 328 /** 329 * Whether the previous char read was CR. 330 */ 331 protected boolean lastCR; 332 333 protected int stateSave; 334 335 private int returnStateSave; 336 337 protected int index; 338 339 private boolean forceQuirks; 340 341 private char additional; 342 343 private int entCol; 344 345 private int firstCharKey; 346 347 private int lo; 348 349 private int hi; 350 351 private int candidate; 352 353 private int charRefBufMark; 354 355 protected int value; 356 357 private boolean seenDigits; 358 359 protected int cstart; 360 361 /** 362 * The SAX public id for the resource being tokenized. (Only passed to back 363 * as part of locator data.) 364 */ 365 private String publicId; 366 367 /** 368 * The SAX system id for the resource being tokenized. (Only passed to back 369 * as part of locator data.) 370 */ 371 private String systemId; 372 373 /** 374 * Buffer for bufferable things other than those that fit the description 375 * of <code>charRefBuf</code>. 376 */ 377 private @Auto char[] strBuf; 378 379 /** 380 * Number of significant <code>char</code>s in <code>strBuf</code>. 381 */ 382 private int strBufLen; 383 384 /** 385 * Buffer for characters that might form a character reference but may 386 * end up not forming one. 387 */ 388 private final @Auto char[] charRefBuf; 389 390 /** 391 * Number of significant <code>char</code>s in <code>charRefBuf</code>. 392 */ 393 private int charRefBufLen; 394 395 /** 396 * Buffer for expanding NCRs falling into the Basic Multilingual Plane. 397 */ 398 private final @Auto char[] bmpChar; 399 400 /** 401 * Buffer for expanding astral NCRs. 402 */ 403 private final @Auto char[] astralChar; 404 405 /** 406 * The element whose end tag closes the current CDATA or RCDATA element. 407 */ 408 protected ElementName endTagExpectation = null; 409 410 private char[] endTagExpectationAsArray; // not @Auto! 411 412 /** 413 * <code>true</code> if tokenizing an end tag 414 */ 415 protected boolean endTag; 416 417 /** 418 * <code>true</code> iff the current element/attribute name contains 419 * a hyphen. 420 */ 421 private boolean containsHyphen; 422 423 /** 424 * The current tag token name. One of 425 * 1) null, 426 * 2) non-owning reference to nonInternedTagName 427 * 3) non-owning reference to a pre-interned ElementName 428 */ 429 private ElementName tagName = null; 430 431 /** 432 * The recycled ElementName instance for the non-pre-interned cases. 433 */ 434 private ElementName nonInternedTagName = null; 435 436 /** 437 * The current attribute name. 438 */ 439 protected AttributeName attributeName = null; 440 441 // CPPONLY: private AttributeName nonInternedAttributeName = null; 442 443 // [NOCPP[ 444 445 /** 446 * Whether comment tokens are emitted. 447 */ 448 private boolean wantsComments = false; 449 450 /** 451 * <code>true</code> when HTML4-specific additional errors are requested. 452 */ 453 protected boolean html4; 454 455 /** 456 * Whether the stream is past the first 1024 bytes. 457 */ 458 private boolean metaBoundaryPassed; 459 460 // ]NOCPP] 461 462 /** 463 * The name of the current doctype token. 464 */ 465 private @Local String doctypeName; 466 467 /** 468 * The public id of the current doctype token. 469 */ 470 private String publicIdentifier; 471 472 /** 473 * The system id of the current doctype token. 474 */ 475 private String systemIdentifier; 476 477 /** 478 * The attribute holder. 479 */ 480 private HtmlAttributes attributes; 481 482 // [NOCPP[ 483 484 /** 485 * The policy for vertical tab and form feed. 486 */ 487 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET; 488 489 /** 490 * The policy for comments. 491 */ 492 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET; 493 494 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET; 495 496 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET; 497 498 private boolean html4ModeCompatibleWithXhtml1Schemata; 499 500 private int mappingLangToXmlLang; 501 502 // ]NOCPP] 503 504 private final boolean newAttributesEachTime; 505 506 private boolean shouldSuspend; 507 508 protected boolean confident; 509 510 private int line; 511 512 /* 513 * The line number of the current attribute. First set to the line of the 514 * attribute name and if there is a value, set to the line the value 515 * started on. 516 */ 517 // CPPONLY: private int attributeLine; 518 519 private Interner interner; 520 521 // CPPONLY: private boolean viewingXmlSource; 522 523 // [NOCPP[ 524 525 protected LocatorImpl ampersandLocation; 526 Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime)527 public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { 528 this.tokenHandler = tokenHandler; 529 this.encodingDeclarationHandler = null; 530 this.newAttributesEachTime = newAttributesEachTime; 531 // ∳ is the longest valid char ref and 532 // the semicolon never gets appended to the buffer. 533 this.charRefBuf = new char[32]; 534 this.bmpChar = new char[1]; 535 this.astralChar = new char[2]; 536 this.containsHyphen = false; 537 this.tagName = null; 538 this.nonInternedTagName = new ElementName(); 539 this.attributeName = null; 540 // CPPONLY: this.nonInternedAttributeName = new AttributeName(); 541 this.doctypeName = null; 542 this.publicIdentifier = null; 543 this.systemIdentifier = null; 544 this.attributes = null; 545 } 546 547 // ]NOCPP] 548 549 /** 550 * The constructor. 551 * 552 * @param tokenHandler 553 * the handler for receiving tokens 554 */ Tokenizer(TokenHandler tokenHandler )555 public Tokenizer(TokenHandler tokenHandler 556 // CPPONLY: , boolean viewingXmlSource 557 ) { 558 this.tokenHandler = tokenHandler; 559 this.encodingDeclarationHandler = null; 560 // [NOCPP[ 561 this.newAttributesEachTime = false; 562 // ]NOCPP] 563 // ∳ is the longest valid char ref and 564 // the semicolon never gets appended to the buffer. 565 this.charRefBuf = new char[32]; 566 this.bmpChar = new char[1]; 567 this.astralChar = new char[2]; 568 this.containsHyphen = false; 569 this.tagName = null; 570 this.nonInternedTagName = new ElementName(); 571 this.attributeName = null; 572 // CPPONLY: this.nonInternedAttributeName = new AttributeName(); 573 this.doctypeName = null; 574 this.publicIdentifier = null; 575 this.systemIdentifier = null; 576 // [NOCPP[ 577 this.attributes = null; 578 // ]NOCPP] 579 // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null; 580 // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder(); 581 // CPPONLY: this.viewingXmlSource = viewingXmlSource; 582 } 583 setInterner(Interner interner)584 public void setInterner(Interner interner) { 585 this.interner = interner; 586 } 587 initLocation(String newPublicId, String newSystemId)588 public void initLocation(String newPublicId, String newSystemId) { 589 this.systemId = newSystemId; 590 this.publicId = newPublicId; 591 592 } 593 594 // CPPONLY: boolean isViewingXmlSource() { 595 // CPPONLY: return viewingXmlSource; 596 // CPPONLY: } 597 598 // [NOCPP[ 599 600 /** 601 * Returns the mappingLangToXmlLang. 602 * 603 * @return the mappingLangToXmlLang 604 */ isMappingLangToXmlLang()605 public boolean isMappingLangToXmlLang() { 606 return mappingLangToXmlLang == AttributeName.HTML_LANG; 607 } 608 609 /** 610 * Sets the mappingLangToXmlLang. 611 * 612 * @param mappingLangToXmlLang 613 * the mappingLangToXmlLang to set 614 */ setMappingLangToXmlLang(boolean mappingLangToXmlLang)615 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { 616 this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG 617 : AttributeName.HTML; 618 } 619 620 /** 621 * Sets the error handler. 622 * 623 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 624 */ setErrorHandler(ErrorHandler eh)625 public void setErrorHandler(ErrorHandler eh) { 626 this.errorHandler = eh; 627 } 628 getErrorHandler()629 public ErrorHandler getErrorHandler() { 630 return this.errorHandler; 631 } 632 633 /** 634 * Sets the commentPolicy. 635 * 636 * @param commentPolicy 637 * the commentPolicy to set 638 */ setCommentPolicy(XmlViolationPolicy commentPolicy)639 public void setCommentPolicy(XmlViolationPolicy commentPolicy) { 640 this.commentPolicy = commentPolicy; 641 } 642 643 /** 644 * Sets the contentNonXmlCharPolicy. 645 * 646 * @param contentNonXmlCharPolicy 647 * the contentNonXmlCharPolicy to set 648 */ setContentNonXmlCharPolicy( XmlViolationPolicy contentNonXmlCharPolicy)649 public void setContentNonXmlCharPolicy( 650 XmlViolationPolicy contentNonXmlCharPolicy) { 651 if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) { 652 throw new IllegalArgumentException( 653 "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW."); 654 } 655 } 656 657 /** 658 * Sets the contentSpacePolicy. 659 * 660 * @param contentSpacePolicy 661 * the contentSpacePolicy to set 662 */ setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy)663 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { 664 this.contentSpacePolicy = contentSpacePolicy; 665 } 666 667 /** 668 * Sets the xmlnsPolicy. 669 * 670 * @param xmlnsPolicy 671 * the xmlnsPolicy to set 672 */ setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy)673 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { 674 if (xmlnsPolicy == XmlViolationPolicy.FATAL) { 675 throw new IllegalArgumentException("Can't use FATAL here."); 676 } 677 this.xmlnsPolicy = xmlnsPolicy; 678 } 679 setNamePolicy(XmlViolationPolicy namePolicy)680 public void setNamePolicy(XmlViolationPolicy namePolicy) { 681 this.namePolicy = namePolicy; 682 } 683 684 /** 685 * Sets the html4ModeCompatibleWithXhtml1Schemata. 686 * 687 * @param html4ModeCompatibleWithXhtml1Schemata 688 * the html4ModeCompatibleWithXhtml1Schemata to set 689 */ setHtml4ModeCompatibleWithXhtml1Schemata( boolean html4ModeCompatibleWithXhtml1Schemata)690 public void setHtml4ModeCompatibleWithXhtml1Schemata( 691 boolean html4ModeCompatibleWithXhtml1Schemata) { 692 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata; 693 } 694 695 // ]NOCPP] 696 697 // For the token handler to call 698 699 /** 700 * Sets the tokenizer state and the associated element name. This should 701 * only ever used to put the tokenizer into one of the states that have 702 * a special end tag expectation. 703 * 704 * @param specialTokenizerState 705 * the tokenizer state to set 706 */ setState(int specialTokenizerState)707 public void setState(int specialTokenizerState) { 708 this.stateSave = specialTokenizerState; 709 this.endTagExpectation = null; 710 this.endTagExpectationAsArray = null; 711 } 712 713 // [NOCPP[ 714 715 /** 716 * Sets the tokenizer state and the associated element name. This should 717 * only ever used to put the tokenizer into one of the states that have 718 * a special end tag expectation. For use from the tokenizer test harness. 719 * 720 * @param specialTokenizerState 721 * the tokenizer state to set 722 * @param endTagExpectation 723 * the expected end tag for transitioning back to normal 724 */ setStateAndEndTagExpectation(int specialTokenizerState, @Local String endTagExpectation)725 public void setStateAndEndTagExpectation(int specialTokenizerState, 726 @Local String endTagExpectation) { 727 this.stateSave = specialTokenizerState; 728 if (specialTokenizerState == Tokenizer.DATA) { 729 return; 730 } 731 @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation); 732 this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 733 asArray.length, interner); 734 assert this.endTagExpectation != null; 735 endTagExpectationToArray(); 736 } 737 738 // ]NOCPP] 739 740 /** 741 * Sets the tokenizer state and the associated element name. This should 742 * only ever used to put the tokenizer into one of the states that have 743 * a special end tag expectation. 744 * 745 * @param specialTokenizerState 746 * the tokenizer state to set 747 * @param endTagExpectation 748 * the expected end tag for transitioning back to normal 749 */ setStateAndEndTagExpectation(int specialTokenizerState, ElementName endTagExpectation)750 public void setStateAndEndTagExpectation(int specialTokenizerState, 751 ElementName endTagExpectation) { 752 this.stateSave = specialTokenizerState; 753 this.endTagExpectation = endTagExpectation; 754 endTagExpectationToArray(); 755 } 756 endTagExpectationToArray()757 private void endTagExpectationToArray() { 758 switch (endTagExpectation.getGroup()) { 759 case TreeBuilder.TITLE: 760 endTagExpectationAsArray = TITLE_ARR; 761 return; 762 case TreeBuilder.SCRIPT: 763 endTagExpectationAsArray = SCRIPT_ARR; 764 return; 765 case TreeBuilder.STYLE: 766 endTagExpectationAsArray = STYLE_ARR; 767 return; 768 case TreeBuilder.PLAINTEXT: 769 endTagExpectationAsArray = PLAINTEXT_ARR; 770 return; 771 case TreeBuilder.XMP: 772 endTagExpectationAsArray = XMP_ARR; 773 return; 774 case TreeBuilder.TEXTAREA: 775 endTagExpectationAsArray = TEXTAREA_ARR; 776 return; 777 case TreeBuilder.IFRAME: 778 endTagExpectationAsArray = IFRAME_ARR; 779 return; 780 case TreeBuilder.NOEMBED: 781 endTagExpectationAsArray = NOEMBED_ARR; 782 return; 783 case TreeBuilder.NOSCRIPT: 784 endTagExpectationAsArray = NOSCRIPT_ARR; 785 return; 786 case TreeBuilder.NOFRAMES: 787 endTagExpectationAsArray = NOFRAMES_ARR; 788 return; 789 default: 790 assert false: "Bad end tag expectation."; 791 return; 792 } 793 } 794 795 /** 796 * For C++ use only. 797 */ setLineNumber(int line)798 public void setLineNumber(int line) { 799 // CPPONLY: this.attributeLine = line; // XXX is this needed? 800 this.line = line; 801 } 802 803 // start Locator impl 804 805 /** 806 * @see org.xml.sax.Locator#getLineNumber() 807 */ getLineNumber()808 @Inline public int getLineNumber() { 809 return line; 810 } 811 812 // [NOCPP[ 813 814 /** 815 * @see org.xml.sax.Locator#getColumnNumber() 816 */ getColumnNumber()817 @Inline public int getColumnNumber() { 818 return -1; 819 } 820 821 /** 822 * @see org.xml.sax.Locator#getPublicId() 823 */ getPublicId()824 public String getPublicId() { 825 return publicId; 826 } 827 828 /** 829 * @see org.xml.sax.Locator#getSystemId() 830 */ getSystemId()831 public String getSystemId() { 832 return systemId; 833 } 834 835 // end Locator impl 836 837 // end public API 838 notifyAboutMetaBoundary()839 public void notifyAboutMetaBoundary() { 840 metaBoundaryPassed = true; 841 } 842 turnOnAdditionalHtml4Errors()843 void turnOnAdditionalHtml4Errors() { 844 html4 = true; 845 } 846 847 // ]NOCPP] 848 emptyAttributes()849 HtmlAttributes emptyAttributes() { 850 // [NOCPP[ 851 if (newAttributesEachTime) { 852 return new HtmlAttributes(mappingLangToXmlLang); 853 } else { 854 // ]NOCPP] 855 return HtmlAttributes.EMPTY_ATTRIBUTES; 856 // [NOCPP[ 857 } 858 // ]NOCPP] 859 } 860 appendCharRefBuf(char c)861 @Inline private void appendCharRefBuf(char c) { 862 // CPPONLY: assert charRefBufLen < charRefBuf.length: 863 // CPPONLY: "RELEASE: Attempted to overrun charRefBuf!"; 864 charRefBuf[charRefBufLen++] = c; 865 } 866 emitOrAppendCharRefBuf(int returnState)867 private void emitOrAppendCharRefBuf(int returnState) throws SAXException { 868 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 869 appendCharRefBufToStrBuf(); 870 } else { 871 if (charRefBufLen > 0) { 872 tokenHandler.characters(charRefBuf, 0, charRefBufLen); 873 charRefBufLen = 0; 874 } 875 } 876 } 877 clearStrBufAfterUse()878 @Inline private void clearStrBufAfterUse() { 879 strBufLen = 0; 880 } 881 clearStrBufBeforeUse()882 @Inline private void clearStrBufBeforeUse() { 883 assert strBufLen == 0: "strBufLen not reset after previous use!"; 884 strBufLen = 0; // no-op in the absence of bugs 885 } 886 clearStrBufAfterOneHyphen()887 @Inline private void clearStrBufAfterOneHyphen() { 888 assert strBufLen == 1: "strBufLen length not one!"; 889 assert strBuf[0] == '-': "strBuf does not start with a hyphen!"; 890 strBufLen = 0; 891 } 892 893 /** 894 * Appends to the buffer. 895 * 896 * @param c 897 * the UTF-16 code unit to append 898 */ appendStrBuf(char c)899 @Inline private void appendStrBuf(char c) { 900 // CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient."; 901 // CPPONLY: if (strBufLen == strBuf.length) { 902 // CPPONLY: if (!EnsureBufferSpace(1)) { 903 // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure"; 904 // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not 905 // CPPONLY: } 906 strBuf[strBufLen++] = c; 907 } 908 909 /** 910 * The buffer as a String. Currently only used for error reporting. 911 * 912 * <p> 913 * C++ memory note: The return value must be released. 914 * 915 * @return the buffer as a string 916 */ strBufToString()917 protected String strBufToString() { 918 String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen 919 // CPPONLY: , tokenHandler, !newAttributesEachTime && attributeName == AttributeName.CLASS 920 ); 921 clearStrBufAfterUse(); 922 return str; 923 } 924 925 /** 926 * Returns the buffer as a local name. The return value is released in 927 * emitDoctypeToken(). 928 * 929 * @return the buffer as local name 930 */ strBufToDoctypeName()931 private void strBufToDoctypeName() { 932 doctypeName = Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner); 933 clearStrBufAfterUse(); 934 } 935 936 /** 937 * Emits the buffer as character tokens. 938 * 939 * @throws SAXException 940 * if the token handler threw 941 */ emitStrBuf()942 private void emitStrBuf() throws SAXException { 943 if (strBufLen > 0) { 944 tokenHandler.characters(strBuf, 0, strBufLen); 945 clearStrBufAfterUse(); 946 } 947 } 948 appendSecondHyphenToBogusComment()949 @Inline private void appendSecondHyphenToBogusComment() throws SAXException { 950 // [NOCPP[ 951 switch (commentPolicy) { 952 case ALTER_INFOSET: 953 appendStrBuf(' '); 954 // CPPONLY: MOZ_FALLTHROUGH; 955 case ALLOW: 956 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 957 // ]NOCPP] 958 appendStrBuf('-'); 959 // [NOCPP[ 960 break; 961 case FATAL: 962 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 963 break; 964 } 965 // ]NOCPP] 966 } 967 968 // [NOCPP[ maybeAppendSpaceToBogusComment()969 private void maybeAppendSpaceToBogusComment() throws SAXException { 970 switch (commentPolicy) { 971 case ALTER_INFOSET: 972 appendStrBuf(' '); 973 // CPPONLY: MOZ_FALLTHROUGH; 974 case ALLOW: 975 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 976 break; 977 case FATAL: 978 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 979 break; 980 } 981 } 982 983 // ]NOCPP] 984 adjustDoubleHyphenAndAppendToStrBufAndErr(char c)985 @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c) 986 throws SAXException { 987 errConsecutiveHyphens(); 988 // [NOCPP[ 989 switch (commentPolicy) { 990 case ALTER_INFOSET: 991 strBufLen--; 992 // WARNING!!! This expands the worst case of the buffer length 993 // given the length of input! 994 appendStrBuf(' '); 995 appendStrBuf('-'); 996 // CPPONLY: MOZ_FALLTHROUGH; 997 case ALLOW: 998 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 999 // ]NOCPP] 1000 appendStrBuf(c); 1001 // [NOCPP[ 1002 break; 1003 case FATAL: 1004 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1005 break; 1006 } 1007 // ]NOCPP] 1008 } 1009 appendStrBuf(@oLength char[] buffer, int offset, int length)1010 private void appendStrBuf(@NoLength char[] buffer, int offset, int length) { 1011 int newLen = strBufLen + length; 1012 // CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient."; 1013 // CPPONLY: if (strBuf.length < newLen) { 1014 // CPPONLY: if (!EnsureBufferSpace(length)) { 1015 // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure"; 1016 // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not 1017 // CPPONLY: } 1018 System.arraycopy(buffer, offset, strBuf, strBufLen, length); 1019 strBufLen = newLen; 1020 } 1021 1022 /** 1023 * Append the contents of the char reference buffer to the main one. 1024 */ appendCharRefBufToStrBuf()1025 @Inline private void appendCharRefBufToStrBuf() { 1026 appendStrBuf(charRefBuf, 0, charRefBufLen); 1027 charRefBufLen = 0; 1028 } 1029 1030 /** 1031 * Emits the current comment token. 1032 * 1033 * @param pos 1034 * TODO 1035 * 1036 * @throws SAXException 1037 */ emitComment(int provisionalHyphens, int pos)1038 private void emitComment(int provisionalHyphens, int pos) 1039 throws SAXException { 1040 // [NOCPP[ 1041 if (wantsComments) { 1042 // ]NOCPP] 1043 tokenHandler.comment(strBuf, 0, strBufLen 1044 - provisionalHyphens); 1045 // [NOCPP[ 1046 } 1047 // ]NOCPP] 1048 clearStrBufAfterUse(); 1049 cstart = pos + 1; 1050 } 1051 1052 /** 1053 * Flushes coalesced character tokens. 1054 * 1055 * @param buf 1056 * TODO 1057 * @param pos 1058 * TODO 1059 * 1060 * @throws SAXException 1061 */ flushChars(@oLength char[] buf, int pos)1062 protected void flushChars(@NoLength char[] buf, int pos) 1063 throws SAXException { 1064 if (pos > cstart) { 1065 tokenHandler.characters(buf, cstart, pos - cstart); 1066 } 1067 cstart = Integer.MAX_VALUE; 1068 } 1069 1070 /** 1071 * Reports an condition that would make the infoset incompatible with XML 1072 * 1.0 as fatal. 1073 * 1074 * @param message 1075 * the message 1076 * @throws SAXException 1077 * @throws SAXParseException 1078 */ fatal(String message)1079 public void fatal(String message) throws SAXException { 1080 SAXParseException spe = new SAXParseException(message, this); 1081 if (errorHandler != null) { 1082 errorHandler.fatalError(spe); 1083 } 1084 throw spe; 1085 } 1086 1087 /** 1088 * Reports a Parse Error. 1089 * 1090 * @param message 1091 * the message 1092 * @throws SAXException 1093 */ err(String message)1094 public void err(String message) throws SAXException { 1095 if (errorHandler == null) { 1096 return; 1097 } 1098 SAXParseException spe = new SAXParseException(message, this); 1099 errorHandler.error(spe); 1100 } 1101 errTreeBuilder(String message)1102 public void errTreeBuilder(String message) throws SAXException { 1103 ErrorHandler eh = null; 1104 if (tokenHandler instanceof TreeBuilder<?>) { 1105 TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler; 1106 eh = treeBuilder.getErrorHandler(); 1107 } 1108 if (eh == null) { 1109 eh = errorHandler; 1110 } 1111 if (eh == null) { 1112 return; 1113 } 1114 SAXParseException spe = new SAXParseException(message, this); 1115 eh.error(spe); 1116 } 1117 1118 /** 1119 * Reports a warning 1120 * 1121 * @param message 1122 * the message 1123 * @throws SAXException 1124 */ warn(String message)1125 public void warn(String message) throws SAXException { 1126 if (errorHandler == null) { 1127 return; 1128 } 1129 SAXParseException spe = new SAXParseException(message, this); 1130 errorHandler.warning(spe); 1131 } 1132 strBufToElementNameString()1133 private void strBufToElementNameString() { 1134 if (containsHyphen) { 1135 // We've got a custom element or annotation-xml. 1136 @Local String annotationName = ElementName.ANNOTATION_XML.getName(); 1137 if (Portability.localEqualsBuffer(annotationName, strBuf, strBufLen)) { 1138 tagName = ElementName.ANNOTATION_XML; 1139 } else { 1140 nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, 1141 interner) 1142 // CPPONLY: , true 1143 ); 1144 tagName = nonInternedTagName; 1145 } 1146 } else { 1147 tagName = ElementName.elementNameByBuffer(strBuf, strBufLen, interner); 1148 if (tagName == null) { 1149 nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, 1150 interner) 1151 // CPPONLY: , false 1152 ); 1153 tagName = nonInternedTagName; 1154 } 1155 } 1156 containsHyphen = false; 1157 clearStrBufAfterUse(); 1158 } 1159 emitCurrentTagToken(boolean selfClosing, int pos)1160 private int emitCurrentTagToken(boolean selfClosing, int pos) 1161 throws SAXException { 1162 cstart = pos + 1; 1163 maybeErrSlashInEndTag(selfClosing); 1164 stateSave = Tokenizer.DATA; 1165 HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES 1166 : attributes); 1167 if (endTag) { 1168 /* 1169 * When an end tag token is emitted, the content model flag must be 1170 * switched to the PCDATA state. 1171 */ 1172 maybeErrAttributesOnEndTag(attrs); 1173 // CPPONLY: if (!viewingXmlSource) { 1174 tokenHandler.endTag(tagName); 1175 // CPPONLY: } 1176 // CPPONLY: if (newAttributesEachTime) { 1177 // CPPONLY: Portability.delete(attributes); 1178 // CPPONLY: attributes = null; 1179 // CPPONLY: } 1180 } else { 1181 // CPPONLY: if (viewingXmlSource) { 1182 // CPPONLY: assert newAttributesEachTime; 1183 // CPPONLY: Portability.delete(attributes); 1184 // CPPONLY: attributes = null; 1185 // CPPONLY: } else { 1186 tokenHandler.startTag(tagName, attrs, selfClosing); 1187 // CPPONLY: } 1188 } 1189 tagName = null; 1190 if (newAttributesEachTime) { 1191 attributes = null; 1192 } else { 1193 attributes.clear(mappingLangToXmlLang); 1194 } 1195 /* 1196 * The token handler may have called setStateAndEndTagExpectation 1197 * and changed stateSave since the start of this method. 1198 */ 1199 return stateSave; 1200 } 1201 attributeNameComplete()1202 private void attributeNameComplete() throws SAXException { 1203 attributeName = AttributeName.nameByBuffer(strBuf, strBufLen, interner); 1204 if (attributeName == null) { 1205 // [NOCPP[ 1206 attributeName = AttributeName.createAttributeName( 1207 Portability.newLocalNameFromBuffer(strBuf, strBufLen, 1208 interner), 1209 namePolicy != XmlViolationPolicy.ALLOW); 1210 // ]NOCPP] 1211 // CPPONLY: nonInternedAttributeName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner)); 1212 // CPPONLY: attributeName = nonInternedAttributeName; 1213 } 1214 clearStrBufAfterUse(); 1215 1216 if (attributes == null) { 1217 attributes = new HtmlAttributes(mappingLangToXmlLang); 1218 } 1219 1220 /* 1221 * When the user agent leaves the attribute name state (and before 1222 * emitting the tag token, if appropriate), the complete attribute's 1223 * name must be compared to the other attributes on the same token; if 1224 * there is already an attribute on the token with the exact same name, 1225 * then this is a parse error and the new attribute must be dropped, 1226 * along with the value that gets associated with it (if any). 1227 */ 1228 if (attributes.contains(attributeName)) { 1229 errDuplicateAttribute(); 1230 attributeName = null; 1231 } 1232 } 1233 addAttributeWithoutValue()1234 private void addAttributeWithoutValue() throws SAXException { 1235 noteAttributeWithoutValue(); 1236 1237 // [NOCPP[ 1238 if (metaBoundaryPassed && AttributeName.CHARSET == attributeName 1239 && ElementName.META == tagName) { 1240 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); 1241 } 1242 // ]NOCPP] 1243 if (attributeName != null) { 1244 // [NOCPP[ 1245 if (html4) { 1246 if (attributeName.isBoolean()) { 1247 if (html4ModeCompatibleWithXhtml1Schemata) { 1248 attributes.addAttribute(attributeName, 1249 attributeName.getLocal(AttributeName.HTML), 1250 xmlnsPolicy); 1251 } else { 1252 attributes.addAttribute(attributeName, "", xmlnsPolicy); 1253 } 1254 } else { 1255 if (AttributeName.BORDER != attributeName) { 1256 err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)"); 1257 attributes.addAttribute(attributeName, "", xmlnsPolicy); 1258 } 1259 } 1260 } else { 1261 if (AttributeName.SRC == attributeName 1262 || AttributeName.HREF == attributeName) { 1263 warn("Attribute \u201C" 1264 + attributeName.getLocal(AttributeName.HTML) 1265 + "\u201D without an explicit value seen. The attribute may be dropped by IE7."); 1266 } 1267 // ]NOCPP] 1268 attributes.addAttribute(attributeName, 1269 Portability.newEmptyString() 1270 // [NOCPP[ 1271 , xmlnsPolicy 1272 // ]NOCPP] 1273 // CPPONLY: , attributeLine 1274 ); 1275 // [NOCPP[ 1276 } 1277 // ]NOCPP] 1278 attributeName = null; 1279 } else { 1280 clearStrBufAfterUse(); 1281 } 1282 } 1283 addAttributeWithValue()1284 private void addAttributeWithValue() throws SAXException { 1285 // [NOCPP[ 1286 if (metaBoundaryPassed && ElementName.META == tagName 1287 && AttributeName.CHARSET == attributeName) { 1288 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); 1289 } 1290 // ]NOCPP] 1291 if (attributeName != null) { 1292 String val = strBufToString(); // Ownership transferred to 1293 // HtmlAttributes 1294 // CPPONLY: if (mViewSource) { 1295 // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val); 1296 // CPPONLY: } 1297 // [NOCPP[ 1298 if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata 1299 && attributeName.isCaseFolded()) { 1300 val = newAsciiLowerCaseStringFromString(val); 1301 } 1302 // ]NOCPP] 1303 attributes.addAttribute(attributeName, val 1304 // [NOCPP[ 1305 , xmlnsPolicy 1306 // ]NOCPP] 1307 // CPPONLY: , attributeLine 1308 ); 1309 attributeName = null; 1310 } else { 1311 // We have a duplicate attribute. Explicitly discard its value. 1312 clearStrBufAfterUse(); 1313 } 1314 } 1315 1316 // [NOCPP[ 1317 newAsciiLowerCaseStringFromString(String str)1318 private static String newAsciiLowerCaseStringFromString(String str) { 1319 if (str == null) { 1320 return null; 1321 } 1322 char[] buf = new char[str.length()]; 1323 for (int i = 0; i < str.length(); i++) { 1324 char c = str.charAt(i); 1325 if (c >= 'A' && c <= 'Z') { 1326 c += 0x20; 1327 } 1328 buf[i] = c; 1329 } 1330 return new String(buf); 1331 } 1332 startErrorReporting()1333 protected void startErrorReporting() throws SAXException { 1334 1335 } 1336 1337 // ]NOCPP] 1338 start()1339 public void start() throws SAXException { 1340 initializeWithoutStarting(); 1341 tokenHandler.startTokenization(this); 1342 // [NOCPP[ 1343 startErrorReporting(); 1344 // ]NOCPP] 1345 } 1346 tokenizeBuffer(UTF16Buffer buffer)1347 public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException { 1348 int state = stateSave; 1349 int returnState = returnStateSave; 1350 char c = '\u0000'; 1351 shouldSuspend = false; 1352 lastCR = false; 1353 1354 int start = buffer.getStart(); 1355 int end = buffer.getEnd(); 1356 1357 // In C++, the caller of tokenizeBuffer needs to do this explicitly. 1358 // [NOCPP[ 1359 ensureBufferSpace(end - start); 1360 // ]NOCPP] 1361 1362 /** 1363 * The index of the last <code>char</code> read from <code>buf</code>. 1364 */ 1365 int pos = start - 1; 1366 1367 /** 1368 * The index of the first <code>char</code> in <code>buf</code> that is 1369 * part of a coalesced run of character tokens or 1370 * <code>Integer.MAX_VALUE</code> if there is not a current run being 1371 * coalesced. 1372 */ 1373 switch (state) { 1374 case DATA: 1375 case RCDATA: 1376 case SCRIPT_DATA: 1377 case PLAINTEXT: 1378 case RAWTEXT: 1379 case CDATA_SECTION: 1380 case SCRIPT_DATA_ESCAPED: 1381 case SCRIPT_DATA_ESCAPE_START: 1382 case SCRIPT_DATA_ESCAPE_START_DASH: 1383 case SCRIPT_DATA_ESCAPED_DASH: 1384 case SCRIPT_DATA_ESCAPED_DASH_DASH: 1385 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 1386 case SCRIPT_DATA_DOUBLE_ESCAPED: 1387 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 1388 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 1389 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 1390 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 1391 cstart = start; 1392 break; 1393 default: 1394 cstart = Integer.MAX_VALUE; 1395 break; 1396 } 1397 1398 /** 1399 * The number of <code>char</code>s in <code>buf</code> that have 1400 * meaning. (The rest of the array is garbage and should not be 1401 * examined.) 1402 */ 1403 // CPPONLY: if (mViewSource) { 1404 // CPPONLY: mViewSource.SetBuffer(buffer); 1405 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1406 // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1); 1407 // CPPONLY: } else { 1408 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1409 // CPPONLY: } 1410 // [NOCPP[ 1411 pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, 1412 end); 1413 // ]NOCPP] 1414 if (pos == end) { 1415 // exiting due to end of buffer 1416 buffer.setStart(pos); 1417 } else { 1418 buffer.setStart(pos + 1); 1419 } 1420 return lastCR; 1421 } 1422 1423 // [NOCPP[ ensureBufferSpace(int inputLength)1424 private void ensureBufferSpace(int inputLength) throws SAXException { 1425 // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB. 1426 // Adding to the general worst case instead of only the 1427 // TreeBuilder-exposed worst case to avoid re-introducing a bug when 1428 // unifying the tokenizer and tree builder buffers in the future. 1429 int worstCase = strBufLen + inputLength + charRefBufLen + 2; 1430 tokenHandler.ensureBufferSpace(worstCase); 1431 if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) { 1432 // When altering infoset, if the comment contents are consecutive 1433 // hyphens, each hyphen generates a space, too. These buffer 1434 // contents never get emitted as characters() to the tokenHandler, 1435 // which is why this calculation happens after the call to 1436 // ensureBufferSpace on tokenHandler. 1437 worstCase *= 2; 1438 } 1439 if (strBuf == null) { 1440 // Add an arbitrary small value to avoid immediate reallocation 1441 // once there are a few characters in the buffer. 1442 strBuf = new char[worstCase + 128]; 1443 } else if (worstCase > strBuf.length) { 1444 // HotSpot reportedly allocates memory with 8-byte accuracy, so 1445 // there's no point in trying to do math here to avoid slop. 1446 // Maybe we should add some small constant to worstCase here 1447 // but not doing that without profiling. In C++ with jemalloc, 1448 // the corresponding method should do math to round up here 1449 // to avoid slop. 1450 char[] newBuf = new char[worstCase]; 1451 System.arraycopy(strBuf, 0, newBuf, 0, strBufLen); 1452 strBuf = newBuf; 1453 } 1454 } 1455 // ]NOCPP] 1456 stateLoop(int state, char c, int pos, @NoLength char[] buf, boolean reconsume, int returnState, int endPos)1457 @SuppressWarnings("unused") private int stateLoop(int state, char c, 1458 int pos, @NoLength char[] buf, boolean reconsume, int returnState, 1459 int endPos) throws SAXException { 1460 /* 1461 * Idioms used in this code: 1462 * 1463 * 1464 * Consuming the next input character 1465 * 1466 * To consume the next input character, the code does this: if (++pos == 1467 * endPos) { break stateloop; } c = checkChar(buf, pos); 1468 * 1469 * 1470 * Staying in a state 1471 * 1472 * When there's a state that the tokenizer may stay in over multiple 1473 * input characters, the state has a wrapper |for(;;)| loop and staying 1474 * in the state continues the loop. 1475 * 1476 * 1477 * Switching to another state 1478 * 1479 * To switch to another state, the code sets the state variable to the 1480 * magic number of the new state. Then it either continues stateloop or 1481 * breaks out of the state's own wrapper loop if the target state is 1482 * right after the current state in source order. (This is a partial 1483 * workaround for Java's lack of goto.) 1484 * 1485 * 1486 * Reconsume support 1487 * 1488 * The spec sometimes says that an input character is reconsumed in 1489 * another state. If a state can ever be entered so that an input 1490 * character can be reconsumed in it, the state's code starts with an 1491 * |if (reconsume)| that sets reconsume to false and skips over the 1492 * normal code for consuming a new character. 1493 * 1494 * To reconsume the current character in another state, the code sets 1495 * |reconsume| to true and then switches to the other state. 1496 * 1497 * 1498 * Emitting character tokens 1499 * 1500 * This method emits character tokens lazily. Whenever a new range of 1501 * character tokens starts, the field cstart must be set to the start 1502 * index of the range. The flushChars() method must be called at the end 1503 * of a range to flush it. 1504 * 1505 * 1506 * U+0000 handling 1507 * 1508 * The various states have to handle the replacement of U+0000 with 1509 * U+FFFD. However, if U+0000 would be reconsumed in another state, the 1510 * replacement doesn't need to happen, because it's handled by the 1511 * reconsuming state. 1512 * 1513 * 1514 * LF handling 1515 * 1516 * Every state needs to increment the line number upon LF unless the LF 1517 * gets reconsumed by another state which increments the line number. 1518 * 1519 * 1520 * CR handling 1521 * 1522 * Every state needs to handle CR unless the CR gets reconsumed and is 1523 * handled by the reconsuming state. The CR needs to be handled as if it 1524 * were and LF, the lastCR field must be set to true and then this 1525 * method must return. The IO driver will then swallow the next 1526 * character if it is an LF to coalesce CRLF. 1527 */ 1528 stateloop: for (;;) { 1529 switch (state) { 1530 case DATA: 1531 dataloop: for (;;) { 1532 if (reconsume) { 1533 reconsume = false; 1534 } else { 1535 if (++pos == endPos) { 1536 break stateloop; 1537 } 1538 c = checkChar(buf, pos); 1539 } 1540 switch (c) { 1541 case '&': 1542 /* 1543 * U+0026 AMPERSAND (&) Switch to the character 1544 * reference in data state. 1545 */ 1546 flushChars(buf, pos); 1547 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 1548 appendCharRefBuf(c); 1549 setAdditionalAndRememberAmpersandLocation('\u0000'); 1550 returnState = state; 1551 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1552 continue stateloop; 1553 case '<': 1554 /* 1555 * U+003C LESS-THAN SIGN (<) Switch to the tag 1556 * open state. 1557 */ 1558 flushChars(buf, pos); 1559 1560 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); 1561 break dataloop; // FALL THROUGH continue 1562 // stateloop; 1563 case '\u0000': 1564 emitReplacementCharacter(buf, pos); 1565 continue; 1566 case '\r': 1567 emitCarriageReturn(buf, pos); 1568 break stateloop; 1569 case '\n': 1570 silentLineFeed(); 1571 // CPPONLY: MOZ_FALLTHROUGH; 1572 default: 1573 /* 1574 * Anything else Emit the input character as a 1575 * character token. 1576 * 1577 * Stay in the data state. 1578 */ 1579 continue; 1580 } 1581 } 1582 // CPPONLY: MOZ_FALLTHROUGH; 1583 case TAG_OPEN: 1584 tagopenloop: for (;;) { 1585 /* 1586 * The behavior of this state depends on the content 1587 * model flag. 1588 */ 1589 if (++pos == endPos) { 1590 break stateloop; 1591 } 1592 c = checkChar(buf, pos); 1593 /* 1594 * If the content model flag is set to the PCDATA state 1595 * Consume the next input character: 1596 */ 1597 if (c >= 'A' && c <= 'Z') { 1598 /* 1599 * U+0041 LATIN CAPITAL LETTER A through to U+005A 1600 * LATIN CAPITAL LETTER Z Create a new start tag 1601 * token, 1602 */ 1603 endTag = false; 1604 /* 1605 * set its tag name to the lowercase version of the 1606 * input character (add 0x0020 to the character's 1607 * code point), 1608 */ 1609 clearStrBufBeforeUse(); 1610 appendStrBuf((char) (c + 0x20)); 1611 containsHyphen = false; 1612 /* then switch to the tag name state. */ 1613 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1614 /* 1615 * (Don't emit the token yet; further details will 1616 * be filled in before it is emitted.) 1617 */ 1618 break tagopenloop; 1619 // continue stateloop; 1620 } else if (c >= 'a' && c <= 'z') { 1621 /* 1622 * U+0061 LATIN SMALL LETTER A through to U+007A 1623 * LATIN SMALL LETTER Z Create a new start tag 1624 * token, 1625 */ 1626 endTag = false; 1627 /* 1628 * set its tag name to the input character, 1629 */ 1630 clearStrBufBeforeUse(); 1631 appendStrBuf(c); 1632 containsHyphen = false; 1633 /* then switch to the tag name state. */ 1634 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1635 /* 1636 * (Don't emit the token yet; further details will 1637 * be filled in before it is emitted.) 1638 */ 1639 break tagopenloop; 1640 // continue stateloop; 1641 } 1642 switch (c) { 1643 case '!': 1644 /* 1645 * U+0021 EXCLAMATION MARK (!) Switch to the 1646 * markup declaration open state. 1647 */ 1648 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); 1649 continue stateloop; 1650 case '/': 1651 /* 1652 * U+002F SOLIDUS (/) Switch to the close tag 1653 * open state. 1654 */ 1655 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); 1656 continue stateloop; 1657 case '?': 1658 // CPPONLY: if (viewingXmlSource) { 1659 // CPPONLY: state = transition(state, 1660 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION, 1661 // CPPONLY: reconsume, 1662 // CPPONLY: pos); 1663 // CPPONLY: continue stateloop; 1664 // CPPONLY: } 1665 /* 1666 * U+003F QUESTION MARK (?) Parse error. 1667 */ 1668 errProcessingInstruction(); 1669 /* 1670 * Switch to the bogus comment state. 1671 */ 1672 clearStrBufBeforeUse(); 1673 appendStrBuf(c); 1674 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1675 continue stateloop; 1676 case '>': 1677 /* 1678 * U+003E GREATER-THAN SIGN (>) Parse error. 1679 */ 1680 errLtGt(); 1681 /* 1682 * Emit a U+003C LESS-THAN SIGN character token 1683 * and a U+003E GREATER-THAN SIGN character 1684 * token. 1685 */ 1686 tokenHandler.characters(Tokenizer.LT_GT, 0, 2); 1687 /* Switch to the data state. */ 1688 cstart = pos + 1; 1689 state = transition(state, Tokenizer.DATA, reconsume, pos); 1690 continue stateloop; 1691 default: 1692 /* 1693 * Anything else Parse error. 1694 */ 1695 errBadCharAfterLt(c); 1696 /* 1697 * Emit a U+003C LESS-THAN SIGN character token 1698 */ 1699 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1700 /* 1701 * and reconsume the current input character in 1702 * the data state. 1703 */ 1704 cstart = pos; 1705 reconsume = true; 1706 state = transition(state, Tokenizer.DATA, reconsume, pos); 1707 continue stateloop; 1708 } 1709 } 1710 // CPPONLY: MOZ_FALLTHROUGH; 1711 case TAG_NAME: 1712 tagnameloop: for (;;) { 1713 if (++pos == endPos) { 1714 break stateloop; 1715 } 1716 c = checkChar(buf, pos); 1717 /* 1718 * Consume the next input character: 1719 */ 1720 switch (c) { 1721 case '\r': 1722 silentCarriageReturn(); 1723 strBufToElementNameString(); 1724 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1725 break stateloop; 1726 case '\n': 1727 silentLineFeed(); 1728 // CPPONLY: MOZ_FALLTHROUGH; 1729 case ' ': 1730 case '\t': 1731 case '\u000C': 1732 /* 1733 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1734 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1735 * Switch to the before attribute name state. 1736 */ 1737 strBufToElementNameString(); 1738 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1739 break tagnameloop; 1740 // continue stateloop; 1741 case '/': 1742 /* 1743 * U+002F SOLIDUS (/) Switch to the self-closing 1744 * start tag state. 1745 */ 1746 strBufToElementNameString(); 1747 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1748 continue stateloop; 1749 case '>': 1750 /* 1751 * U+003E GREATER-THAN SIGN (>) Emit the current 1752 * tag token. 1753 */ 1754 strBufToElementNameString(); 1755 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1756 if (shouldSuspend) { 1757 break stateloop; 1758 } 1759 /* 1760 * Switch to the data state. 1761 */ 1762 continue stateloop; 1763 case '\u0000': 1764 c = '\uFFFD'; 1765 // CPPONLY: MOZ_FALLTHROUGH; 1766 default: 1767 if (c >= 'A' && c <= 'Z') { 1768 /* 1769 * U+0041 LATIN CAPITAL LETTER A through to 1770 * U+005A LATIN CAPITAL LETTER Z Append the 1771 * lowercase version of the current input 1772 * character (add 0x0020 to the character's 1773 * code point) to the current tag token's 1774 * tag name. 1775 */ 1776 c += 0x20; 1777 } else if (c == '-') { 1778 containsHyphen = true; 1779 } 1780 /* 1781 * Anything else Append the current input 1782 * character to the current tag token's tag 1783 * name. 1784 */ 1785 appendStrBuf(c); 1786 /* 1787 * Stay in the tag name state. 1788 */ 1789 continue; 1790 } 1791 } 1792 // CPPONLY: MOZ_FALLTHROUGH; 1793 case BEFORE_ATTRIBUTE_NAME: 1794 beforeattributenameloop: for (;;) { 1795 if (reconsume) { 1796 reconsume = false; 1797 } else { 1798 if (++pos == endPos) { 1799 break stateloop; 1800 } 1801 c = checkChar(buf, pos); 1802 } 1803 /* 1804 * Consume the next input character: 1805 */ 1806 switch (c) { 1807 case '\r': 1808 silentCarriageReturn(); 1809 break stateloop; 1810 case '\n': 1811 silentLineFeed(); 1812 // CPPONLY: MOZ_FALLTHROUGH; 1813 case ' ': 1814 case '\t': 1815 case '\u000C': 1816 /* 1817 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1818 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1819 * in the before attribute name state. 1820 */ 1821 continue; 1822 case '/': 1823 /* 1824 * U+002F SOLIDUS (/) Switch to the self-closing 1825 * start tag state. 1826 */ 1827 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1828 continue stateloop; 1829 case '>': 1830 /* 1831 * U+003E GREATER-THAN SIGN (>) Emit the current 1832 * tag token. 1833 */ 1834 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1835 if (shouldSuspend) { 1836 break stateloop; 1837 } 1838 /* 1839 * Switch to the data state. 1840 */ 1841 continue stateloop; 1842 case '\u0000': 1843 c = '\uFFFD'; 1844 // CPPONLY: MOZ_FALLTHROUGH; 1845 case '\"': 1846 case '\'': 1847 case '<': 1848 case '=': 1849 /* 1850 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1851 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 1852 * SIGN (=) Parse error. 1853 */ 1854 errBadCharBeforeAttributeNameOrNull(c); 1855 /* 1856 * Treat it as per the "anything else" entry 1857 * below. 1858 */ 1859 // CPPONLY: MOZ_FALLTHROUGH; 1860 default: 1861 /* 1862 * Anything else Start a new attribute in the 1863 * current tag token. 1864 */ 1865 if (c >= 'A' && c <= 'Z') { 1866 /* 1867 * U+0041 LATIN CAPITAL LETTER A through to 1868 * U+005A LATIN CAPITAL LETTER Z Set that 1869 * attribute's name to the lowercase version 1870 * of the current input character (add 1871 * 0x0020 to the character's code point) 1872 */ 1873 c += 0x20; 1874 } 1875 // CPPONLY: attributeLine = line; 1876 /* 1877 * Set that attribute's name to the current 1878 * input character, 1879 */ 1880 clearStrBufBeforeUse(); 1881 appendStrBuf(c); 1882 /* 1883 * and its value to the empty string. 1884 */ 1885 // Will do later. 1886 /* 1887 * Switch to the attribute name state. 1888 */ 1889 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 1890 break beforeattributenameloop; 1891 // continue stateloop; 1892 } 1893 } 1894 // CPPONLY: MOZ_FALLTHROUGH; 1895 case ATTRIBUTE_NAME: 1896 attributenameloop: for (;;) { 1897 if (++pos == endPos) { 1898 break stateloop; 1899 } 1900 c = checkChar(buf, pos); 1901 /* 1902 * Consume the next input character: 1903 */ 1904 switch (c) { 1905 case '\r': 1906 silentCarriageReturn(); 1907 attributeNameComplete(); 1908 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 1909 break stateloop; 1910 case '\n': 1911 silentLineFeed(); 1912 // CPPONLY: MOZ_FALLTHROUGH; 1913 case ' ': 1914 case '\t': 1915 case '\u000C': 1916 /* 1917 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1918 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1919 * Switch to the after attribute name state. 1920 */ 1921 attributeNameComplete(); 1922 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 1923 continue stateloop; 1924 case '/': 1925 /* 1926 * U+002F SOLIDUS (/) Switch to the self-closing 1927 * start tag state. 1928 */ 1929 attributeNameComplete(); 1930 addAttributeWithoutValue(); 1931 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1932 continue stateloop; 1933 case '=': 1934 /* 1935 * U+003D EQUALS SIGN (=) Switch to the before 1936 * attribute value state. 1937 */ 1938 attributeNameComplete(); 1939 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 1940 break attributenameloop; 1941 // continue stateloop; 1942 case '>': 1943 /* 1944 * U+003E GREATER-THAN SIGN (>) Emit the current 1945 * tag token. 1946 */ 1947 attributeNameComplete(); 1948 addAttributeWithoutValue(); 1949 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1950 if (shouldSuspend) { 1951 break stateloop; 1952 } 1953 /* 1954 * Switch to the data state. 1955 */ 1956 continue stateloop; 1957 case '\u0000': 1958 c = '\uFFFD'; 1959 // CPPONLY: MOZ_FALLTHROUGH; 1960 case '\"': 1961 case '\'': 1962 case '<': 1963 /* 1964 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1965 * (') U+003C LESS-THAN SIGN (<) Parse error. 1966 */ 1967 errQuoteOrLtInAttributeNameOrNull(c); 1968 /* 1969 * Treat it as per the "anything else" entry 1970 * below. 1971 */ 1972 // CPPONLY: MOZ_FALLTHROUGH; 1973 default: 1974 if (c >= 'A' && c <= 'Z') { 1975 /* 1976 * U+0041 LATIN CAPITAL LETTER A through to 1977 * U+005A LATIN CAPITAL LETTER Z Append the 1978 * lowercase version of the current input 1979 * character (add 0x0020 to the character's 1980 * code point) to the current attribute's 1981 * name. 1982 */ 1983 c += 0x20; 1984 } 1985 /* 1986 * Anything else Append the current input 1987 * character to the current attribute's name. 1988 */ 1989 appendStrBuf(c); 1990 /* 1991 * Stay in the attribute name state. 1992 */ 1993 continue; 1994 } 1995 } 1996 // CPPONLY: MOZ_FALLTHROUGH; 1997 case BEFORE_ATTRIBUTE_VALUE: 1998 beforeattributevalueloop: for (;;) { 1999 if (++pos == endPos) { 2000 break stateloop; 2001 } 2002 c = checkChar(buf, pos); 2003 /* 2004 * Consume the next input character: 2005 */ 2006 switch (c) { 2007 case '\r': 2008 silentCarriageReturn(); 2009 break stateloop; 2010 case '\n': 2011 silentLineFeed(); 2012 // CPPONLY: MOZ_FALLTHROUGH; 2013 case ' ': 2014 case '\t': 2015 case '\u000C': 2016 /* 2017 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2018 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 2019 * in the before attribute value state. 2020 */ 2021 continue; 2022 case '"': 2023 /* 2024 * U+0022 QUOTATION MARK (") Switch to the 2025 * attribute value (double-quoted) state. 2026 */ 2027 // CPPONLY: attributeLine = line; 2028 clearStrBufBeforeUse(); 2029 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); 2030 break beforeattributevalueloop; 2031 // continue stateloop; 2032 case '&': 2033 /* 2034 * U+0026 AMPERSAND (&) Switch to the attribute 2035 * value (unquoted) state and reconsume this 2036 * input character. 2037 */ 2038 // CPPONLY: attributeLine = line; 2039 clearStrBufBeforeUse(); 2040 reconsume = true; 2041 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 2042 noteUnquotedAttributeValue(); 2043 continue stateloop; 2044 case '\'': 2045 /* 2046 * U+0027 APOSTROPHE (') Switch to the attribute 2047 * value (single-quoted) state. 2048 */ 2049 // CPPONLY: attributeLine = line; 2050 clearStrBufBeforeUse(); 2051 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); 2052 continue stateloop; 2053 case '>': 2054 /* 2055 * U+003E GREATER-THAN SIGN (>) Parse error. 2056 */ 2057 errAttributeValueMissing(); 2058 /* 2059 * Emit the current tag token. 2060 */ 2061 addAttributeWithoutValue(); 2062 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2063 if (shouldSuspend) { 2064 break stateloop; 2065 } 2066 /* 2067 * Switch to the data state. 2068 */ 2069 continue stateloop; 2070 case '\u0000': 2071 c = '\uFFFD'; 2072 // CPPONLY: MOZ_FALLTHROUGH; 2073 case '<': 2074 case '=': 2075 case '`': 2076 /* 2077 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN 2078 * (=) U+0060 GRAVE ACCENT (`) 2079 */ 2080 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); 2081 /* 2082 * Treat it as per the "anything else" entry 2083 * below. 2084 */ 2085 // CPPONLY: MOZ_FALLTHROUGH; 2086 default: 2087 // [NOCPP[ 2088 errHtml4NonNameInUnquotedAttribute(c); 2089 // ]NOCPP] 2090 /* 2091 * Anything else Append the current input 2092 * character to the current attribute's value. 2093 */ 2094 // CPPONLY: attributeLine = line; 2095 clearStrBufBeforeUse(); 2096 appendStrBuf(c); 2097 /* 2098 * Switch to the attribute value (unquoted) 2099 * state. 2100 */ 2101 2102 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 2103 noteUnquotedAttributeValue(); 2104 continue stateloop; 2105 } 2106 } 2107 // CPPONLY: MOZ_FALLTHROUGH; 2108 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 2109 attributevaluedoublequotedloop: for (;;) { 2110 if (reconsume) { 2111 reconsume = false; 2112 } else { 2113 if (++pos == endPos) { 2114 break stateloop; 2115 } 2116 c = checkChar(buf, pos); 2117 } 2118 /* 2119 * Consume the next input character: 2120 */ 2121 switch (c) { 2122 case '"': 2123 /* 2124 * U+0022 QUOTATION MARK (") Switch to the after 2125 * attribute value (quoted) state. 2126 */ 2127 addAttributeWithValue(); 2128 2129 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 2130 break attributevaluedoublequotedloop; 2131 // continue stateloop; 2132 case '&': 2133 /* 2134 * U+0026 AMPERSAND (&) Switch to the character 2135 * reference in attribute value state, with the 2136 * additional allowed character being U+0022 2137 * QUOTATION MARK ("). 2138 */ 2139 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2140 appendCharRefBuf(c); 2141 setAdditionalAndRememberAmpersandLocation('\"'); 2142 returnState = state; 2143 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2144 continue stateloop; 2145 case '\r': 2146 appendStrBufCarriageReturn(); 2147 break stateloop; 2148 case '\n': 2149 appendStrBufLineFeed(); 2150 continue; 2151 case '\u0000': 2152 c = '\uFFFD'; 2153 // CPPONLY: MOZ_FALLTHROUGH; 2154 default: 2155 /* 2156 * Anything else Append the current input 2157 * character to the current attribute's value. 2158 */ 2159 appendStrBuf(c); 2160 /* 2161 * Stay in the attribute value (double-quoted) 2162 * state. 2163 */ 2164 continue; 2165 } 2166 } 2167 // CPPONLY: MOZ_FALLTHROUGH; 2168 case AFTER_ATTRIBUTE_VALUE_QUOTED: 2169 afterattributevaluequotedloop: for (;;) { 2170 if (++pos == endPos) { 2171 break stateloop; 2172 } 2173 c = checkChar(buf, pos); 2174 /* 2175 * Consume the next input character: 2176 */ 2177 switch (c) { 2178 case '\r': 2179 silentCarriageReturn(); 2180 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2181 break stateloop; 2182 case '\n': 2183 silentLineFeed(); 2184 // CPPONLY: MOZ_FALLTHROUGH; 2185 case ' ': 2186 case '\t': 2187 case '\u000C': 2188 /* 2189 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2190 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2191 * Switch to the before attribute name state. 2192 */ 2193 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2194 continue stateloop; 2195 case '/': 2196 /* 2197 * U+002F SOLIDUS (/) Switch to the self-closing 2198 * start tag state. 2199 */ 2200 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2201 break afterattributevaluequotedloop; 2202 // continue stateloop; 2203 case '>': 2204 /* 2205 * U+003E GREATER-THAN SIGN (>) Emit the current 2206 * tag token. 2207 */ 2208 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2209 if (shouldSuspend) { 2210 break stateloop; 2211 } 2212 /* 2213 * Switch to the data state. 2214 */ 2215 continue stateloop; 2216 default: 2217 /* 2218 * Anything else Parse error. 2219 */ 2220 errNoSpaceBetweenAttributes(); 2221 /* 2222 * Reconsume the character in the before 2223 * attribute name state. 2224 */ 2225 reconsume = true; 2226 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2227 continue stateloop; 2228 } 2229 } 2230 // CPPONLY: MOZ_FALLTHROUGH; 2231 case SELF_CLOSING_START_TAG: 2232 if (++pos == endPos) { 2233 break stateloop; 2234 } 2235 c = checkChar(buf, pos); 2236 /* 2237 * Consume the next input character: 2238 */ 2239 switch (c) { 2240 case '>': 2241 /* 2242 * U+003E GREATER-THAN SIGN (>) Set the self-closing 2243 * flag of the current tag token. Emit the current 2244 * tag token. 2245 */ 2246 // [NOCPP[ 2247 errHtml4XmlVoidSyntax(); 2248 // ]NOCPP] 2249 state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); 2250 if (shouldSuspend) { 2251 break stateloop; 2252 } 2253 /* 2254 * Switch to the data state. 2255 */ 2256 continue stateloop; 2257 default: 2258 /* Anything else Parse error. */ 2259 errSlashNotFollowedByGt(); 2260 /* 2261 * Reconsume the character in the before attribute 2262 * name state. 2263 */ 2264 reconsume = true; 2265 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2266 continue stateloop; 2267 } 2268 case ATTRIBUTE_VALUE_UNQUOTED: 2269 for (;;) { 2270 if (reconsume) { 2271 reconsume = false; 2272 } else { 2273 if (++pos == endPos) { 2274 break stateloop; 2275 } 2276 c = checkChar(buf, pos); 2277 } 2278 /* 2279 * Consume the next input character: 2280 */ 2281 switch (c) { 2282 case '\r': 2283 silentCarriageReturn(); 2284 addAttributeWithValue(); 2285 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2286 break stateloop; 2287 case '\n': 2288 silentLineFeed(); 2289 // CPPONLY: MOZ_FALLTHROUGH; 2290 case ' ': 2291 case '\t': 2292 case '\u000C': 2293 /* 2294 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2295 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2296 * Switch to the before attribute name state. 2297 */ 2298 addAttributeWithValue(); 2299 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2300 continue stateloop; 2301 case '&': 2302 /* 2303 * U+0026 AMPERSAND (&) Switch to the character 2304 * reference in attribute value state, with the 2305 * additional allowed character being U+003E 2306 * GREATER-THAN SIGN (>) 2307 */ 2308 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2309 appendCharRefBuf(c); 2310 setAdditionalAndRememberAmpersandLocation('>'); 2311 returnState = state; 2312 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2313 continue stateloop; 2314 case '>': 2315 /* 2316 * U+003E GREATER-THAN SIGN (>) Emit the current 2317 * tag token. 2318 */ 2319 addAttributeWithValue(); 2320 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2321 if (shouldSuspend) { 2322 break stateloop; 2323 } 2324 /* 2325 * Switch to the data state. 2326 */ 2327 continue stateloop; 2328 case '\u0000': 2329 c = '\uFFFD'; 2330 // CPPONLY: MOZ_FALLTHROUGH; 2331 case '<': 2332 case '\"': 2333 case '\'': 2334 case '=': 2335 case '`': 2336 /* 2337 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 2338 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 2339 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. 2340 */ 2341 errUnquotedAttributeValOrNull(c); 2342 /* 2343 * Treat it as per the "anything else" entry 2344 * below. 2345 */ 2346 // CPPONLY: MOZ_FALLTHROUGH; 2347 default: 2348 // [NOCPP] 2349 errHtml4NonNameInUnquotedAttribute(c); 2350 // ]NOCPP] 2351 /* 2352 * Anything else Append the current input 2353 * character to the current attribute's value. 2354 */ 2355 appendStrBuf(c); 2356 /* 2357 * Stay in the attribute value (unquoted) state. 2358 */ 2359 continue; 2360 } 2361 } 2362 case AFTER_ATTRIBUTE_NAME: 2363 for (;;) { 2364 if (++pos == endPos) { 2365 break stateloop; 2366 } 2367 c = checkChar(buf, pos); 2368 /* 2369 * Consume the next input character: 2370 */ 2371 switch (c) { 2372 case '\r': 2373 silentCarriageReturn(); 2374 break stateloop; 2375 case '\n': 2376 silentLineFeed(); 2377 // CPPONLY: MOZ_FALLTHROUGH; 2378 case ' ': 2379 case '\t': 2380 case '\u000C': 2381 /* 2382 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2383 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 2384 * in the after attribute name state. 2385 */ 2386 continue; 2387 case '/': 2388 /* 2389 * U+002F SOLIDUS (/) Switch to the self-closing 2390 * start tag state. 2391 */ 2392 addAttributeWithoutValue(); 2393 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2394 continue stateloop; 2395 case '=': 2396 /* 2397 * U+003D EQUALS SIGN (=) Switch to the before 2398 * attribute value state. 2399 */ 2400 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 2401 continue stateloop; 2402 case '>': 2403 /* 2404 * U+003E GREATER-THAN SIGN (>) Emit the current 2405 * tag token. 2406 */ 2407 addAttributeWithoutValue(); 2408 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2409 if (shouldSuspend) { 2410 break stateloop; 2411 } 2412 /* 2413 * Switch to the data state. 2414 */ 2415 continue stateloop; 2416 case '\u0000': 2417 c = '\uFFFD'; 2418 // CPPONLY: MOZ_FALLTHROUGH; 2419 case '\"': 2420 case '\'': 2421 case '<': 2422 errQuoteOrLtInAttributeNameOrNull(c); 2423 /* 2424 * Treat it as per the "anything else" entry 2425 * below. 2426 */ 2427 // CPPONLY: MOZ_FALLTHROUGH; 2428 default: 2429 addAttributeWithoutValue(); 2430 /* 2431 * Anything else Start a new attribute in the 2432 * current tag token. 2433 */ 2434 if (c >= 'A' && c <= 'Z') { 2435 /* 2436 * U+0041 LATIN CAPITAL LETTER A through to 2437 * U+005A LATIN CAPITAL LETTER Z Set that 2438 * attribute's name to the lowercase version 2439 * of the current input character (add 2440 * 0x0020 to the character's code point) 2441 */ 2442 c += 0x20; 2443 } 2444 /* 2445 * Set that attribute's name to the current 2446 * input character, 2447 */ 2448 clearStrBufBeforeUse(); 2449 appendStrBuf(c); 2450 /* 2451 * and its value to the empty string. 2452 */ 2453 // Will do later. 2454 /* 2455 * Switch to the attribute name state. 2456 */ 2457 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 2458 continue stateloop; 2459 } 2460 } 2461 case MARKUP_DECLARATION_OPEN: 2462 markupdeclarationopenloop: for (;;) { 2463 if (++pos == endPos) { 2464 break stateloop; 2465 } 2466 c = checkChar(buf, pos); 2467 /* 2468 * If the next two characters are both U+002D 2469 * HYPHEN-MINUS characters (-), consume those two 2470 * characters, create a comment token whose data is the 2471 * empty string, and switch to the comment start state. 2472 * 2473 * Otherwise, if the next seven characters are an ASCII 2474 * case-insensitive match for the word "DOCTYPE", then 2475 * consume those characters and switch to the DOCTYPE 2476 * state. 2477 * 2478 * Otherwise, if the insertion mode is 2479 * "in foreign content" and the current node is not an 2480 * element in the HTML namespace and the next seven 2481 * characters are an case-sensitive match for the string 2482 * "[CDATA[" (the five uppercase letters "CDATA" with a 2483 * U+005B LEFT SQUARE BRACKET character before and 2484 * after), then consume those characters and switch to 2485 * the CDATA section state. 2486 * 2487 * Otherwise, is is a parse error. Switch to the bogus 2488 * comment state. The next character that is consumed, 2489 * if any, is the first character that will be in the 2490 * comment. 2491 */ 2492 switch (c) { 2493 case '-': 2494 clearStrBufBeforeUse(); 2495 appendStrBuf(c); 2496 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); 2497 break markupdeclarationopenloop; 2498 // continue stateloop; 2499 case 'd': 2500 case 'D': 2501 clearStrBufBeforeUse(); 2502 appendStrBuf(c); 2503 index = 0; 2504 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); 2505 continue stateloop; 2506 case '[': 2507 if (tokenHandler.cdataSectionAllowed()) { 2508 clearStrBufBeforeUse(); 2509 appendStrBuf(c); 2510 index = 0; 2511 state = transition(state, Tokenizer.CDATA_START, reconsume, pos); 2512 continue stateloop; 2513 } 2514 // CPPONLY: MOZ_FALLTHROUGH; 2515 default: 2516 errBogusComment(); 2517 clearStrBufBeforeUse(); 2518 reconsume = true; 2519 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2520 continue stateloop; 2521 } 2522 } 2523 // CPPONLY: MOZ_FALLTHROUGH; 2524 case MARKUP_DECLARATION_HYPHEN: 2525 markupdeclarationhyphenloop: for (;;) { 2526 if (++pos == endPos) { 2527 break stateloop; 2528 } 2529 c = checkChar(buf, pos); 2530 switch (c) { 2531 case '\u0000': 2532 break stateloop; 2533 case '-': 2534 clearStrBufAfterOneHyphen(); 2535 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); 2536 break markupdeclarationhyphenloop; 2537 // continue stateloop; 2538 default: 2539 errBogusComment(); 2540 reconsume = true; 2541 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2542 continue stateloop; 2543 } 2544 } 2545 // CPPONLY: MOZ_FALLTHROUGH; 2546 case COMMENT_START: 2547 commentstartloop: for (;;) { 2548 if (++pos == endPos) { 2549 break stateloop; 2550 } 2551 c = checkChar(buf, pos); 2552 /* 2553 * Comment start state 2554 * 2555 * 2556 * Consume the next input character: 2557 */ 2558 switch (c) { 2559 case '-': 2560 /* 2561 * U+002D HYPHEN-MINUS (-) Switch to the comment 2562 * start dash state. 2563 */ 2564 appendStrBuf(c); 2565 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); 2566 continue stateloop; 2567 case '>': 2568 /* 2569 * U+003E GREATER-THAN SIGN (>) Parse error. 2570 */ 2571 errPrematureEndOfComment(); 2572 /* Emit the comment token. */ 2573 emitComment(0, pos); 2574 /* 2575 * Switch to the data state. 2576 */ 2577 state = transition(state, Tokenizer.DATA, reconsume, pos); 2578 continue stateloop; 2579 case '\r': 2580 appendStrBufCarriageReturn(); 2581 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2582 break stateloop; 2583 case '\n': 2584 appendStrBufLineFeed(); 2585 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2586 break commentstartloop; 2587 case '\u0000': 2588 c = '\uFFFD'; 2589 // CPPONLY: MOZ_FALLTHROUGH; 2590 default: 2591 /* 2592 * Anything else Append the input character to 2593 * the comment token's data. 2594 */ 2595 appendStrBuf(c); 2596 /* 2597 * Switch to the comment state. 2598 */ 2599 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2600 break commentstartloop; 2601 // continue stateloop; 2602 } 2603 } 2604 // CPPONLY: MOZ_FALLTHROUGH; 2605 case COMMENT: 2606 commentloop: for (;;) { 2607 if (++pos == endPos) { 2608 break stateloop; 2609 } 2610 c = checkChar(buf, pos); 2611 /* 2612 * Comment state Consume the next input character: 2613 */ 2614 switch (c) { 2615 case '-': 2616 /* 2617 * U+002D HYPHEN-MINUS (-) Switch to the comment 2618 * end dash state 2619 */ 2620 appendStrBuf(c); 2621 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2622 break commentloop; 2623 // continue stateloop; 2624 case '\r': 2625 appendStrBufCarriageReturn(); 2626 break stateloop; 2627 case '\n': 2628 appendStrBufLineFeed(); 2629 continue; 2630 case '\u0000': 2631 c = '\uFFFD'; 2632 // CPPONLY: MOZ_FALLTHROUGH; 2633 default: 2634 /* 2635 * Anything else Append the input character to 2636 * the comment token's data. 2637 */ 2638 appendStrBuf(c); 2639 /* 2640 * Stay in the comment state. 2641 */ 2642 continue; 2643 } 2644 } 2645 // CPPONLY: MOZ_FALLTHROUGH; 2646 case COMMENT_END_DASH: 2647 commentenddashloop: for (;;) { 2648 if (++pos == endPos) { 2649 break stateloop; 2650 } 2651 c = checkChar(buf, pos); 2652 /* 2653 * Comment end dash state Consume the next input 2654 * character: 2655 */ 2656 switch (c) { 2657 case '-': 2658 /* 2659 * U+002D HYPHEN-MINUS (-) Switch to the comment 2660 * end state 2661 */ 2662 appendStrBuf(c); 2663 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 2664 break commentenddashloop; 2665 // continue stateloop; 2666 case '\r': 2667 appendStrBufCarriageReturn(); 2668 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2669 break stateloop; 2670 case '\n': 2671 appendStrBufLineFeed(); 2672 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2673 continue stateloop; 2674 case '\u0000': 2675 c = '\uFFFD'; 2676 // CPPONLY: MOZ_FALLTHROUGH; 2677 default: 2678 /* 2679 * Anything else Append a U+002D HYPHEN-MINUS 2680 * (-) character and the input character to the 2681 * comment token's data. 2682 */ 2683 appendStrBuf(c); 2684 /* 2685 * Switch to the comment state. 2686 */ 2687 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2688 continue stateloop; 2689 } 2690 } 2691 // CPPONLY: MOZ_FALLTHROUGH; 2692 case COMMENT_END: 2693 commentendloop: for (;;) { 2694 if (++pos == endPos) { 2695 break stateloop; 2696 } 2697 c = checkChar(buf, pos); 2698 /* 2699 * Comment end dash state Consume the next input 2700 * character: 2701 */ 2702 switch (c) { 2703 case '>': 2704 /* 2705 * U+003E GREATER-THAN SIGN (>) Emit the comment 2706 * token. 2707 */ 2708 emitComment(2, pos); 2709 /* 2710 * Switch to the data state. 2711 */ 2712 state = transition(state, Tokenizer.DATA, reconsume, pos); 2713 continue stateloop; 2714 case '-': 2715 /* U+002D HYPHEN-MINUS (-) Parse error. */ 2716 /* 2717 * Append a U+002D HYPHEN-MINUS (-) character to 2718 * the comment token's data. 2719 */ 2720 adjustDoubleHyphenAndAppendToStrBufAndErr(c); 2721 /* 2722 * Stay in the comment end state. 2723 */ 2724 continue; 2725 case '\r': 2726 adjustDoubleHyphenAndAppendToStrBufCarriageReturn(); 2727 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2728 break stateloop; 2729 case '\n': 2730 adjustDoubleHyphenAndAppendToStrBufLineFeed(); 2731 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2732 continue stateloop; 2733 case '!': 2734 errHyphenHyphenBang(); 2735 appendStrBuf(c); 2736 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); 2737 continue stateloop; 2738 case '\u0000': 2739 c = '\uFFFD'; 2740 // CPPONLY: MOZ_FALLTHROUGH; 2741 default: 2742 /* 2743 * Append two U+002D HYPHEN-MINUS (-) characters 2744 * and the input character to the comment 2745 * token's data. 2746 */ 2747 adjustDoubleHyphenAndAppendToStrBufAndErr(c); 2748 /* 2749 * Switch to the comment state. 2750 */ 2751 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2752 continue stateloop; 2753 } 2754 } 2755 case COMMENT_END_BANG: 2756 for (;;) { 2757 if (++pos == endPos) { 2758 break stateloop; 2759 } 2760 c = checkChar(buf, pos); 2761 /* 2762 * Comment end bang state 2763 * 2764 * Consume the next input character: 2765 */ 2766 switch (c) { 2767 case '>': 2768 /* 2769 * U+003E GREATER-THAN SIGN (>) Emit the comment 2770 * token. 2771 */ 2772 emitComment(3, pos); 2773 /* 2774 * Switch to the data state. 2775 */ 2776 state = transition(state, Tokenizer.DATA, reconsume, pos); 2777 continue stateloop; 2778 case '-': 2779 /* 2780 * Append two U+002D HYPHEN-MINUS (-) characters 2781 * and a U+0021 EXCLAMATION MARK (!) character 2782 * to the comment token's data. 2783 */ 2784 appendStrBuf(c); 2785 /* 2786 * Switch to the comment end dash state. 2787 */ 2788 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2789 continue stateloop; 2790 case '\r': 2791 appendStrBufCarriageReturn(); 2792 break stateloop; 2793 case '\n': 2794 appendStrBufLineFeed(); 2795 continue; 2796 case '\u0000': 2797 c = '\uFFFD'; 2798 // CPPONLY: MOZ_FALLTHROUGH; 2799 default: 2800 /* 2801 * Anything else Append two U+002D HYPHEN-MINUS 2802 * (-) characters, a U+0021 EXCLAMATION MARK (!) 2803 * character, and the input character to the 2804 * comment token's data. Switch to the comment 2805 * state. 2806 */ 2807 appendStrBuf(c); 2808 /* 2809 * Switch to the comment state. 2810 */ 2811 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2812 continue stateloop; 2813 } 2814 } 2815 case COMMENT_START_DASH: 2816 if (++pos == endPos) { 2817 break stateloop; 2818 } 2819 c = checkChar(buf, pos); 2820 /* 2821 * Comment start dash state 2822 * 2823 * Consume the next input character: 2824 */ 2825 switch (c) { 2826 case '-': 2827 /* 2828 * U+002D HYPHEN-MINUS (-) Switch to the comment end 2829 * state 2830 */ 2831 appendStrBuf(c); 2832 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 2833 continue stateloop; 2834 case '>': 2835 errPrematureEndOfComment(); 2836 /* Emit the comment token. */ 2837 emitComment(1, pos); 2838 /* 2839 * Switch to the data state. 2840 */ 2841 state = transition(state, Tokenizer.DATA, reconsume, pos); 2842 continue stateloop; 2843 case '\r': 2844 appendStrBufCarriageReturn(); 2845 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2846 break stateloop; 2847 case '\n': 2848 appendStrBufLineFeed(); 2849 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2850 continue stateloop; 2851 case '\u0000': 2852 c = '\uFFFD'; 2853 // CPPONLY: MOZ_FALLTHROUGH; 2854 default: 2855 /* 2856 * Append a U+002D HYPHEN-MINUS character (-) and 2857 * the current input character to the comment 2858 * token's data. 2859 */ 2860 appendStrBuf(c); 2861 /* 2862 * Switch to the comment state. 2863 */ 2864 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2865 continue stateloop; 2866 } 2867 case CDATA_START: 2868 for (;;) { 2869 if (++pos == endPos) { 2870 break stateloop; 2871 } 2872 c = checkChar(buf, pos); 2873 if (index < 6) { // CDATA_LSQB.length 2874 if (c == Tokenizer.CDATA_LSQB[index]) { 2875 appendStrBuf(c); 2876 } else { 2877 errBogusComment(); 2878 reconsume = true; 2879 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2880 continue stateloop; 2881 } 2882 index++; 2883 continue; 2884 } else { 2885 clearStrBufAfterUse(); 2886 cstart = pos; // start coalescing 2887 reconsume = true; 2888 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2889 break; // FALL THROUGH continue stateloop; 2890 } 2891 } 2892 // CPPONLY: MOZ_FALLTHROUGH; 2893 case CDATA_SECTION: 2894 cdatasectionloop: for (;;) { 2895 if (reconsume) { 2896 reconsume = false; 2897 } else { 2898 if (++pos == endPos) { 2899 break stateloop; 2900 } 2901 c = checkChar(buf, pos); 2902 } 2903 switch (c) { 2904 case ']': 2905 flushChars(buf, pos); 2906 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); 2907 break cdatasectionloop; // FALL THROUGH 2908 case '\u0000': 2909 emitReplacementCharacter(buf, pos); 2910 continue; 2911 case '\r': 2912 emitCarriageReturn(buf, pos); 2913 break stateloop; 2914 case '\n': 2915 silentLineFeed(); 2916 // CPPONLY: MOZ_FALLTHROUGH; 2917 default: 2918 continue; 2919 } 2920 } 2921 // CPPONLY: MOZ_FALLTHROUGH; 2922 case CDATA_RSQB: 2923 cdatarsqb: for (;;) { 2924 if (++pos == endPos) { 2925 break stateloop; 2926 } 2927 c = checkChar(buf, pos); 2928 switch (c) { 2929 case ']': 2930 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); 2931 break cdatarsqb; 2932 default: 2933 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2934 1); 2935 cstart = pos; 2936 reconsume = true; 2937 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2938 continue stateloop; 2939 } 2940 } 2941 // CPPONLY: MOZ_FALLTHROUGH; 2942 case CDATA_RSQB_RSQB: 2943 cdatarsqbrsqb: for (;;) { 2944 if (++pos == endPos) { 2945 break stateloop; 2946 } 2947 c = checkChar(buf, pos); 2948 switch (c) { 2949 case ']': 2950 // Saw a third ]. Emit one ] (logically the 2951 // first one) and stay in this state to 2952 // remember that the last two characters seen 2953 // have been ]]. 2954 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 2955 continue; 2956 case '>': 2957 cstart = pos + 1; 2958 state = transition(state, Tokenizer.DATA, reconsume, pos); 2959 continue stateloop; 2960 default: 2961 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 2962 cstart = pos; 2963 reconsume = true; 2964 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2965 continue stateloop; 2966 } 2967 } 2968 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 2969 attributevaluesinglequotedloop: for (;;) { 2970 if (reconsume) { 2971 reconsume = false; 2972 } else { 2973 if (++pos == endPos) { 2974 break stateloop; 2975 } 2976 c = checkChar(buf, pos); 2977 } 2978 /* 2979 * Consume the next input character: 2980 */ 2981 switch (c) { 2982 case '\'': 2983 /* 2984 * U+0027 APOSTROPHE (') Switch to the after 2985 * attribute value (quoted) state. 2986 */ 2987 addAttributeWithValue(); 2988 2989 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 2990 continue stateloop; 2991 case '&': 2992 /* 2993 * U+0026 AMPERSAND (&) Switch to the character 2994 * reference in attribute value state, with the 2995 * + additional allowed character being U+0027 2996 * APOSTROPHE ('). 2997 */ 2998 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2999 appendCharRefBuf(c); 3000 setAdditionalAndRememberAmpersandLocation('\''); 3001 returnState = state; 3002 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 3003 break attributevaluesinglequotedloop; 3004 // continue stateloop; 3005 case '\r': 3006 appendStrBufCarriageReturn(); 3007 break stateloop; 3008 case '\n': 3009 appendStrBufLineFeed(); 3010 continue; 3011 case '\u0000': 3012 c = '\uFFFD'; 3013 // CPPONLY: MOZ_FALLTHROUGH; 3014 default: 3015 /* 3016 * Anything else Append the current input 3017 * character to the current attribute's value. 3018 */ 3019 appendStrBuf(c); 3020 /* 3021 * Stay in the attribute value (double-quoted) 3022 * state. 3023 */ 3024 continue; 3025 } 3026 } 3027 // CPPONLY: MOZ_FALLTHROUGH; 3028 case CONSUME_CHARACTER_REFERENCE: 3029 if (++pos == endPos) { 3030 break stateloop; 3031 } 3032 c = checkChar(buf, pos); 3033 if (c == '\u0000') { 3034 break stateloop; 3035 } 3036 /* 3037 * Unlike the definition is the spec, this state does not 3038 * return a value and never requires the caller to 3039 * backtrack. This state takes care of emitting characters 3040 * or appending to the current attribute value. It also 3041 * takes care of that in the case when consuming the 3042 * character reference fails. 3043 */ 3044 /* 3045 * This section defines how to consume a character 3046 * reference. This definition is used when parsing character 3047 * references in text and in attributes. 3048 * 3049 * The behavior depends on the identity of the next 3050 * character (the one immediately after the U+0026 AMPERSAND 3051 * character): 3052 */ 3053 switch (c) { 3054 case ' ': 3055 case '\t': 3056 case '\n': 3057 case '\r': // we'll reconsume! 3058 case '\u000C': 3059 case '<': 3060 case '&': 3061 emitOrAppendCharRefBuf(returnState); 3062 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3063 cstart = pos; 3064 } 3065 reconsume = true; 3066 state = transition(state, returnState, reconsume, pos); 3067 continue stateloop; 3068 case '#': 3069 /* 3070 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER 3071 * SIGN. 3072 */ 3073 appendCharRefBuf('#'); 3074 state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); 3075 continue stateloop; 3076 default: 3077 if (c == additional) { 3078 emitOrAppendCharRefBuf(returnState); 3079 reconsume = true; 3080 state = transition(state, returnState, reconsume, pos); 3081 continue stateloop; 3082 } 3083 if (c >= 'a' && c <= 'z') { 3084 firstCharKey = c - 'a' + 26; 3085 } else if (c >= 'A' && c <= 'Z') { 3086 firstCharKey = c - 'A'; 3087 } else { 3088 // No match 3089 /* 3090 * If no match can be made, then this is a parse 3091 * error. 3092 */ 3093 errNoNamedCharacterMatch(); 3094 emitOrAppendCharRefBuf(returnState); 3095 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3096 cstart = pos; 3097 } 3098 reconsume = true; 3099 state = transition(state, returnState, reconsume, pos); 3100 continue stateloop; 3101 } 3102 // Didn't fail yet 3103 appendCharRefBuf(c); 3104 state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); 3105 // FALL THROUGH continue stateloop; 3106 } 3107 // CPPONLY: MOZ_FALLTHROUGH; 3108 case CHARACTER_REFERENCE_HILO_LOOKUP: 3109 { 3110 if (++pos == endPos) { 3111 break stateloop; 3112 } 3113 c = checkChar(buf, pos); 3114 if (c == '\u0000') { 3115 break stateloop; 3116 } 3117 /* 3118 * The data structure is as follows: 3119 * 3120 * HILO_ACCEL is a two-dimensional int array whose major 3121 * index corresponds to the second character of the 3122 * character reference (code point as index) and the 3123 * minor index corresponds to the first character of the 3124 * character reference (packed so that A-Z runs from 0 3125 * to 25 and a-z runs from 26 to 51). This layout makes 3126 * it easier to use the sparseness of the data structure 3127 * to omit parts of it: The second dimension of the 3128 * table is null when no character reference starts with 3129 * the character corresponding to that row. 3130 * 3131 * The int value HILO_ACCEL (by these indeces) is zero 3132 * if there exists no character reference starting with 3133 * that two-letter prefix. Otherwise, the value is an 3134 * int that packs two shorts so that the higher short is 3135 * the index of the highest character reference name 3136 * with that prefix in NAMES and the lower short 3137 * corresponds to the index of the lowest character 3138 * reference name with that prefix. (It happens that the 3139 * first two character reference names share their 3140 * prefix so the packed int cannot be 0 by packing the 3141 * two shorts.) 3142 * 3143 * NAMES is an array of byte arrays where each byte 3144 * array encodes the name of a character references as 3145 * ASCII. The names omit the first two letters of the 3146 * name. (Since storing the first two letters would be 3147 * redundant with the data contained in HILO_ACCEL.) The 3148 * entries are lexically sorted. 3149 * 3150 * For a given index in NAMES, the same index in VALUES 3151 * contains the corresponding expansion as an array of 3152 * two UTF-16 code units (either the character and 3153 * U+0000 or a suggogate pair). 3154 */ 3155 int hilo = 0; 3156 if (c <= 'z') { 3157 @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c]; 3158 if (row != null) { 3159 hilo = row[firstCharKey]; 3160 } 3161 } 3162 if (hilo == 0) { 3163 /* 3164 * If no match can be made, then this is a parse 3165 * error. 3166 */ 3167 errNoNamedCharacterMatch(); 3168 emitOrAppendCharRefBuf(returnState); 3169 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3170 cstart = pos; 3171 } 3172 reconsume = true; 3173 state = transition(state, returnState, reconsume, pos); 3174 continue stateloop; 3175 } 3176 // Didn't fail yet 3177 appendCharRefBuf(c); 3178 lo = hilo & 0xFFFF; 3179 hi = hilo >> 16; 3180 entCol = -1; 3181 candidate = -1; 3182 charRefBufMark = 0; 3183 state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); 3184 // FALL THROUGH continue stateloop; 3185 } 3186 // CPPONLY: MOZ_FALLTHROUGH; 3187 case CHARACTER_REFERENCE_TAIL: 3188 outer: for (;;) { 3189 if (++pos == endPos) { 3190 break stateloop; 3191 } 3192 c = checkChar(buf, pos); 3193 if (c == '\u0000') { 3194 break stateloop; 3195 } 3196 entCol++; 3197 /* 3198 * Consume the maximum number of characters possible, 3199 * with the consumed characters matching one of the 3200 * identifiers in the first column of the named 3201 * character references table (in a case-sensitive 3202 * manner). 3203 */ 3204 loloop: for (;;) { 3205 if (hi < lo) { 3206 break outer; 3207 } 3208 if (entCol == NamedCharacters.NAMES[lo].length()) { 3209 candidate = lo; 3210 charRefBufMark = charRefBufLen; 3211 lo++; 3212 } else if (entCol > NamedCharacters.NAMES[lo].length()) { 3213 break outer; 3214 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 3215 lo++; 3216 } else { 3217 break loloop; 3218 } 3219 } 3220 3221 hiloop: for (;;) { 3222 if (hi < lo) { 3223 break outer; 3224 } 3225 if (entCol == NamedCharacters.NAMES[hi].length()) { 3226 break hiloop; 3227 } 3228 if (entCol > NamedCharacters.NAMES[hi].length()) { 3229 break outer; 3230 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 3231 hi--; 3232 } else { 3233 break hiloop; 3234 } 3235 } 3236 3237 if (c == ';') { 3238 // If we see a semicolon, there cannot be a 3239 // longer match. Break the loop. However, before 3240 // breaking, take the longest match so far as the 3241 // candidate, if we are just about to complete a 3242 // match. 3243 if (entCol + 1 == NamedCharacters.NAMES[lo].length()) { 3244 candidate = lo; 3245 charRefBufMark = charRefBufLen; 3246 } 3247 break outer; 3248 } 3249 3250 if (hi < lo) { 3251 break outer; 3252 } 3253 appendCharRefBuf(c); 3254 continue; 3255 } 3256 3257 if (candidate == -1) { 3258 // reconsume deals with CR, LF or nul 3259 /* 3260 * If no match can be made, then this is a parse error. 3261 */ 3262 errNoNamedCharacterMatch(); 3263 emitOrAppendCharRefBuf(returnState); 3264 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3265 cstart = pos; 3266 } 3267 reconsume = true; 3268 state = transition(state, returnState, reconsume, pos); 3269 continue stateloop; 3270 } else { 3271 // c can't be CR, LF or nul if we got here 3272 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 3273 if (candidateName.length() == 0 3274 || candidateName.charAt(candidateName.length() - 1) != ';') { 3275 /* 3276 * If the last character matched is not a U+003B 3277 * SEMICOLON (;), there is a parse error. 3278 */ 3279 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3280 /* 3281 * If the entity is being consumed as part of an 3282 * attribute, and the last character matched is 3283 * not a U+003B SEMICOLON (;), 3284 */ 3285 char ch; 3286 if (charRefBufMark == charRefBufLen) { 3287 ch = c; 3288 } else { 3289 ch = charRefBuf[charRefBufMark]; 3290 } 3291 if (ch == '=' || (ch >= '0' && ch <= '9') 3292 || (ch >= 'A' && ch <= 'Z') 3293 || (ch >= 'a' && ch <= 'z')) { 3294 /* 3295 * and the next character is either a U+003D 3296 * EQUALS SIGN character (=) or in the range 3297 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 3298 * U+0041 LATIN CAPITAL LETTER A to U+005A 3299 * LATIN CAPITAL LETTER Z, or U+0061 LATIN 3300 * SMALL LETTER A to U+007A LATIN SMALL 3301 * LETTER Z, then, for historical reasons, 3302 * all the characters that were matched 3303 * after the U+0026 AMPERSAND (&) must be 3304 * unconsumed, and nothing is returned. 3305 */ 3306 errNoNamedCharacterMatch(); 3307 appendCharRefBufToStrBuf(); 3308 reconsume = true; 3309 state = transition(state, returnState, reconsume, pos); 3310 continue stateloop; 3311 } 3312 } 3313 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3314 errUnescapedAmpersandInterpretedAsCharacterReference(); 3315 } else { 3316 errNotSemicolonTerminated(); 3317 } 3318 } 3319 3320 /* 3321 * Otherwise, return a character token for the character 3322 * corresponding to the entity name (as given by the 3323 * second column of the named character references 3324 * table). 3325 */ 3326 // CPPONLY: completedNamedCharacterReference(); 3327 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 3328 if ( 3329 // [NOCPP[ 3330 val.length == 1 3331 // ]NOCPP] 3332 // CPPONLY: val[1] == 0 3333 ) { 3334 emitOrAppendOne(val, returnState); 3335 } else { 3336 emitOrAppendTwo(val, returnState); 3337 } 3338 // this is so complicated! 3339 if (charRefBufMark < charRefBufLen) { 3340 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3341 appendStrBuf(charRefBuf, charRefBufMark, 3342 charRefBufLen - charRefBufMark); 3343 } else { 3344 tokenHandler.characters(charRefBuf, charRefBufMark, 3345 charRefBufLen - charRefBufMark); 3346 } 3347 } 3348 // charRefBufLen will be zeroed below! 3349 3350 // Check if we broke out early with c being the last 3351 // character that matched as opposed to being the 3352 // first one that didn't match. In the case of an 3353 // early break, the next run on text should start 3354 // *after* the current character and the current 3355 // character shouldn't be reconsumed. 3356 boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen); 3357 charRefBufLen = 0; 3358 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3359 cstart = earlyBreak ? pos + 1 : pos; 3360 } 3361 reconsume = !earlyBreak; 3362 state = transition(state, returnState, reconsume, pos); 3363 continue stateloop; 3364 /* 3365 * If the markup contains I'm ¬it; I tell you, the 3366 * entity is parsed as "not", as in, I'm ¬it; I tell 3367 * you. But if the markup was I'm ∉ I tell you, 3368 * the entity would be parsed as "notin;", resulting in 3369 * I'm ∉ I tell you. 3370 */ 3371 } 3372 case CONSUME_NCR: 3373 if (++pos == endPos) { 3374 break stateloop; 3375 } 3376 c = checkChar(buf, pos); 3377 value = 0; 3378 seenDigits = false; 3379 /* 3380 * The behavior further depends on the character after the 3381 * U+0023 NUMBER SIGN: 3382 */ 3383 switch (c) { 3384 case 'x': 3385 case 'X': 3386 3387 /* 3388 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL 3389 * LETTER X Consume the X. 3390 * 3391 * Follow the steps below, but using the range of 3392 * characters U+0030 DIGIT ZERO through to U+0039 3393 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through 3394 * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN 3395 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL 3396 * LETTER F (in other words, 0-9, A-F, a-f). 3397 * 3398 * When it comes to interpreting the number, 3399 * interpret it as a hexadecimal number. 3400 */ 3401 appendCharRefBuf(c); 3402 state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos); 3403 continue stateloop; 3404 default: 3405 /* 3406 * Anything else Follow the steps below, but using 3407 * the range of characters U+0030 DIGIT ZERO through 3408 * to U+0039 DIGIT NINE (i.e. just 0-9). 3409 * 3410 * When it comes to interpreting the number, 3411 * interpret it as a decimal number. 3412 */ 3413 reconsume = true; 3414 state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); 3415 // FALL THROUGH continue stateloop; 3416 } 3417 // CPPONLY: MOZ_FALLTHROUGH; 3418 case DECIMAL_NRC_LOOP: 3419 decimalloop: for (;;) { 3420 if (reconsume) { 3421 reconsume = false; 3422 } else { 3423 if (++pos == endPos) { 3424 break stateloop; 3425 } 3426 c = checkChar(buf, pos); 3427 } 3428 /* 3429 * Consume as many characters as match the range of 3430 * characters given above. 3431 */ 3432 assert value >= 0: "value must not become negative."; 3433 if (c >= '0' && c <= '9') { 3434 seenDigits = true; 3435 // Avoid overflow 3436 if (value <= 0x10FFFF) { 3437 value *= 10; 3438 value += c - '0'; 3439 } 3440 continue; 3441 } else if (c == ';') { 3442 if (seenDigits) { 3443 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3444 cstart = pos + 1; 3445 } 3446 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3447 // FALL THROUGH continue stateloop; 3448 break decimalloop; 3449 } else { 3450 errNoDigitsInNCR(); 3451 appendCharRefBuf(';'); 3452 emitOrAppendCharRefBuf(returnState); 3453 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3454 cstart = pos + 1; 3455 } 3456 state = transition(state, returnState, reconsume, pos); 3457 continue stateloop; 3458 } 3459 } else { 3460 /* 3461 * If no characters match the range, then don't 3462 * consume any characters (and unconsume the U+0023 3463 * NUMBER SIGN character and, if appropriate, the X 3464 * character). This is a parse error; nothing is 3465 * returned. 3466 * 3467 * Otherwise, if the next character is a U+003B 3468 * SEMICOLON, consume that too. If it isn't, there 3469 * is a parse error. 3470 */ 3471 if (!seenDigits) { 3472 errNoDigitsInNCR(); 3473 emitOrAppendCharRefBuf(returnState); 3474 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3475 cstart = pos; 3476 } 3477 reconsume = true; 3478 state = transition(state, returnState, reconsume, pos); 3479 continue stateloop; 3480 } else { 3481 errCharRefLacksSemicolon(); 3482 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3483 cstart = pos; 3484 } 3485 reconsume = true; 3486 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3487 // FALL THROUGH continue stateloop; 3488 break decimalloop; 3489 } 3490 } 3491 } 3492 // CPPONLY: MOZ_FALLTHROUGH; 3493 case HANDLE_NCR_VALUE: 3494 // WARNING previous state sets reconsume 3495 // We are not going to emit the contents of charRefBuf. 3496 charRefBufLen = 0; 3497 // XXX inline this case if the method size can take it 3498 handleNcrValue(returnState); 3499 state = transition(state, returnState, reconsume, pos); 3500 continue stateloop; 3501 case HEX_NCR_LOOP: 3502 for (;;) { 3503 if (++pos == endPos) { 3504 break stateloop; 3505 } 3506 c = checkChar(buf, pos); 3507 /* 3508 * Consume as many characters as match the range of 3509 * characters given above. 3510 */ 3511 assert value >= 0: "value must not become negative."; 3512 if (c >= '0' && c <= '9') { 3513 seenDigits = true; 3514 // Avoid overflow 3515 if (value <= 0x10FFFF) { 3516 value *= 16; 3517 value += c - '0'; 3518 } 3519 continue; 3520 } else if (c >= 'A' && c <= 'F') { 3521 seenDigits = true; 3522 // Avoid overflow 3523 if (value <= 0x10FFFF) { 3524 value *= 16; 3525 value += c - 'A' + 10; 3526 } 3527 continue; 3528 } else if (c >= 'a' && c <= 'f') { 3529 seenDigits = true; 3530 // Avoid overflow 3531 if (value <= 0x10FFFF) { 3532 value *= 16; 3533 value += c - 'a' + 10; 3534 } 3535 continue; 3536 } else if (c == ';') { 3537 if (seenDigits) { 3538 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3539 cstart = pos + 1; 3540 } 3541 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3542 continue stateloop; 3543 } else { 3544 errNoDigitsInNCR(); 3545 appendCharRefBuf(';'); 3546 emitOrAppendCharRefBuf(returnState); 3547 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3548 cstart = pos + 1; 3549 } 3550 state = transition(state, returnState, reconsume, pos); 3551 continue stateloop; 3552 } 3553 } else { 3554 /* 3555 * If no characters match the range, then don't 3556 * consume any characters (and unconsume the U+0023 3557 * NUMBER SIGN character and, if appropriate, the X 3558 * character). This is a parse error; nothing is 3559 * returned. 3560 * 3561 * Otherwise, if the next character is a U+003B 3562 * SEMICOLON, consume that too. If it isn't, there 3563 * is a parse error. 3564 */ 3565 if (!seenDigits) { 3566 errNoDigitsInNCR(); 3567 emitOrAppendCharRefBuf(returnState); 3568 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3569 cstart = pos; 3570 } 3571 reconsume = true; 3572 state = transition(state, returnState, reconsume, pos); 3573 continue stateloop; 3574 } else { 3575 errCharRefLacksSemicolon(); 3576 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3577 cstart = pos; 3578 } 3579 reconsume = true; 3580 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3581 continue stateloop; 3582 } 3583 } 3584 } 3585 case PLAINTEXT: 3586 plaintextloop: for (;;) { 3587 if (reconsume) { 3588 reconsume = false; 3589 } else { 3590 if (++pos == endPos) { 3591 break stateloop; 3592 } 3593 c = checkChar(buf, pos); 3594 } 3595 switch (c) { 3596 case '\u0000': 3597 emitPlaintextReplacementCharacter(buf, pos); 3598 continue; 3599 case '\r': 3600 emitCarriageReturn(buf, pos); 3601 break stateloop; 3602 case '\n': 3603 silentLineFeed(); 3604 // CPPONLY: MOZ_FALLTHROUGH; 3605 default: 3606 /* 3607 * Anything else Emit the current input 3608 * character as a character token. Stay in the 3609 * RAWTEXT state. 3610 */ 3611 continue; 3612 } 3613 } 3614 case CLOSE_TAG_OPEN: 3615 if (++pos == endPos) { 3616 break stateloop; 3617 } 3618 c = checkChar(buf, pos); 3619 /* 3620 * Otherwise, if the content model flag is set to the PCDATA 3621 * state, or if the next few characters do match that tag 3622 * name, consume the next input character: 3623 */ 3624 switch (c) { 3625 case '>': 3626 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 3627 errLtSlashGt(); 3628 /* 3629 * Switch to the data state. 3630 */ 3631 cstart = pos + 1; 3632 state = transition(state, Tokenizer.DATA, reconsume, pos); 3633 continue stateloop; 3634 case '\r': 3635 silentCarriageReturn(); 3636 /* Anything else Parse error. */ 3637 errGarbageAfterLtSlash(); 3638 /* 3639 * Switch to the bogus comment state. 3640 */ 3641 clearStrBufBeforeUse(); 3642 appendStrBuf('\n'); 3643 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3644 break stateloop; 3645 case '\n': 3646 silentLineFeed(); 3647 /* Anything else Parse error. */ 3648 errGarbageAfterLtSlash(); 3649 /* 3650 * Switch to the bogus comment state. 3651 */ 3652 clearStrBufBeforeUse(); 3653 appendStrBuf(c); 3654 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3655 continue stateloop; 3656 case '\u0000': 3657 c = '\uFFFD'; 3658 // CPPONLY: MOZ_FALLTHROUGH; 3659 default: 3660 if (c >= 'A' && c <= 'Z') { 3661 c += 0x20; 3662 } 3663 if (c >= 'a' && c <= 'z') { 3664 /* 3665 * U+0061 LATIN SMALL LETTER A through to U+007A 3666 * LATIN SMALL LETTER Z Create a new end tag 3667 * token, 3668 */ 3669 endTag = true; 3670 /* 3671 * set its tag name to the input character, 3672 */ 3673 clearStrBufBeforeUse(); 3674 appendStrBuf(c); 3675 containsHyphen = false; 3676 /* 3677 * then switch to the tag name state. (Don't 3678 * emit the token yet; further details will be 3679 * filled in before it is emitted.) 3680 */ 3681 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 3682 continue stateloop; 3683 } else { 3684 /* Anything else Parse error. */ 3685 errGarbageAfterLtSlash(); 3686 /* 3687 * Switch to the bogus comment state. 3688 */ 3689 clearStrBufBeforeUse(); 3690 appendStrBuf(c); 3691 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3692 continue stateloop; 3693 } 3694 } 3695 case RCDATA: 3696 rcdataloop: for (;;) { 3697 if (reconsume) { 3698 reconsume = false; 3699 } else { 3700 if (++pos == endPos) { 3701 break stateloop; 3702 } 3703 c = checkChar(buf, pos); 3704 } 3705 switch (c) { 3706 case '&': 3707 /* 3708 * U+0026 AMPERSAND (&) Switch to the character 3709 * reference in RCDATA state. 3710 */ 3711 flushChars(buf, pos); 3712 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 3713 appendCharRefBuf(c); 3714 setAdditionalAndRememberAmpersandLocation('\u0000'); 3715 returnState = state; 3716 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 3717 continue stateloop; 3718 case '<': 3719 /* 3720 * U+003C LESS-THAN SIGN (<) Switch to the 3721 * RCDATA less-than sign state. 3722 */ 3723 flushChars(buf, pos); 3724 3725 returnState = state; 3726 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 3727 continue stateloop; 3728 case '\u0000': 3729 emitReplacementCharacter(buf, pos); 3730 continue; 3731 case '\r': 3732 emitCarriageReturn(buf, pos); 3733 break stateloop; 3734 case '\n': 3735 silentLineFeed(); 3736 // CPPONLY: MOZ_FALLTHROUGH; 3737 default: 3738 /* 3739 * Emit the current input character as a 3740 * character token. Stay in the RCDATA state. 3741 */ 3742 continue; 3743 } 3744 } 3745 case RAWTEXT: 3746 rawtextloop: for (;;) { 3747 if (reconsume) { 3748 reconsume = false; 3749 } else { 3750 if (++pos == endPos) { 3751 break stateloop; 3752 } 3753 c = checkChar(buf, pos); 3754 } 3755 switch (c) { 3756 case '<': 3757 /* 3758 * U+003C LESS-THAN SIGN (<) Switch to the 3759 * RAWTEXT less-than sign state. 3760 */ 3761 flushChars(buf, pos); 3762 3763 returnState = state; 3764 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 3765 break rawtextloop; 3766 // FALL THRU continue stateloop; 3767 case '\u0000': 3768 emitReplacementCharacter(buf, pos); 3769 continue; 3770 case '\r': 3771 emitCarriageReturn(buf, pos); 3772 break stateloop; 3773 case '\n': 3774 silentLineFeed(); 3775 // CPPONLY: MOZ_FALLTHROUGH; 3776 default: 3777 /* 3778 * Emit the current input character as a 3779 * character token. Stay in the RAWTEXT state. 3780 */ 3781 continue; 3782 } 3783 } 3784 // CPPONLY: MOZ_FALLTHROUGH; 3785 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 3786 rawtextrcdatalessthansignloop: for (;;) { 3787 if (++pos == endPos) { 3788 break stateloop; 3789 } 3790 c = checkChar(buf, pos); 3791 switch (c) { 3792 case '/': 3793 /* 3794 * U+002F SOLIDUS (/) Set the temporary buffer 3795 * to the empty string. Switch to the script 3796 * data end tag open state. 3797 */ 3798 index = 0; 3799 clearStrBufBeforeUse(); 3800 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 3801 break rawtextrcdatalessthansignloop; 3802 // FALL THRU continue stateloop; 3803 default: 3804 /* 3805 * Otherwise, emit a U+003C LESS-THAN SIGN 3806 * character token 3807 */ 3808 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 3809 /* 3810 * and reconsume the current input character in 3811 * the data state. 3812 */ 3813 cstart = pos; 3814 reconsume = true; 3815 state = transition(state, returnState, reconsume, pos); 3816 continue stateloop; 3817 } 3818 } 3819 // CPPONLY: MOZ_FALLTHROUGH; 3820 case NON_DATA_END_TAG_NAME: 3821 for (;;) { 3822 if (++pos == endPos) { 3823 break stateloop; 3824 } 3825 c = checkChar(buf, pos); 3826 /* 3827 * ASSERT! when entering this state, set index to 0 and 3828 * call clearStrBufBeforeUse(); Let's implement the above 3829 * without lookahead. strBuf is the 'temporary buffer'. 3830 */ 3831 if (endTagExpectationAsArray == null) { 3832 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 3833 0, 2); 3834 cstart = pos; 3835 reconsume = true; 3836 state = transition(state, returnState, reconsume, pos); 3837 continue stateloop; 3838 } else if (index < endTagExpectationAsArray.length) { 3839 char e = endTagExpectationAsArray[index]; 3840 char folded = c; 3841 if (c >= 'A' && c <= 'Z') { 3842 folded += 0x20; 3843 } 3844 if (folded != e) { 3845 // [NOCPP[ 3846 errHtml4LtSlashInRcdata(folded); 3847 // ]NOCPP] 3848 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 3849 0, 2); 3850 emitStrBuf(); 3851 cstart = pos; 3852 reconsume = true; 3853 state = transition(state, returnState, reconsume, pos); 3854 continue stateloop; 3855 } 3856 appendStrBuf(c); 3857 index++; 3858 continue; 3859 } else { 3860 endTag = true; 3861 // XXX replace contentModelElement with different 3862 // type 3863 tagName = endTagExpectation; 3864 switch (c) { 3865 case '\r': 3866 silentCarriageReturn(); 3867 clearStrBufAfterUse(); // strBuf not used 3868 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 3869 break stateloop; 3870 case '\n': 3871 silentLineFeed(); 3872 // CPPONLY: MOZ_FALLTHROUGH; 3873 case ' ': 3874 case '\t': 3875 case '\u000C': 3876 /* 3877 * U+0009 CHARACTER TABULATION U+000A LINE 3878 * FEED (LF) U+000C FORM FEED (FF) U+0020 3879 * SPACE If the current end tag token is an 3880 * appropriate end tag token, then switch to 3881 * the before attribute name state. 3882 */ 3883 clearStrBufAfterUse(); // strBuf not used 3884 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 3885 continue stateloop; 3886 case '/': 3887 /* 3888 * U+002F SOLIDUS (/) If the current end tag 3889 * token is an appropriate end tag token, 3890 * then switch to the self-closing start tag 3891 * state. 3892 */ 3893 clearStrBufAfterUse(); // strBuf not used 3894 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 3895 continue stateloop; 3896 case '>': 3897 /* 3898 * U+003E GREATER-THAN SIGN (>) If the 3899 * current end tag token is an appropriate 3900 * end tag token, then emit the current tag 3901 * token and switch to the data state. 3902 */ 3903 clearStrBufAfterUse(); // strBuf not used 3904 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 3905 if (shouldSuspend) { 3906 break stateloop; 3907 } 3908 continue stateloop; 3909 default: 3910 /* 3911 * Emit a U+003C LESS-THAN SIGN character 3912 * token, a U+002F SOLIDUS character token, 3913 * a character token for each of the 3914 * characters in the temporary buffer (in 3915 * the order they were added to the buffer), 3916 * and reconsume the current input character 3917 * in the RAWTEXT state. 3918 */ 3919 // [NOCPP[ 3920 errWarnLtSlashInRcdata(); 3921 // ]NOCPP] 3922 tokenHandler.characters( 3923 Tokenizer.LT_SOLIDUS, 0, 2); 3924 emitStrBuf(); 3925 cstart = pos; // don't drop the 3926 // character 3927 reconsume = true; 3928 state = transition(state, returnState, reconsume, pos); 3929 continue stateloop; 3930 } 3931 } 3932 } 3933 // BEGIN HOTSPOT WORKAROUND 3934 case BOGUS_COMMENT: 3935 boguscommentloop: for (;;) { 3936 if (reconsume) { 3937 reconsume = false; 3938 } else { 3939 if (++pos == endPos) { 3940 break stateloop; 3941 } 3942 c = checkChar(buf, pos); 3943 } 3944 /* 3945 * Consume every character up to and including the first 3946 * U+003E GREATER-THAN SIGN character (>) or the end of 3947 * the file (EOF), whichever comes first. Emit a comment 3948 * token whose data is the concatenation of all the 3949 * characters starting from and including the character 3950 * that caused the state machine to switch into the 3951 * bogus comment state, up to and including the 3952 * character immediately before the last consumed 3953 * character (i.e. up to the character just before the 3954 * U+003E or EOF character). (If the comment was started 3955 * by the end of the file (EOF), the token is empty.) 3956 * 3957 * Switch to the data state. 3958 * 3959 * If the end of the file was reached, reconsume the EOF 3960 * character. 3961 */ 3962 switch (c) { 3963 case '>': 3964 emitComment(0, pos); 3965 state = transition(state, Tokenizer.DATA, reconsume, pos); 3966 continue stateloop; 3967 case '-': 3968 appendStrBuf(c); 3969 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); 3970 break boguscommentloop; 3971 case '\r': 3972 appendStrBufCarriageReturn(); 3973 break stateloop; 3974 case '\n': 3975 appendStrBufLineFeed(); 3976 continue; 3977 case '\u0000': 3978 c = '\uFFFD'; 3979 // CPPONLY: MOZ_FALLTHROUGH; 3980 default: 3981 appendStrBuf(c); 3982 continue; 3983 } 3984 } 3985 // CPPONLY: MOZ_FALLTHROUGH; 3986 case BOGUS_COMMENT_HYPHEN: 3987 boguscommenthyphenloop: for (;;) { 3988 if (++pos == endPos) { 3989 break stateloop; 3990 } 3991 c = checkChar(buf, pos); 3992 switch (c) { 3993 case '>': 3994 // [NOCPP[ 3995 maybeAppendSpaceToBogusComment(); 3996 // ]NOCPP] 3997 emitComment(0, pos); 3998 state = transition(state, Tokenizer.DATA, reconsume, pos); 3999 continue stateloop; 4000 case '-': 4001 appendSecondHyphenToBogusComment(); 4002 continue boguscommenthyphenloop; 4003 case '\r': 4004 appendStrBufCarriageReturn(); 4005 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4006 break stateloop; 4007 case '\n': 4008 appendStrBufLineFeed(); 4009 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4010 continue stateloop; 4011 case '\u0000': 4012 c = '\uFFFD'; 4013 // CPPONLY: MOZ_FALLTHROUGH; 4014 default: 4015 appendStrBuf(c); 4016 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4017 continue stateloop; 4018 } 4019 } 4020 case SCRIPT_DATA: 4021 scriptdataloop: for (;;) { 4022 if (reconsume) { 4023 reconsume = false; 4024 } else { 4025 if (++pos == endPos) { 4026 break stateloop; 4027 } 4028 c = checkChar(buf, pos); 4029 } 4030 switch (c) { 4031 case '<': 4032 /* 4033 * U+003C LESS-THAN SIGN (<) Switch to the 4034 * script data less-than sign state. 4035 */ 4036 flushChars(buf, pos); 4037 returnState = state; 4038 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); 4039 break scriptdataloop; // FALL THRU continue 4040 // stateloop; 4041 case '\u0000': 4042 emitReplacementCharacter(buf, pos); 4043 continue; 4044 case '\r': 4045 emitCarriageReturn(buf, pos); 4046 break stateloop; 4047 case '\n': 4048 silentLineFeed(); 4049 // CPPONLY: MOZ_FALLTHROUGH; 4050 default: 4051 /* 4052 * Anything else Emit the current input 4053 * character as a character token. Stay in the 4054 * script data state. 4055 */ 4056 continue; 4057 } 4058 } 4059 // CPPONLY: MOZ_FALLTHROUGH; 4060 case SCRIPT_DATA_LESS_THAN_SIGN: 4061 scriptdatalessthansignloop: for (;;) { 4062 if (++pos == endPos) { 4063 break stateloop; 4064 } 4065 c = checkChar(buf, pos); 4066 switch (c) { 4067 case '/': 4068 /* 4069 * U+002F SOLIDUS (/) Set the temporary buffer 4070 * to the empty string. Switch to the script 4071 * data end tag open state. 4072 */ 4073 index = 0; 4074 clearStrBufBeforeUse(); 4075 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 4076 continue stateloop; 4077 case '!': 4078 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4079 cstart = pos; 4080 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); 4081 break scriptdatalessthansignloop; // FALL THRU 4082 // continue 4083 // stateloop; 4084 default: 4085 /* 4086 * Otherwise, emit a U+003C LESS-THAN SIGN 4087 * character token 4088 */ 4089 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4090 /* 4091 * and reconsume the current input character in 4092 * the data state. 4093 */ 4094 cstart = pos; 4095 reconsume = true; 4096 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4097 continue stateloop; 4098 } 4099 } 4100 // CPPONLY: MOZ_FALLTHROUGH; 4101 case SCRIPT_DATA_ESCAPE_START: 4102 scriptdataescapestartloop: for (;;) { 4103 if (++pos == endPos) { 4104 break stateloop; 4105 } 4106 c = checkChar(buf, pos); 4107 /* 4108 * Consume the next input character: 4109 */ 4110 switch (c) { 4111 case '-': 4112 /* 4113 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4114 * HYPHEN-MINUS character token. Switch to the 4115 * script data escape start dash state. 4116 */ 4117 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); 4118 break scriptdataescapestartloop; // FALL THRU 4119 // continue 4120 // stateloop; 4121 default: 4122 /* 4123 * Anything else Reconsume the current input 4124 * character in the script data state. 4125 */ 4126 reconsume = true; 4127 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4128 continue stateloop; 4129 } 4130 } 4131 // CPPONLY: MOZ_FALLTHROUGH; 4132 case SCRIPT_DATA_ESCAPE_START_DASH: 4133 scriptdataescapestartdashloop: for (;;) { 4134 if (++pos == endPos) { 4135 break stateloop; 4136 } 4137 c = checkChar(buf, pos); 4138 /* 4139 * Consume the next input character: 4140 */ 4141 switch (c) { 4142 case '-': 4143 /* 4144 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4145 * HYPHEN-MINUS character token. Switch to the 4146 * script data escaped dash dash state. 4147 */ 4148 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 4149 break scriptdataescapestartdashloop; 4150 // continue stateloop; 4151 default: 4152 /* 4153 * Anything else Reconsume the current input 4154 * character in the script data state. 4155 */ 4156 reconsume = true; 4157 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4158 continue stateloop; 4159 } 4160 } 4161 // CPPONLY: MOZ_FALLTHROUGH; 4162 case SCRIPT_DATA_ESCAPED_DASH_DASH: 4163 scriptdataescapeddashdashloop: for (;;) { 4164 if (++pos == endPos) { 4165 break stateloop; 4166 } 4167 c = checkChar(buf, pos); 4168 /* 4169 * Consume the next input character: 4170 */ 4171 switch (c) { 4172 case '-': 4173 /* 4174 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4175 * HYPHEN-MINUS character token. Stay in the 4176 * script data escaped dash dash state. 4177 */ 4178 continue; 4179 case '<': 4180 /* 4181 * U+003C LESS-THAN SIGN (<) Switch to the 4182 * script data escaped less-than sign state. 4183 */ 4184 flushChars(buf, pos); 4185 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4186 continue stateloop; 4187 case '>': 4188 /* 4189 * U+003E GREATER-THAN SIGN (>) Emit a U+003E 4190 * GREATER-THAN SIGN character token. Switch to 4191 * the script data state. 4192 */ 4193 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4194 continue stateloop; 4195 case '\u0000': 4196 emitReplacementCharacter(buf, pos); 4197 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4198 break scriptdataescapeddashdashloop; 4199 case '\r': 4200 emitCarriageReturn(buf, pos); 4201 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4202 break stateloop; 4203 case '\n': 4204 silentLineFeed(); 4205 // CPPONLY: MOZ_FALLTHROUGH; 4206 default: 4207 /* 4208 * Anything else Emit the current input 4209 * character as a character token. Switch to the 4210 * script data escaped state. 4211 */ 4212 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4213 break scriptdataescapeddashdashloop; 4214 // continue stateloop; 4215 } 4216 } 4217 // CPPONLY: MOZ_FALLTHROUGH; 4218 case SCRIPT_DATA_ESCAPED: 4219 scriptdataescapedloop: for (;;) { 4220 if (reconsume) { 4221 reconsume = false; 4222 } else { 4223 if (++pos == endPos) { 4224 break stateloop; 4225 } 4226 c = checkChar(buf, pos); 4227 } 4228 /* 4229 * Consume the next input character: 4230 */ 4231 switch (c) { 4232 case '-': 4233 /* 4234 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4235 * HYPHEN-MINUS character token. Switch to the 4236 * script data escaped dash state. 4237 */ 4238 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); 4239 break scriptdataescapedloop; // FALL THRU 4240 // continue 4241 // stateloop; 4242 case '<': 4243 /* 4244 * U+003C LESS-THAN SIGN (<) Switch to the 4245 * script data escaped less-than sign state. 4246 */ 4247 flushChars(buf, pos); 4248 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4249 continue stateloop; 4250 case '\u0000': 4251 emitReplacementCharacter(buf, pos); 4252 continue; 4253 case '\r': 4254 emitCarriageReturn(buf, pos); 4255 break stateloop; 4256 case '\n': 4257 silentLineFeed(); 4258 // CPPONLY: MOZ_FALLTHROUGH; 4259 default: 4260 /* 4261 * Anything else Emit the current input 4262 * character as a character token. Stay in the 4263 * script data escaped state. 4264 */ 4265 continue; 4266 } 4267 } 4268 // CPPONLY: MOZ_FALLTHROUGH; 4269 case SCRIPT_DATA_ESCAPED_DASH: 4270 scriptdataescapeddashloop: for (;;) { 4271 if (++pos == endPos) { 4272 break stateloop; 4273 } 4274 c = checkChar(buf, pos); 4275 /* 4276 * Consume the next input character: 4277 */ 4278 switch (c) { 4279 case '-': 4280 /* 4281 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4282 * HYPHEN-MINUS character token. Switch to the 4283 * script data escaped dash dash state. 4284 */ 4285 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 4286 continue stateloop; 4287 case '<': 4288 /* 4289 * U+003C LESS-THAN SIGN (<) Switch to the 4290 * script data escaped less-than sign state. 4291 */ 4292 flushChars(buf, pos); 4293 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4294 break scriptdataescapeddashloop; 4295 // continue stateloop; 4296 case '\u0000': 4297 emitReplacementCharacter(buf, pos); 4298 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4299 continue stateloop; 4300 case '\r': 4301 emitCarriageReturn(buf, pos); 4302 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4303 break stateloop; 4304 case '\n': 4305 silentLineFeed(); 4306 // CPPONLY: MOZ_FALLTHROUGH; 4307 default: 4308 /* 4309 * Anything else Emit the current input 4310 * character as a character token. Switch to the 4311 * script data escaped state. 4312 */ 4313 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4314 continue stateloop; 4315 } 4316 } 4317 // CPPONLY: MOZ_FALLTHROUGH; 4318 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 4319 scriptdataescapedlessthanloop: for (;;) { 4320 if (++pos == endPos) { 4321 break stateloop; 4322 } 4323 c = checkChar(buf, pos); 4324 /* 4325 * Consume the next input character: 4326 */ 4327 switch (c) { 4328 case '/': 4329 /* 4330 * U+002F SOLIDUS (/) Set the temporary buffer 4331 * to the empty string. Switch to the script 4332 * data escaped end tag open state. 4333 */ 4334 index = 0; 4335 clearStrBufBeforeUse(); 4336 returnState = Tokenizer.SCRIPT_DATA_ESCAPED; 4337 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 4338 continue stateloop; 4339 case 'S': 4340 case 's': 4341 /* 4342 * U+0041 LATIN CAPITAL LETTER A through to 4343 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C 4344 * LESS-THAN SIGN character token and the 4345 * current input character as a character token. 4346 */ 4347 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4348 cstart = pos; 4349 index = 1; 4350 /* 4351 * Set the temporary buffer to the empty string. 4352 * Append the lowercase version of the current 4353 * input character (add 0x0020 to the 4354 * character's code point) to the temporary 4355 * buffer. Switch to the script data double 4356 * escape start state. 4357 */ 4358 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); 4359 break scriptdataescapedlessthanloop; 4360 // continue stateloop; 4361 default: 4362 /* 4363 * Anything else Emit a U+003C LESS-THAN SIGN 4364 * character token and reconsume the current 4365 * input character in the script data escaped 4366 * state. 4367 */ 4368 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4369 cstart = pos; 4370 reconsume = true; 4371 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4372 continue stateloop; 4373 } 4374 } 4375 // CPPONLY: MOZ_FALLTHROUGH; 4376 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 4377 scriptdatadoubleescapestartloop: for (;;) { 4378 if (++pos == endPos) { 4379 break stateloop; 4380 } 4381 c = checkChar(buf, pos); 4382 assert index > 0; 4383 if (index < 6) { // SCRIPT_ARR.length 4384 char folded = c; 4385 if (c >= 'A' && c <= 'Z') { 4386 folded += 0x20; 4387 } 4388 if (folded != Tokenizer.SCRIPT_ARR[index]) { 4389 reconsume = true; 4390 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4391 continue stateloop; 4392 } 4393 index++; 4394 continue; 4395 } 4396 switch (c) { 4397 case '\r': 4398 emitCarriageReturn(buf, pos); 4399 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4400 break stateloop; 4401 case '\n': 4402 silentLineFeed(); 4403 // CPPONLY: MOZ_FALLTHROUGH; 4404 case ' ': 4405 case '\t': 4406 case '\u000C': 4407 case '/': 4408 case '>': 4409 /* 4410 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4411 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4412 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 4413 * (>) Emit the current input character as a 4414 * character token. If the temporary buffer is 4415 * the string "script", then switch to the 4416 * script data double escaped state. 4417 */ 4418 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4419 break scriptdatadoubleescapestartloop; 4420 // continue stateloop; 4421 default: 4422 /* 4423 * Anything else Reconsume the current input 4424 * character in the script data escaped state. 4425 */ 4426 reconsume = true; 4427 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4428 continue stateloop; 4429 } 4430 } 4431 // CPPONLY: MOZ_FALLTHROUGH; 4432 case SCRIPT_DATA_DOUBLE_ESCAPED: 4433 scriptdatadoubleescapedloop: for (;;) { 4434 if (reconsume) { 4435 reconsume = false; 4436 } else { 4437 if (++pos == endPos) { 4438 break stateloop; 4439 } 4440 c = checkChar(buf, pos); 4441 } 4442 /* 4443 * Consume the next input character: 4444 */ 4445 switch (c) { 4446 case '-': 4447 /* 4448 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4449 * HYPHEN-MINUS character token. Switch to the 4450 * script data double escaped dash state. 4451 */ 4452 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); 4453 break scriptdatadoubleescapedloop; // FALL THRU 4454 // continue 4455 // stateloop; 4456 case '<': 4457 /* 4458 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4459 * LESS-THAN SIGN character token. Switch to the 4460 * script data double escaped less-than sign 4461 * state. 4462 */ 4463 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4464 continue stateloop; 4465 case '\u0000': 4466 emitReplacementCharacter(buf, pos); 4467 continue; 4468 case '\r': 4469 emitCarriageReturn(buf, pos); 4470 break stateloop; 4471 case '\n': 4472 silentLineFeed(); 4473 // CPPONLY: MOZ_FALLTHROUGH; 4474 default: 4475 /* 4476 * Anything else Emit the current input 4477 * character as a character token. Stay in the 4478 * script data double escaped state. 4479 */ 4480 continue; 4481 } 4482 } 4483 // CPPONLY: MOZ_FALLTHROUGH; 4484 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 4485 scriptdatadoubleescapeddashloop: for (;;) { 4486 if (++pos == endPos) { 4487 break stateloop; 4488 } 4489 c = checkChar(buf, pos); 4490 /* 4491 * Consume the next input character: 4492 */ 4493 switch (c) { 4494 case '-': 4495 /* 4496 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4497 * HYPHEN-MINUS character token. Switch to the 4498 * script data double escaped dash dash state. 4499 */ 4500 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); 4501 break scriptdatadoubleescapeddashloop; 4502 // continue stateloop; 4503 case '<': 4504 /* 4505 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4506 * LESS-THAN SIGN character token. Switch to the 4507 * script data double escaped less-than sign 4508 * state. 4509 */ 4510 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4511 continue stateloop; 4512 case '\u0000': 4513 emitReplacementCharacter(buf, pos); 4514 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4515 continue stateloop; 4516 case '\r': 4517 emitCarriageReturn(buf, pos); 4518 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4519 break stateloop; 4520 case '\n': 4521 silentLineFeed(); 4522 // CPPONLY: MOZ_FALLTHROUGH; 4523 default: 4524 /* 4525 * Anything else Emit the current input 4526 * character as a character token. Switch to the 4527 * script data double escaped state. 4528 */ 4529 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4530 continue stateloop; 4531 } 4532 } 4533 // CPPONLY: MOZ_FALLTHROUGH; 4534 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 4535 scriptdatadoubleescapeddashdashloop: for (;;) { 4536 if (++pos == endPos) { 4537 break stateloop; 4538 } 4539 c = checkChar(buf, pos); 4540 /* 4541 * Consume the next input character: 4542 */ 4543 switch (c) { 4544 case '-': 4545 /* 4546 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4547 * HYPHEN-MINUS character token. Stay in the 4548 * script data double escaped dash dash state. 4549 */ 4550 continue; 4551 case '<': 4552 /* 4553 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4554 * LESS-THAN SIGN character token. Switch to the 4555 * script data double escaped less-than sign 4556 * state. 4557 */ 4558 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4559 break scriptdatadoubleescapeddashdashloop; 4560 case '>': 4561 /* 4562 * U+003E GREATER-THAN SIGN (>) Emit a U+003E 4563 * GREATER-THAN SIGN character token. Switch to 4564 * the script data state. 4565 */ 4566 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4567 continue stateloop; 4568 case '\u0000': 4569 emitReplacementCharacter(buf, pos); 4570 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4571 continue stateloop; 4572 case '\r': 4573 emitCarriageReturn(buf, pos); 4574 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4575 break stateloop; 4576 case '\n': 4577 silentLineFeed(); 4578 // CPPONLY: MOZ_FALLTHROUGH; 4579 default: 4580 /* 4581 * Anything else Emit the current input 4582 * character as a character token. Switch to the 4583 * script data double escaped state. 4584 */ 4585 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4586 continue stateloop; 4587 } 4588 } 4589 // CPPONLY: MOZ_FALLTHROUGH; 4590 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 4591 scriptdatadoubleescapedlessthanloop: for (;;) { 4592 if (++pos == endPos) { 4593 break stateloop; 4594 } 4595 c = checkChar(buf, pos); 4596 /* 4597 * Consume the next input character: 4598 */ 4599 switch (c) { 4600 case '/': 4601 /* 4602 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS 4603 * character token. Set the temporary buffer to 4604 * the empty string. Switch to the script data 4605 * double escape end state. 4606 */ 4607 index = 0; 4608 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); 4609 break scriptdatadoubleescapedlessthanloop; 4610 default: 4611 /* 4612 * Anything else Reconsume the current input 4613 * character in the script data double escaped 4614 * state. 4615 */ 4616 reconsume = true; 4617 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4618 continue stateloop; 4619 } 4620 } 4621 // CPPONLY: MOZ_FALLTHROUGH; 4622 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 4623 scriptdatadoubleescapeendloop: for (;;) { 4624 if (++pos == endPos) { 4625 break stateloop; 4626 } 4627 c = checkChar(buf, pos); 4628 if (index < 6) { // SCRIPT_ARR.length 4629 char folded = c; 4630 if (c >= 'A' && c <= 'Z') { 4631 folded += 0x20; 4632 } 4633 if (folded != Tokenizer.SCRIPT_ARR[index]) { 4634 reconsume = true; 4635 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4636 continue stateloop; 4637 } 4638 index++; 4639 continue; 4640 } 4641 switch (c) { 4642 case '\r': 4643 emitCarriageReturn(buf, pos); 4644 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4645 break stateloop; 4646 case '\n': 4647 silentLineFeed(); 4648 // CPPONLY: MOZ_FALLTHROUGH; 4649 case ' ': 4650 case '\t': 4651 case '\u000C': 4652 case '/': 4653 case '>': 4654 /* 4655 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4656 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4657 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 4658 * (>) Emit the current input character as a 4659 * character token. If the temporary buffer is 4660 * the string "script", then switch to the 4661 * script data escaped state. 4662 */ 4663 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4664 continue stateloop; 4665 default: 4666 /* 4667 * Reconsume the current input character in the 4668 * script data double escaped state. 4669 */ 4670 reconsume = true; 4671 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4672 continue stateloop; 4673 } 4674 } 4675 case MARKUP_DECLARATION_OCTYPE: 4676 markupdeclarationdoctypeloop: for (;;) { 4677 if (++pos == endPos) { 4678 break stateloop; 4679 } 4680 c = checkChar(buf, pos); 4681 if (index < 6) { // OCTYPE.length 4682 char folded = c; 4683 if (c >= 'A' && c <= 'Z') { 4684 folded += 0x20; 4685 } 4686 if (folded == Tokenizer.OCTYPE[index]) { 4687 appendStrBuf(c); 4688 } else { 4689 errBogusComment(); 4690 reconsume = true; 4691 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4692 continue stateloop; 4693 } 4694 index++; 4695 continue; 4696 } else { 4697 reconsume = true; 4698 state = transition(state, Tokenizer.DOCTYPE, reconsume, pos); 4699 break markupdeclarationdoctypeloop; 4700 // continue stateloop; 4701 } 4702 } 4703 // CPPONLY: MOZ_FALLTHROUGH; 4704 case DOCTYPE: 4705 doctypeloop: for (;;) { 4706 if (reconsume) { 4707 reconsume = false; 4708 } else { 4709 if (++pos == endPos) { 4710 break stateloop; 4711 } 4712 c = checkChar(buf, pos); 4713 } 4714 initDoctypeFields(); 4715 /* 4716 * Consume the next input character: 4717 */ 4718 switch (c) { 4719 case '\r': 4720 silentCarriageReturn(); 4721 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4722 break stateloop; 4723 case '\n': 4724 silentLineFeed(); 4725 // CPPONLY: MOZ_FALLTHROUGH; 4726 case ' ': 4727 case '\t': 4728 case '\u000C': 4729 /* 4730 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4731 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4732 * Switch to the before DOCTYPE name state. 4733 */ 4734 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4735 break doctypeloop; 4736 // continue stateloop; 4737 default: 4738 /* 4739 * Anything else Parse error. 4740 */ 4741 errMissingSpaceBeforeDoctypeName(); 4742 /* 4743 * Reconsume the current character in the before 4744 * DOCTYPE name state. 4745 */ 4746 reconsume = true; 4747 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4748 break doctypeloop; 4749 // continue stateloop; 4750 } 4751 } 4752 // CPPONLY: MOZ_FALLTHROUGH; 4753 case BEFORE_DOCTYPE_NAME: 4754 beforedoctypenameloop: for (;;) { 4755 if (reconsume) { 4756 reconsume = false; 4757 } else { 4758 if (++pos == endPos) { 4759 break stateloop; 4760 } 4761 c = checkChar(buf, pos); 4762 } 4763 /* 4764 * Consume the next input character: 4765 */ 4766 switch (c) { 4767 case '\r': 4768 silentCarriageReturn(); 4769 break stateloop; 4770 case '\n': 4771 silentLineFeed(); 4772 // CPPONLY: MOZ_FALLTHROUGH; 4773 case ' ': 4774 case '\t': 4775 case '\u000C': 4776 /* 4777 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4778 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 4779 * in the before DOCTYPE name state. 4780 */ 4781 continue; 4782 case '>': 4783 /* 4784 * U+003E GREATER-THAN SIGN (>) Parse error. 4785 */ 4786 errNamelessDoctype(); 4787 /* 4788 * Create a new DOCTYPE token. Set its 4789 * force-quirks flag to on. 4790 */ 4791 forceQuirks = true; 4792 /* 4793 * Emit the token. 4794 */ 4795 emitDoctypeToken(pos); 4796 /* 4797 * Switch to the data state. 4798 */ 4799 state = transition(state, Tokenizer.DATA, reconsume, pos); 4800 continue stateloop; 4801 case '\u0000': 4802 c = '\uFFFD'; 4803 // CPPONLY: MOZ_FALLTHROUGH; 4804 default: 4805 if (c >= 'A' && c <= 'Z') { 4806 /* 4807 * U+0041 LATIN CAPITAL LETTER A through to 4808 * U+005A LATIN CAPITAL LETTER Z Create a 4809 * new DOCTYPE token. Set the token's name 4810 * to the lowercase version of the input 4811 * character (add 0x0020 to the character's 4812 * code point). 4813 */ 4814 c += 0x20; 4815 } 4816 /* Anything else Create a new DOCTYPE token. */ 4817 /* 4818 * Set the token's name name to the current 4819 * input character. 4820 */ 4821 clearStrBufBeforeUse(); 4822 appendStrBuf(c); 4823 /* 4824 * Switch to the DOCTYPE name state. 4825 */ 4826 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); 4827 break beforedoctypenameloop; 4828 // continue stateloop; 4829 } 4830 } 4831 // CPPONLY: MOZ_FALLTHROUGH; 4832 case DOCTYPE_NAME: 4833 doctypenameloop: for (;;) { 4834 if (++pos == endPos) { 4835 break stateloop; 4836 } 4837 c = checkChar(buf, pos); 4838 /* 4839 * Consume the next input character: 4840 */ 4841 switch (c) { 4842 case '\r': 4843 silentCarriageReturn(); 4844 strBufToDoctypeName(); 4845 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 4846 break stateloop; 4847 case '\n': 4848 silentLineFeed(); 4849 // CPPONLY: MOZ_FALLTHROUGH; 4850 case ' ': 4851 case '\t': 4852 case '\u000C': 4853 /* 4854 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4855 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4856 * Switch to the after DOCTYPE name state. 4857 */ 4858 strBufToDoctypeName(); 4859 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 4860 break doctypenameloop; 4861 // continue stateloop; 4862 case '>': 4863 /* 4864 * U+003E GREATER-THAN SIGN (>) Emit the current 4865 * DOCTYPE token. 4866 */ 4867 strBufToDoctypeName(); 4868 emitDoctypeToken(pos); 4869 /* 4870 * Switch to the data state. 4871 */ 4872 state = transition(state, Tokenizer.DATA, reconsume, pos); 4873 continue stateloop; 4874 case '\u0000': 4875 c = '\uFFFD'; 4876 // CPPONLY: MOZ_FALLTHROUGH; 4877 default: 4878 /* 4879 * U+0041 LATIN CAPITAL LETTER A through to 4880 * U+005A LATIN CAPITAL LETTER Z Append the 4881 * lowercase version of the input character (add 4882 * 0x0020 to the character's code point) to the 4883 * current DOCTYPE token's name. 4884 */ 4885 if (c >= 'A' && c <= 'Z') { 4886 c += 0x0020; 4887 } 4888 /* 4889 * Anything else Append the current input 4890 * character to the current DOCTYPE token's 4891 * name. 4892 */ 4893 appendStrBuf(c); 4894 /* 4895 * Stay in the DOCTYPE name state. 4896 */ 4897 continue; 4898 } 4899 } 4900 // CPPONLY: MOZ_FALLTHROUGH; 4901 case AFTER_DOCTYPE_NAME: 4902 afterdoctypenameloop: for (;;) { 4903 if (++pos == endPos) { 4904 break stateloop; 4905 } 4906 c = checkChar(buf, pos); 4907 /* 4908 * Consume the next input character: 4909 */ 4910 switch (c) { 4911 case '\r': 4912 silentCarriageReturn(); 4913 break stateloop; 4914 case '\n': 4915 silentLineFeed(); 4916 // CPPONLY: MOZ_FALLTHROUGH; 4917 case ' ': 4918 case '\t': 4919 case '\u000C': 4920 /* 4921 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4922 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 4923 * in the after DOCTYPE name state. 4924 */ 4925 continue; 4926 case '>': 4927 /* 4928 * U+003E GREATER-THAN SIGN (>) Emit the current 4929 * DOCTYPE token. 4930 */ 4931 emitDoctypeToken(pos); 4932 /* 4933 * Switch to the data state. 4934 */ 4935 state = transition(state, Tokenizer.DATA, reconsume, pos); 4936 continue stateloop; 4937 case 'p': 4938 case 'P': 4939 index = 0; 4940 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); 4941 break afterdoctypenameloop; 4942 // continue stateloop; 4943 case 's': 4944 case 'S': 4945 index = 0; 4946 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); 4947 continue stateloop; 4948 default: 4949 /* 4950 * Otherwise, this is the parse error. 4951 */ 4952 bogusDoctype(); 4953 4954 /* 4955 * Set the DOCTYPE token's force-quirks flag to 4956 * on. 4957 */ 4958 // done by bogusDoctype(); 4959 /* 4960 * Switch to the bogus DOCTYPE state. 4961 */ 4962 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 4963 continue stateloop; 4964 } 4965 } 4966 // CPPONLY: MOZ_FALLTHROUGH; 4967 case DOCTYPE_UBLIC: 4968 doctypeublicloop: for (;;) { 4969 if (++pos == endPos) { 4970 break stateloop; 4971 } 4972 c = checkChar(buf, pos); 4973 /* 4974 * If the six characters starting from the current input 4975 * character are an ASCII case-insensitive match for the 4976 * word "PUBLIC", then consume those characters and 4977 * switch to the before DOCTYPE public identifier state. 4978 */ 4979 if (index < 5) { // UBLIC.length 4980 char folded = c; 4981 if (c >= 'A' && c <= 'Z') { 4982 folded += 0x20; 4983 } 4984 if (folded != Tokenizer.UBLIC[index]) { 4985 bogusDoctype(); 4986 // forceQuirks = true; 4987 reconsume = true; 4988 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 4989 continue stateloop; 4990 } 4991 index++; 4992 continue; 4993 } else { 4994 reconsume = true; 4995 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); 4996 break doctypeublicloop; 4997 // continue stateloop; 4998 } 4999 } 5000 // CPPONLY: MOZ_FALLTHROUGH; 5001 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 5002 afterdoctypepublickeywordloop: for (;;) { 5003 if (reconsume) { 5004 reconsume = false; 5005 } else { 5006 if (++pos == endPos) { 5007 break stateloop; 5008 } 5009 c = checkChar(buf, pos); 5010 } 5011 /* 5012 * Consume the next input character: 5013 */ 5014 switch (c) { 5015 case '\r': 5016 silentCarriageReturn(); 5017 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5018 break stateloop; 5019 case '\n': 5020 silentLineFeed(); 5021 // CPPONLY: MOZ_FALLTHROUGH; 5022 case ' ': 5023 case '\t': 5024 case '\u000C': 5025 /* 5026 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5027 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5028 * Switch to the before DOCTYPE public 5029 * identifier state. 5030 */ 5031 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5032 break afterdoctypepublickeywordloop; 5033 // FALL THROUGH continue stateloop 5034 case '"': 5035 /* 5036 * U+0022 QUOTATION MARK (") Parse Error. 5037 */ 5038 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 5039 /* 5040 * Set the DOCTYPE token's public identifier to 5041 * the empty string (not missing), 5042 */ 5043 clearStrBufBeforeUse(); 5044 /* 5045 * then switch to the DOCTYPE public identifier 5046 * (double-quoted) state. 5047 */ 5048 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5049 continue stateloop; 5050 case '\'': 5051 /* 5052 * U+0027 APOSTROPHE (') Parse Error. 5053 */ 5054 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 5055 /* 5056 * Set the DOCTYPE token's public identifier to 5057 * the empty string (not missing), 5058 */ 5059 clearStrBufBeforeUse(); 5060 /* 5061 * then switch to the DOCTYPE public identifier 5062 * (single-quoted) state. 5063 */ 5064 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5065 continue stateloop; 5066 case '>': 5067 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5068 errExpectedPublicId(); 5069 /* 5070 * Set the DOCTYPE token's force-quirks flag to 5071 * on. 5072 */ 5073 forceQuirks = true; 5074 /* 5075 * Emit that DOCTYPE token. 5076 */ 5077 emitDoctypeToken(pos); 5078 /* 5079 * Switch to the data state. 5080 */ 5081 state = transition(state, Tokenizer.DATA, reconsume, pos); 5082 continue stateloop; 5083 default: 5084 bogusDoctype(); 5085 /* 5086 * Set the DOCTYPE token's force-quirks flag to 5087 * on. 5088 */ 5089 // done by bogusDoctype(); 5090 /* 5091 * Switch to the bogus DOCTYPE state. 5092 */ 5093 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5094 continue stateloop; 5095 } 5096 } 5097 // CPPONLY: MOZ_FALLTHROUGH; 5098 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 5099 beforedoctypepublicidentifierloop: for (;;) { 5100 if (++pos == endPos) { 5101 break stateloop; 5102 } 5103 c = checkChar(buf, pos); 5104 /* 5105 * Consume the next input character: 5106 */ 5107 switch (c) { 5108 case '\r': 5109 silentCarriageReturn(); 5110 break stateloop; 5111 case '\n': 5112 silentLineFeed(); 5113 // CPPONLY: MOZ_FALLTHROUGH; 5114 case ' ': 5115 case '\t': 5116 case '\u000C': 5117 /* 5118 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5119 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5120 * in the before DOCTYPE public identifier 5121 * state. 5122 */ 5123 continue; 5124 case '"': 5125 /* 5126 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5127 * token's public identifier to the empty string 5128 * (not missing), 5129 */ 5130 clearStrBufBeforeUse(); 5131 /* 5132 * then switch to the DOCTYPE public identifier 5133 * (double-quoted) state. 5134 */ 5135 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5136 break beforedoctypepublicidentifierloop; 5137 // continue stateloop; 5138 case '\'': 5139 /* 5140 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5141 * public identifier to the empty string (not 5142 * missing), 5143 */ 5144 clearStrBufBeforeUse(); 5145 /* 5146 * then switch to the DOCTYPE public identifier 5147 * (single-quoted) state. 5148 */ 5149 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5150 continue stateloop; 5151 case '>': 5152 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5153 errExpectedPublicId(); 5154 /* 5155 * Set the DOCTYPE token's force-quirks flag to 5156 * on. 5157 */ 5158 forceQuirks = true; 5159 /* 5160 * Emit that DOCTYPE token. 5161 */ 5162 emitDoctypeToken(pos); 5163 /* 5164 * Switch to the data state. 5165 */ 5166 state = transition(state, Tokenizer.DATA, reconsume, pos); 5167 continue stateloop; 5168 default: 5169 bogusDoctype(); 5170 /* 5171 * Set the DOCTYPE token's force-quirks flag to 5172 * on. 5173 */ 5174 // done by bogusDoctype(); 5175 /* 5176 * Switch to the bogus DOCTYPE state. 5177 */ 5178 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5179 continue stateloop; 5180 } 5181 } 5182 // CPPONLY: MOZ_FALLTHROUGH; 5183 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 5184 doctypepublicidentifierdoublequotedloop: for (;;) { 5185 if (++pos == endPos) { 5186 break stateloop; 5187 } 5188 c = checkChar(buf, pos); 5189 /* 5190 * Consume the next input character: 5191 */ 5192 switch (c) { 5193 case '"': 5194 /* 5195 * U+0022 QUOTATION MARK (") Switch to the after 5196 * DOCTYPE public identifier state. 5197 */ 5198 publicIdentifier = strBufToString(); 5199 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5200 break doctypepublicidentifierdoublequotedloop; 5201 // continue stateloop; 5202 case '>': 5203 /* 5204 * U+003E GREATER-THAN SIGN (>) Parse error. 5205 */ 5206 errGtInPublicId(); 5207 /* 5208 * Set the DOCTYPE token's force-quirks flag to 5209 * on. 5210 */ 5211 forceQuirks = true; 5212 /* 5213 * Emit that DOCTYPE token. 5214 */ 5215 publicIdentifier = strBufToString(); 5216 emitDoctypeToken(pos); 5217 /* 5218 * Switch to the data state. 5219 */ 5220 state = transition(state, Tokenizer.DATA, reconsume, pos); 5221 continue stateloop; 5222 case '\r': 5223 appendStrBufCarriageReturn(); 5224 break stateloop; 5225 case '\n': 5226 appendStrBufLineFeed(); 5227 continue; 5228 case '\u0000': 5229 c = '\uFFFD'; 5230 // CPPONLY: MOZ_FALLTHROUGH; 5231 default: 5232 /* 5233 * Anything else Append the current input 5234 * character to the current DOCTYPE token's 5235 * public identifier. 5236 */ 5237 appendStrBuf(c); 5238 /* 5239 * Stay in the DOCTYPE public identifier 5240 * (double-quoted) state. 5241 */ 5242 continue; 5243 } 5244 } 5245 // CPPONLY: MOZ_FALLTHROUGH; 5246 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 5247 afterdoctypepublicidentifierloop: for (;;) { 5248 if (++pos == endPos) { 5249 break stateloop; 5250 } 5251 c = checkChar(buf, pos); 5252 /* 5253 * Consume the next input character: 5254 */ 5255 switch (c) { 5256 case '\r': 5257 silentCarriageReturn(); 5258 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 5259 break stateloop; 5260 case '\n': 5261 silentLineFeed(); 5262 // CPPONLY: MOZ_FALLTHROUGH; 5263 case ' ': 5264 case '\t': 5265 case '\u000C': 5266 /* 5267 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5268 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5269 * Switch to the between DOCTYPE public and 5270 * system identifiers state. 5271 */ 5272 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 5273 break afterdoctypepublicidentifierloop; 5274 // continue stateloop; 5275 case '>': 5276 /* 5277 * U+003E GREATER-THAN SIGN (>) Emit the current 5278 * DOCTYPE token. 5279 */ 5280 emitDoctypeToken(pos); 5281 /* 5282 * Switch to the data state. 5283 */ 5284 state = transition(state, Tokenizer.DATA, reconsume, pos); 5285 continue stateloop; 5286 case '"': 5287 /* 5288 * U+0022 QUOTATION MARK (") Parse error. 5289 */ 5290 errNoSpaceBetweenPublicAndSystemIds(); 5291 /* 5292 * Set the DOCTYPE token's system identifier to 5293 * the empty string (not missing), 5294 */ 5295 clearStrBufBeforeUse(); 5296 /* 5297 * then switch to the DOCTYPE system identifier 5298 * (double-quoted) state. 5299 */ 5300 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5301 continue stateloop; 5302 case '\'': 5303 /* 5304 * U+0027 APOSTROPHE (') Parse error. 5305 */ 5306 errNoSpaceBetweenPublicAndSystemIds(); 5307 /* 5308 * Set the DOCTYPE token's system identifier to 5309 * the empty string (not missing), 5310 */ 5311 clearStrBufBeforeUse(); 5312 /* 5313 * then switch to the DOCTYPE system identifier 5314 * (single-quoted) state. 5315 */ 5316 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5317 continue stateloop; 5318 default: 5319 bogusDoctype(); 5320 /* 5321 * Set the DOCTYPE token's force-quirks flag to 5322 * on. 5323 */ 5324 // done by bogusDoctype(); 5325 /* 5326 * Switch to the bogus DOCTYPE state. 5327 */ 5328 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5329 continue stateloop; 5330 } 5331 } 5332 // CPPONLY: MOZ_FALLTHROUGH; 5333 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 5334 betweendoctypepublicandsystemidentifiersloop: for (;;) { 5335 if (++pos == endPos) { 5336 break stateloop; 5337 } 5338 c = checkChar(buf, pos); 5339 /* 5340 * Consume the next input character: 5341 */ 5342 switch (c) { 5343 case '\r': 5344 silentCarriageReturn(); 5345 break stateloop; 5346 case '\n': 5347 silentLineFeed(); 5348 // CPPONLY: MOZ_FALLTHROUGH; 5349 case ' ': 5350 case '\t': 5351 case '\u000C': 5352 /* 5353 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5354 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5355 * in the between DOCTYPE public and system 5356 * identifiers state. 5357 */ 5358 continue; 5359 case '>': 5360 /* 5361 * U+003E GREATER-THAN SIGN (>) Emit the current 5362 * DOCTYPE token. 5363 */ 5364 emitDoctypeToken(pos); 5365 /* 5366 * Switch to the data state. 5367 */ 5368 state = transition(state, Tokenizer.DATA, reconsume, pos); 5369 continue stateloop; 5370 case '"': 5371 /* 5372 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5373 * token's system identifier to the empty string 5374 * (not missing), 5375 */ 5376 clearStrBufBeforeUse(); 5377 /* 5378 * then switch to the DOCTYPE system identifier 5379 * (double-quoted) state. 5380 */ 5381 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5382 break betweendoctypepublicandsystemidentifiersloop; 5383 // continue stateloop; 5384 case '\'': 5385 /* 5386 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5387 * system identifier to the empty string (not 5388 * missing), 5389 */ 5390 clearStrBufBeforeUse(); 5391 /* 5392 * then switch to the DOCTYPE system identifier 5393 * (single-quoted) state. 5394 */ 5395 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5396 continue stateloop; 5397 default: 5398 bogusDoctype(); 5399 /* 5400 * Set the DOCTYPE token's force-quirks flag to 5401 * on. 5402 */ 5403 // done by bogusDoctype(); 5404 /* 5405 * Switch to the bogus DOCTYPE state. 5406 */ 5407 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5408 continue stateloop; 5409 } 5410 } 5411 // CPPONLY: MOZ_FALLTHROUGH; 5412 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 5413 doctypesystemidentifierdoublequotedloop: for (;;) { 5414 if (++pos == endPos) { 5415 break stateloop; 5416 } 5417 c = checkChar(buf, pos); 5418 /* 5419 * Consume the next input character: 5420 */ 5421 switch (c) { 5422 case '"': 5423 /* 5424 * U+0022 QUOTATION MARK (") Switch to the after 5425 * DOCTYPE system identifier state. 5426 */ 5427 systemIdentifier = strBufToString(); 5428 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5429 continue stateloop; 5430 case '>': 5431 /* 5432 * U+003E GREATER-THAN SIGN (>) Parse error. 5433 */ 5434 errGtInSystemId(); 5435 /* 5436 * Set the DOCTYPE token's force-quirks flag to 5437 * on. 5438 */ 5439 forceQuirks = true; 5440 /* 5441 * Emit that DOCTYPE token. 5442 */ 5443 systemIdentifier = strBufToString(); 5444 emitDoctypeToken(pos); 5445 /* 5446 * Switch to the data state. 5447 */ 5448 state = transition(state, Tokenizer.DATA, reconsume, pos); 5449 continue stateloop; 5450 case '\r': 5451 appendStrBufCarriageReturn(); 5452 break stateloop; 5453 case '\n': 5454 appendStrBufLineFeed(); 5455 continue; 5456 case '\u0000': 5457 c = '\uFFFD'; 5458 // CPPONLY: MOZ_FALLTHROUGH; 5459 default: 5460 /* 5461 * Anything else Append the current input 5462 * character to the current DOCTYPE token's 5463 * system identifier. 5464 */ 5465 appendStrBuf(c); 5466 /* 5467 * Stay in the DOCTYPE system identifier 5468 * (double-quoted) state. 5469 */ 5470 continue; 5471 } 5472 } 5473 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 5474 afterdoctypesystemidentifierloop: for (;;) { 5475 if (++pos == endPos) { 5476 break stateloop; 5477 } 5478 c = checkChar(buf, pos); 5479 /* 5480 * Consume the next input character: 5481 */ 5482 switch (c) { 5483 case '\r': 5484 silentCarriageReturn(); 5485 break stateloop; 5486 case '\n': 5487 silentLineFeed(); 5488 // CPPONLY: MOZ_FALLTHROUGH; 5489 case ' ': 5490 case '\t': 5491 case '\u000C': 5492 /* 5493 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5494 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5495 * in the after DOCTYPE system identifier state. 5496 */ 5497 continue; 5498 case '>': 5499 /* 5500 * U+003E GREATER-THAN SIGN (>) Emit the current 5501 * DOCTYPE token. 5502 */ 5503 emitDoctypeToken(pos); 5504 /* 5505 * Switch to the data state. 5506 */ 5507 state = transition(state, Tokenizer.DATA, reconsume, pos); 5508 continue stateloop; 5509 default: 5510 /* 5511 * Switch to the bogus DOCTYPE state. (This does 5512 * not set the DOCTYPE token's force-quirks flag 5513 * to on.) 5514 */ 5515 bogusDoctypeWithoutQuirks(); 5516 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5517 break afterdoctypesystemidentifierloop; 5518 // continue stateloop; 5519 } 5520 } 5521 // CPPONLY: MOZ_FALLTHROUGH; 5522 case BOGUS_DOCTYPE: 5523 for (;;) { 5524 if (reconsume) { 5525 reconsume = false; 5526 } else { 5527 if (++pos == endPos) { 5528 break stateloop; 5529 } 5530 c = checkChar(buf, pos); 5531 } 5532 /* 5533 * Consume the next input character: 5534 */ 5535 switch (c) { 5536 case '>': 5537 /* 5538 * U+003E GREATER-THAN SIGN (>) Emit that 5539 * DOCTYPE token. 5540 */ 5541 emitDoctypeToken(pos); 5542 /* 5543 * Switch to the data state. 5544 */ 5545 state = transition(state, Tokenizer.DATA, reconsume, pos); 5546 continue stateloop; 5547 case '\r': 5548 silentCarriageReturn(); 5549 break stateloop; 5550 case '\n': 5551 silentLineFeed(); 5552 // CPPONLY: MOZ_FALLTHROUGH; 5553 default: 5554 /* 5555 * Anything else Stay in the bogus DOCTYPE 5556 * state. 5557 */ 5558 continue; 5559 } 5560 } 5561 case DOCTYPE_YSTEM: 5562 doctypeystemloop: for (;;) { 5563 if (++pos == endPos) { 5564 break stateloop; 5565 } 5566 c = checkChar(buf, pos); 5567 /* 5568 * Otherwise, if the six characters starting from the 5569 * current input character are an ASCII case-insensitive 5570 * match for the word "SYSTEM", then consume those 5571 * characters and switch to the before DOCTYPE system 5572 * identifier state. 5573 */ 5574 if (index < 5) { // YSTEM.length 5575 char folded = c; 5576 if (c >= 'A' && c <= 'Z') { 5577 folded += 0x20; 5578 } 5579 if (folded != Tokenizer.YSTEM[index]) { 5580 bogusDoctype(); 5581 reconsume = true; 5582 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5583 continue stateloop; 5584 } 5585 index++; 5586 continue stateloop; 5587 } else { 5588 reconsume = true; 5589 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); 5590 break doctypeystemloop; 5591 // continue stateloop; 5592 } 5593 } 5594 // CPPONLY: MOZ_FALLTHROUGH; 5595 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 5596 afterdoctypesystemkeywordloop: for (;;) { 5597 if (reconsume) { 5598 reconsume = false; 5599 } else { 5600 if (++pos == endPos) { 5601 break stateloop; 5602 } 5603 c = checkChar(buf, pos); 5604 } 5605 /* 5606 * Consume the next input character: 5607 */ 5608 switch (c) { 5609 case '\r': 5610 silentCarriageReturn(); 5611 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5612 break stateloop; 5613 case '\n': 5614 silentLineFeed(); 5615 // CPPONLY: MOZ_FALLTHROUGH; 5616 case ' ': 5617 case '\t': 5618 case '\u000C': 5619 /* 5620 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5621 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5622 * Switch to the before DOCTYPE public 5623 * identifier state. 5624 */ 5625 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5626 break afterdoctypesystemkeywordloop; 5627 // FALL THROUGH continue stateloop 5628 case '"': 5629 /* 5630 * U+0022 QUOTATION MARK (") Parse Error. 5631 */ 5632 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 5633 /* 5634 * Set the DOCTYPE token's system identifier to 5635 * the empty string (not missing), 5636 */ 5637 clearStrBufBeforeUse(); 5638 /* 5639 * then switch to the DOCTYPE public identifier 5640 * (double-quoted) state. 5641 */ 5642 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5643 continue stateloop; 5644 case '\'': 5645 /* 5646 * U+0027 APOSTROPHE (') Parse Error. 5647 */ 5648 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 5649 /* 5650 * Set the DOCTYPE token's public identifier to 5651 * the empty string (not missing), 5652 */ 5653 clearStrBufBeforeUse(); 5654 /* 5655 * then switch to the DOCTYPE public identifier 5656 * (single-quoted) state. 5657 */ 5658 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5659 continue stateloop; 5660 case '>': 5661 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5662 errExpectedPublicId(); 5663 /* 5664 * Set the DOCTYPE token's force-quirks flag to 5665 * on. 5666 */ 5667 forceQuirks = true; 5668 /* 5669 * Emit that DOCTYPE token. 5670 */ 5671 emitDoctypeToken(pos); 5672 /* 5673 * Switch to the data state. 5674 */ 5675 state = transition(state, Tokenizer.DATA, reconsume, pos); 5676 continue stateloop; 5677 default: 5678 bogusDoctype(); 5679 /* 5680 * Set the DOCTYPE token's force-quirks flag to 5681 * on. 5682 */ 5683 // done by bogusDoctype(); 5684 /* 5685 * Switch to the bogus DOCTYPE state. 5686 */ 5687 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5688 continue stateloop; 5689 } 5690 } 5691 // CPPONLY: MOZ_FALLTHROUGH; 5692 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 5693 beforedoctypesystemidentifierloop: for (;;) { 5694 if (++pos == endPos) { 5695 break stateloop; 5696 } 5697 c = checkChar(buf, pos); 5698 /* 5699 * Consume the next input character: 5700 */ 5701 switch (c) { 5702 case '\r': 5703 silentCarriageReturn(); 5704 break stateloop; 5705 case '\n': 5706 silentLineFeed(); 5707 // CPPONLY: MOZ_FALLTHROUGH; 5708 case ' ': 5709 case '\t': 5710 case '\u000C': 5711 /* 5712 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5713 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5714 * in the before DOCTYPE system identifier 5715 * state. 5716 */ 5717 continue; 5718 case '"': 5719 /* 5720 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5721 * token's system identifier to the empty string 5722 * (not missing), 5723 */ 5724 clearStrBufBeforeUse(); 5725 /* 5726 * then switch to the DOCTYPE system identifier 5727 * (double-quoted) state. 5728 */ 5729 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5730 continue stateloop; 5731 case '\'': 5732 /* 5733 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5734 * system identifier to the empty string (not 5735 * missing), 5736 */ 5737 clearStrBufBeforeUse(); 5738 /* 5739 * then switch to the DOCTYPE system identifier 5740 * (single-quoted) state. 5741 */ 5742 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5743 break beforedoctypesystemidentifierloop; 5744 // continue stateloop; 5745 case '>': 5746 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5747 errExpectedSystemId(); 5748 /* 5749 * Set the DOCTYPE token's force-quirks flag to 5750 * on. 5751 */ 5752 forceQuirks = true; 5753 /* 5754 * Emit that DOCTYPE token. 5755 */ 5756 emitDoctypeToken(pos); 5757 /* 5758 * Switch to the data state. 5759 */ 5760 state = transition(state, Tokenizer.DATA, reconsume, pos); 5761 continue stateloop; 5762 default: 5763 bogusDoctype(); 5764 /* 5765 * Set the DOCTYPE token's force-quirks flag to 5766 * on. 5767 */ 5768 // done by bogusDoctype(); 5769 /* 5770 * Switch to the bogus DOCTYPE state. 5771 */ 5772 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5773 continue stateloop; 5774 } 5775 } 5776 // CPPONLY: MOZ_FALLTHROUGH; 5777 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 5778 for (;;) { 5779 if (++pos == endPos) { 5780 break stateloop; 5781 } 5782 c = checkChar(buf, pos); 5783 /* 5784 * Consume the next input character: 5785 */ 5786 switch (c) { 5787 case '\'': 5788 /* 5789 * U+0027 APOSTROPHE (') Switch to the after 5790 * DOCTYPE system identifier state. 5791 */ 5792 systemIdentifier = strBufToString(); 5793 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5794 continue stateloop; 5795 case '>': 5796 errGtInSystemId(); 5797 /* 5798 * Set the DOCTYPE token's force-quirks flag to 5799 * on. 5800 */ 5801 forceQuirks = true; 5802 /* 5803 * Emit that DOCTYPE token. 5804 */ 5805 systemIdentifier = strBufToString(); 5806 emitDoctypeToken(pos); 5807 /* 5808 * Switch to the data state. 5809 */ 5810 state = transition(state, Tokenizer.DATA, reconsume, pos); 5811 continue stateloop; 5812 case '\r': 5813 appendStrBufCarriageReturn(); 5814 break stateloop; 5815 case '\n': 5816 appendStrBufLineFeed(); 5817 continue; 5818 case '\u0000': 5819 c = '\uFFFD'; 5820 // CPPONLY: MOZ_FALLTHROUGH; 5821 default: 5822 /* 5823 * Anything else Append the current input 5824 * character to the current DOCTYPE token's 5825 * system identifier. 5826 */ 5827 appendStrBuf(c); 5828 /* 5829 * Stay in the DOCTYPE system identifier 5830 * (double-quoted) state. 5831 */ 5832 continue; 5833 } 5834 } 5835 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 5836 for (;;) { 5837 if (++pos == endPos) { 5838 break stateloop; 5839 } 5840 c = checkChar(buf, pos); 5841 /* 5842 * Consume the next input character: 5843 */ 5844 switch (c) { 5845 case '\'': 5846 /* 5847 * U+0027 APOSTROPHE (') Switch to the after 5848 * DOCTYPE public identifier state. 5849 */ 5850 publicIdentifier = strBufToString(); 5851 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5852 continue stateloop; 5853 case '>': 5854 errGtInPublicId(); 5855 /* 5856 * Set the DOCTYPE token's force-quirks flag to 5857 * on. 5858 */ 5859 forceQuirks = true; 5860 /* 5861 * Emit that DOCTYPE token. 5862 */ 5863 publicIdentifier = strBufToString(); 5864 emitDoctypeToken(pos); 5865 /* 5866 * Switch to the data state. 5867 */ 5868 state = transition(state, Tokenizer.DATA, reconsume, pos); 5869 continue stateloop; 5870 case '\r': 5871 appendStrBufCarriageReturn(); 5872 break stateloop; 5873 case '\n': 5874 appendStrBufLineFeed(); 5875 continue; 5876 case '\u0000': 5877 c = '\uFFFD'; 5878 // CPPONLY: MOZ_FALLTHROUGH; 5879 default: 5880 /* 5881 * Anything else Append the current input 5882 * character to the current DOCTYPE token's 5883 * public identifier. 5884 */ 5885 appendStrBuf(c); 5886 /* 5887 * Stay in the DOCTYPE public identifier 5888 * (single-quoted) state. 5889 */ 5890 continue; 5891 } 5892 } 5893 case PROCESSING_INSTRUCTION: 5894 processinginstructionloop: for (;;) { 5895 if (++pos == endPos) { 5896 break stateloop; 5897 } 5898 c = checkChar(buf, pos); 5899 switch (c) { 5900 case '?': 5901 state = transition( 5902 state, 5903 Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK, 5904 reconsume, pos); 5905 break processinginstructionloop; 5906 // continue stateloop; 5907 default: 5908 continue; 5909 } 5910 } 5911 // CPPONLY: MOZ_FALLTHROUGH; 5912 case PROCESSING_INSTRUCTION_QUESTION_MARK: 5913 if (++pos == endPos) { 5914 break stateloop; 5915 } 5916 c = checkChar(buf, pos); 5917 switch (c) { 5918 case '>': 5919 state = transition(state, Tokenizer.DATA, 5920 reconsume, pos); 5921 continue stateloop; 5922 default: 5923 state = transition(state, 5924 Tokenizer.PROCESSING_INSTRUCTION, 5925 reconsume, pos); 5926 continue stateloop; 5927 } 5928 // END HOTSPOT WORKAROUND 5929 } 5930 } 5931 flushChars(buf, pos); 5932 /* 5933 * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } 5934 */ 5935 // Save locals 5936 stateSave = state; 5937 returnStateSave = returnState; 5938 return pos; 5939 } 5940 5941 // HOTSPOT WORKAROUND INSERTION POINT 5942 5943 // [NOCPP[ 5944 transition(int from, int to, boolean reconsume, int pos)5945 protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { 5946 return to; 5947 } 5948 5949 // ]NOCPP] 5950 initDoctypeFields()5951 private void initDoctypeFields() { 5952 // Discard the characters "DOCTYPE" accumulated as a potential bogus 5953 // comment into strBuf. 5954 clearStrBufAfterUse(); 5955 doctypeName = ""; 5956 if (systemIdentifier != null) { 5957 Portability.releaseString(systemIdentifier); 5958 systemIdentifier = null; 5959 } 5960 if (publicIdentifier != null) { 5961 Portability.releaseString(publicIdentifier); 5962 publicIdentifier = null; 5963 } 5964 forceQuirks = false; 5965 } 5966 adjustDoubleHyphenAndAppendToStrBufCarriageReturn()5967 @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn() 5968 throws SAXException { 5969 silentCarriageReturn(); 5970 adjustDoubleHyphenAndAppendToStrBufAndErr('\n'); 5971 } 5972 adjustDoubleHyphenAndAppendToStrBufLineFeed()5973 @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed() 5974 throws SAXException { 5975 silentLineFeed(); 5976 adjustDoubleHyphenAndAppendToStrBufAndErr('\n'); 5977 } 5978 appendStrBufLineFeed()5979 @Inline private void appendStrBufLineFeed() { 5980 silentLineFeed(); 5981 appendStrBuf('\n'); 5982 } 5983 appendStrBufCarriageReturn()5984 @Inline private void appendStrBufCarriageReturn() { 5985 silentCarriageReturn(); 5986 appendStrBuf('\n'); 5987 } 5988 silentCarriageReturn()5989 @Inline protected void silentCarriageReturn() { 5990 ++line; 5991 lastCR = true; 5992 } 5993 silentLineFeed()5994 @Inline protected void silentLineFeed() { 5995 ++line; 5996 } 5997 emitCarriageReturn(@oLength char[] buf, int pos)5998 private void emitCarriageReturn(@NoLength char[] buf, int pos) 5999 throws SAXException { 6000 silentCarriageReturn(); 6001 flushChars(buf, pos); 6002 tokenHandler.characters(Tokenizer.LF, 0, 1); 6003 cstart = Integer.MAX_VALUE; 6004 } 6005 emitReplacementCharacter(@oLength char[] buf, int pos)6006 private void emitReplacementCharacter(@NoLength char[] buf, int pos) 6007 throws SAXException { 6008 flushChars(buf, pos); 6009 tokenHandler.zeroOriginatingReplacementCharacter(); 6010 cstart = pos + 1; 6011 } 6012 emitPlaintextReplacementCharacter(@oLength char[] buf, int pos)6013 private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos) 6014 throws SAXException { 6015 flushChars(buf, pos); 6016 tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1); 6017 cstart = pos + 1; 6018 } 6019 setAdditionalAndRememberAmpersandLocation(char add)6020 private void setAdditionalAndRememberAmpersandLocation(char add) { 6021 additional = add; 6022 // [NOCPP[ 6023 ampersandLocation = new LocatorImpl(this); 6024 // ]NOCPP] 6025 } 6026 bogusDoctype()6027 private void bogusDoctype() throws SAXException { 6028 errBogusDoctype(); 6029 forceQuirks = true; 6030 } 6031 bogusDoctypeWithoutQuirks()6032 private void bogusDoctypeWithoutQuirks() throws SAXException { 6033 errBogusDoctype(); 6034 forceQuirks = false; 6035 } 6036 handleNcrValue(int returnState)6037 private void handleNcrValue(int returnState) throws SAXException { 6038 /* 6039 * If one or more characters match the range, then take them all and 6040 * interpret the string of characters as a number (either hexadecimal or 6041 * decimal as appropriate). 6042 */ 6043 if (value <= 0xFFFF) { 6044 if (value >= 0x80 && value <= 0x9f) { 6045 /* 6046 * If that number is one of the numbers in the first column of 6047 * the following table, then this is a parse error. 6048 */ 6049 errNcrInC1Range(); 6050 /* 6051 * Find the row with that number in the first column, and return 6052 * a character token for the Unicode character given in the 6053 * second column of that row. 6054 */ 6055 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80]; 6056 emitOrAppendOne(val, returnState); 6057 // [NOCPP[ 6058 } else if (value == 0xC 6059 && contentSpacePolicy != XmlViolationPolicy.ALLOW) { 6060 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) { 6061 emitOrAppendOne(Tokenizer.SPACE, returnState); 6062 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) { 6063 fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space."); 6064 } 6065 // ]NOCPP] 6066 } else if (value == 0x0) { 6067 errNcrZero(); 6068 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6069 } else if ((value & 0xF800) == 0xD800) { 6070 errNcrSurrogate(); 6071 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6072 } else { 6073 /* 6074 * Otherwise, return a character token for the Unicode character 6075 * whose code point is that number. 6076 */ 6077 char ch = (char) value; 6078 // [NOCPP[ 6079 if (value == 0x0D) { 6080 errNcrCr(); 6081 } else if ((value <= 0x0008) || (value == 0x000B) 6082 || (value >= 0x000E && value <= 0x001F)) { 6083 ch = errNcrControlChar(ch); 6084 } else if (value >= 0xFDD0 && value <= 0xFDEF) { 6085 errNcrUnassigned(); 6086 } else if ((value & 0xFFFE) == 0xFFFE) { 6087 ch = errNcrNonCharacter(ch); 6088 } else if (value >= 0x007F && value <= 0x009F) { 6089 errNcrControlChar(); 6090 } else { 6091 maybeWarnPrivateUse(ch); 6092 } 6093 // ]NOCPP] 6094 bmpChar[0] = ch; 6095 emitOrAppendOne(bmpChar, returnState); 6096 } 6097 } else if (value <= 0x10FFFF) { 6098 // [NOCPP[ 6099 maybeWarnPrivateUseAstral(); 6100 if ((value & 0xFFFE) == 0xFFFE) { 6101 errAstralNonCharacter(value); 6102 } 6103 // ]NOCPP] 6104 astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10)); 6105 astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); 6106 emitOrAppendTwo(astralChar, returnState); 6107 } else { 6108 errNcrOutOfRange(); 6109 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6110 } 6111 } 6112 eof()6113 public void eof() throws SAXException { 6114 int state = stateSave; 6115 int returnState = returnStateSave; 6116 6117 eofloop: for (;;) { 6118 switch (state) { 6119 case SCRIPT_DATA_LESS_THAN_SIGN: 6120 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 6121 /* 6122 * Otherwise, emit a U+003C LESS-THAN SIGN character token 6123 */ 6124 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6125 /* 6126 * and reconsume the current input character in the data 6127 * state. 6128 */ 6129 break eofloop; 6130 case TAG_OPEN: 6131 /* 6132 * The behavior of this state depends on the content model 6133 * flag. 6134 */ 6135 /* 6136 * Anything else Parse error. 6137 */ 6138 errEofAfterLt(); 6139 /* 6140 * Emit a U+003C LESS-THAN SIGN character token 6141 */ 6142 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6143 /* 6144 * and reconsume the current input character in the data 6145 * state. 6146 */ 6147 break eofloop; 6148 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 6149 /* 6150 * Emit a U+003C LESS-THAN SIGN character token 6151 */ 6152 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6153 /* 6154 * and reconsume the current input character in the RCDATA 6155 * state. 6156 */ 6157 break eofloop; 6158 case NON_DATA_END_TAG_NAME: 6159 /* 6160 * Emit a U+003C LESS-THAN SIGN character token, a U+002F 6161 * SOLIDUS character token, 6162 */ 6163 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 6164 /* 6165 * a character token for each of the characters in the 6166 * temporary buffer (in the order they were added to the 6167 * buffer), 6168 */ 6169 emitStrBuf(); 6170 /* 6171 * and reconsume the current input character in the RCDATA 6172 * state. 6173 */ 6174 break eofloop; 6175 case CLOSE_TAG_OPEN: 6176 /* EOF Parse error. */ 6177 errEofAfterLt(); 6178 /* 6179 * Emit a U+003C LESS-THAN SIGN character token and a U+002F 6180 * SOLIDUS character token. 6181 */ 6182 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 6183 /* 6184 * Reconsume the EOF character in the data state. 6185 */ 6186 break eofloop; 6187 case TAG_NAME: 6188 /* 6189 * EOF Parse error. 6190 */ 6191 errEofInTagName(); 6192 /* 6193 * Reconsume the EOF character in the data state. 6194 */ 6195 break eofloop; 6196 case BEFORE_ATTRIBUTE_NAME: 6197 case AFTER_ATTRIBUTE_VALUE_QUOTED: 6198 case SELF_CLOSING_START_TAG: 6199 /* EOF Parse error. */ 6200 errEofWithoutGt(); 6201 /* 6202 * Reconsume the EOF character in the data state. 6203 */ 6204 break eofloop; 6205 case ATTRIBUTE_NAME: 6206 /* 6207 * EOF Parse error. 6208 */ 6209 errEofInAttributeName(); 6210 /* 6211 * Reconsume the EOF character in the data state. 6212 */ 6213 break eofloop; 6214 case AFTER_ATTRIBUTE_NAME: 6215 case BEFORE_ATTRIBUTE_VALUE: 6216 /* EOF Parse error. */ 6217 errEofWithoutGt(); 6218 /* 6219 * Reconsume the EOF character in the data state. 6220 */ 6221 break eofloop; 6222 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 6223 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 6224 case ATTRIBUTE_VALUE_UNQUOTED: 6225 /* EOF Parse error. */ 6226 errEofInAttributeValue(); 6227 /* 6228 * Reconsume the EOF character in the data state. 6229 */ 6230 break eofloop; 6231 case BOGUS_COMMENT: 6232 emitComment(0, 0); 6233 break eofloop; 6234 case BOGUS_COMMENT_HYPHEN: 6235 // [NOCPP[ 6236 maybeAppendSpaceToBogusComment(); 6237 // ]NOCPP] 6238 emitComment(0, 0); 6239 break eofloop; 6240 case MARKUP_DECLARATION_OPEN: 6241 errBogusComment(); 6242 emitComment(0, 0); 6243 break eofloop; 6244 case MARKUP_DECLARATION_HYPHEN: 6245 errBogusComment(); 6246 emitComment(0, 0); 6247 break eofloop; 6248 case MARKUP_DECLARATION_OCTYPE: 6249 if (index < 6) { 6250 errBogusComment(); 6251 emitComment(0, 0); 6252 } else { 6253 /* EOF Parse error. */ 6254 errEofInDoctype(); 6255 /* 6256 * Create a new DOCTYPE token. Set its force-quirks flag 6257 * to on. 6258 */ 6259 doctypeName = ""; 6260 if (systemIdentifier != null) { 6261 Portability.releaseString(systemIdentifier); 6262 systemIdentifier = null; 6263 } 6264 if (publicIdentifier != null) { 6265 Portability.releaseString(publicIdentifier); 6266 publicIdentifier = null; 6267 } 6268 forceQuirks = true; 6269 /* 6270 * Emit the token. 6271 */ 6272 emitDoctypeToken(0); 6273 /* 6274 * Reconsume the EOF character in the data state. 6275 */ 6276 break eofloop; 6277 } 6278 break eofloop; 6279 case COMMENT_START: 6280 case COMMENT: 6281 /* 6282 * EOF Parse error. 6283 */ 6284 errEofInComment(); 6285 /* Emit the comment token. */ 6286 emitComment(0, 0); 6287 /* 6288 * Reconsume the EOF character in the data state. 6289 */ 6290 break eofloop; 6291 case COMMENT_END: 6292 errEofInComment(); 6293 /* Emit the comment token. */ 6294 emitComment(2, 0); 6295 /* 6296 * Reconsume the EOF character in the data state. 6297 */ 6298 break eofloop; 6299 case COMMENT_END_DASH: 6300 case COMMENT_START_DASH: 6301 errEofInComment(); 6302 /* Emit the comment token. */ 6303 emitComment(1, 0); 6304 /* 6305 * Reconsume the EOF character in the data state. 6306 */ 6307 break eofloop; 6308 case COMMENT_END_BANG: 6309 errEofInComment(); 6310 /* Emit the comment token. */ 6311 emitComment(3, 0); 6312 /* 6313 * Reconsume the EOF character in the data state. 6314 */ 6315 break eofloop; 6316 case DOCTYPE: 6317 case BEFORE_DOCTYPE_NAME: 6318 errEofInDoctype(); 6319 /* 6320 * Create a new DOCTYPE token. Set its force-quirks flag to 6321 * on. 6322 */ 6323 forceQuirks = true; 6324 /* 6325 * Emit the token. 6326 */ 6327 emitDoctypeToken(0); 6328 /* 6329 * Reconsume the EOF character in the data state. 6330 */ 6331 break eofloop; 6332 case DOCTYPE_NAME: 6333 errEofInDoctype(); 6334 strBufToDoctypeName(); 6335 /* 6336 * Set the DOCTYPE token's force-quirks flag to on. 6337 */ 6338 forceQuirks = true; 6339 /* 6340 * Emit that DOCTYPE token. 6341 */ 6342 emitDoctypeToken(0); 6343 /* 6344 * Reconsume the EOF character in the data state. 6345 */ 6346 break eofloop; 6347 case DOCTYPE_UBLIC: 6348 case DOCTYPE_YSTEM: 6349 case AFTER_DOCTYPE_NAME: 6350 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 6351 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 6352 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 6353 errEofInDoctype(); 6354 /* 6355 * Set the DOCTYPE token's force-quirks flag to on. 6356 */ 6357 forceQuirks = true; 6358 /* 6359 * Emit that DOCTYPE token. 6360 */ 6361 emitDoctypeToken(0); 6362 /* 6363 * Reconsume the EOF character in the data state. 6364 */ 6365 break eofloop; 6366 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 6367 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 6368 /* EOF Parse error. */ 6369 errEofInPublicId(); 6370 /* 6371 * Set the DOCTYPE token's force-quirks flag to on. 6372 */ 6373 forceQuirks = true; 6374 /* 6375 * Emit that DOCTYPE token. 6376 */ 6377 publicIdentifier = strBufToString(); 6378 emitDoctypeToken(0); 6379 /* 6380 * Reconsume the EOF character in the data state. 6381 */ 6382 break eofloop; 6383 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 6384 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 6385 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 6386 errEofInDoctype(); 6387 /* 6388 * Set the DOCTYPE token's force-quirks flag to on. 6389 */ 6390 forceQuirks = true; 6391 /* 6392 * Emit that DOCTYPE token. 6393 */ 6394 emitDoctypeToken(0); 6395 /* 6396 * Reconsume the EOF character in the data state. 6397 */ 6398 break eofloop; 6399 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 6400 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 6401 /* EOF Parse error. */ 6402 errEofInSystemId(); 6403 /* 6404 * Set the DOCTYPE token's force-quirks flag to on. 6405 */ 6406 forceQuirks = true; 6407 /* 6408 * Emit that DOCTYPE token. 6409 */ 6410 systemIdentifier = strBufToString(); 6411 emitDoctypeToken(0); 6412 /* 6413 * Reconsume the EOF character in the data state. 6414 */ 6415 break eofloop; 6416 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 6417 errEofInDoctype(); 6418 /* 6419 * Set the DOCTYPE token's force-quirks flag to on. 6420 */ 6421 forceQuirks = true; 6422 /* 6423 * Emit that DOCTYPE token. 6424 */ 6425 emitDoctypeToken(0); 6426 /* 6427 * Reconsume the EOF character in the data state. 6428 */ 6429 break eofloop; 6430 case BOGUS_DOCTYPE: 6431 /* 6432 * Emit that DOCTYPE token. 6433 */ 6434 emitDoctypeToken(0); 6435 /* 6436 * Reconsume the EOF character in the data state. 6437 */ 6438 break eofloop; 6439 case CONSUME_CHARACTER_REFERENCE: 6440 /* 6441 * Unlike the definition is the spec, this state does not 6442 * return a value and never requires the caller to 6443 * backtrack. This state takes care of emitting characters 6444 * or appending to the current attribute value. It also 6445 * takes care of that in the case when consuming the entity 6446 * fails. 6447 */ 6448 /* 6449 * This section defines how to consume an entity. This 6450 * definition is used when parsing entities in text and in 6451 * attributes. 6452 * 6453 * The behavior depends on the identity of the next 6454 * character (the one immediately after the U+0026 AMPERSAND 6455 * character): 6456 */ 6457 6458 emitOrAppendCharRefBuf(returnState); 6459 state = returnState; 6460 continue; 6461 case CHARACTER_REFERENCE_HILO_LOOKUP: 6462 errNoNamedCharacterMatch(); 6463 emitOrAppendCharRefBuf(returnState); 6464 state = returnState; 6465 continue; 6466 case CHARACTER_REFERENCE_TAIL: 6467 outer: for (;;) { 6468 char c = '\u0000'; 6469 entCol++; 6470 /* 6471 * Consume the maximum number of characters possible, 6472 * with the consumed characters matching one of the 6473 * identifiers in the first column of the named 6474 * character references table (in a case-sensitive 6475 * manner). 6476 */ 6477 hiloop: for (;;) { 6478 if (hi == -1) { 6479 break hiloop; 6480 } 6481 if (entCol == NamedCharacters.NAMES[hi].length()) { 6482 break hiloop; 6483 } 6484 if (entCol > NamedCharacters.NAMES[hi].length()) { 6485 break outer; 6486 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 6487 hi--; 6488 } else { 6489 break hiloop; 6490 } 6491 } 6492 6493 loloop: for (;;) { 6494 if (hi < lo) { 6495 break outer; 6496 } 6497 if (entCol == NamedCharacters.NAMES[lo].length()) { 6498 candidate = lo; 6499 charRefBufMark = charRefBufLen; 6500 lo++; 6501 } else if (entCol > NamedCharacters.NAMES[lo].length()) { 6502 break outer; 6503 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 6504 lo++; 6505 } else { 6506 break loloop; 6507 } 6508 } 6509 if (hi < lo) { 6510 break outer; 6511 } 6512 continue; 6513 } 6514 6515 if (candidate == -1) { 6516 /* 6517 * If no match can be made, then this is a parse error. 6518 */ 6519 errNoNamedCharacterMatch(); 6520 emitOrAppendCharRefBuf(returnState); 6521 state = returnState; 6522 continue eofloop; 6523 } else { 6524 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 6525 if (candidateName.length() == 0 6526 || candidateName.charAt(candidateName.length() - 1) != ';') { 6527 /* 6528 * If the last character matched is not a U+003B 6529 * SEMICOLON (;), there is a parse error. 6530 */ 6531 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6532 /* 6533 * If the entity is being consumed as part of an 6534 * attribute, and the last character matched is 6535 * not a U+003B SEMICOLON (;), 6536 */ 6537 char ch; 6538 if (charRefBufMark == charRefBufLen) { 6539 ch = '\u0000'; 6540 } else { 6541 ch = charRefBuf[charRefBufMark]; 6542 } 6543 if ((ch >= '0' && ch <= '9') 6544 || (ch >= 'A' && ch <= 'Z') 6545 || (ch >= 'a' && ch <= 'z')) { 6546 /* 6547 * and the next character is in the range 6548 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 6549 * U+0041 LATIN CAPITAL LETTER A to U+005A 6550 * LATIN CAPITAL LETTER Z, or U+0061 LATIN 6551 * SMALL LETTER A to U+007A LATIN SMALL 6552 * LETTER Z, then, for historical reasons, 6553 * all the characters that were matched 6554 * after the U+0026 AMPERSAND (&) must be 6555 * unconsumed, and nothing is returned. 6556 */ 6557 errNoNamedCharacterMatch(); 6558 appendCharRefBufToStrBuf(); 6559 state = returnState; 6560 continue eofloop; 6561 } 6562 } 6563 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6564 errUnescapedAmpersandInterpretedAsCharacterReference(); 6565 } else { 6566 errNotSemicolonTerminated(); 6567 } 6568 } 6569 6570 /* 6571 * Otherwise, return a character token for the character 6572 * corresponding to the entity name (as given by the 6573 * second column of the named character references 6574 * table). 6575 */ 6576 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 6577 if ( 6578 // [NOCPP[ 6579 val.length == 1 6580 // ]NOCPP] 6581 // CPPONLY: val[1] == 0 6582 ) { 6583 emitOrAppendOne(val, returnState); 6584 } else { 6585 emitOrAppendTwo(val, returnState); 6586 } 6587 // this is so complicated! 6588 if (charRefBufMark < charRefBufLen) { 6589 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6590 appendStrBuf(charRefBuf, charRefBufMark, 6591 charRefBufLen - charRefBufMark); 6592 } else { 6593 tokenHandler.characters(charRefBuf, charRefBufMark, 6594 charRefBufLen - charRefBufMark); 6595 } 6596 } 6597 charRefBufLen = 0; 6598 state = returnState; 6599 continue eofloop; 6600 /* 6601 * If the markup contains I'm ¬it; I tell you, the 6602 * entity is parsed as "not", as in, I'm ¬it; I tell 6603 * you. But if the markup was I'm ∉ I tell you, 6604 * the entity would be parsed as "notin;", resulting in 6605 * I'm ∉ I tell you. 6606 */ 6607 } 6608 case CONSUME_NCR: 6609 case DECIMAL_NRC_LOOP: 6610 case HEX_NCR_LOOP: 6611 /* 6612 * If no characters match the range, then don't consume any 6613 * characters (and unconsume the U+0023 NUMBER SIGN 6614 * character and, if appropriate, the X character). This is 6615 * a parse error; nothing is returned. 6616 * 6617 * Otherwise, if the next character is a U+003B SEMICOLON, 6618 * consume that too. If it isn't, there is a parse error. 6619 */ 6620 if (!seenDigits) { 6621 errNoDigitsInNCR(); 6622 emitOrAppendCharRefBuf(returnState); 6623 state = returnState; 6624 continue; 6625 } else { 6626 errCharRefLacksSemicolon(); 6627 } 6628 // WARNING previous state sets reconsume 6629 handleNcrValue(returnState); 6630 state = returnState; 6631 continue; 6632 case CDATA_RSQB: 6633 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 6634 break eofloop; 6635 case CDATA_RSQB_RSQB: 6636 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 6637 break eofloop; 6638 case DATA: 6639 default: 6640 break eofloop; 6641 } 6642 } 6643 // case DATA: 6644 /* 6645 * EOF Emit an end-of-file token. 6646 */ 6647 tokenHandler.eof(); 6648 return; 6649 } 6650 emitDoctypeToken(int pos)6651 private void emitDoctypeToken(int pos) throws SAXException { 6652 cstart = pos + 1; 6653 tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier, 6654 forceQuirks); 6655 // It is OK and sufficient to release these here, since 6656 // there's no way out of the doctype states than through paths 6657 // that call this method. 6658 doctypeName = null; 6659 Portability.releaseString(publicIdentifier); 6660 publicIdentifier = null; 6661 Portability.releaseString(systemIdentifier); 6662 systemIdentifier = null; 6663 } 6664 checkChar(@oLength char[] buf, int pos)6665 @Inline protected char checkChar(@NoLength char[] buf, int pos) 6666 throws SAXException { 6667 return buf[pos]; 6668 } 6669 internalEncodingDeclaration(String internalCharset)6670 public boolean internalEncodingDeclaration(String internalCharset) 6671 throws SAXException { 6672 if (encodingDeclarationHandler != null) { 6673 return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset); 6674 } 6675 return false; 6676 } 6677 6678 /** 6679 * @param val 6680 * @throws SAXException 6681 */ emitOrAppendTwo(@onst @oLength char[] val, int returnState)6682 private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState) 6683 throws SAXException { 6684 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6685 appendStrBuf(val[0]); 6686 appendStrBuf(val[1]); 6687 } else { 6688 tokenHandler.characters(val, 0, 2); 6689 } 6690 } 6691 emitOrAppendOne(@onst @oLength char[] val, int returnState)6692 private void emitOrAppendOne(@Const @NoLength char[] val, int returnState) 6693 throws SAXException { 6694 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6695 appendStrBuf(val[0]); 6696 } else { 6697 tokenHandler.characters(val, 0, 1); 6698 } 6699 } 6700 end()6701 public void end() throws SAXException { 6702 strBuf = null; 6703 doctypeName = null; 6704 if (systemIdentifier != null) { 6705 Portability.releaseString(systemIdentifier); 6706 systemIdentifier = null; 6707 } 6708 if (publicIdentifier != null) { 6709 Portability.releaseString(publicIdentifier); 6710 publicIdentifier = null; 6711 } 6712 tagName = null; 6713 nonInternedTagName.setNameForNonInterned(null 6714 // CPPONLY: , false 6715 ); 6716 attributeName = null; 6717 // CPPONLY: nonInternedAttributeName.setNameForNonInterned(null); 6718 tokenHandler.endTokenization(); 6719 if (attributes != null) { 6720 // [NOCPP[ 6721 attributes = null; 6722 // ]NOCPP] 6723 // CPPONLY: attributes.clear(mappingLangToXmlLang); 6724 } 6725 } 6726 requestSuspension()6727 public void requestSuspension() { 6728 shouldSuspend = true; 6729 } 6730 6731 // [NOCPP[ 6732 becomeConfident()6733 public void becomeConfident() { 6734 confident = true; 6735 } 6736 6737 /** 6738 * Returns the nextCharOnNewLine. 6739 * 6740 * @return the nextCharOnNewLine 6741 */ isNextCharOnNewLine()6742 public boolean isNextCharOnNewLine() { 6743 return false; 6744 } 6745 isPrevCR()6746 public boolean isPrevCR() { 6747 return lastCR; 6748 } 6749 6750 /** 6751 * Returns the line. 6752 * 6753 * @return the line 6754 */ getLine()6755 public int getLine() { 6756 return -1; 6757 } 6758 6759 /** 6760 * Returns the col. 6761 * 6762 * @return the col 6763 */ getCol()6764 public int getCol() { 6765 return -1; 6766 } 6767 6768 // ]NOCPP] 6769 isInDataState()6770 public boolean isInDataState() { 6771 return (stateSave == DATA); 6772 } 6773 resetToDataState()6774 public void resetToDataState() { 6775 clearStrBufAfterUse(); 6776 charRefBufLen = 0; 6777 stateSave = Tokenizer.DATA; 6778 // line = 1; XXX line numbers 6779 lastCR = false; 6780 index = 0; 6781 forceQuirks = false; 6782 additional = '\u0000'; 6783 entCol = -1; 6784 firstCharKey = -1; 6785 lo = 0; 6786 hi = 0; // will always be overwritten before use anyway 6787 candidate = -1; 6788 charRefBufMark = 0; 6789 value = 0; 6790 seenDigits = false; 6791 endTag = false; 6792 shouldSuspend = false; 6793 initDoctypeFields(); 6794 containsHyphen = false; 6795 tagName = null; 6796 attributeName = null; 6797 if (newAttributesEachTime) { 6798 if (attributes != null) { 6799 Portability.delete(attributes); 6800 attributes = null; 6801 } 6802 } 6803 } 6804 loadState(Tokenizer other)6805 public void loadState(Tokenizer other) throws SAXException { 6806 strBufLen = other.strBufLen; 6807 if (strBufLen > strBuf.length) { 6808 strBuf = new char[strBufLen]; 6809 } 6810 System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen); 6811 6812 charRefBufLen = other.charRefBufLen; 6813 System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen); 6814 6815 stateSave = other.stateSave; 6816 returnStateSave = other.returnStateSave; 6817 endTagExpectation = other.endTagExpectation; 6818 endTagExpectationAsArray = other.endTagExpectationAsArray; 6819 // line = 1; XXX line numbers 6820 lastCR = other.lastCR; 6821 index = other.index; 6822 forceQuirks = other.forceQuirks; 6823 additional = other.additional; 6824 entCol = other.entCol; 6825 firstCharKey = other.firstCharKey; 6826 lo = other.lo; 6827 hi = other.hi; 6828 candidate = other.candidate; 6829 charRefBufMark = other.charRefBufMark; 6830 value = other.value; 6831 seenDigits = other.seenDigits; 6832 endTag = other.endTag; 6833 shouldSuspend = false; 6834 6835 if (other.doctypeName == null) { 6836 doctypeName = null; 6837 } else { 6838 doctypeName = Portability.newLocalFromLocal(other.doctypeName, 6839 interner); 6840 } 6841 6842 Portability.releaseString(systemIdentifier); 6843 if (other.systemIdentifier == null) { 6844 systemIdentifier = null; 6845 } else { 6846 systemIdentifier = Portability.newStringFromString(other.systemIdentifier); 6847 } 6848 6849 Portability.releaseString(publicIdentifier); 6850 if (other.publicIdentifier == null) { 6851 publicIdentifier = null; 6852 } else { 6853 publicIdentifier = Portability.newStringFromString(other.publicIdentifier); 6854 } 6855 6856 containsHyphen = other.containsHyphen; 6857 if (other.tagName == null) { 6858 tagName = null; 6859 } else if (other.tagName.isInterned()) { 6860 tagName = other.tagName; 6861 } else { 6862 // In the C++ case, the atoms in the other tokenizer are from a 6863 // different tokenizer-scoped atom table. Therefore, we have to 6864 // obtain the correspoding atom from our own atom table. 6865 nonInternedTagName.setNameForNonInterned(Portability.newLocalFromLocal(other.tagName.getName(), interner) 6866 // CPPONLY: , other.tagName.isCustom() 6867 ); 6868 tagName = nonInternedTagName; 6869 } 6870 6871 // [NOCPP[ 6872 attributeName = other.attributeName; 6873 // ]NOCPP] 6874 // CPPONLY: if (other.attributeName == null) { 6875 // CPPONLY: attributeName = null; 6876 // CPPONLY: } else if (other.attributeName.isInterned()) { 6877 // CPPONLY: attributeName = other.attributeName; 6878 // CPPONLY: } else { 6879 // CPPONLY: // In the C++ case, the atoms in the other tokenizer are from a 6880 // CPPONLY: // different tokenizer-scoped atom table. Therefore, we have to 6881 // CPPONLY: // obtain the correspoding atom from our own atom table. 6882 // CPPONLY: nonInternedAttributeName.setNameForNonInterned(Portability.newLocalFromLocal(other.attributeName.getLocal(AttributeName.HTML), interner)); 6883 // CPPONLY: attributeName = nonInternedAttributeName; 6884 // CPPONLY: } 6885 6886 Portability.delete(attributes); 6887 if (other.attributes == null) { 6888 attributes = null; 6889 } else { 6890 attributes = other.attributes.cloneAttributes(interner); 6891 } 6892 } 6893 initializeWithoutStarting()6894 public void initializeWithoutStarting() throws SAXException { 6895 confident = false; 6896 strBuf = null; 6897 line = 1; 6898 // CPPONLY: attributeLine = 1; 6899 // [NOCPP[ 6900 html4 = false; 6901 metaBoundaryPassed = false; 6902 wantsComments = tokenHandler.wantsComments(); 6903 if (!newAttributesEachTime) { 6904 attributes = new HtmlAttributes(mappingLangToXmlLang); 6905 } 6906 // ]NOCPP] 6907 resetToDataState(); 6908 } 6909 errGarbageAfterLtSlash()6910 protected void errGarbageAfterLtSlash() throws SAXException { 6911 } 6912 errLtSlashGt()6913 protected void errLtSlashGt() throws SAXException { 6914 } 6915 errWarnLtSlashInRcdata()6916 protected void errWarnLtSlashInRcdata() throws SAXException { 6917 } 6918 errHtml4LtSlashInRcdata(char folded)6919 protected void errHtml4LtSlashInRcdata(char folded) throws SAXException { 6920 } 6921 errCharRefLacksSemicolon()6922 protected void errCharRefLacksSemicolon() throws SAXException { 6923 } 6924 errNoDigitsInNCR()6925 protected void errNoDigitsInNCR() throws SAXException { 6926 } 6927 errGtInSystemId()6928 protected void errGtInSystemId() throws SAXException { 6929 } 6930 errGtInPublicId()6931 protected void errGtInPublicId() throws SAXException { 6932 } 6933 errNamelessDoctype()6934 protected void errNamelessDoctype() throws SAXException { 6935 } 6936 errConsecutiveHyphens()6937 protected void errConsecutiveHyphens() throws SAXException { 6938 } 6939 errPrematureEndOfComment()6940 protected void errPrematureEndOfComment() throws SAXException { 6941 } 6942 errBogusComment()6943 protected void errBogusComment() throws SAXException { 6944 } 6945 errUnquotedAttributeValOrNull(char c)6946 protected void errUnquotedAttributeValOrNull(char c) throws SAXException { 6947 } 6948 errSlashNotFollowedByGt()6949 protected void errSlashNotFollowedByGt() throws SAXException { 6950 } 6951 errHtml4XmlVoidSyntax()6952 protected void errHtml4XmlVoidSyntax() throws SAXException { 6953 } 6954 errNoSpaceBetweenAttributes()6955 protected void errNoSpaceBetweenAttributes() throws SAXException { 6956 } 6957 errHtml4NonNameInUnquotedAttribute(char c)6958 protected void errHtml4NonNameInUnquotedAttribute(char c) 6959 throws SAXException { 6960 } 6961 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)6962 protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c) 6963 throws SAXException { 6964 } 6965 errAttributeValueMissing()6966 protected void errAttributeValueMissing() throws SAXException { 6967 } 6968 errBadCharBeforeAttributeNameOrNull(char c)6969 protected void errBadCharBeforeAttributeNameOrNull(char c) 6970 throws SAXException { 6971 } 6972 errEqualsSignBeforeAttributeName()6973 protected void errEqualsSignBeforeAttributeName() throws SAXException { 6974 } 6975 errBadCharAfterLt(char c)6976 protected void errBadCharAfterLt(char c) throws SAXException { 6977 } 6978 errLtGt()6979 protected void errLtGt() throws SAXException { 6980 } 6981 errProcessingInstruction()6982 protected void errProcessingInstruction() throws SAXException { 6983 } 6984 errUnescapedAmpersandInterpretedAsCharacterReference()6985 protected void errUnescapedAmpersandInterpretedAsCharacterReference() 6986 throws SAXException { 6987 } 6988 errNotSemicolonTerminated()6989 protected void errNotSemicolonTerminated() throws SAXException { 6990 } 6991 errNoNamedCharacterMatch()6992 protected void errNoNamedCharacterMatch() throws SAXException { 6993 } 6994 errQuoteBeforeAttributeName(char c)6995 protected void errQuoteBeforeAttributeName(char c) throws SAXException { 6996 } 6997 errQuoteOrLtInAttributeNameOrNull(char c)6998 protected void errQuoteOrLtInAttributeNameOrNull(char c) 6999 throws SAXException { 7000 } 7001 errExpectedPublicId()7002 protected void errExpectedPublicId() throws SAXException { 7003 } 7004 errBogusDoctype()7005 protected void errBogusDoctype() throws SAXException { 7006 } 7007 maybeWarnPrivateUseAstral()7008 protected void maybeWarnPrivateUseAstral() throws SAXException { 7009 } 7010 maybeWarnPrivateUse(char ch)7011 protected void maybeWarnPrivateUse(char ch) throws SAXException { 7012 } 7013 maybeErrAttributesOnEndTag(HtmlAttributes attrs)7014 protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs) 7015 throws SAXException { 7016 } 7017 maybeErrSlashInEndTag(boolean selfClosing)7018 protected void maybeErrSlashInEndTag(boolean selfClosing) 7019 throws SAXException { 7020 } 7021 errNcrNonCharacter(char ch)7022 protected char errNcrNonCharacter(char ch) throws SAXException { 7023 return ch; 7024 } 7025 errAstralNonCharacter(int ch)7026 protected void errAstralNonCharacter(int ch) throws SAXException { 7027 } 7028 errNcrSurrogate()7029 protected void errNcrSurrogate() throws SAXException { 7030 } 7031 errNcrControlChar(char ch)7032 protected char errNcrControlChar(char ch) throws SAXException { 7033 return ch; 7034 } 7035 errNcrCr()7036 protected void errNcrCr() throws SAXException { 7037 } 7038 errNcrInC1Range()7039 protected void errNcrInC1Range() throws SAXException { 7040 } 7041 errEofInPublicId()7042 protected void errEofInPublicId() throws SAXException { 7043 } 7044 errEofInComment()7045 protected void errEofInComment() throws SAXException { 7046 } 7047 errEofInDoctype()7048 protected void errEofInDoctype() throws SAXException { 7049 } 7050 errEofInAttributeValue()7051 protected void errEofInAttributeValue() throws SAXException { 7052 } 7053 errEofInAttributeName()7054 protected void errEofInAttributeName() throws SAXException { 7055 } 7056 errEofWithoutGt()7057 protected void errEofWithoutGt() throws SAXException { 7058 } 7059 errEofInTagName()7060 protected void errEofInTagName() throws SAXException { 7061 } 7062 errEofInEndTag()7063 protected void errEofInEndTag() throws SAXException { 7064 } 7065 errEofAfterLt()7066 protected void errEofAfterLt() throws SAXException { 7067 } 7068 errNcrOutOfRange()7069 protected void errNcrOutOfRange() throws SAXException { 7070 } 7071 errNcrUnassigned()7072 protected void errNcrUnassigned() throws SAXException { 7073 } 7074 errDuplicateAttribute()7075 protected void errDuplicateAttribute() throws SAXException { 7076 } 7077 errEofInSystemId()7078 protected void errEofInSystemId() throws SAXException { 7079 } 7080 errExpectedSystemId()7081 protected void errExpectedSystemId() throws SAXException { 7082 } 7083 errMissingSpaceBeforeDoctypeName()7084 protected void errMissingSpaceBeforeDoctypeName() throws SAXException { 7085 } 7086 errHyphenHyphenBang()7087 protected void errHyphenHyphenBang() throws SAXException { 7088 } 7089 errNcrControlChar()7090 protected void errNcrControlChar() throws SAXException { 7091 } 7092 errNcrZero()7093 protected void errNcrZero() throws SAXException { 7094 } 7095 errNoSpaceBetweenDoctypeSystemKeywordAndQuote()7096 protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote() 7097 throws SAXException { 7098 } 7099 errNoSpaceBetweenPublicAndSystemIds()7100 protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException { 7101 } 7102 errNoSpaceBetweenDoctypePublicKeywordAndQuote()7103 protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote() 7104 throws SAXException { 7105 } 7106 noteAttributeWithoutValue()7107 protected void noteAttributeWithoutValue() throws SAXException { 7108 } 7109 noteUnquotedAttributeValue()7110 protected void noteUnquotedAttributeValue() throws SAXException { 7111 } 7112 7113 /** 7114 * Sets the encodingDeclarationHandler. 7115 * 7116 * @param encodingDeclarationHandler 7117 * the encodingDeclarationHandler to set 7118 */ setEncodingDeclarationHandler( EncodingDeclarationHandler encodingDeclarationHandler)7119 public void setEncodingDeclarationHandler( 7120 EncodingDeclarationHandler encodingDeclarationHandler) { 7121 this.encodingDeclarationHandler = encodingDeclarationHandler; 7122 } 7123 destructor()7124 void destructor() { 7125 Portability.delete(nonInternedTagName); 7126 // CPPONLY: Portability.delete(nonInternedAttributeName); 7127 nonInternedTagName = null; 7128 // The translator will write refcount tracing stuff here 7129 Portability.delete(attributes); 7130 attributes = null; 7131 } 7132 7133 // [NOCPP[ 7134 7135 /** 7136 * Sets an offset to be added to the position reported to 7137 * <code>TransitionHandler</code>. 7138 * 7139 * @param offset the offset 7140 */ setTransitionBaseOffset(int offset)7141 public void setTransitionBaseOffset(int offset) { 7142 7143 } 7144 7145 // ]NOCPP] 7146 7147 } 7148