1 /* 2 * Copyright (c) 2005-2007 Henri Sivonen 3 * Copyright (c) 2007-2015 Mozilla Foundation 4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 5 * Foundation, and Opera Software ASA. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 * DEALINGS IN THE SOFTWARE. 24 */ 25 26 /* 27 * The comments following this one that use the same comment syntax as this 28 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 29 * amended as of June 18 2008 and May 31 2010. 30 * That document came with this statement: 31 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and 32 * Opera Software ASA. You are granted a license to use, reproduce and 33 * create derivative works of this document." 34 */ 35 36 package nu.validator.htmlparser.impl; 37 38 import org.xml.sax.ErrorHandler; 39 import org.xml.sax.Locator; 40 import org.xml.sax.SAXException; 41 import org.xml.sax.SAXParseException; 42 43 import nu.validator.htmlparser.annotation.Auto; 44 import nu.validator.htmlparser.annotation.CharacterName; 45 import nu.validator.htmlparser.annotation.Const; 46 import nu.validator.htmlparser.annotation.Inline; 47 import nu.validator.htmlparser.annotation.Local; 48 import nu.validator.htmlparser.annotation.NoLength; 49 import nu.validator.htmlparser.common.EncodingDeclarationHandler; 50 import nu.validator.htmlparser.common.Interner; 51 import nu.validator.htmlparser.common.TokenHandler; 52 import nu.validator.htmlparser.common.XmlViolationPolicy; 53 54 /** 55 * An implementation of 56 * https://html.spec.whatwg.org/multipage/syntax.html#tokenization 57 * 58 * This class implements the <code>Locator</code> interface. This is not an 59 * incidental implementation detail: Users of this class are encouraged to make 60 * use of the <code>Locator</code> nature. 61 * 62 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer 63 * can be configured to treat these conditions as fatal or to coerce the infoset 64 * to something that XML 1.0 allows. 65 * 66 * @version $Id$ 67 * @author hsivonen 68 */ 69 public class Tokenizer implements Locator { 70 71 private static final int DATA_AND_RCDATA_MASK = ~1; 72 73 public static final int DATA = 0; 74 75 public static final int RCDATA = 1; 76 77 public static final int SCRIPT_DATA = 2; 78 79 public static final int RAWTEXT = 3; 80 81 public static final int SCRIPT_DATA_ESCAPED = 4; 82 83 public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; 84 85 public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; 86 87 public static final int ATTRIBUTE_VALUE_UNQUOTED = 7; 88 89 public static final int PLAINTEXT = 8; 90 91 public static final int TAG_OPEN = 9; 92 93 public static final int CLOSE_TAG_OPEN = 10; 94 95 public static final int TAG_NAME = 11; 96 97 public static final int BEFORE_ATTRIBUTE_NAME = 12; 98 99 public static final int ATTRIBUTE_NAME = 13; 100 101 public static final int AFTER_ATTRIBUTE_NAME = 14; 102 103 public static final int BEFORE_ATTRIBUTE_VALUE = 15; 104 105 public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16; 106 107 public static final int BOGUS_COMMENT = 17; 108 109 public static final int MARKUP_DECLARATION_OPEN = 18; 110 111 public static final int DOCTYPE = 19; 112 113 public static final int BEFORE_DOCTYPE_NAME = 20; 114 115 public static final int DOCTYPE_NAME = 21; 116 117 public static final int AFTER_DOCTYPE_NAME = 22; 118 119 public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; 120 121 public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; 122 123 public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; 124 125 public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; 126 127 public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; 128 129 public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; 130 131 public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; 132 133 public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; 134 135 public static final int BOGUS_DOCTYPE = 31; 136 137 public static final int COMMENT_START = 32; 138 139 public static final int COMMENT_START_DASH = 33; 140 141 public static final int COMMENT = 34; 142 143 public static final int COMMENT_END_DASH = 35; 144 145 public static final int COMMENT_END = 36; 146 147 public static final int COMMENT_END_BANG = 37; 148 149 public static final int NON_DATA_END_TAG_NAME = 38; 150 151 public static final int MARKUP_DECLARATION_HYPHEN = 39; 152 153 public static final int MARKUP_DECLARATION_OCTYPE = 40; 154 155 public static final int DOCTYPE_UBLIC = 41; 156 157 public static final int DOCTYPE_YSTEM = 42; 158 159 public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; 160 161 public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; 162 163 public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; 164 165 public static final int CONSUME_CHARACTER_REFERENCE = 46; 166 167 public static final int CONSUME_NCR = 47; 168 169 public static final int CHARACTER_REFERENCE_TAIL = 48; 170 171 public static final int HEX_NCR_LOOP = 49; 172 173 public static final int DECIMAL_NRC_LOOP = 50; 174 175 public static final int HANDLE_NCR_VALUE = 51; 176 177 public static final int HANDLE_NCR_VALUE_RECONSUME = 52; 178 179 public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53; 180 181 public static final int SELF_CLOSING_START_TAG = 54; 182 183 public static final int CDATA_START = 55; 184 185 public static final int CDATA_SECTION = 56; 186 187 public static final int CDATA_RSQB = 57; 188 189 public static final int CDATA_RSQB_RSQB = 58; 190 191 public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59; 192 193 public static final int SCRIPT_DATA_ESCAPE_START = 60; 194 195 public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61; 196 197 public static final int SCRIPT_DATA_ESCAPED_DASH = 62; 198 199 public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63; 200 201 public static final int BOGUS_COMMENT_HYPHEN = 64; 202 203 public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; 204 205 public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; 206 207 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; 208 209 public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68; 210 211 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; 212 213 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; 214 215 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; 216 217 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; 218 219 public static final int PROCESSING_INSTRUCTION = 73; 220 221 public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74; 222 223 /** 224 * Magic value for UTF-16 operations. 225 */ 226 private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); 227 228 /** 229 * UTF-16 code unit array containing less than and greater than for emitting 230 * those characters on certain parse errors. 231 */ 232 private static final @NoLength char[] LT_GT = { '<', '>' }; 233 234 /** 235 * UTF-16 code unit array containing less than and solidus for emitting 236 * those characters on certain parse errors. 237 */ 238 private static final @NoLength char[] LT_SOLIDUS = { '<', '/' }; 239 240 /** 241 * UTF-16 code unit array containing ]] for emitting those characters on 242 * state transitions. 243 */ 244 private static final @NoLength char[] RSQB_RSQB = { ']', ']' }; 245 246 /** 247 * Array version of U+FFFD. 248 */ 249 private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; 250 251 // [NOCPP[ 252 253 /** 254 * Array version of space. 255 */ 256 private static final @NoLength char[] SPACE = { ' ' }; 257 258 // ]NOCPP] 259 260 /** 261 * Array version of line feed. 262 */ 263 private static final @NoLength char[] LF = { '\n' }; 264 265 /** 266 * "CDATA[" as <code>char[]</code> 267 */ 268 private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T', 269 'A', '[' }; 270 271 /** 272 * "octype" as <code>char[]</code> 273 */ 274 private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p', 275 'e' }; 276 277 /** 278 * "ublic" as <code>char[]</code> 279 */ 280 private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' }; 281 282 /** 283 * "ystem" as <code>char[]</code> 284 */ 285 private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' }; 286 287 private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' }; 288 289 private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' }; 290 291 private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' }; 292 293 private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't', 294 'e', 'x', 't' }; 295 296 private static final char[] XMP_ARR = { 'x', 'm', 'p' }; 297 298 private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r', 299 'e', 'a' }; 300 301 private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' }; 302 303 private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e', 304 'd' }; 305 306 private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i', 307 'p', 't' }; 308 309 private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm', 310 'e', 's' }; 311 312 /** 313 * The token handler. 314 */ 315 protected final TokenHandler tokenHandler; 316 317 protected EncodingDeclarationHandler encodingDeclarationHandler; 318 319 // [NOCPP[ 320 321 /** 322 * The error handler. 323 */ 324 protected ErrorHandler errorHandler; 325 326 // ]NOCPP] 327 328 /** 329 * Whether the previous char read was CR. 330 */ 331 protected boolean lastCR; 332 333 protected int stateSave; 334 335 private int returnStateSave; 336 337 protected int index; 338 339 private boolean forceQuirks; 340 341 private char additional; 342 343 private int entCol; 344 345 private int firstCharKey; 346 347 private int lo; 348 349 private int hi; 350 351 private int candidate; 352 353 private int charRefBufMark; 354 355 protected int value; 356 357 private boolean seenDigits; 358 359 protected int cstart; 360 361 /** 362 * The SAX public id for the resource being tokenized. (Only passed to back 363 * as part of locator data.) 364 */ 365 private String publicId; 366 367 /** 368 * The SAX system id for the resource being tokenized. (Only passed to back 369 * as part of locator data.) 370 */ 371 private String systemId; 372 373 /** 374 * Buffer for bufferable things other than those that fit the description 375 * of <code>charRefBuf</code>. 376 */ 377 private @Auto char[] strBuf; 378 379 /** 380 * Number of significant <code>char</code>s in <code>strBuf</code>. 381 */ 382 private int strBufLen; 383 384 /** 385 * Buffer for characters that might form a character reference but may 386 * end up not forming one. 387 */ 388 private final @Auto char[] charRefBuf; 389 390 /** 391 * Number of significant <code>char</code>s in <code>charRefBuf</code>. 392 */ 393 private int charRefBufLen; 394 395 /** 396 * Buffer for expanding NCRs falling into the Basic Multilingual Plane. 397 */ 398 private final @Auto char[] bmpChar; 399 400 /** 401 * Buffer for expanding astral NCRs. 402 */ 403 private final @Auto char[] astralChar; 404 405 /** 406 * The element whose end tag closes the current CDATA or RCDATA element. 407 */ 408 protected ElementName endTagExpectation = null; 409 410 private char[] endTagExpectationAsArray; // not @Auto! 411 412 /** 413 * <code>true</code> if tokenizing an end tag 414 */ 415 protected boolean endTag; 416 417 /** 418 * The current tag token name. 419 */ 420 private ElementName tagName = null; 421 422 /** 423 * The current attribute name. 424 */ 425 protected AttributeName attributeName = null; 426 427 // [NOCPP[ 428 429 /** 430 * Whether comment tokens are emitted. 431 */ 432 private boolean wantsComments = false; 433 434 /** 435 * <code>true</code> when HTML4-specific additional errors are requested. 436 */ 437 protected boolean html4; 438 439 /** 440 * Whether the stream is past the first 1024 bytes. 441 */ 442 private boolean metaBoundaryPassed; 443 444 // ]NOCPP] 445 446 /** 447 * The name of the current doctype token. 448 */ 449 private @Local String doctypeName; 450 451 /** 452 * The public id of the current doctype token. 453 */ 454 private String publicIdentifier; 455 456 /** 457 * The system id of the current doctype token. 458 */ 459 private String systemIdentifier; 460 461 /** 462 * The attribute holder. 463 */ 464 private HtmlAttributes attributes; 465 466 // [NOCPP[ 467 468 /** 469 * The policy for vertical tab and form feed. 470 */ 471 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET; 472 473 /** 474 * The policy for comments. 475 */ 476 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET; 477 478 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET; 479 480 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET; 481 482 private boolean html4ModeCompatibleWithXhtml1Schemata; 483 484 private int mappingLangToXmlLang; 485 486 // ]NOCPP] 487 488 private final boolean newAttributesEachTime; 489 490 private boolean shouldSuspend; 491 492 protected boolean confident; 493 494 private int line; 495 496 /* 497 * The line number of the current attribute. First set to the line of the 498 * attribute name and if there is a value, set to the line the value 499 * started on. 500 */ 501 // CPPONLY: private int attributeLine; 502 503 private Interner interner; 504 505 // CPPONLY: private boolean viewingXmlSource; 506 507 // [NOCPP[ 508 509 protected LocatorImpl ampersandLocation; 510 Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime)511 public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { 512 this.tokenHandler = tokenHandler; 513 this.encodingDeclarationHandler = null; 514 this.newAttributesEachTime = newAttributesEachTime; 515 // ∳ is the longest valid char ref and 516 // the semicolon never gets appended to the buffer. 517 this.charRefBuf = new char[32]; 518 this.bmpChar = new char[1]; 519 this.astralChar = new char[2]; 520 this.tagName = null; 521 this.attributeName = null; 522 this.doctypeName = null; 523 this.publicIdentifier = null; 524 this.systemIdentifier = null; 525 this.attributes = null; 526 } 527 528 // ]NOCPP] 529 530 /** 531 * The constructor. 532 * 533 * @param tokenHandler 534 * the handler for receiving tokens 535 */ Tokenizer(TokenHandler tokenHandler )536 public Tokenizer(TokenHandler tokenHandler 537 // CPPONLY: , boolean viewingXmlSource 538 ) { 539 this.tokenHandler = tokenHandler; 540 this.encodingDeclarationHandler = null; 541 // [NOCPP[ 542 this.newAttributesEachTime = false; 543 // ]NOCPP] 544 // ∳ is the longest valid char ref and 545 // the semicolon never gets appended to the buffer. 546 this.charRefBuf = new char[32]; 547 this.bmpChar = new char[1]; 548 this.astralChar = new char[2]; 549 this.tagName = null; 550 this.attributeName = null; 551 this.doctypeName = null; 552 this.publicIdentifier = null; 553 this.systemIdentifier = null; 554 // [NOCPP[ 555 this.attributes = null; 556 // ]NOCPP] 557 // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null; 558 // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder(); 559 // CPPONLY: this.viewingXmlSource = viewingXmlSource; 560 } 561 setInterner(Interner interner)562 public void setInterner(Interner interner) { 563 this.interner = interner; 564 } 565 initLocation(String newPublicId, String newSystemId)566 public void initLocation(String newPublicId, String newSystemId) { 567 this.systemId = newSystemId; 568 this.publicId = newPublicId; 569 570 } 571 572 // CPPONLY: boolean isViewingXmlSource() { 573 // CPPONLY: return viewingXmlSource; 574 // CPPONLY: } 575 576 // [NOCPP[ 577 578 /** 579 * Returns the mappingLangToXmlLang. 580 * 581 * @return the mappingLangToXmlLang 582 */ isMappingLangToXmlLang()583 public boolean isMappingLangToXmlLang() { 584 return mappingLangToXmlLang == AttributeName.HTML_LANG; 585 } 586 587 /** 588 * Sets the mappingLangToXmlLang. 589 * 590 * @param mappingLangToXmlLang 591 * the mappingLangToXmlLang to set 592 */ setMappingLangToXmlLang(boolean mappingLangToXmlLang)593 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { 594 this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG 595 : AttributeName.HTML; 596 } 597 598 /** 599 * Sets the error handler. 600 * 601 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 602 */ setErrorHandler(ErrorHandler eh)603 public void setErrorHandler(ErrorHandler eh) { 604 this.errorHandler = eh; 605 } 606 getErrorHandler()607 public ErrorHandler getErrorHandler() { 608 return this.errorHandler; 609 } 610 611 /** 612 * Sets the commentPolicy. 613 * 614 * @param commentPolicy 615 * the commentPolicy to set 616 */ setCommentPolicy(XmlViolationPolicy commentPolicy)617 public void setCommentPolicy(XmlViolationPolicy commentPolicy) { 618 this.commentPolicy = commentPolicy; 619 } 620 621 /** 622 * Sets the contentNonXmlCharPolicy. 623 * 624 * @param contentNonXmlCharPolicy 625 * the contentNonXmlCharPolicy to set 626 */ setContentNonXmlCharPolicy( XmlViolationPolicy contentNonXmlCharPolicy)627 public void setContentNonXmlCharPolicy( 628 XmlViolationPolicy contentNonXmlCharPolicy) { 629 if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) { 630 throw new IllegalArgumentException( 631 "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW."); 632 } 633 } 634 635 /** 636 * Sets the contentSpacePolicy. 637 * 638 * @param contentSpacePolicy 639 * the contentSpacePolicy to set 640 */ setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy)641 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { 642 this.contentSpacePolicy = contentSpacePolicy; 643 } 644 645 /** 646 * Sets the xmlnsPolicy. 647 * 648 * @param xmlnsPolicy 649 * the xmlnsPolicy to set 650 */ setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy)651 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { 652 if (xmlnsPolicy == XmlViolationPolicy.FATAL) { 653 throw new IllegalArgumentException("Can't use FATAL here."); 654 } 655 this.xmlnsPolicy = xmlnsPolicy; 656 } 657 setNamePolicy(XmlViolationPolicy namePolicy)658 public void setNamePolicy(XmlViolationPolicy namePolicy) { 659 this.namePolicy = namePolicy; 660 } 661 662 /** 663 * Sets the html4ModeCompatibleWithXhtml1Schemata. 664 * 665 * @param html4ModeCompatibleWithXhtml1Schemata 666 * the html4ModeCompatibleWithXhtml1Schemata to set 667 */ setHtml4ModeCompatibleWithXhtml1Schemata( boolean html4ModeCompatibleWithXhtml1Schemata)668 public void setHtml4ModeCompatibleWithXhtml1Schemata( 669 boolean html4ModeCompatibleWithXhtml1Schemata) { 670 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata; 671 } 672 673 // ]NOCPP] 674 675 // For the token handler to call 676 /** 677 * Sets the tokenizer state and the associated element name. This should 678 * only ever used to put the tokenizer into one of the states that have 679 * a special end tag expectation. 680 * 681 * @param specialTokenizerState 682 * the tokenizer state to set 683 * @param endTagExpectation 684 * the expected end tag for transitioning back to normal 685 */ setStateAndEndTagExpectation(int specialTokenizerState, @Local String endTagExpectation)686 public void setStateAndEndTagExpectation(int specialTokenizerState, 687 @Local String endTagExpectation) { 688 this.stateSave = specialTokenizerState; 689 if (specialTokenizerState == Tokenizer.DATA) { 690 return; 691 } 692 @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation); 693 this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0, 694 asArray.length, interner); 695 endTagExpectationToArray(); 696 } 697 698 /** 699 * Sets the tokenizer state and the associated element name. This should 700 * only ever used to put the tokenizer into one of the states that have 701 * a special end tag expectation. 702 * 703 * @param specialTokenizerState 704 * the tokenizer state to set 705 * @param endTagExpectation 706 * the expected end tag for transitioning back to normal 707 */ setStateAndEndTagExpectation(int specialTokenizerState, ElementName endTagExpectation)708 public void setStateAndEndTagExpectation(int specialTokenizerState, 709 ElementName endTagExpectation) { 710 this.stateSave = specialTokenizerState; 711 this.endTagExpectation = endTagExpectation; 712 endTagExpectationToArray(); 713 } 714 endTagExpectationToArray()715 private void endTagExpectationToArray() { 716 switch (endTagExpectation.getGroup()) { 717 case TreeBuilder.TITLE: 718 endTagExpectationAsArray = TITLE_ARR; 719 return; 720 case TreeBuilder.SCRIPT: 721 endTagExpectationAsArray = SCRIPT_ARR; 722 return; 723 case TreeBuilder.STYLE: 724 endTagExpectationAsArray = STYLE_ARR; 725 return; 726 case TreeBuilder.PLAINTEXT: 727 endTagExpectationAsArray = PLAINTEXT_ARR; 728 return; 729 case TreeBuilder.XMP: 730 endTagExpectationAsArray = XMP_ARR; 731 return; 732 case TreeBuilder.TEXTAREA: 733 endTagExpectationAsArray = TEXTAREA_ARR; 734 return; 735 case TreeBuilder.IFRAME: 736 endTagExpectationAsArray = IFRAME_ARR; 737 return; 738 case TreeBuilder.NOEMBED: 739 endTagExpectationAsArray = NOEMBED_ARR; 740 return; 741 case TreeBuilder.NOSCRIPT: 742 endTagExpectationAsArray = NOSCRIPT_ARR; 743 return; 744 case TreeBuilder.NOFRAMES: 745 endTagExpectationAsArray = NOFRAMES_ARR; 746 return; 747 default: 748 assert false: "Bad end tag expectation."; 749 return; 750 } 751 } 752 753 /** 754 * For C++ use only. 755 */ setLineNumber(int line)756 public void setLineNumber(int line) { 757 // CPPONLY: this.attributeLine = line; // XXX is this needed? 758 this.line = line; 759 } 760 761 // start Locator impl 762 763 /** 764 * @see org.xml.sax.Locator#getLineNumber() 765 */ getLineNumber()766 @Inline public int getLineNumber() { 767 return line; 768 } 769 770 // [NOCPP[ 771 772 /** 773 * @see org.xml.sax.Locator#getColumnNumber() 774 */ getColumnNumber()775 @Inline public int getColumnNumber() { 776 return -1; 777 } 778 779 /** 780 * @see org.xml.sax.Locator#getPublicId() 781 */ getPublicId()782 public String getPublicId() { 783 return publicId; 784 } 785 786 /** 787 * @see org.xml.sax.Locator#getSystemId() 788 */ getSystemId()789 public String getSystemId() { 790 return systemId; 791 } 792 793 // end Locator impl 794 795 // end public API 796 notifyAboutMetaBoundary()797 public void notifyAboutMetaBoundary() { 798 metaBoundaryPassed = true; 799 } 800 turnOnAdditionalHtml4Errors()801 void turnOnAdditionalHtml4Errors() { 802 html4 = true; 803 } 804 805 // ]NOCPP] 806 emptyAttributes()807 HtmlAttributes emptyAttributes() { 808 // [NOCPP[ 809 if (newAttributesEachTime) { 810 return new HtmlAttributes(mappingLangToXmlLang); 811 } else { 812 // ]NOCPP] 813 return HtmlAttributes.EMPTY_ATTRIBUTES; 814 // [NOCPP[ 815 } 816 // ]NOCPP] 817 } 818 appendCharRefBuf(char c)819 @Inline private void appendCharRefBuf(char c) { 820 // CPPONLY: assert charRefBufLen < charRefBuf.length: 821 // CPPONLY: "RELEASE: Attempted to overrun charRefBuf!"; 822 charRefBuf[charRefBufLen++] = c; 823 } 824 emitOrAppendCharRefBuf(int returnState)825 private void emitOrAppendCharRefBuf(int returnState) throws SAXException { 826 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 827 appendCharRefBufToStrBuf(); 828 } else { 829 if (charRefBufLen > 0) { 830 tokenHandler.characters(charRefBuf, 0, charRefBufLen); 831 charRefBufLen = 0; 832 } 833 } 834 } 835 clearStrBufAfterUse()836 @Inline private void clearStrBufAfterUse() { 837 strBufLen = 0; 838 } 839 clearStrBufBeforeUse()840 @Inline private void clearStrBufBeforeUse() { 841 assert strBufLen == 0: "strBufLen not reset after previous use!"; 842 strBufLen = 0; // no-op in the absence of bugs 843 } 844 clearStrBufAfterOneHyphen()845 @Inline private void clearStrBufAfterOneHyphen() { 846 assert strBufLen == 1: "strBufLen length not one!"; 847 assert strBuf[0] == '-': "strBuf does not start with a hyphen!"; 848 strBufLen = 0; 849 } 850 851 /** 852 * Appends to the buffer. 853 * 854 * @param c 855 * the UTF-16 code unit to append 856 */ appendStrBuf(char c)857 @Inline private void appendStrBuf(char c) { 858 // CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient."; 859 // CPPONLY: if (strBufLen == strBuf.length) { 860 // CPPONLY: if (!EnsureBufferSpace(1)) { 861 // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure"; 862 // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not 863 // CPPONLY: } 864 strBuf[strBufLen++] = c; 865 } 866 867 /** 868 * The buffer as a String. Currently only used for error reporting. 869 * 870 * <p> 871 * C++ memory note: The return value must be released. 872 * 873 * @return the buffer as a string 874 */ strBufToString()875 protected String strBufToString() { 876 String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen 877 // CPPONLY: , tokenHandler 878 ); 879 clearStrBufAfterUse(); 880 return str; 881 } 882 883 /** 884 * Returns the buffer as a local name. The return value is released in 885 * emitDoctypeToken(). 886 * 887 * @return the buffer as local name 888 */ strBufToDoctypeName()889 private void strBufToDoctypeName() { 890 doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen, 891 interner); 892 clearStrBufAfterUse(); 893 } 894 895 /** 896 * Emits the buffer as character tokens. 897 * 898 * @throws SAXException 899 * if the token handler threw 900 */ emitStrBuf()901 private void emitStrBuf() throws SAXException { 902 if (strBufLen > 0) { 903 tokenHandler.characters(strBuf, 0, strBufLen); 904 clearStrBufAfterUse(); 905 } 906 } 907 appendSecondHyphenToBogusComment()908 @Inline private void appendSecondHyphenToBogusComment() throws SAXException { 909 // [NOCPP[ 910 switch (commentPolicy) { 911 case ALTER_INFOSET: 912 appendStrBuf(' '); 913 // FALLTHROUGH 914 case ALLOW: 915 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 916 // ]NOCPP] 917 appendStrBuf('-'); 918 // [NOCPP[ 919 break; 920 case FATAL: 921 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 922 break; 923 } 924 // ]NOCPP] 925 } 926 927 // [NOCPP[ maybeAppendSpaceToBogusComment()928 private void maybeAppendSpaceToBogusComment() throws SAXException { 929 switch (commentPolicy) { 930 case ALTER_INFOSET: 931 appendStrBuf(' '); 932 // FALLTHROUGH 933 case ALLOW: 934 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 935 break; 936 case FATAL: 937 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 938 break; 939 } 940 } 941 942 // ]NOCPP] 943 adjustDoubleHyphenAndAppendToStrBufAndErr(char c)944 @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c) 945 throws SAXException { 946 errConsecutiveHyphens(); 947 // [NOCPP[ 948 switch (commentPolicy) { 949 case ALTER_INFOSET: 950 strBufLen--; 951 // WARNING!!! This expands the worst case of the buffer length 952 // given the length of input! 953 appendStrBuf(' '); 954 appendStrBuf('-'); 955 // FALLTHROUGH 956 case ALLOW: 957 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 958 // ]NOCPP] 959 appendStrBuf(c); 960 // [NOCPP[ 961 break; 962 case FATAL: 963 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 964 break; 965 } 966 // ]NOCPP] 967 } 968 appendStrBuf(@oLength char[] buffer, int offset, int length)969 private void appendStrBuf(@NoLength char[] buffer, int offset, int length) { 970 int newLen = strBufLen + length; 971 // CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient."; 972 // CPPONLY: if (strBuf.length < newLen) { 973 // CPPONLY: if (!EnsureBufferSpace(length)) { 974 // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure"; 975 // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not 976 // CPPONLY: } 977 System.arraycopy(buffer, offset, strBuf, strBufLen, length); 978 strBufLen = newLen; 979 } 980 981 /** 982 * Append the contents of the char reference buffer to the main one. 983 */ appendCharRefBufToStrBuf()984 @Inline private void appendCharRefBufToStrBuf() { 985 appendStrBuf(charRefBuf, 0, charRefBufLen); 986 charRefBufLen = 0; 987 } 988 989 /** 990 * Emits the current comment token. 991 * 992 * @param pos 993 * TODO 994 * 995 * @throws SAXException 996 */ emitComment(int provisionalHyphens, int pos)997 private void emitComment(int provisionalHyphens, int pos) 998 throws SAXException { 999 // [NOCPP[ 1000 if (wantsComments) { 1001 // ]NOCPP] 1002 tokenHandler.comment(strBuf, 0, strBufLen 1003 - provisionalHyphens); 1004 // [NOCPP[ 1005 } 1006 // ]NOCPP] 1007 clearStrBufAfterUse(); 1008 cstart = pos + 1; 1009 } 1010 1011 /** 1012 * Flushes coalesced character tokens. 1013 * 1014 * @param buf 1015 * TODO 1016 * @param pos 1017 * TODO 1018 * 1019 * @throws SAXException 1020 */ flushChars(@oLength char[] buf, int pos)1021 protected void flushChars(@NoLength char[] buf, int pos) 1022 throws SAXException { 1023 if (pos > cstart) { 1024 tokenHandler.characters(buf, cstart, pos - cstart); 1025 } 1026 cstart = Integer.MAX_VALUE; 1027 } 1028 1029 /** 1030 * Reports an condition that would make the infoset incompatible with XML 1031 * 1.0 as fatal. 1032 * 1033 * @param message 1034 * the message 1035 * @throws SAXException 1036 * @throws SAXParseException 1037 */ fatal(String message)1038 public void fatal(String message) throws SAXException { 1039 SAXParseException spe = new SAXParseException(message, this); 1040 if (errorHandler != null) { 1041 errorHandler.fatalError(spe); 1042 } 1043 throw spe; 1044 } 1045 1046 /** 1047 * Reports a Parse Error. 1048 * 1049 * @param message 1050 * the message 1051 * @throws SAXException 1052 */ err(String message)1053 public void err(String message) throws SAXException { 1054 if (errorHandler == null) { 1055 return; 1056 } 1057 SAXParseException spe = new SAXParseException(message, this); 1058 errorHandler.error(spe); 1059 } 1060 errTreeBuilder(String message)1061 public void errTreeBuilder(String message) throws SAXException { 1062 ErrorHandler eh = null; 1063 if (tokenHandler instanceof TreeBuilder<?>) { 1064 TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler; 1065 eh = treeBuilder.getErrorHandler(); 1066 } 1067 if (eh == null) { 1068 eh = errorHandler; 1069 } 1070 if (eh == null) { 1071 return; 1072 } 1073 SAXParseException spe = new SAXParseException(message, this); 1074 eh.error(spe); 1075 } 1076 1077 /** 1078 * Reports a warning 1079 * 1080 * @param message 1081 * the message 1082 * @throws SAXException 1083 */ warn(String message)1084 public void warn(String message) throws SAXException { 1085 if (errorHandler == null) { 1086 return; 1087 } 1088 SAXParseException spe = new SAXParseException(message, this); 1089 errorHandler.warning(spe); 1090 } 1091 strBufToElementNameString()1092 private void strBufToElementNameString() { 1093 tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen, 1094 interner); 1095 clearStrBufAfterUse(); 1096 } 1097 emitCurrentTagToken(boolean selfClosing, int pos)1098 private int emitCurrentTagToken(boolean selfClosing, int pos) 1099 throws SAXException { 1100 cstart = pos + 1; 1101 maybeErrSlashInEndTag(selfClosing); 1102 stateSave = Tokenizer.DATA; 1103 HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES 1104 : attributes); 1105 if (endTag) { 1106 /* 1107 * When an end tag token is emitted, the content model flag must be 1108 * switched to the PCDATA state. 1109 */ 1110 maybeErrAttributesOnEndTag(attrs); 1111 // CPPONLY: if (!viewingXmlSource) { 1112 tokenHandler.endTag(tagName); 1113 // CPPONLY: } 1114 // CPPONLY: if (newAttributesEachTime) { 1115 // CPPONLY: Portability.delete(attributes); 1116 // CPPONLY: attributes = null; 1117 // CPPONLY: } 1118 } else { 1119 // CPPONLY: if (viewingXmlSource) { 1120 // CPPONLY: assert newAttributesEachTime; 1121 // CPPONLY: Portability.delete(attributes); 1122 // CPPONLY: attributes = null; 1123 // CPPONLY: } else { 1124 tokenHandler.startTag(tagName, attrs, selfClosing); 1125 // CPPONLY: } 1126 } 1127 tagName.release(); 1128 tagName = null; 1129 if (newAttributesEachTime) { 1130 attributes = null; 1131 } else { 1132 attributes.clear(mappingLangToXmlLang); 1133 } 1134 /* 1135 * The token handler may have called setStateAndEndTagExpectation 1136 * and changed stateSave since the start of this method. 1137 */ 1138 return stateSave; 1139 } 1140 attributeNameComplete()1141 private void attributeNameComplete() throws SAXException { 1142 attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen 1143 // [NOCPP[ 1144 , namePolicy != XmlViolationPolicy.ALLOW 1145 // ]NOCPP] 1146 , interner); 1147 clearStrBufAfterUse(); 1148 1149 if (attributes == null) { 1150 attributes = new HtmlAttributes(mappingLangToXmlLang); 1151 } 1152 1153 /* 1154 * When the user agent leaves the attribute name state (and before 1155 * emitting the tag token, if appropriate), the complete attribute's 1156 * name must be compared to the other attributes on the same token; if 1157 * there is already an attribute on the token with the exact same name, 1158 * then this is a parse error and the new attribute must be dropped, 1159 * along with the value that gets associated with it (if any). 1160 */ 1161 if (attributes.contains(attributeName)) { 1162 errDuplicateAttribute(); 1163 attributeName.release(); 1164 attributeName = null; 1165 } 1166 } 1167 addAttributeWithoutValue()1168 private void addAttributeWithoutValue() throws SAXException { 1169 noteAttributeWithoutValue(); 1170 1171 // [NOCPP[ 1172 if (metaBoundaryPassed && AttributeName.CHARSET == attributeName 1173 && ElementName.META == tagName) { 1174 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); 1175 } 1176 // ]NOCPP] 1177 if (attributeName != null) { 1178 // [NOCPP[ 1179 if (html4) { 1180 if (attributeName.isBoolean()) { 1181 if (html4ModeCompatibleWithXhtml1Schemata) { 1182 attributes.addAttribute(attributeName, 1183 attributeName.getLocal(AttributeName.HTML), 1184 xmlnsPolicy); 1185 } else { 1186 attributes.addAttribute(attributeName, "", xmlnsPolicy); 1187 } 1188 } else { 1189 if (AttributeName.BORDER != attributeName) { 1190 err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)"); 1191 attributes.addAttribute(attributeName, "", xmlnsPolicy); 1192 } 1193 } 1194 } else { 1195 if (AttributeName.SRC == attributeName 1196 || AttributeName.HREF == attributeName) { 1197 warn("Attribute \u201C" 1198 + attributeName.getLocal(AttributeName.HTML) 1199 + "\u201D without an explicit value seen. The attribute may be dropped by IE7."); 1200 } 1201 // ]NOCPP] 1202 attributes.addAttribute(attributeName, 1203 Portability.newEmptyString() 1204 // [NOCPP[ 1205 , xmlnsPolicy 1206 // ]NOCPP] 1207 // CPPONLY: , attributeLine 1208 ); 1209 // [NOCPP[ 1210 } 1211 // ]NOCPP] 1212 attributeName = null; // attributeName has been adopted by the 1213 // |attributes| object 1214 } else { 1215 clearStrBufAfterUse(); 1216 } 1217 } 1218 addAttributeWithValue()1219 private void addAttributeWithValue() throws SAXException { 1220 // [NOCPP[ 1221 if (metaBoundaryPassed && ElementName.META == tagName 1222 && AttributeName.CHARSET == attributeName) { 1223 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); 1224 } 1225 // ]NOCPP] 1226 if (attributeName != null) { 1227 String val = strBufToString(); // Ownership transferred to 1228 // HtmlAttributes 1229 // CPPONLY: if (mViewSource) { 1230 // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val); 1231 // CPPONLY: } 1232 // [NOCPP[ 1233 if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata 1234 && attributeName.isCaseFolded()) { 1235 val = newAsciiLowerCaseStringFromString(val); 1236 } 1237 // ]NOCPP] 1238 attributes.addAttribute(attributeName, val 1239 // [NOCPP[ 1240 , xmlnsPolicy 1241 // ]NOCPP] 1242 // CPPONLY: , attributeLine 1243 ); 1244 attributeName = null; // attributeName has been adopted by the 1245 // |attributes| object 1246 } else { 1247 // We have a duplicate attribute. Explicitly discard its value. 1248 clearStrBufAfterUse(); 1249 } 1250 } 1251 1252 // [NOCPP[ 1253 newAsciiLowerCaseStringFromString(String str)1254 private static String newAsciiLowerCaseStringFromString(String str) { 1255 if (str == null) { 1256 return null; 1257 } 1258 char[] buf = new char[str.length()]; 1259 for (int i = 0; i < str.length(); i++) { 1260 char c = str.charAt(i); 1261 if (c >= 'A' && c <= 'Z') { 1262 c += 0x20; 1263 } 1264 buf[i] = c; 1265 } 1266 return new String(buf); 1267 } 1268 startErrorReporting()1269 protected void startErrorReporting() throws SAXException { 1270 1271 } 1272 1273 // ]NOCPP] 1274 start()1275 public void start() throws SAXException { 1276 initializeWithoutStarting(); 1277 tokenHandler.startTokenization(this); 1278 // [NOCPP[ 1279 startErrorReporting(); 1280 // ]NOCPP] 1281 } 1282 tokenizeBuffer(UTF16Buffer buffer)1283 public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException { 1284 int state = stateSave; 1285 int returnState = returnStateSave; 1286 char c = '\u0000'; 1287 shouldSuspend = false; 1288 lastCR = false; 1289 1290 int start = buffer.getStart(); 1291 int end = buffer.getEnd(); 1292 1293 // In C++, the caller of tokenizeBuffer needs to do this explicitly. 1294 // [NOCPP[ 1295 ensureBufferSpace(end - start); 1296 // ]NOCPP] 1297 1298 /** 1299 * The index of the last <code>char</code> read from <code>buf</code>. 1300 */ 1301 int pos = start - 1; 1302 1303 /** 1304 * The index of the first <code>char</code> in <code>buf</code> that is 1305 * part of a coalesced run of character tokens or 1306 * <code>Integer.MAX_VALUE</code> if there is not a current run being 1307 * coalesced. 1308 */ 1309 switch (state) { 1310 case DATA: 1311 case RCDATA: 1312 case SCRIPT_DATA: 1313 case PLAINTEXT: 1314 case RAWTEXT: 1315 case CDATA_SECTION: 1316 case SCRIPT_DATA_ESCAPED: 1317 case SCRIPT_DATA_ESCAPE_START: 1318 case SCRIPT_DATA_ESCAPE_START_DASH: 1319 case SCRIPT_DATA_ESCAPED_DASH: 1320 case SCRIPT_DATA_ESCAPED_DASH_DASH: 1321 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 1322 case SCRIPT_DATA_DOUBLE_ESCAPED: 1323 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 1324 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 1325 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 1326 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 1327 cstart = start; 1328 break; 1329 default: 1330 cstart = Integer.MAX_VALUE; 1331 break; 1332 } 1333 1334 /** 1335 * The number of <code>char</code>s in <code>buf</code> that have 1336 * meaning. (The rest of the array is garbage and should not be 1337 * examined.) 1338 */ 1339 // CPPONLY: if (mViewSource) { 1340 // CPPONLY: mViewSource.SetBuffer(buffer); 1341 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1342 // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1); 1343 // CPPONLY: } else { 1344 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1345 // CPPONLY: } 1346 // [NOCPP[ 1347 pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, 1348 end); 1349 // ]NOCPP] 1350 if (pos == end) { 1351 // exiting due to end of buffer 1352 buffer.setStart(pos); 1353 } else { 1354 buffer.setStart(pos + 1); 1355 } 1356 return lastCR; 1357 } 1358 1359 // [NOCPP[ ensureBufferSpace(int inputLength)1360 private void ensureBufferSpace(int inputLength) throws SAXException { 1361 // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB. 1362 // Adding to the general worst case instead of only the 1363 // TreeBuilder-exposed worst case to avoid re-introducing a bug when 1364 // unifying the tokenizer and tree builder buffers in the future. 1365 int worstCase = strBufLen + inputLength + charRefBufLen + 2; 1366 tokenHandler.ensureBufferSpace(worstCase); 1367 if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) { 1368 // When altering infoset, if the comment contents are consecutive 1369 // hyphens, each hyphen generates a space, too. These buffer 1370 // contents never get emitted as characters() to the tokenHandler, 1371 // which is why this calculation happens after the call to 1372 // ensureBufferSpace on tokenHandler. 1373 worstCase *= 2; 1374 } 1375 if (strBuf == null) { 1376 // Add an arbitrary small value to avoid immediate reallocation 1377 // once there are a few characters in the buffer. 1378 strBuf = new char[worstCase + 128]; 1379 } else if (worstCase > strBuf.length) { 1380 // HotSpot reportedly allocates memory with 8-byte accuracy, so 1381 // there's no point in trying to do math here to avoid slop. 1382 // Maybe we should add some small constant to worstCase here 1383 // but not doing that without profiling. In C++ with jemalloc, 1384 // the corresponding method should do math to round up here 1385 // to avoid slop. 1386 char[] newBuf = new char[worstCase]; 1387 System.arraycopy(strBuf, 0, newBuf, 0, strBufLen); 1388 strBuf = newBuf; 1389 } 1390 } 1391 // ]NOCPP] 1392 stateLoop(int state, char c, int pos, @NoLength char[] buf, boolean reconsume, int returnState, int endPos)1393 @SuppressWarnings("unused") private int stateLoop(int state, char c, 1394 int pos, @NoLength char[] buf, boolean reconsume, int returnState, 1395 int endPos) throws SAXException { 1396 /* 1397 * Idioms used in this code: 1398 * 1399 * 1400 * Consuming the next input character 1401 * 1402 * To consume the next input character, the code does this: if (++pos == 1403 * endPos) { break stateloop; } c = checkChar(buf, pos); 1404 * 1405 * 1406 * Staying in a state 1407 * 1408 * When there's a state that the tokenizer may stay in over multiple 1409 * input characters, the state has a wrapper |for(;;)| loop and staying 1410 * in the state continues the loop. 1411 * 1412 * 1413 * Switching to another state 1414 * 1415 * To switch to another state, the code sets the state variable to the 1416 * magic number of the new state. Then it either continues stateloop or 1417 * breaks out of the state's own wrapper loop if the target state is 1418 * right after the current state in source order. (This is a partial 1419 * workaround for Java's lack of goto.) 1420 * 1421 * 1422 * Reconsume support 1423 * 1424 * The spec sometimes says that an input character is reconsumed in 1425 * another state. If a state can ever be entered so that an input 1426 * character can be reconsumed in it, the state's code starts with an 1427 * |if (reconsume)| that sets reconsume to false and skips over the 1428 * normal code for consuming a new character. 1429 * 1430 * To reconsume the current character in another state, the code sets 1431 * |reconsume| to true and then switches to the other state. 1432 * 1433 * 1434 * Emitting character tokens 1435 * 1436 * This method emits character tokens lazily. Whenever a new range of 1437 * character tokens starts, the field cstart must be set to the start 1438 * index of the range. The flushChars() method must be called at the end 1439 * of a range to flush it. 1440 * 1441 * 1442 * U+0000 handling 1443 * 1444 * The various states have to handle the replacement of U+0000 with 1445 * U+FFFD. However, if U+0000 would be reconsumed in another state, the 1446 * replacement doesn't need to happen, because it's handled by the 1447 * reconsuming state. 1448 * 1449 * 1450 * LF handling 1451 * 1452 * Every state needs to increment the line number upon LF unless the LF 1453 * gets reconsumed by another state which increments the line number. 1454 * 1455 * 1456 * CR handling 1457 * 1458 * Every state needs to handle CR unless the CR gets reconsumed and is 1459 * handled by the reconsuming state. The CR needs to be handled as if it 1460 * were and LF, the lastCR field must be set to true and then this 1461 * method must return. The IO driver will then swallow the next 1462 * character if it is an LF to coalesce CRLF. 1463 */ 1464 stateloop: for (;;) { 1465 switch (state) { 1466 case DATA: 1467 dataloop: for (;;) { 1468 if (reconsume) { 1469 reconsume = false; 1470 } else { 1471 if (++pos == endPos) { 1472 break stateloop; 1473 } 1474 c = checkChar(buf, pos); 1475 } 1476 switch (c) { 1477 case '&': 1478 /* 1479 * U+0026 AMPERSAND (&) Switch to the character 1480 * reference in data state. 1481 */ 1482 flushChars(buf, pos); 1483 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 1484 appendCharRefBuf(c); 1485 setAdditionalAndRememberAmpersandLocation('\u0000'); 1486 returnState = state; 1487 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1488 continue stateloop; 1489 case '<': 1490 /* 1491 * U+003C LESS-THAN SIGN (<) Switch to the tag 1492 * open state. 1493 */ 1494 flushChars(buf, pos); 1495 1496 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); 1497 break dataloop; // FALL THROUGH continue 1498 // stateloop; 1499 case '\u0000': 1500 emitReplacementCharacter(buf, pos); 1501 continue; 1502 case '\r': 1503 emitCarriageReturn(buf, pos); 1504 break stateloop; 1505 case '\n': 1506 silentLineFeed(); 1507 default: 1508 /* 1509 * Anything else Emit the input character as a 1510 * character token. 1511 * 1512 * Stay in the data state. 1513 */ 1514 continue; 1515 } 1516 } 1517 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1518 case TAG_OPEN: 1519 tagopenloop: for (;;) { 1520 /* 1521 * The behavior of this state depends on the content 1522 * model flag. 1523 */ 1524 if (++pos == endPos) { 1525 break stateloop; 1526 } 1527 c = checkChar(buf, pos); 1528 /* 1529 * If the content model flag is set to the PCDATA state 1530 * Consume the next input character: 1531 */ 1532 if (c >= 'A' && c <= 'Z') { 1533 /* 1534 * U+0041 LATIN CAPITAL LETTER A through to U+005A 1535 * LATIN CAPITAL LETTER Z Create a new start tag 1536 * token, 1537 */ 1538 endTag = false; 1539 /* 1540 * set its tag name to the lowercase version of the 1541 * input character (add 0x0020 to the character's 1542 * code point), 1543 */ 1544 clearStrBufBeforeUse(); 1545 appendStrBuf((char) (c + 0x20)); 1546 /* then switch to the tag name state. */ 1547 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1548 /* 1549 * (Don't emit the token yet; further details will 1550 * be filled in before it is emitted.) 1551 */ 1552 break tagopenloop; 1553 // continue stateloop; 1554 } else if (c >= 'a' && c <= 'z') { 1555 /* 1556 * U+0061 LATIN SMALL LETTER A through to U+007A 1557 * LATIN SMALL LETTER Z Create a new start tag 1558 * token, 1559 */ 1560 endTag = false; 1561 /* 1562 * set its tag name to the input character, 1563 */ 1564 clearStrBufBeforeUse(); 1565 appendStrBuf(c); 1566 /* then switch to the tag name state. */ 1567 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1568 /* 1569 * (Don't emit the token yet; further details will 1570 * be filled in before it is emitted.) 1571 */ 1572 break tagopenloop; 1573 // continue stateloop; 1574 } 1575 switch (c) { 1576 case '!': 1577 /* 1578 * U+0021 EXCLAMATION MARK (!) Switch to the 1579 * markup declaration open state. 1580 */ 1581 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); 1582 continue stateloop; 1583 case '/': 1584 /* 1585 * U+002F SOLIDUS (/) Switch to the close tag 1586 * open state. 1587 */ 1588 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); 1589 continue stateloop; 1590 case '?': 1591 // CPPONLY: if (viewingXmlSource) { 1592 // CPPONLY: state = transition(state, 1593 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION, 1594 // CPPONLY: reconsume, 1595 // CPPONLY: pos); 1596 // CPPONLY: continue stateloop; 1597 // CPPONLY: } 1598 /* 1599 * U+003F QUESTION MARK (?) Parse error. 1600 */ 1601 errProcessingInstruction(); 1602 /* 1603 * Switch to the bogus comment state. 1604 */ 1605 clearStrBufBeforeUse(); 1606 appendStrBuf(c); 1607 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1608 continue stateloop; 1609 case '>': 1610 /* 1611 * U+003E GREATER-THAN SIGN (>) Parse error. 1612 */ 1613 errLtGt(); 1614 /* 1615 * Emit a U+003C LESS-THAN SIGN character token 1616 * and a U+003E GREATER-THAN SIGN character 1617 * token. 1618 */ 1619 tokenHandler.characters(Tokenizer.LT_GT, 0, 2); 1620 /* Switch to the data state. */ 1621 cstart = pos + 1; 1622 state = transition(state, Tokenizer.DATA, reconsume, pos); 1623 continue stateloop; 1624 default: 1625 /* 1626 * Anything else Parse error. 1627 */ 1628 errBadCharAfterLt(c); 1629 /* 1630 * Emit a U+003C LESS-THAN SIGN character token 1631 */ 1632 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1633 /* 1634 * and reconsume the current input character in 1635 * the data state. 1636 */ 1637 cstart = pos; 1638 reconsume = true; 1639 state = transition(state, Tokenizer.DATA, reconsume, pos); 1640 continue stateloop; 1641 } 1642 } 1643 // FALL THROUGH DON'T REORDER 1644 case TAG_NAME: 1645 tagnameloop: for (;;) { 1646 if (++pos == endPos) { 1647 break stateloop; 1648 } 1649 c = checkChar(buf, pos); 1650 /* 1651 * Consume the next input character: 1652 */ 1653 switch (c) { 1654 case '\r': 1655 silentCarriageReturn(); 1656 strBufToElementNameString(); 1657 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1658 break stateloop; 1659 case '\n': 1660 silentLineFeed(); 1661 case ' ': 1662 case '\t': 1663 case '\u000C': 1664 /* 1665 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1666 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1667 * Switch to the before attribute name state. 1668 */ 1669 strBufToElementNameString(); 1670 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1671 break tagnameloop; 1672 // continue stateloop; 1673 case '/': 1674 /* 1675 * U+002F SOLIDUS (/) Switch to the self-closing 1676 * start tag state. 1677 */ 1678 strBufToElementNameString(); 1679 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1680 continue stateloop; 1681 case '>': 1682 /* 1683 * U+003E GREATER-THAN SIGN (>) Emit the current 1684 * tag token. 1685 */ 1686 strBufToElementNameString(); 1687 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1688 if (shouldSuspend) { 1689 break stateloop; 1690 } 1691 /* 1692 * Switch to the data state. 1693 */ 1694 continue stateloop; 1695 case '\u0000': 1696 c = '\uFFFD'; 1697 // fall thru 1698 default: 1699 if (c >= 'A' && c <= 'Z') { 1700 /* 1701 * U+0041 LATIN CAPITAL LETTER A through to 1702 * U+005A LATIN CAPITAL LETTER Z Append the 1703 * lowercase version of the current input 1704 * character (add 0x0020 to the character's 1705 * code point) to the current tag token's 1706 * tag name. 1707 */ 1708 c += 0x20; 1709 } 1710 /* 1711 * Anything else Append the current input 1712 * character to the current tag token's tag 1713 * name. 1714 */ 1715 appendStrBuf(c); 1716 /* 1717 * Stay in the tag name state. 1718 */ 1719 continue; 1720 } 1721 } 1722 // FALLTHRU DON'T REORDER 1723 case BEFORE_ATTRIBUTE_NAME: 1724 beforeattributenameloop: for (;;) { 1725 if (reconsume) { 1726 reconsume = false; 1727 } else { 1728 if (++pos == endPos) { 1729 break stateloop; 1730 } 1731 c = checkChar(buf, pos); 1732 } 1733 /* 1734 * Consume the next input character: 1735 */ 1736 switch (c) { 1737 case '\r': 1738 silentCarriageReturn(); 1739 break stateloop; 1740 case '\n': 1741 silentLineFeed(); 1742 // fall thru 1743 case ' ': 1744 case '\t': 1745 case '\u000C': 1746 /* 1747 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1748 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1749 * in the before attribute name state. 1750 */ 1751 continue; 1752 case '/': 1753 /* 1754 * U+002F SOLIDUS (/) Switch to the self-closing 1755 * start tag state. 1756 */ 1757 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1758 continue stateloop; 1759 case '>': 1760 /* 1761 * U+003E GREATER-THAN SIGN (>) Emit the current 1762 * tag token. 1763 */ 1764 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1765 if (shouldSuspend) { 1766 break stateloop; 1767 } 1768 /* 1769 * Switch to the data state. 1770 */ 1771 continue stateloop; 1772 case '\u0000': 1773 c = '\uFFFD'; 1774 // fall thru 1775 case '\"': 1776 case '\'': 1777 case '<': 1778 case '=': 1779 /* 1780 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1781 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 1782 * SIGN (=) Parse error. 1783 */ 1784 errBadCharBeforeAttributeNameOrNull(c); 1785 /* 1786 * Treat it as per the "anything else" entry 1787 * below. 1788 */ 1789 default: 1790 /* 1791 * Anything else Start a new attribute in the 1792 * current tag token. 1793 */ 1794 if (c >= 'A' && c <= 'Z') { 1795 /* 1796 * U+0041 LATIN CAPITAL LETTER A through to 1797 * U+005A LATIN CAPITAL LETTER Z Set that 1798 * attribute's name to the lowercase version 1799 * of the current input character (add 1800 * 0x0020 to the character's code point) 1801 */ 1802 c += 0x20; 1803 } 1804 // CPPONLY: attributeLine = line; 1805 /* 1806 * Set that attribute's name to the current 1807 * input character, 1808 */ 1809 clearStrBufBeforeUse(); 1810 appendStrBuf(c); 1811 /* 1812 * and its value to the empty string. 1813 */ 1814 // Will do later. 1815 /* 1816 * Switch to the attribute name state. 1817 */ 1818 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 1819 break beforeattributenameloop; 1820 // continue stateloop; 1821 } 1822 } 1823 // FALLTHRU DON'T REORDER 1824 case ATTRIBUTE_NAME: 1825 attributenameloop: for (;;) { 1826 if (++pos == endPos) { 1827 break stateloop; 1828 } 1829 c = checkChar(buf, pos); 1830 /* 1831 * Consume the next input character: 1832 */ 1833 switch (c) { 1834 case '\r': 1835 silentCarriageReturn(); 1836 attributeNameComplete(); 1837 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 1838 break stateloop; 1839 case '\n': 1840 silentLineFeed(); 1841 // fall thru 1842 case ' ': 1843 case '\t': 1844 case '\u000C': 1845 /* 1846 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1847 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1848 * Switch to the after attribute name state. 1849 */ 1850 attributeNameComplete(); 1851 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 1852 continue stateloop; 1853 case '/': 1854 /* 1855 * U+002F SOLIDUS (/) Switch to the self-closing 1856 * start tag state. 1857 */ 1858 attributeNameComplete(); 1859 addAttributeWithoutValue(); 1860 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1861 continue stateloop; 1862 case '=': 1863 /* 1864 * U+003D EQUALS SIGN (=) Switch to the before 1865 * attribute value state. 1866 */ 1867 attributeNameComplete(); 1868 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 1869 break attributenameloop; 1870 // continue stateloop; 1871 case '>': 1872 /* 1873 * U+003E GREATER-THAN SIGN (>) Emit the current 1874 * tag token. 1875 */ 1876 attributeNameComplete(); 1877 addAttributeWithoutValue(); 1878 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1879 if (shouldSuspend) { 1880 break stateloop; 1881 } 1882 /* 1883 * Switch to the data state. 1884 */ 1885 continue stateloop; 1886 case '\u0000': 1887 c = '\uFFFD'; 1888 // fall thru 1889 case '\"': 1890 case '\'': 1891 case '<': 1892 /* 1893 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1894 * (') U+003C LESS-THAN SIGN (<) Parse error. 1895 */ 1896 errQuoteOrLtInAttributeNameOrNull(c); 1897 /* 1898 * Treat it as per the "anything else" entry 1899 * below. 1900 */ 1901 default: 1902 if (c >= 'A' && c <= 'Z') { 1903 /* 1904 * U+0041 LATIN CAPITAL LETTER A through to 1905 * U+005A LATIN CAPITAL LETTER Z Append the 1906 * lowercase version of the current input 1907 * character (add 0x0020 to the character's 1908 * code point) to the current attribute's 1909 * name. 1910 */ 1911 c += 0x20; 1912 } 1913 /* 1914 * Anything else Append the current input 1915 * character to the current attribute's name. 1916 */ 1917 appendStrBuf(c); 1918 /* 1919 * Stay in the attribute name state. 1920 */ 1921 continue; 1922 } 1923 } 1924 // FALLTHRU DON'T REORDER 1925 case BEFORE_ATTRIBUTE_VALUE: 1926 beforeattributevalueloop: for (;;) { 1927 if (++pos == endPos) { 1928 break stateloop; 1929 } 1930 c = checkChar(buf, pos); 1931 /* 1932 * Consume the next input character: 1933 */ 1934 switch (c) { 1935 case '\r': 1936 silentCarriageReturn(); 1937 break stateloop; 1938 case '\n': 1939 silentLineFeed(); 1940 // fall thru 1941 case ' ': 1942 case '\t': 1943 case '\u000C': 1944 /* 1945 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1946 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1947 * in the before attribute value state. 1948 */ 1949 continue; 1950 case '"': 1951 /* 1952 * U+0022 QUOTATION MARK (") Switch to the 1953 * attribute value (double-quoted) state. 1954 */ 1955 // CPPONLY: attributeLine = line; 1956 clearStrBufBeforeUse(); 1957 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); 1958 break beforeattributevalueloop; 1959 // continue stateloop; 1960 case '&': 1961 /* 1962 * U+0026 AMPERSAND (&) Switch to the attribute 1963 * value (unquoted) state and reconsume this 1964 * input character. 1965 */ 1966 // CPPONLY: attributeLine = line; 1967 clearStrBufBeforeUse(); 1968 reconsume = true; 1969 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 1970 noteUnquotedAttributeValue(); 1971 continue stateloop; 1972 case '\'': 1973 /* 1974 * U+0027 APOSTROPHE (') Switch to the attribute 1975 * value (single-quoted) state. 1976 */ 1977 // CPPONLY: attributeLine = line; 1978 clearStrBufBeforeUse(); 1979 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); 1980 continue stateloop; 1981 case '>': 1982 /* 1983 * U+003E GREATER-THAN SIGN (>) Parse error. 1984 */ 1985 errAttributeValueMissing(); 1986 /* 1987 * Emit the current tag token. 1988 */ 1989 addAttributeWithoutValue(); 1990 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1991 if (shouldSuspend) { 1992 break stateloop; 1993 } 1994 /* 1995 * Switch to the data state. 1996 */ 1997 continue stateloop; 1998 case '\u0000': 1999 c = '\uFFFD'; 2000 // fall thru 2001 case '<': 2002 case '=': 2003 case '`': 2004 /* 2005 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN 2006 * (=) U+0060 GRAVE ACCENT (`) 2007 */ 2008 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); 2009 /* 2010 * Treat it as per the "anything else" entry 2011 * below. 2012 */ 2013 default: 2014 // [NOCPP[ 2015 errHtml4NonNameInUnquotedAttribute(c); 2016 // ]NOCPP] 2017 /* 2018 * Anything else Append the current input 2019 * character to the current attribute's value. 2020 */ 2021 // CPPONLY: attributeLine = line; 2022 clearStrBufBeforeUse(); 2023 appendStrBuf(c); 2024 /* 2025 * Switch to the attribute value (unquoted) 2026 * state. 2027 */ 2028 2029 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 2030 noteUnquotedAttributeValue(); 2031 continue stateloop; 2032 } 2033 } 2034 // FALLTHRU DON'T REORDER 2035 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 2036 attributevaluedoublequotedloop: for (;;) { 2037 if (reconsume) { 2038 reconsume = false; 2039 } else { 2040 if (++pos == endPos) { 2041 break stateloop; 2042 } 2043 c = checkChar(buf, pos); 2044 } 2045 /* 2046 * Consume the next input character: 2047 */ 2048 switch (c) { 2049 case '"': 2050 /* 2051 * U+0022 QUOTATION MARK (") Switch to the after 2052 * attribute value (quoted) state. 2053 */ 2054 addAttributeWithValue(); 2055 2056 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 2057 break attributevaluedoublequotedloop; 2058 // continue stateloop; 2059 case '&': 2060 /* 2061 * U+0026 AMPERSAND (&) Switch to the character 2062 * reference in attribute value state, with the 2063 * additional allowed character being U+0022 2064 * QUOTATION MARK ("). 2065 */ 2066 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2067 appendCharRefBuf(c); 2068 setAdditionalAndRememberAmpersandLocation('\"'); 2069 returnState = state; 2070 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2071 continue stateloop; 2072 case '\r': 2073 appendStrBufCarriageReturn(); 2074 break stateloop; 2075 case '\n': 2076 appendStrBufLineFeed(); 2077 continue; 2078 case '\u0000': 2079 c = '\uFFFD'; 2080 // fall thru 2081 default: 2082 /* 2083 * Anything else Append the current input 2084 * character to the current attribute's value. 2085 */ 2086 appendStrBuf(c); 2087 /* 2088 * Stay in the attribute value (double-quoted) 2089 * state. 2090 */ 2091 continue; 2092 } 2093 } 2094 // FALLTHRU DON'T REORDER 2095 case AFTER_ATTRIBUTE_VALUE_QUOTED: 2096 afterattributevaluequotedloop: for (;;) { 2097 if (++pos == endPos) { 2098 break stateloop; 2099 } 2100 c = checkChar(buf, pos); 2101 /* 2102 * Consume the next input character: 2103 */ 2104 switch (c) { 2105 case '\r': 2106 silentCarriageReturn(); 2107 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2108 break stateloop; 2109 case '\n': 2110 silentLineFeed(); 2111 // fall thru 2112 case ' ': 2113 case '\t': 2114 case '\u000C': 2115 /* 2116 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2117 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2118 * Switch to the before attribute name state. 2119 */ 2120 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2121 continue stateloop; 2122 case '/': 2123 /* 2124 * U+002F SOLIDUS (/) Switch to the self-closing 2125 * start tag state. 2126 */ 2127 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2128 break afterattributevaluequotedloop; 2129 // continue stateloop; 2130 case '>': 2131 /* 2132 * U+003E GREATER-THAN SIGN (>) Emit the current 2133 * tag token. 2134 */ 2135 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2136 if (shouldSuspend) { 2137 break stateloop; 2138 } 2139 /* 2140 * Switch to the data state. 2141 */ 2142 continue stateloop; 2143 default: 2144 /* 2145 * Anything else Parse error. 2146 */ 2147 errNoSpaceBetweenAttributes(); 2148 /* 2149 * Reconsume the character in the before 2150 * attribute name state. 2151 */ 2152 reconsume = true; 2153 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2154 continue stateloop; 2155 } 2156 } 2157 // FALLTHRU DON'T REORDER 2158 case SELF_CLOSING_START_TAG: 2159 if (++pos == endPos) { 2160 break stateloop; 2161 } 2162 c = checkChar(buf, pos); 2163 /* 2164 * Consume the next input character: 2165 */ 2166 switch (c) { 2167 case '>': 2168 /* 2169 * U+003E GREATER-THAN SIGN (>) Set the self-closing 2170 * flag of the current tag token. Emit the current 2171 * tag token. 2172 */ 2173 // [NOCPP[ 2174 errHtml4XmlVoidSyntax(); 2175 // ]NOCPP] 2176 state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); 2177 if (shouldSuspend) { 2178 break stateloop; 2179 } 2180 /* 2181 * Switch to the data state. 2182 */ 2183 continue stateloop; 2184 default: 2185 /* Anything else Parse error. */ 2186 errSlashNotFollowedByGt(); 2187 /* 2188 * Reconsume the character in the before attribute 2189 * name state. 2190 */ 2191 reconsume = true; 2192 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2193 continue stateloop; 2194 } 2195 // XXX reorder point 2196 case ATTRIBUTE_VALUE_UNQUOTED: 2197 for (;;) { 2198 if (reconsume) { 2199 reconsume = false; 2200 } else { 2201 if (++pos == endPos) { 2202 break stateloop; 2203 } 2204 c = checkChar(buf, pos); 2205 } 2206 /* 2207 * Consume the next input character: 2208 */ 2209 switch (c) { 2210 case '\r': 2211 silentCarriageReturn(); 2212 addAttributeWithValue(); 2213 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2214 break stateloop; 2215 case '\n': 2216 silentLineFeed(); 2217 // fall thru 2218 case ' ': 2219 case '\t': 2220 case '\u000C': 2221 /* 2222 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2223 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2224 * Switch to the before attribute name state. 2225 */ 2226 addAttributeWithValue(); 2227 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2228 continue stateloop; 2229 case '&': 2230 /* 2231 * U+0026 AMPERSAND (&) Switch to the character 2232 * reference in attribute value state, with the 2233 * additional allowed character being U+003E 2234 * GREATER-THAN SIGN (>) 2235 */ 2236 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2237 appendCharRefBuf(c); 2238 setAdditionalAndRememberAmpersandLocation('>'); 2239 returnState = state; 2240 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2241 continue stateloop; 2242 case '>': 2243 /* 2244 * U+003E GREATER-THAN SIGN (>) Emit the current 2245 * tag token. 2246 */ 2247 addAttributeWithValue(); 2248 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2249 if (shouldSuspend) { 2250 break stateloop; 2251 } 2252 /* 2253 * Switch to the data state. 2254 */ 2255 continue stateloop; 2256 case '\u0000': 2257 c = '\uFFFD'; 2258 // fall thru 2259 case '<': 2260 case '\"': 2261 case '\'': 2262 case '=': 2263 case '`': 2264 /* 2265 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 2266 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 2267 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. 2268 */ 2269 errUnquotedAttributeValOrNull(c); 2270 /* 2271 * Treat it as per the "anything else" entry 2272 * below. 2273 */ 2274 // fall through 2275 default: 2276 // [NOCPP] 2277 errHtml4NonNameInUnquotedAttribute(c); 2278 // ]NOCPP] 2279 /* 2280 * Anything else Append the current input 2281 * character to the current attribute's value. 2282 */ 2283 appendStrBuf(c); 2284 /* 2285 * Stay in the attribute value (unquoted) state. 2286 */ 2287 continue; 2288 } 2289 } 2290 // XXX reorder point 2291 case AFTER_ATTRIBUTE_NAME: 2292 for (;;) { 2293 if (++pos == endPos) { 2294 break stateloop; 2295 } 2296 c = checkChar(buf, pos); 2297 /* 2298 * Consume the next input character: 2299 */ 2300 switch (c) { 2301 case '\r': 2302 silentCarriageReturn(); 2303 break stateloop; 2304 case '\n': 2305 silentLineFeed(); 2306 // fall thru 2307 case ' ': 2308 case '\t': 2309 case '\u000C': 2310 /* 2311 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2312 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 2313 * in the after attribute name state. 2314 */ 2315 continue; 2316 case '/': 2317 /* 2318 * U+002F SOLIDUS (/) Switch to the self-closing 2319 * start tag state. 2320 */ 2321 addAttributeWithoutValue(); 2322 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2323 continue stateloop; 2324 case '=': 2325 /* 2326 * U+003D EQUALS SIGN (=) Switch to the before 2327 * attribute value state. 2328 */ 2329 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 2330 continue stateloop; 2331 case '>': 2332 /* 2333 * U+003E GREATER-THAN SIGN (>) Emit the current 2334 * tag token. 2335 */ 2336 addAttributeWithoutValue(); 2337 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2338 if (shouldSuspend) { 2339 break stateloop; 2340 } 2341 /* 2342 * Switch to the data state. 2343 */ 2344 continue stateloop; 2345 case '\u0000': 2346 c = '\uFFFD'; 2347 // fall thru 2348 case '\"': 2349 case '\'': 2350 case '<': 2351 errQuoteOrLtInAttributeNameOrNull(c); 2352 /* 2353 * Treat it as per the "anything else" entry 2354 * below. 2355 */ 2356 default: 2357 addAttributeWithoutValue(); 2358 /* 2359 * Anything else Start a new attribute in the 2360 * current tag token. 2361 */ 2362 if (c >= 'A' && c <= 'Z') { 2363 /* 2364 * U+0041 LATIN CAPITAL LETTER A through to 2365 * U+005A LATIN CAPITAL LETTER Z Set that 2366 * attribute's name to the lowercase version 2367 * of the current input character (add 2368 * 0x0020 to the character's code point) 2369 */ 2370 c += 0x20; 2371 } 2372 /* 2373 * Set that attribute's name to the current 2374 * input character, 2375 */ 2376 clearStrBufBeforeUse(); 2377 appendStrBuf(c); 2378 /* 2379 * and its value to the empty string. 2380 */ 2381 // Will do later. 2382 /* 2383 * Switch to the attribute name state. 2384 */ 2385 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 2386 continue stateloop; 2387 } 2388 } 2389 // XXX reorder point 2390 case MARKUP_DECLARATION_OPEN: 2391 markupdeclarationopenloop: for (;;) { 2392 if (++pos == endPos) { 2393 break stateloop; 2394 } 2395 c = checkChar(buf, pos); 2396 /* 2397 * If the next two characters are both U+002D 2398 * HYPHEN-MINUS characters (-), consume those two 2399 * characters, create a comment token whose data is the 2400 * empty string, and switch to the comment start state. 2401 * 2402 * Otherwise, if the next seven characters are an ASCII 2403 * case-insensitive match for the word "DOCTYPE", then 2404 * consume those characters and switch to the DOCTYPE 2405 * state. 2406 * 2407 * Otherwise, if the insertion mode is 2408 * "in foreign content" and the current node is not an 2409 * element in the HTML namespace and the next seven 2410 * characters are an case-sensitive match for the string 2411 * "[CDATA[" (the five uppercase letters "CDATA" with a 2412 * U+005B LEFT SQUARE BRACKET character before and 2413 * after), then consume those characters and switch to 2414 * the CDATA section state. 2415 * 2416 * Otherwise, is is a parse error. Switch to the bogus 2417 * comment state. The next character that is consumed, 2418 * if any, is the first character that will be in the 2419 * comment. 2420 */ 2421 switch (c) { 2422 case '-': 2423 clearStrBufBeforeUse(); 2424 appendStrBuf(c); 2425 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); 2426 break markupdeclarationopenloop; 2427 // continue stateloop; 2428 case 'd': 2429 case 'D': 2430 clearStrBufBeforeUse(); 2431 appendStrBuf(c); 2432 index = 0; 2433 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); 2434 continue stateloop; 2435 case '[': 2436 if (tokenHandler.cdataSectionAllowed()) { 2437 clearStrBufBeforeUse(); 2438 appendStrBuf(c); 2439 index = 0; 2440 state = transition(state, Tokenizer.CDATA_START, reconsume, pos); 2441 continue stateloop; 2442 } 2443 // else fall through 2444 default: 2445 errBogusComment(); 2446 clearStrBufBeforeUse(); 2447 reconsume = true; 2448 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2449 continue stateloop; 2450 } 2451 } 2452 // FALLTHRU DON'T REORDER 2453 case MARKUP_DECLARATION_HYPHEN: 2454 markupdeclarationhyphenloop: for (;;) { 2455 if (++pos == endPos) { 2456 break stateloop; 2457 } 2458 c = checkChar(buf, pos); 2459 switch (c) { 2460 case '\u0000': 2461 break stateloop; 2462 case '-': 2463 clearStrBufAfterOneHyphen(); 2464 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); 2465 break markupdeclarationhyphenloop; 2466 // continue stateloop; 2467 default: 2468 errBogusComment(); 2469 reconsume = true; 2470 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2471 continue stateloop; 2472 } 2473 } 2474 // FALLTHRU DON'T REORDER 2475 case COMMENT_START: 2476 commentstartloop: for (;;) { 2477 if (++pos == endPos) { 2478 break stateloop; 2479 } 2480 c = checkChar(buf, pos); 2481 /* 2482 * Comment start state 2483 * 2484 * 2485 * Consume the next input character: 2486 */ 2487 switch (c) { 2488 case '-': 2489 /* 2490 * U+002D HYPHEN-MINUS (-) Switch to the comment 2491 * start dash state. 2492 */ 2493 appendStrBuf(c); 2494 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); 2495 continue stateloop; 2496 case '>': 2497 /* 2498 * U+003E GREATER-THAN SIGN (>) Parse error. 2499 */ 2500 errPrematureEndOfComment(); 2501 /* Emit the comment token. */ 2502 emitComment(0, pos); 2503 /* 2504 * Switch to the data state. 2505 */ 2506 state = transition(state, Tokenizer.DATA, reconsume, pos); 2507 continue stateloop; 2508 case '\r': 2509 appendStrBufCarriageReturn(); 2510 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2511 break stateloop; 2512 case '\n': 2513 appendStrBufLineFeed(); 2514 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2515 break commentstartloop; 2516 case '\u0000': 2517 c = '\uFFFD'; 2518 // fall thru 2519 default: 2520 /* 2521 * Anything else Append the input character to 2522 * the comment token's data. 2523 */ 2524 appendStrBuf(c); 2525 /* 2526 * Switch to the comment state. 2527 */ 2528 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2529 break commentstartloop; 2530 // continue stateloop; 2531 } 2532 } 2533 // FALLTHRU DON'T REORDER 2534 case COMMENT: 2535 commentloop: for (;;) { 2536 if (++pos == endPos) { 2537 break stateloop; 2538 } 2539 c = checkChar(buf, pos); 2540 /* 2541 * Comment state Consume the next input character: 2542 */ 2543 switch (c) { 2544 case '-': 2545 /* 2546 * U+002D HYPHEN-MINUS (-) Switch to the comment 2547 * end dash state 2548 */ 2549 appendStrBuf(c); 2550 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2551 break commentloop; 2552 // continue stateloop; 2553 case '\r': 2554 appendStrBufCarriageReturn(); 2555 break stateloop; 2556 case '\n': 2557 appendStrBufLineFeed(); 2558 continue; 2559 case '\u0000': 2560 c = '\uFFFD'; 2561 // fall thru 2562 default: 2563 /* 2564 * Anything else Append the input character to 2565 * the comment token's data. 2566 */ 2567 appendStrBuf(c); 2568 /* 2569 * Stay in the comment state. 2570 */ 2571 continue; 2572 } 2573 } 2574 // FALLTHRU DON'T REORDER 2575 case COMMENT_END_DASH: 2576 commentenddashloop: for (;;) { 2577 if (++pos == endPos) { 2578 break stateloop; 2579 } 2580 c = checkChar(buf, pos); 2581 /* 2582 * Comment end dash state Consume the next input 2583 * character: 2584 */ 2585 switch (c) { 2586 case '-': 2587 /* 2588 * U+002D HYPHEN-MINUS (-) Switch to the comment 2589 * end state 2590 */ 2591 appendStrBuf(c); 2592 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 2593 break commentenddashloop; 2594 // continue stateloop; 2595 case '\r': 2596 appendStrBufCarriageReturn(); 2597 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2598 break stateloop; 2599 case '\n': 2600 appendStrBufLineFeed(); 2601 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2602 continue stateloop; 2603 case '\u0000': 2604 c = '\uFFFD'; 2605 // fall thru 2606 default: 2607 /* 2608 * Anything else Append a U+002D HYPHEN-MINUS 2609 * (-) character and the input character to the 2610 * comment token's data. 2611 */ 2612 appendStrBuf(c); 2613 /* 2614 * Switch to the comment state. 2615 */ 2616 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2617 continue stateloop; 2618 } 2619 } 2620 // FALLTHRU DON'T REORDER 2621 case COMMENT_END: 2622 commentendloop: for (;;) { 2623 if (++pos == endPos) { 2624 break stateloop; 2625 } 2626 c = checkChar(buf, pos); 2627 /* 2628 * Comment end dash state Consume the next input 2629 * character: 2630 */ 2631 switch (c) { 2632 case '>': 2633 /* 2634 * U+003E GREATER-THAN SIGN (>) Emit the comment 2635 * token. 2636 */ 2637 emitComment(2, pos); 2638 /* 2639 * Switch to the data state. 2640 */ 2641 state = transition(state, Tokenizer.DATA, reconsume, pos); 2642 continue stateloop; 2643 case '-': 2644 /* U+002D HYPHEN-MINUS (-) Parse error. */ 2645 /* 2646 * Append a U+002D HYPHEN-MINUS (-) character to 2647 * the comment token's data. 2648 */ 2649 adjustDoubleHyphenAndAppendToStrBufAndErr(c); 2650 /* 2651 * Stay in the comment end state. 2652 */ 2653 continue; 2654 case '\r': 2655 adjustDoubleHyphenAndAppendToStrBufCarriageReturn(); 2656 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2657 break stateloop; 2658 case '\n': 2659 adjustDoubleHyphenAndAppendToStrBufLineFeed(); 2660 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2661 continue stateloop; 2662 case '!': 2663 errHyphenHyphenBang(); 2664 appendStrBuf(c); 2665 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); 2666 continue stateloop; 2667 case '\u0000': 2668 c = '\uFFFD'; 2669 // fall thru 2670 default: 2671 /* 2672 * Append two U+002D HYPHEN-MINUS (-) characters 2673 * and the input character to the comment 2674 * token's data. 2675 */ 2676 adjustDoubleHyphenAndAppendToStrBufAndErr(c); 2677 /* 2678 * Switch to the comment state. 2679 */ 2680 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2681 continue stateloop; 2682 } 2683 } 2684 // XXX reorder point 2685 case COMMENT_END_BANG: 2686 for (;;) { 2687 if (++pos == endPos) { 2688 break stateloop; 2689 } 2690 c = checkChar(buf, pos); 2691 /* 2692 * Comment end bang state 2693 * 2694 * Consume the next input character: 2695 */ 2696 switch (c) { 2697 case '>': 2698 /* 2699 * U+003E GREATER-THAN SIGN (>) Emit the comment 2700 * token. 2701 */ 2702 emitComment(3, pos); 2703 /* 2704 * Switch to the data state. 2705 */ 2706 state = transition(state, Tokenizer.DATA, reconsume, pos); 2707 continue stateloop; 2708 case '-': 2709 /* 2710 * Append two U+002D HYPHEN-MINUS (-) characters 2711 * and a U+0021 EXCLAMATION MARK (!) character 2712 * to the comment token's data. 2713 */ 2714 appendStrBuf(c); 2715 /* 2716 * Switch to the comment end dash state. 2717 */ 2718 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2719 continue stateloop; 2720 case '\r': 2721 appendStrBufCarriageReturn(); 2722 break stateloop; 2723 case '\n': 2724 appendStrBufLineFeed(); 2725 continue; 2726 case '\u0000': 2727 c = '\uFFFD'; 2728 // fall thru 2729 default: 2730 /* 2731 * Anything else Append two U+002D HYPHEN-MINUS 2732 * (-) characters, a U+0021 EXCLAMATION MARK (!) 2733 * character, and the input character to the 2734 * comment token's data. Switch to the comment 2735 * state. 2736 */ 2737 appendStrBuf(c); 2738 /* 2739 * Switch to the comment state. 2740 */ 2741 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2742 continue stateloop; 2743 } 2744 } 2745 // XXX reorder point 2746 case COMMENT_START_DASH: 2747 if (++pos == endPos) { 2748 break stateloop; 2749 } 2750 c = checkChar(buf, pos); 2751 /* 2752 * Comment start dash state 2753 * 2754 * Consume the next input character: 2755 */ 2756 switch (c) { 2757 case '-': 2758 /* 2759 * U+002D HYPHEN-MINUS (-) Switch to the comment end 2760 * state 2761 */ 2762 appendStrBuf(c); 2763 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 2764 continue stateloop; 2765 case '>': 2766 errPrematureEndOfComment(); 2767 /* Emit the comment token. */ 2768 emitComment(1, pos); 2769 /* 2770 * Switch to the data state. 2771 */ 2772 state = transition(state, Tokenizer.DATA, reconsume, pos); 2773 continue stateloop; 2774 case '\r': 2775 appendStrBufCarriageReturn(); 2776 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2777 break stateloop; 2778 case '\n': 2779 appendStrBufLineFeed(); 2780 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2781 continue stateloop; 2782 case '\u0000': 2783 c = '\uFFFD'; 2784 // fall thru 2785 default: 2786 /* 2787 * Append a U+002D HYPHEN-MINUS character (-) and 2788 * the current input character to the comment 2789 * token's data. 2790 */ 2791 appendStrBuf(c); 2792 /* 2793 * Switch to the comment state. 2794 */ 2795 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2796 continue stateloop; 2797 } 2798 // XXX reorder point 2799 case CDATA_START: 2800 for (;;) { 2801 if (++pos == endPos) { 2802 break stateloop; 2803 } 2804 c = checkChar(buf, pos); 2805 if (index < 6) { // CDATA_LSQB.length 2806 if (c == Tokenizer.CDATA_LSQB[index]) { 2807 appendStrBuf(c); 2808 } else { 2809 errBogusComment(); 2810 reconsume = true; 2811 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2812 continue stateloop; 2813 } 2814 index++; 2815 continue; 2816 } else { 2817 clearStrBufAfterUse(); 2818 cstart = pos; // start coalescing 2819 reconsume = true; 2820 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2821 break; // FALL THROUGH continue stateloop; 2822 } 2823 } 2824 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 2825 case CDATA_SECTION: 2826 cdatasectionloop: for (;;) { 2827 if (reconsume) { 2828 reconsume = false; 2829 } else { 2830 if (++pos == endPos) { 2831 break stateloop; 2832 } 2833 c = checkChar(buf, pos); 2834 } 2835 switch (c) { 2836 case ']': 2837 flushChars(buf, pos); 2838 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); 2839 break cdatasectionloop; // FALL THROUGH 2840 case '\u0000': 2841 emitReplacementCharacter(buf, pos); 2842 continue; 2843 case '\r': 2844 emitCarriageReturn(buf, pos); 2845 break stateloop; 2846 case '\n': 2847 silentLineFeed(); 2848 // fall thru 2849 default: 2850 continue; 2851 } 2852 } 2853 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 2854 case CDATA_RSQB: 2855 cdatarsqb: for (;;) { 2856 if (++pos == endPos) { 2857 break stateloop; 2858 } 2859 c = checkChar(buf, pos); 2860 switch (c) { 2861 case ']': 2862 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); 2863 break cdatarsqb; 2864 default: 2865 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2866 1); 2867 cstart = pos; 2868 reconsume = true; 2869 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2870 continue stateloop; 2871 } 2872 } 2873 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 2874 case CDATA_RSQB_RSQB: 2875 cdatarsqbrsqb: for (;;) { 2876 if (++pos == endPos) { 2877 break stateloop; 2878 } 2879 c = checkChar(buf, pos); 2880 switch (c) { 2881 case ']': 2882 // Saw a third ]. Emit one ] (logically the 2883 // first one) and stay in this state to 2884 // remember that the last two characters seen 2885 // have been ]]. 2886 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 2887 continue; 2888 case '>': 2889 cstart = pos + 1; 2890 state = transition(state, Tokenizer.DATA, reconsume, pos); 2891 continue stateloop; 2892 default: 2893 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 2894 cstart = pos; 2895 reconsume = true; 2896 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2897 continue stateloop; 2898 } 2899 } 2900 // XXX reorder point 2901 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 2902 attributevaluesinglequotedloop: for (;;) { 2903 if (reconsume) { 2904 reconsume = false; 2905 } else { 2906 if (++pos == endPos) { 2907 break stateloop; 2908 } 2909 c = checkChar(buf, pos); 2910 } 2911 /* 2912 * Consume the next input character: 2913 */ 2914 switch (c) { 2915 case '\'': 2916 /* 2917 * U+0027 APOSTROPHE (') Switch to the after 2918 * attribute value (quoted) state. 2919 */ 2920 addAttributeWithValue(); 2921 2922 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 2923 continue stateloop; 2924 case '&': 2925 /* 2926 * U+0026 AMPERSAND (&) Switch to the character 2927 * reference in attribute value state, with the 2928 * + additional allowed character being U+0027 2929 * APOSTROPHE ('). 2930 */ 2931 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2932 appendCharRefBuf(c); 2933 setAdditionalAndRememberAmpersandLocation('\''); 2934 returnState = state; 2935 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2936 break attributevaluesinglequotedloop; 2937 // continue stateloop; 2938 case '\r': 2939 appendStrBufCarriageReturn(); 2940 break stateloop; 2941 case '\n': 2942 appendStrBufLineFeed(); 2943 continue; 2944 case '\u0000': 2945 c = '\uFFFD'; 2946 // fall thru 2947 default: 2948 /* 2949 * Anything else Append the current input 2950 * character to the current attribute's value. 2951 */ 2952 appendStrBuf(c); 2953 /* 2954 * Stay in the attribute value (double-quoted) 2955 * state. 2956 */ 2957 continue; 2958 } 2959 } 2960 // FALLTHRU DON'T REORDER 2961 case CONSUME_CHARACTER_REFERENCE: 2962 if (++pos == endPos) { 2963 break stateloop; 2964 } 2965 c = checkChar(buf, pos); 2966 if (c == '\u0000') { 2967 break stateloop; 2968 } 2969 /* 2970 * Unlike the definition is the spec, this state does not 2971 * return a value and never requires the caller to 2972 * backtrack. This state takes care of emitting characters 2973 * or appending to the current attribute value. It also 2974 * takes care of that in the case when consuming the 2975 * character reference fails. 2976 */ 2977 /* 2978 * This section defines how to consume a character 2979 * reference. This definition is used when parsing character 2980 * references in text and in attributes. 2981 * 2982 * The behavior depends on the identity of the next 2983 * character (the one immediately after the U+0026 AMPERSAND 2984 * character): 2985 */ 2986 switch (c) { 2987 case ' ': 2988 case '\t': 2989 case '\n': 2990 case '\r': // we'll reconsume! 2991 case '\u000C': 2992 case '<': 2993 case '&': 2994 emitOrAppendCharRefBuf(returnState); 2995 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 2996 cstart = pos; 2997 } 2998 reconsume = true; 2999 state = transition(state, returnState, reconsume, pos); 3000 continue stateloop; 3001 case '#': 3002 /* 3003 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER 3004 * SIGN. 3005 */ 3006 appendCharRefBuf('#'); 3007 state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); 3008 continue stateloop; 3009 default: 3010 if (c == additional) { 3011 emitOrAppendCharRefBuf(returnState); 3012 reconsume = true; 3013 state = transition(state, returnState, reconsume, pos); 3014 continue stateloop; 3015 } 3016 if (c >= 'a' && c <= 'z') { 3017 firstCharKey = c - 'a' + 26; 3018 } else if (c >= 'A' && c <= 'Z') { 3019 firstCharKey = c - 'A'; 3020 } else { 3021 // No match 3022 /* 3023 * If no match can be made, then this is a parse 3024 * error. 3025 */ 3026 errNoNamedCharacterMatch(); 3027 emitOrAppendCharRefBuf(returnState); 3028 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3029 cstart = pos; 3030 } 3031 reconsume = true; 3032 state = transition(state, returnState, reconsume, pos); 3033 continue stateloop; 3034 } 3035 // Didn't fail yet 3036 appendCharRefBuf(c); 3037 state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); 3038 // FALL THROUGH continue stateloop; 3039 } 3040 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 3041 case CHARACTER_REFERENCE_HILO_LOOKUP: 3042 { 3043 if (++pos == endPos) { 3044 break stateloop; 3045 } 3046 c = checkChar(buf, pos); 3047 if (c == '\u0000') { 3048 break stateloop; 3049 } 3050 /* 3051 * The data structure is as follows: 3052 * 3053 * HILO_ACCEL is a two-dimensional int array whose major 3054 * index corresponds to the second character of the 3055 * character reference (code point as index) and the 3056 * minor index corresponds to the first character of the 3057 * character reference (packed so that A-Z runs from 0 3058 * to 25 and a-z runs from 26 to 51). This layout makes 3059 * it easier to use the sparseness of the data structure 3060 * to omit parts of it: The second dimension of the 3061 * table is null when no character reference starts with 3062 * the character corresponding to that row. 3063 * 3064 * The int value HILO_ACCEL (by these indeces) is zero 3065 * if there exists no character reference starting with 3066 * that two-letter prefix. Otherwise, the value is an 3067 * int that packs two shorts so that the higher short is 3068 * the index of the highest character reference name 3069 * with that prefix in NAMES and the lower short 3070 * corresponds to the index of the lowest character 3071 * reference name with that prefix. (It happens that the 3072 * first two character reference names share their 3073 * prefix so the packed int cannot be 0 by packing the 3074 * two shorts.) 3075 * 3076 * NAMES is an array of byte arrays where each byte 3077 * array encodes the name of a character references as 3078 * ASCII. The names omit the first two letters of the 3079 * name. (Since storing the first two letters would be 3080 * redundant with the data contained in HILO_ACCEL.) The 3081 * entries are lexically sorted. 3082 * 3083 * For a given index in NAMES, the same index in VALUES 3084 * contains the corresponding expansion as an array of 3085 * two UTF-16 code units (either the character and 3086 * U+0000 or a suggogate pair). 3087 */ 3088 int hilo = 0; 3089 if (c <= 'z') { 3090 @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c]; 3091 if (row != null) { 3092 hilo = row[firstCharKey]; 3093 } 3094 } 3095 if (hilo == 0) { 3096 /* 3097 * If no match can be made, then this is a parse 3098 * error. 3099 */ 3100 errNoNamedCharacterMatch(); 3101 emitOrAppendCharRefBuf(returnState); 3102 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3103 cstart = pos; 3104 } 3105 reconsume = true; 3106 state = transition(state, returnState, reconsume, pos); 3107 continue stateloop; 3108 } 3109 // Didn't fail yet 3110 appendCharRefBuf(c); 3111 lo = hilo & 0xFFFF; 3112 hi = hilo >> 16; 3113 entCol = -1; 3114 candidate = -1; 3115 charRefBufMark = 0; 3116 state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); 3117 // FALL THROUGH continue stateloop; 3118 } 3119 case CHARACTER_REFERENCE_TAIL: 3120 outer: for (;;) { 3121 if (++pos == endPos) { 3122 break stateloop; 3123 } 3124 c = checkChar(buf, pos); 3125 if (c == '\u0000') { 3126 break stateloop; 3127 } 3128 entCol++; 3129 /* 3130 * Consume the maximum number of characters possible, 3131 * with the consumed characters matching one of the 3132 * identifiers in the first column of the named 3133 * character references table (in a case-sensitive 3134 * manner). 3135 */ 3136 loloop: for (;;) { 3137 if (hi < lo) { 3138 break outer; 3139 } 3140 if (entCol == NamedCharacters.NAMES[lo].length()) { 3141 candidate = lo; 3142 charRefBufMark = charRefBufLen; 3143 lo++; 3144 } else if (entCol > NamedCharacters.NAMES[lo].length()) { 3145 break outer; 3146 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 3147 lo++; 3148 } else { 3149 break loloop; 3150 } 3151 } 3152 3153 hiloop: for (;;) { 3154 if (hi < lo) { 3155 break outer; 3156 } 3157 if (entCol == NamedCharacters.NAMES[hi].length()) { 3158 break hiloop; 3159 } 3160 if (entCol > NamedCharacters.NAMES[hi].length()) { 3161 break outer; 3162 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 3163 hi--; 3164 } else { 3165 break hiloop; 3166 } 3167 } 3168 3169 if (c == ';') { 3170 // If we see a semicolon, there cannot be a 3171 // longer match. Break the loop. However, before 3172 // breaking, take the longest match so far as the 3173 // candidate, if we are just about to complete a 3174 // match. 3175 if (entCol + 1 == NamedCharacters.NAMES[lo].length()) { 3176 candidate = lo; 3177 charRefBufMark = charRefBufLen; 3178 } 3179 break outer; 3180 } 3181 3182 if (hi < lo) { 3183 break outer; 3184 } 3185 appendCharRefBuf(c); 3186 continue; 3187 } 3188 3189 if (candidate == -1) { 3190 // reconsume deals with CR, LF or nul 3191 /* 3192 * If no match can be made, then this is a parse error. 3193 */ 3194 errNoNamedCharacterMatch(); 3195 emitOrAppendCharRefBuf(returnState); 3196 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3197 cstart = pos; 3198 } 3199 reconsume = true; 3200 state = transition(state, returnState, reconsume, pos); 3201 continue stateloop; 3202 } else { 3203 // c can't be CR, LF or nul if we got here 3204 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 3205 if (candidateName.length() == 0 3206 || candidateName.charAt(candidateName.length() - 1) != ';') { 3207 /* 3208 * If the last character matched is not a U+003B 3209 * SEMICOLON (;), there is a parse error. 3210 */ 3211 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3212 /* 3213 * If the entity is being consumed as part of an 3214 * attribute, and the last character matched is 3215 * not a U+003B SEMICOLON (;), 3216 */ 3217 char ch; 3218 if (charRefBufMark == charRefBufLen) { 3219 ch = c; 3220 } else { 3221 ch = charRefBuf[charRefBufMark]; 3222 } 3223 if (ch == '=' || (ch >= '0' && ch <= '9') 3224 || (ch >= 'A' && ch <= 'Z') 3225 || (ch >= 'a' && ch <= 'z')) { 3226 /* 3227 * and the next character is either a U+003D 3228 * EQUALS SIGN character (=) or in the range 3229 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 3230 * U+0041 LATIN CAPITAL LETTER A to U+005A 3231 * LATIN CAPITAL LETTER Z, or U+0061 LATIN 3232 * SMALL LETTER A to U+007A LATIN SMALL 3233 * LETTER Z, then, for historical reasons, 3234 * all the characters that were matched 3235 * after the U+0026 AMPERSAND (&) must be 3236 * unconsumed, and nothing is returned. 3237 */ 3238 errNoNamedCharacterMatch(); 3239 appendCharRefBufToStrBuf(); 3240 reconsume = true; 3241 state = transition(state, returnState, reconsume, pos); 3242 continue stateloop; 3243 } 3244 } 3245 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3246 errUnescapedAmpersandInterpretedAsCharacterReference(); 3247 } else { 3248 errNotSemicolonTerminated(); 3249 } 3250 } 3251 3252 /* 3253 * Otherwise, return a character token for the character 3254 * corresponding to the entity name (as given by the 3255 * second column of the named character references 3256 * table). 3257 */ 3258 // CPPONLY: completedNamedCharacterReference(); 3259 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 3260 if ( 3261 // [NOCPP[ 3262 val.length == 1 3263 // ]NOCPP] 3264 // CPPONLY: val[1] == 0 3265 ) { 3266 emitOrAppendOne(val, returnState); 3267 } else { 3268 emitOrAppendTwo(val, returnState); 3269 } 3270 // this is so complicated! 3271 if (charRefBufMark < charRefBufLen) { 3272 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3273 appendStrBuf(charRefBuf, charRefBufMark, 3274 charRefBufLen - charRefBufMark); 3275 } else { 3276 tokenHandler.characters(charRefBuf, charRefBufMark, 3277 charRefBufLen - charRefBufMark); 3278 } 3279 } 3280 // charRefBufLen will be zeroed below! 3281 3282 // Check if we broke out early with c being the last 3283 // character that matched as opposed to being the 3284 // first one that didn't match. In the case of an 3285 // early break, the next run on text should start 3286 // *after* the current character and the current 3287 // character shouldn't be reconsumed. 3288 boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen); 3289 charRefBufLen = 0; 3290 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3291 cstart = earlyBreak ? pos + 1 : pos; 3292 } 3293 reconsume = !earlyBreak; 3294 state = transition(state, returnState, reconsume, pos); 3295 continue stateloop; 3296 /* 3297 * If the markup contains I'm ¬it; I tell you, the 3298 * entity is parsed as "not", as in, I'm ¬it; I tell 3299 * you. But if the markup was I'm ∉ I tell you, 3300 * the entity would be parsed as "notin;", resulting in 3301 * I'm ∉ I tell you. 3302 */ 3303 } 3304 // XXX reorder point 3305 case CONSUME_NCR: 3306 if (++pos == endPos) { 3307 break stateloop; 3308 } 3309 c = checkChar(buf, pos); 3310 value = 0; 3311 seenDigits = false; 3312 /* 3313 * The behavior further depends on the character after the 3314 * U+0023 NUMBER SIGN: 3315 */ 3316 switch (c) { 3317 case 'x': 3318 case 'X': 3319 3320 /* 3321 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL 3322 * LETTER X Consume the X. 3323 * 3324 * Follow the steps below, but using the range of 3325 * characters U+0030 DIGIT ZERO through to U+0039 3326 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through 3327 * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN 3328 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL 3329 * LETTER F (in other words, 0-9, A-F, a-f). 3330 * 3331 * When it comes to interpreting the number, 3332 * interpret it as a hexadecimal number. 3333 */ 3334 appendCharRefBuf(c); 3335 state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos); 3336 continue stateloop; 3337 default: 3338 /* 3339 * Anything else Follow the steps below, but using 3340 * the range of characters U+0030 DIGIT ZERO through 3341 * to U+0039 DIGIT NINE (i.e. just 0-9). 3342 * 3343 * When it comes to interpreting the number, 3344 * interpret it as a decimal number. 3345 */ 3346 reconsume = true; 3347 state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); 3348 // FALL THROUGH continue stateloop; 3349 } 3350 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 3351 case DECIMAL_NRC_LOOP: 3352 decimalloop: for (;;) { 3353 if (reconsume) { 3354 reconsume = false; 3355 } else { 3356 if (++pos == endPos) { 3357 break stateloop; 3358 } 3359 c = checkChar(buf, pos); 3360 } 3361 /* 3362 * Consume as many characters as match the range of 3363 * characters given above. 3364 */ 3365 assert value >= 0: "value must not become negative."; 3366 if (c >= '0' && c <= '9') { 3367 seenDigits = true; 3368 // Avoid overflow 3369 if (value <= 0x10FFFF) { 3370 value *= 10; 3371 value += c - '0'; 3372 } 3373 continue; 3374 } else if (c == ';') { 3375 if (seenDigits) { 3376 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3377 cstart = pos + 1; 3378 } 3379 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3380 // FALL THROUGH continue stateloop; 3381 break decimalloop; 3382 } else { 3383 errNoDigitsInNCR(); 3384 appendCharRefBuf(';'); 3385 emitOrAppendCharRefBuf(returnState); 3386 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3387 cstart = pos + 1; 3388 } 3389 state = transition(state, returnState, reconsume, pos); 3390 continue stateloop; 3391 } 3392 } else { 3393 /* 3394 * If no characters match the range, then don't 3395 * consume any characters (and unconsume the U+0023 3396 * NUMBER SIGN character and, if appropriate, the X 3397 * character). This is a parse error; nothing is 3398 * returned. 3399 * 3400 * Otherwise, if the next character is a U+003B 3401 * SEMICOLON, consume that too. If it isn't, there 3402 * is a parse error. 3403 */ 3404 if (!seenDigits) { 3405 errNoDigitsInNCR(); 3406 emitOrAppendCharRefBuf(returnState); 3407 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3408 cstart = pos; 3409 } 3410 reconsume = true; 3411 state = transition(state, returnState, reconsume, pos); 3412 continue stateloop; 3413 } else { 3414 errCharRefLacksSemicolon(); 3415 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3416 cstart = pos; 3417 } 3418 reconsume = true; 3419 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3420 // FALL THROUGH continue stateloop; 3421 break decimalloop; 3422 } 3423 } 3424 } 3425 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 3426 case HANDLE_NCR_VALUE: 3427 // WARNING previous state sets reconsume 3428 // We are not going to emit the contents of charRefBuf. 3429 charRefBufLen = 0; 3430 // XXX inline this case if the method size can take it 3431 handleNcrValue(returnState); 3432 state = transition(state, returnState, reconsume, pos); 3433 continue stateloop; 3434 // XXX reorder point 3435 case HEX_NCR_LOOP: 3436 for (;;) { 3437 if (++pos == endPos) { 3438 break stateloop; 3439 } 3440 c = checkChar(buf, pos); 3441 /* 3442 * Consume as many characters as match the range of 3443 * characters given above. 3444 */ 3445 assert value >= 0: "value must not become negative."; 3446 if (c >= '0' && c <= '9') { 3447 seenDigits = true; 3448 // Avoid overflow 3449 if (value <= 0x10FFFF) { 3450 value *= 16; 3451 value += c - '0'; 3452 } 3453 continue; 3454 } else if (c >= 'A' && c <= 'F') { 3455 seenDigits = true; 3456 // Avoid overflow 3457 if (value <= 0x10FFFF) { 3458 value *= 16; 3459 value += c - 'A' + 10; 3460 } 3461 continue; 3462 } else if (c >= 'a' && c <= 'f') { 3463 seenDigits = true; 3464 // Avoid overflow 3465 if (value <= 0x10FFFF) { 3466 value *= 16; 3467 value += c - 'a' + 10; 3468 } 3469 continue; 3470 } else if (c == ';') { 3471 if (seenDigits) { 3472 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3473 cstart = pos + 1; 3474 } 3475 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3476 continue stateloop; 3477 } else { 3478 errNoDigitsInNCR(); 3479 appendCharRefBuf(';'); 3480 emitOrAppendCharRefBuf(returnState); 3481 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3482 cstart = pos + 1; 3483 } 3484 state = transition(state, returnState, reconsume, pos); 3485 continue stateloop; 3486 } 3487 } else { 3488 /* 3489 * If no characters match the range, then don't 3490 * consume any characters (and unconsume the U+0023 3491 * NUMBER SIGN character and, if appropriate, the X 3492 * character). This is a parse error; nothing is 3493 * returned. 3494 * 3495 * Otherwise, if the next character is a U+003B 3496 * SEMICOLON, consume that too. If it isn't, there 3497 * is a parse error. 3498 */ 3499 if (!seenDigits) { 3500 errNoDigitsInNCR(); 3501 emitOrAppendCharRefBuf(returnState); 3502 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3503 cstart = pos; 3504 } 3505 reconsume = true; 3506 state = transition(state, returnState, reconsume, pos); 3507 continue stateloop; 3508 } else { 3509 errCharRefLacksSemicolon(); 3510 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3511 cstart = pos; 3512 } 3513 reconsume = true; 3514 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3515 continue stateloop; 3516 } 3517 } 3518 } 3519 // XXX reorder point 3520 case PLAINTEXT: 3521 plaintextloop: for (;;) { 3522 if (reconsume) { 3523 reconsume = false; 3524 } else { 3525 if (++pos == endPos) { 3526 break stateloop; 3527 } 3528 c = checkChar(buf, pos); 3529 } 3530 switch (c) { 3531 case '\u0000': 3532 emitPlaintextReplacementCharacter(buf, pos); 3533 continue; 3534 case '\r': 3535 emitCarriageReturn(buf, pos); 3536 break stateloop; 3537 case '\n': 3538 silentLineFeed(); 3539 default: 3540 /* 3541 * Anything else Emit the current input 3542 * character as a character token. Stay in the 3543 * RAWTEXT state. 3544 */ 3545 continue; 3546 } 3547 } 3548 // XXX reorder point 3549 case CLOSE_TAG_OPEN: 3550 if (++pos == endPos) { 3551 break stateloop; 3552 } 3553 c = checkChar(buf, pos); 3554 /* 3555 * Otherwise, if the content model flag is set to the PCDATA 3556 * state, or if the next few characters do match that tag 3557 * name, consume the next input character: 3558 */ 3559 switch (c) { 3560 case '>': 3561 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 3562 errLtSlashGt(); 3563 /* 3564 * Switch to the data state. 3565 */ 3566 cstart = pos + 1; 3567 state = transition(state, Tokenizer.DATA, reconsume, pos); 3568 continue stateloop; 3569 case '\r': 3570 silentCarriageReturn(); 3571 /* Anything else Parse error. */ 3572 errGarbageAfterLtSlash(); 3573 /* 3574 * Switch to the bogus comment state. 3575 */ 3576 clearStrBufBeforeUse(); 3577 appendStrBuf('\n'); 3578 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3579 break stateloop; 3580 case '\n': 3581 silentLineFeed(); 3582 /* Anything else Parse error. */ 3583 errGarbageAfterLtSlash(); 3584 /* 3585 * Switch to the bogus comment state. 3586 */ 3587 clearStrBufBeforeUse(); 3588 appendStrBuf(c); 3589 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3590 continue stateloop; 3591 case '\u0000': 3592 c = '\uFFFD'; 3593 // fall thru 3594 default: 3595 if (c >= 'A' && c <= 'Z') { 3596 c += 0x20; 3597 } 3598 if (c >= 'a' && c <= 'z') { 3599 /* 3600 * U+0061 LATIN SMALL LETTER A through to U+007A 3601 * LATIN SMALL LETTER Z Create a new end tag 3602 * token, 3603 */ 3604 endTag = true; 3605 /* 3606 * set its tag name to the input character, 3607 */ 3608 clearStrBufBeforeUse(); 3609 appendStrBuf(c); 3610 /* 3611 * then switch to the tag name state. (Don't 3612 * emit the token yet; further details will be 3613 * filled in before it is emitted.) 3614 */ 3615 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 3616 continue stateloop; 3617 } else { 3618 /* Anything else Parse error. */ 3619 errGarbageAfterLtSlash(); 3620 /* 3621 * Switch to the bogus comment state. 3622 */ 3623 clearStrBufBeforeUse(); 3624 appendStrBuf(c); 3625 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3626 continue stateloop; 3627 } 3628 } 3629 // XXX reorder point 3630 case RCDATA: 3631 rcdataloop: for (;;) { 3632 if (reconsume) { 3633 reconsume = false; 3634 } else { 3635 if (++pos == endPos) { 3636 break stateloop; 3637 } 3638 c = checkChar(buf, pos); 3639 } 3640 switch (c) { 3641 case '&': 3642 /* 3643 * U+0026 AMPERSAND (&) Switch to the character 3644 * reference in RCDATA state. 3645 */ 3646 flushChars(buf, pos); 3647 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 3648 appendCharRefBuf(c); 3649 setAdditionalAndRememberAmpersandLocation('\u0000'); 3650 returnState = state; 3651 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 3652 continue stateloop; 3653 case '<': 3654 /* 3655 * U+003C LESS-THAN SIGN (<) Switch to the 3656 * RCDATA less-than sign state. 3657 */ 3658 flushChars(buf, pos); 3659 3660 returnState = state; 3661 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 3662 continue stateloop; 3663 case '\u0000': 3664 emitReplacementCharacter(buf, pos); 3665 continue; 3666 case '\r': 3667 emitCarriageReturn(buf, pos); 3668 break stateloop; 3669 case '\n': 3670 silentLineFeed(); 3671 default: 3672 /* 3673 * Emit the current input character as a 3674 * character token. Stay in the RCDATA state. 3675 */ 3676 continue; 3677 } 3678 } 3679 // XXX reorder point 3680 case RAWTEXT: 3681 rawtextloop: for (;;) { 3682 if (reconsume) { 3683 reconsume = false; 3684 } else { 3685 if (++pos == endPos) { 3686 break stateloop; 3687 } 3688 c = checkChar(buf, pos); 3689 } 3690 switch (c) { 3691 case '<': 3692 /* 3693 * U+003C LESS-THAN SIGN (<) Switch to the 3694 * RAWTEXT less-than sign state. 3695 */ 3696 flushChars(buf, pos); 3697 3698 returnState = state; 3699 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 3700 break rawtextloop; 3701 // FALL THRU continue stateloop; 3702 case '\u0000': 3703 emitReplacementCharacter(buf, pos); 3704 continue; 3705 case '\r': 3706 emitCarriageReturn(buf, pos); 3707 break stateloop; 3708 case '\n': 3709 silentLineFeed(); 3710 default: 3711 /* 3712 * Emit the current input character as a 3713 * character token. Stay in the RAWTEXT state. 3714 */ 3715 continue; 3716 } 3717 } 3718 // XXX fallthru don't reorder 3719 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 3720 rawtextrcdatalessthansignloop: for (;;) { 3721 if (++pos == endPos) { 3722 break stateloop; 3723 } 3724 c = checkChar(buf, pos); 3725 switch (c) { 3726 case '/': 3727 /* 3728 * U+002F SOLIDUS (/) Set the temporary buffer 3729 * to the empty string. Switch to the script 3730 * data end tag open state. 3731 */ 3732 index = 0; 3733 clearStrBufBeforeUse(); 3734 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 3735 break rawtextrcdatalessthansignloop; 3736 // FALL THRU continue stateloop; 3737 default: 3738 /* 3739 * Otherwise, emit a U+003C LESS-THAN SIGN 3740 * character token 3741 */ 3742 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 3743 /* 3744 * and reconsume the current input character in 3745 * the data state. 3746 */ 3747 cstart = pos; 3748 reconsume = true; 3749 state = transition(state, returnState, reconsume, pos); 3750 continue stateloop; 3751 } 3752 } 3753 // XXX fall thru. don't reorder. 3754 case NON_DATA_END_TAG_NAME: 3755 for (;;) { 3756 if (++pos == endPos) { 3757 break stateloop; 3758 } 3759 c = checkChar(buf, pos); 3760 /* 3761 * ASSERT! when entering this state, set index to 0 and 3762 * call clearStrBufBeforeUse() assert (contentModelElement != 3763 * null); Let's implement the above without lookahead. 3764 * strBuf is the 'temporary buffer'. 3765 */ 3766 if (index < endTagExpectationAsArray.length) { 3767 char e = endTagExpectationAsArray[index]; 3768 char folded = c; 3769 if (c >= 'A' && c <= 'Z') { 3770 folded += 0x20; 3771 } 3772 if (folded != e) { 3773 // [NOCPP[ 3774 errHtml4LtSlashInRcdata(folded); 3775 // ]NOCPP] 3776 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 3777 0, 2); 3778 emitStrBuf(); 3779 cstart = pos; 3780 reconsume = true; 3781 state = transition(state, returnState, reconsume, pos); 3782 continue stateloop; 3783 } 3784 appendStrBuf(c); 3785 index++; 3786 continue; 3787 } else { 3788 endTag = true; 3789 // XXX replace contentModelElement with different 3790 // type 3791 tagName = endTagExpectation; 3792 switch (c) { 3793 case '\r': 3794 silentCarriageReturn(); 3795 clearStrBufAfterUse(); // strBuf not used 3796 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 3797 break stateloop; 3798 case '\n': 3799 silentLineFeed(); 3800 // fall thru 3801 case ' ': 3802 case '\t': 3803 case '\u000C': 3804 /* 3805 * U+0009 CHARACTER TABULATION U+000A LINE 3806 * FEED (LF) U+000C FORM FEED (FF) U+0020 3807 * SPACE If the current end tag token is an 3808 * appropriate end tag token, then switch to 3809 * the before attribute name state. 3810 */ 3811 clearStrBufAfterUse(); // strBuf not used 3812 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 3813 continue stateloop; 3814 case '/': 3815 /* 3816 * U+002F SOLIDUS (/) If the current end tag 3817 * token is an appropriate end tag token, 3818 * then switch to the self-closing start tag 3819 * state. 3820 */ 3821 clearStrBufAfterUse(); // strBuf not used 3822 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 3823 continue stateloop; 3824 case '>': 3825 /* 3826 * U+003E GREATER-THAN SIGN (>) If the 3827 * current end tag token is an appropriate 3828 * end tag token, then emit the current tag 3829 * token and switch to the data state. 3830 */ 3831 clearStrBufAfterUse(); // strBuf not used 3832 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 3833 if (shouldSuspend) { 3834 break stateloop; 3835 } 3836 continue stateloop; 3837 default: 3838 /* 3839 * Emit a U+003C LESS-THAN SIGN character 3840 * token, a U+002F SOLIDUS character token, 3841 * a character token for each of the 3842 * characters in the temporary buffer (in 3843 * the order they were added to the buffer), 3844 * and reconsume the current input character 3845 * in the RAWTEXT state. 3846 */ 3847 // [NOCPP[ 3848 errWarnLtSlashInRcdata(); 3849 // ]NOCPP] 3850 tokenHandler.characters( 3851 Tokenizer.LT_SOLIDUS, 0, 2); 3852 emitStrBuf(); 3853 if (c == '\u0000') { 3854 emitReplacementCharacter(buf, pos); 3855 } else { 3856 cstart = pos; // don't drop the 3857 // character 3858 } 3859 state = transition(state, returnState, reconsume, pos); 3860 continue stateloop; 3861 } 3862 } 3863 } 3864 // XXX reorder point 3865 // BEGIN HOTSPOT WORKAROUND 3866 case BOGUS_COMMENT: 3867 boguscommentloop: for (;;) { 3868 if (reconsume) { 3869 reconsume = false; 3870 } else { 3871 if (++pos == endPos) { 3872 break stateloop; 3873 } 3874 c = checkChar(buf, pos); 3875 } 3876 /* 3877 * Consume every character up to and including the first 3878 * U+003E GREATER-THAN SIGN character (>) or the end of 3879 * the file (EOF), whichever comes first. Emit a comment 3880 * token whose data is the concatenation of all the 3881 * characters starting from and including the character 3882 * that caused the state machine to switch into the 3883 * bogus comment state, up to and including the 3884 * character immediately before the last consumed 3885 * character (i.e. up to the character just before the 3886 * U+003E or EOF character). (If the comment was started 3887 * by the end of the file (EOF), the token is empty.) 3888 * 3889 * Switch to the data state. 3890 * 3891 * If the end of the file was reached, reconsume the EOF 3892 * character. 3893 */ 3894 switch (c) { 3895 case '>': 3896 emitComment(0, pos); 3897 state = transition(state, Tokenizer.DATA, reconsume, pos); 3898 continue stateloop; 3899 case '-': 3900 appendStrBuf(c); 3901 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); 3902 break boguscommentloop; 3903 case '\r': 3904 appendStrBufCarriageReturn(); 3905 break stateloop; 3906 case '\n': 3907 appendStrBufLineFeed(); 3908 continue; 3909 case '\u0000': 3910 c = '\uFFFD'; 3911 // fall thru 3912 default: 3913 appendStrBuf(c); 3914 continue; 3915 } 3916 } 3917 // FALLTHRU DON'T REORDER 3918 case BOGUS_COMMENT_HYPHEN: 3919 boguscommenthyphenloop: for (;;) { 3920 if (++pos == endPos) { 3921 break stateloop; 3922 } 3923 c = checkChar(buf, pos); 3924 switch (c) { 3925 case '>': 3926 // [NOCPP[ 3927 maybeAppendSpaceToBogusComment(); 3928 // ]NOCPP] 3929 emitComment(0, pos); 3930 state = transition(state, Tokenizer.DATA, reconsume, pos); 3931 continue stateloop; 3932 case '-': 3933 appendSecondHyphenToBogusComment(); 3934 continue boguscommenthyphenloop; 3935 case '\r': 3936 appendStrBufCarriageReturn(); 3937 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3938 break stateloop; 3939 case '\n': 3940 appendStrBufLineFeed(); 3941 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3942 continue stateloop; 3943 case '\u0000': 3944 c = '\uFFFD'; 3945 // fall thru 3946 default: 3947 appendStrBuf(c); 3948 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3949 continue stateloop; 3950 } 3951 } 3952 // XXX reorder point 3953 case SCRIPT_DATA: 3954 scriptdataloop: for (;;) { 3955 if (reconsume) { 3956 reconsume = false; 3957 } else { 3958 if (++pos == endPos) { 3959 break stateloop; 3960 } 3961 c = checkChar(buf, pos); 3962 } 3963 switch (c) { 3964 case '<': 3965 /* 3966 * U+003C LESS-THAN SIGN (<) Switch to the 3967 * script data less-than sign state. 3968 */ 3969 flushChars(buf, pos); 3970 returnState = state; 3971 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); 3972 break scriptdataloop; // FALL THRU continue 3973 // stateloop; 3974 case '\u0000': 3975 emitReplacementCharacter(buf, pos); 3976 continue; 3977 case '\r': 3978 emitCarriageReturn(buf, pos); 3979 break stateloop; 3980 case '\n': 3981 silentLineFeed(); 3982 default: 3983 /* 3984 * Anything else Emit the current input 3985 * character as a character token. Stay in the 3986 * script data state. 3987 */ 3988 continue; 3989 } 3990 } 3991 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 3992 case SCRIPT_DATA_LESS_THAN_SIGN: 3993 scriptdatalessthansignloop: for (;;) { 3994 if (++pos == endPos) { 3995 break stateloop; 3996 } 3997 c = checkChar(buf, pos); 3998 switch (c) { 3999 case '/': 4000 /* 4001 * U+002F SOLIDUS (/) Set the temporary buffer 4002 * to the empty string. Switch to the script 4003 * data end tag open state. 4004 */ 4005 index = 0; 4006 clearStrBufBeforeUse(); 4007 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 4008 continue stateloop; 4009 case '!': 4010 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4011 cstart = pos; 4012 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); 4013 break scriptdatalessthansignloop; // FALL THRU 4014 // continue 4015 // stateloop; 4016 default: 4017 /* 4018 * Otherwise, emit a U+003C LESS-THAN SIGN 4019 * character token 4020 */ 4021 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4022 /* 4023 * and reconsume the current input character in 4024 * the data state. 4025 */ 4026 cstart = pos; 4027 reconsume = true; 4028 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4029 continue stateloop; 4030 } 4031 } 4032 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4033 case SCRIPT_DATA_ESCAPE_START: 4034 scriptdataescapestartloop: for (;;) { 4035 if (++pos == endPos) { 4036 break stateloop; 4037 } 4038 c = checkChar(buf, pos); 4039 /* 4040 * Consume the next input character: 4041 */ 4042 switch (c) { 4043 case '-': 4044 /* 4045 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4046 * HYPHEN-MINUS character token. Switch to the 4047 * script data escape start dash state. 4048 */ 4049 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); 4050 break scriptdataescapestartloop; // FALL THRU 4051 // continue 4052 // stateloop; 4053 default: 4054 /* 4055 * Anything else Reconsume the current input 4056 * character in the script data state. 4057 */ 4058 reconsume = true; 4059 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4060 continue stateloop; 4061 } 4062 } 4063 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4064 case SCRIPT_DATA_ESCAPE_START_DASH: 4065 scriptdataescapestartdashloop: for (;;) { 4066 if (++pos == endPos) { 4067 break stateloop; 4068 } 4069 c = checkChar(buf, pos); 4070 /* 4071 * Consume the next input character: 4072 */ 4073 switch (c) { 4074 case '-': 4075 /* 4076 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4077 * HYPHEN-MINUS character token. Switch to the 4078 * script data escaped dash dash state. 4079 */ 4080 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 4081 break scriptdataescapestartdashloop; 4082 // continue stateloop; 4083 default: 4084 /* 4085 * Anything else Reconsume the current input 4086 * character in the script data state. 4087 */ 4088 reconsume = true; 4089 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4090 continue stateloop; 4091 } 4092 } 4093 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4094 case SCRIPT_DATA_ESCAPED_DASH_DASH: 4095 scriptdataescapeddashdashloop: for (;;) { 4096 if (++pos == endPos) { 4097 break stateloop; 4098 } 4099 c = checkChar(buf, pos); 4100 /* 4101 * Consume the next input character: 4102 */ 4103 switch (c) { 4104 case '-': 4105 /* 4106 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4107 * HYPHEN-MINUS character token. Stay in the 4108 * script data escaped dash dash state. 4109 */ 4110 continue; 4111 case '<': 4112 /* 4113 * U+003C LESS-THAN SIGN (<) Switch to the 4114 * script data escaped less-than sign state. 4115 */ 4116 flushChars(buf, pos); 4117 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4118 continue stateloop; 4119 case '>': 4120 /* 4121 * U+003E GREATER-THAN SIGN (>) Emit a U+003E 4122 * GREATER-THAN SIGN character token. Switch to 4123 * the script data state. 4124 */ 4125 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4126 continue stateloop; 4127 case '\u0000': 4128 emitReplacementCharacter(buf, pos); 4129 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4130 break scriptdataescapeddashdashloop; 4131 case '\r': 4132 emitCarriageReturn(buf, pos); 4133 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4134 break stateloop; 4135 case '\n': 4136 silentLineFeed(); 4137 default: 4138 /* 4139 * Anything else Emit the current input 4140 * character as a character token. Switch to the 4141 * script data escaped state. 4142 */ 4143 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4144 break scriptdataescapeddashdashloop; 4145 // continue stateloop; 4146 } 4147 } 4148 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4149 case SCRIPT_DATA_ESCAPED: 4150 scriptdataescapedloop: for (;;) { 4151 if (reconsume) { 4152 reconsume = false; 4153 } else { 4154 if (++pos == endPos) { 4155 break stateloop; 4156 } 4157 c = checkChar(buf, pos); 4158 } 4159 /* 4160 * Consume the next input character: 4161 */ 4162 switch (c) { 4163 case '-': 4164 /* 4165 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4166 * HYPHEN-MINUS character token. Switch to the 4167 * script data escaped dash state. 4168 */ 4169 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); 4170 break scriptdataescapedloop; // FALL THRU 4171 // continue 4172 // stateloop; 4173 case '<': 4174 /* 4175 * U+003C LESS-THAN SIGN (<) Switch to the 4176 * script data escaped less-than sign state. 4177 */ 4178 flushChars(buf, pos); 4179 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4180 continue stateloop; 4181 case '\u0000': 4182 emitReplacementCharacter(buf, pos); 4183 continue; 4184 case '\r': 4185 emitCarriageReturn(buf, pos); 4186 break stateloop; 4187 case '\n': 4188 silentLineFeed(); 4189 default: 4190 /* 4191 * Anything else Emit the current input 4192 * character as a character token. Stay in the 4193 * script data escaped state. 4194 */ 4195 continue; 4196 } 4197 } 4198 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4199 case SCRIPT_DATA_ESCAPED_DASH: 4200 scriptdataescapeddashloop: for (;;) { 4201 if (++pos == endPos) { 4202 break stateloop; 4203 } 4204 c = checkChar(buf, pos); 4205 /* 4206 * Consume the next input character: 4207 */ 4208 switch (c) { 4209 case '-': 4210 /* 4211 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4212 * HYPHEN-MINUS character token. Switch to the 4213 * script data escaped dash dash state. 4214 */ 4215 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 4216 continue stateloop; 4217 case '<': 4218 /* 4219 * U+003C LESS-THAN SIGN (<) Switch to the 4220 * script data escaped less-than sign state. 4221 */ 4222 flushChars(buf, pos); 4223 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4224 break scriptdataescapeddashloop; 4225 // continue stateloop; 4226 case '\u0000': 4227 emitReplacementCharacter(buf, pos); 4228 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4229 continue stateloop; 4230 case '\r': 4231 emitCarriageReturn(buf, pos); 4232 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4233 break stateloop; 4234 case '\n': 4235 silentLineFeed(); 4236 default: 4237 /* 4238 * Anything else Emit the current input 4239 * character as a character token. Switch to the 4240 * script data escaped state. 4241 */ 4242 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4243 continue stateloop; 4244 } 4245 } 4246 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4247 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 4248 scriptdataescapedlessthanloop: for (;;) { 4249 if (++pos == endPos) { 4250 break stateloop; 4251 } 4252 c = checkChar(buf, pos); 4253 /* 4254 * Consume the next input character: 4255 */ 4256 switch (c) { 4257 case '/': 4258 /* 4259 * U+002F SOLIDUS (/) Set the temporary buffer 4260 * to the empty string. Switch to the script 4261 * data escaped end tag open state. 4262 */ 4263 index = 0; 4264 clearStrBufBeforeUse(); 4265 returnState = Tokenizer.SCRIPT_DATA_ESCAPED; 4266 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 4267 continue stateloop; 4268 case 'S': 4269 case 's': 4270 /* 4271 * U+0041 LATIN CAPITAL LETTER A through to 4272 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C 4273 * LESS-THAN SIGN character token and the 4274 * current input character as a character token. 4275 */ 4276 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4277 cstart = pos; 4278 index = 1; 4279 /* 4280 * Set the temporary buffer to the empty string. 4281 * Append the lowercase version of the current 4282 * input character (add 0x0020 to the 4283 * character's code point) to the temporary 4284 * buffer. Switch to the script data double 4285 * escape start state. 4286 */ 4287 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); 4288 break scriptdataescapedlessthanloop; 4289 // continue stateloop; 4290 default: 4291 /* 4292 * Anything else Emit a U+003C LESS-THAN SIGN 4293 * character token and reconsume the current 4294 * input character in the script data escaped 4295 * state. 4296 */ 4297 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4298 cstart = pos; 4299 reconsume = true; 4300 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4301 continue stateloop; 4302 } 4303 } 4304 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4305 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 4306 scriptdatadoubleescapestartloop: for (;;) { 4307 if (++pos == endPos) { 4308 break stateloop; 4309 } 4310 c = checkChar(buf, pos); 4311 assert index > 0; 4312 if (index < 6) { // SCRIPT_ARR.length 4313 char folded = c; 4314 if (c >= 'A' && c <= 'Z') { 4315 folded += 0x20; 4316 } 4317 if (folded != Tokenizer.SCRIPT_ARR[index]) { 4318 reconsume = true; 4319 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4320 continue stateloop; 4321 } 4322 index++; 4323 continue; 4324 } 4325 switch (c) { 4326 case '\r': 4327 emitCarriageReturn(buf, pos); 4328 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4329 break stateloop; 4330 case '\n': 4331 silentLineFeed(); 4332 case ' ': 4333 case '\t': 4334 case '\u000C': 4335 case '/': 4336 case '>': 4337 /* 4338 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4339 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4340 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 4341 * (>) Emit the current input character as a 4342 * character token. If the temporary buffer is 4343 * the string "script", then switch to the 4344 * script data double escaped state. 4345 */ 4346 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4347 break scriptdatadoubleescapestartloop; 4348 // continue stateloop; 4349 default: 4350 /* 4351 * Anything else Reconsume the current input 4352 * character in the script data escaped state. 4353 */ 4354 reconsume = true; 4355 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4356 continue stateloop; 4357 } 4358 } 4359 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4360 case SCRIPT_DATA_DOUBLE_ESCAPED: 4361 scriptdatadoubleescapedloop: for (;;) { 4362 if (reconsume) { 4363 reconsume = false; 4364 } else { 4365 if (++pos == endPos) { 4366 break stateloop; 4367 } 4368 c = checkChar(buf, pos); 4369 } 4370 /* 4371 * Consume the next input character: 4372 */ 4373 switch (c) { 4374 case '-': 4375 /* 4376 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4377 * HYPHEN-MINUS character token. Switch to the 4378 * script data double escaped dash state. 4379 */ 4380 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); 4381 break scriptdatadoubleescapedloop; // FALL THRU 4382 // continue 4383 // stateloop; 4384 case '<': 4385 /* 4386 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4387 * LESS-THAN SIGN character token. Switch to the 4388 * script data double escaped less-than sign 4389 * state. 4390 */ 4391 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4392 continue stateloop; 4393 case '\u0000': 4394 emitReplacementCharacter(buf, pos); 4395 continue; 4396 case '\r': 4397 emitCarriageReturn(buf, pos); 4398 break stateloop; 4399 case '\n': 4400 silentLineFeed(); 4401 default: 4402 /* 4403 * Anything else Emit the current input 4404 * character as a character token. Stay in the 4405 * script data double escaped state. 4406 */ 4407 continue; 4408 } 4409 } 4410 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4411 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 4412 scriptdatadoubleescapeddashloop: for (;;) { 4413 if (++pos == endPos) { 4414 break stateloop; 4415 } 4416 c = checkChar(buf, pos); 4417 /* 4418 * Consume the next input character: 4419 */ 4420 switch (c) { 4421 case '-': 4422 /* 4423 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4424 * HYPHEN-MINUS character token. Switch to the 4425 * script data double escaped dash dash state. 4426 */ 4427 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); 4428 break scriptdatadoubleescapeddashloop; 4429 // continue stateloop; 4430 case '<': 4431 /* 4432 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4433 * LESS-THAN SIGN character token. Switch to the 4434 * script data double escaped less-than sign 4435 * state. 4436 */ 4437 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4438 continue stateloop; 4439 case '\u0000': 4440 emitReplacementCharacter(buf, pos); 4441 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4442 continue stateloop; 4443 case '\r': 4444 emitCarriageReturn(buf, pos); 4445 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4446 break stateloop; 4447 case '\n': 4448 silentLineFeed(); 4449 default: 4450 /* 4451 * Anything else Emit the current input 4452 * character as a character token. Switch to the 4453 * script data double escaped state. 4454 */ 4455 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4456 continue stateloop; 4457 } 4458 } 4459 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4460 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 4461 scriptdatadoubleescapeddashdashloop: for (;;) { 4462 if (++pos == endPos) { 4463 break stateloop; 4464 } 4465 c = checkChar(buf, pos); 4466 /* 4467 * Consume the next input character: 4468 */ 4469 switch (c) { 4470 case '-': 4471 /* 4472 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4473 * HYPHEN-MINUS character token. Stay in the 4474 * script data double escaped dash dash state. 4475 */ 4476 continue; 4477 case '<': 4478 /* 4479 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4480 * LESS-THAN SIGN character token. Switch to the 4481 * script data double escaped less-than sign 4482 * state. 4483 */ 4484 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4485 break scriptdatadoubleescapeddashdashloop; 4486 case '>': 4487 /* 4488 * U+003E GREATER-THAN SIGN (>) Emit a U+003E 4489 * GREATER-THAN SIGN character token. Switch to 4490 * the script data state. 4491 */ 4492 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4493 continue stateloop; 4494 case '\u0000': 4495 emitReplacementCharacter(buf, pos); 4496 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4497 continue stateloop; 4498 case '\r': 4499 emitCarriageReturn(buf, pos); 4500 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4501 break stateloop; 4502 case '\n': 4503 silentLineFeed(); 4504 default: 4505 /* 4506 * Anything else Emit the current input 4507 * character as a character token. Switch to the 4508 * script data double escaped state. 4509 */ 4510 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4511 continue stateloop; 4512 } 4513 } 4514 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4515 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 4516 scriptdatadoubleescapedlessthanloop: for (;;) { 4517 if (++pos == endPos) { 4518 break stateloop; 4519 } 4520 c = checkChar(buf, pos); 4521 /* 4522 * Consume the next input character: 4523 */ 4524 switch (c) { 4525 case '/': 4526 /* 4527 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS 4528 * character token. Set the temporary buffer to 4529 * the empty string. Switch to the script data 4530 * double escape end state. 4531 */ 4532 index = 0; 4533 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); 4534 break scriptdatadoubleescapedlessthanloop; 4535 default: 4536 /* 4537 * Anything else Reconsume the current input 4538 * character in the script data double escaped 4539 * state. 4540 */ 4541 reconsume = true; 4542 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4543 continue stateloop; 4544 } 4545 } 4546 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4547 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 4548 scriptdatadoubleescapeendloop: for (;;) { 4549 if (++pos == endPos) { 4550 break stateloop; 4551 } 4552 c = checkChar(buf, pos); 4553 if (index < 6) { // SCRIPT_ARR.length 4554 char folded = c; 4555 if (c >= 'A' && c <= 'Z') { 4556 folded += 0x20; 4557 } 4558 if (folded != Tokenizer.SCRIPT_ARR[index]) { 4559 reconsume = true; 4560 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4561 continue stateloop; 4562 } 4563 index++; 4564 continue; 4565 } 4566 switch (c) { 4567 case '\r': 4568 emitCarriageReturn(buf, pos); 4569 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4570 break stateloop; 4571 case '\n': 4572 silentLineFeed(); 4573 case ' ': 4574 case '\t': 4575 case '\u000C': 4576 case '/': 4577 case '>': 4578 /* 4579 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4580 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4581 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 4582 * (>) Emit the current input character as a 4583 * character token. If the temporary buffer is 4584 * the string "script", then switch to the 4585 * script data escaped state. 4586 */ 4587 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4588 continue stateloop; 4589 default: 4590 /* 4591 * Reconsume the current input character in the 4592 * script data double escaped state. 4593 */ 4594 reconsume = true; 4595 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4596 continue stateloop; 4597 } 4598 } 4599 // XXX reorder point 4600 case MARKUP_DECLARATION_OCTYPE: 4601 markupdeclarationdoctypeloop: for (;;) { 4602 if (++pos == endPos) { 4603 break stateloop; 4604 } 4605 c = checkChar(buf, pos); 4606 if (index < 6) { // OCTYPE.length 4607 char folded = c; 4608 if (c >= 'A' && c <= 'Z') { 4609 folded += 0x20; 4610 } 4611 if (folded == Tokenizer.OCTYPE[index]) { 4612 appendStrBuf(c); 4613 } else { 4614 errBogusComment(); 4615 reconsume = true; 4616 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4617 continue stateloop; 4618 } 4619 index++; 4620 continue; 4621 } else { 4622 reconsume = true; 4623 state = transition(state, Tokenizer.DOCTYPE, reconsume, pos); 4624 break markupdeclarationdoctypeloop; 4625 // continue stateloop; 4626 } 4627 } 4628 // FALLTHRU DON'T REORDER 4629 case DOCTYPE: 4630 doctypeloop: for (;;) { 4631 if (reconsume) { 4632 reconsume = false; 4633 } else { 4634 if (++pos == endPos) { 4635 break stateloop; 4636 } 4637 c = checkChar(buf, pos); 4638 } 4639 initDoctypeFields(); 4640 /* 4641 * Consume the next input character: 4642 */ 4643 switch (c) { 4644 case '\r': 4645 silentCarriageReturn(); 4646 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4647 break stateloop; 4648 case '\n': 4649 silentLineFeed(); 4650 // fall thru 4651 case ' ': 4652 case '\t': 4653 case '\u000C': 4654 /* 4655 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4656 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4657 * Switch to the before DOCTYPE name state. 4658 */ 4659 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4660 break doctypeloop; 4661 // continue stateloop; 4662 default: 4663 /* 4664 * Anything else Parse error. 4665 */ 4666 errMissingSpaceBeforeDoctypeName(); 4667 /* 4668 * Reconsume the current character in the before 4669 * DOCTYPE name state. 4670 */ 4671 reconsume = true; 4672 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4673 break doctypeloop; 4674 // continue stateloop; 4675 } 4676 } 4677 // FALLTHRU DON'T REORDER 4678 case BEFORE_DOCTYPE_NAME: 4679 beforedoctypenameloop: for (;;) { 4680 if (reconsume) { 4681 reconsume = false; 4682 } else { 4683 if (++pos == endPos) { 4684 break stateloop; 4685 } 4686 c = checkChar(buf, pos); 4687 } 4688 /* 4689 * Consume the next input character: 4690 */ 4691 switch (c) { 4692 case '\r': 4693 silentCarriageReturn(); 4694 break stateloop; 4695 case '\n': 4696 silentLineFeed(); 4697 // fall thru 4698 case ' ': 4699 case '\t': 4700 case '\u000C': 4701 /* 4702 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4703 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 4704 * in the before DOCTYPE name state. 4705 */ 4706 continue; 4707 case '>': 4708 /* 4709 * U+003E GREATER-THAN SIGN (>) Parse error. 4710 */ 4711 errNamelessDoctype(); 4712 /* 4713 * Create a new DOCTYPE token. Set its 4714 * force-quirks flag to on. 4715 */ 4716 forceQuirks = true; 4717 /* 4718 * Emit the token. 4719 */ 4720 emitDoctypeToken(pos); 4721 /* 4722 * Switch to the data state. 4723 */ 4724 state = transition(state, Tokenizer.DATA, reconsume, pos); 4725 continue stateloop; 4726 case '\u0000': 4727 c = '\uFFFD'; 4728 // fall thru 4729 default: 4730 if (c >= 'A' && c <= 'Z') { 4731 /* 4732 * U+0041 LATIN CAPITAL LETTER A through to 4733 * U+005A LATIN CAPITAL LETTER Z Create a 4734 * new DOCTYPE token. Set the token's name 4735 * to the lowercase version of the input 4736 * character (add 0x0020 to the character's 4737 * code point). 4738 */ 4739 c += 0x20; 4740 } 4741 /* Anything else Create a new DOCTYPE token. */ 4742 /* 4743 * Set the token's name name to the current 4744 * input character. 4745 */ 4746 clearStrBufBeforeUse(); 4747 appendStrBuf(c); 4748 /* 4749 * Switch to the DOCTYPE name state. 4750 */ 4751 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); 4752 break beforedoctypenameloop; 4753 // continue stateloop; 4754 } 4755 } 4756 // FALLTHRU DON'T REORDER 4757 case DOCTYPE_NAME: 4758 doctypenameloop: for (;;) { 4759 if (++pos == endPos) { 4760 break stateloop; 4761 } 4762 c = checkChar(buf, pos); 4763 /* 4764 * Consume the next input character: 4765 */ 4766 switch (c) { 4767 case '\r': 4768 silentCarriageReturn(); 4769 strBufToDoctypeName(); 4770 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 4771 break stateloop; 4772 case '\n': 4773 silentLineFeed(); 4774 // fall thru 4775 case ' ': 4776 case '\t': 4777 case '\u000C': 4778 /* 4779 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4780 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4781 * Switch to the after DOCTYPE name state. 4782 */ 4783 strBufToDoctypeName(); 4784 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 4785 break doctypenameloop; 4786 // continue stateloop; 4787 case '>': 4788 /* 4789 * U+003E GREATER-THAN SIGN (>) Emit the current 4790 * DOCTYPE token. 4791 */ 4792 strBufToDoctypeName(); 4793 emitDoctypeToken(pos); 4794 /* 4795 * Switch to the data state. 4796 */ 4797 state = transition(state, Tokenizer.DATA, reconsume, pos); 4798 continue stateloop; 4799 case '\u0000': 4800 c = '\uFFFD'; 4801 // fall thru 4802 default: 4803 /* 4804 * U+0041 LATIN CAPITAL LETTER A through to 4805 * U+005A LATIN CAPITAL LETTER Z Append the 4806 * lowercase version of the input character (add 4807 * 0x0020 to the character's code point) to the 4808 * current DOCTYPE token's name. 4809 */ 4810 if (c >= 'A' && c <= 'Z') { 4811 c += 0x0020; 4812 } 4813 /* 4814 * Anything else Append the current input 4815 * character to the current DOCTYPE token's 4816 * name. 4817 */ 4818 appendStrBuf(c); 4819 /* 4820 * Stay in the DOCTYPE name state. 4821 */ 4822 continue; 4823 } 4824 } 4825 // FALLTHRU DON'T REORDER 4826 case AFTER_DOCTYPE_NAME: 4827 afterdoctypenameloop: for (;;) { 4828 if (++pos == endPos) { 4829 break stateloop; 4830 } 4831 c = checkChar(buf, pos); 4832 /* 4833 * Consume the next input character: 4834 */ 4835 switch (c) { 4836 case '\r': 4837 silentCarriageReturn(); 4838 break stateloop; 4839 case '\n': 4840 silentLineFeed(); 4841 // fall thru 4842 case ' ': 4843 case '\t': 4844 case '\u000C': 4845 /* 4846 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4847 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 4848 * in the after DOCTYPE name state. 4849 */ 4850 continue; 4851 case '>': 4852 /* 4853 * U+003E GREATER-THAN SIGN (>) Emit the current 4854 * DOCTYPE token. 4855 */ 4856 emitDoctypeToken(pos); 4857 /* 4858 * Switch to the data state. 4859 */ 4860 state = transition(state, Tokenizer.DATA, reconsume, pos); 4861 continue stateloop; 4862 case 'p': 4863 case 'P': 4864 index = 0; 4865 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); 4866 break afterdoctypenameloop; 4867 // continue stateloop; 4868 case 's': 4869 case 'S': 4870 index = 0; 4871 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); 4872 continue stateloop; 4873 default: 4874 /* 4875 * Otherwise, this is the parse error. 4876 */ 4877 bogusDoctype(); 4878 4879 /* 4880 * Set the DOCTYPE token's force-quirks flag to 4881 * on. 4882 */ 4883 // done by bogusDoctype(); 4884 /* 4885 * Switch to the bogus DOCTYPE state. 4886 */ 4887 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 4888 continue stateloop; 4889 } 4890 } 4891 // FALLTHRU DON'T REORDER 4892 case DOCTYPE_UBLIC: 4893 doctypeublicloop: for (;;) { 4894 if (++pos == endPos) { 4895 break stateloop; 4896 } 4897 c = checkChar(buf, pos); 4898 /* 4899 * If the six characters starting from the current input 4900 * character are an ASCII case-insensitive match for the 4901 * word "PUBLIC", then consume those characters and 4902 * switch to the before DOCTYPE public identifier state. 4903 */ 4904 if (index < 5) { // UBLIC.length 4905 char folded = c; 4906 if (c >= 'A' && c <= 'Z') { 4907 folded += 0x20; 4908 } 4909 if (folded != Tokenizer.UBLIC[index]) { 4910 bogusDoctype(); 4911 // forceQuirks = true; 4912 reconsume = true; 4913 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 4914 continue stateloop; 4915 } 4916 index++; 4917 continue; 4918 } else { 4919 reconsume = true; 4920 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); 4921 break doctypeublicloop; 4922 // continue stateloop; 4923 } 4924 } 4925 // FALLTHRU DON'T REORDER 4926 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 4927 afterdoctypepublickeywordloop: for (;;) { 4928 if (reconsume) { 4929 reconsume = false; 4930 } else { 4931 if (++pos == endPos) { 4932 break stateloop; 4933 } 4934 c = checkChar(buf, pos); 4935 } 4936 /* 4937 * Consume the next input character: 4938 */ 4939 switch (c) { 4940 case '\r': 4941 silentCarriageReturn(); 4942 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 4943 break stateloop; 4944 case '\n': 4945 silentLineFeed(); 4946 // fall thru 4947 case ' ': 4948 case '\t': 4949 case '\u000C': 4950 /* 4951 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4952 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4953 * Switch to the before DOCTYPE public 4954 * identifier state. 4955 */ 4956 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 4957 break afterdoctypepublickeywordloop; 4958 // FALL THROUGH continue stateloop 4959 case '"': 4960 /* 4961 * U+0022 QUOTATION MARK (") Parse Error. 4962 */ 4963 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 4964 /* 4965 * Set the DOCTYPE token's public identifier to 4966 * the empty string (not missing), 4967 */ 4968 clearStrBufBeforeUse(); 4969 /* 4970 * then switch to the DOCTYPE public identifier 4971 * (double-quoted) state. 4972 */ 4973 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 4974 continue stateloop; 4975 case '\'': 4976 /* 4977 * U+0027 APOSTROPHE (') Parse Error. 4978 */ 4979 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 4980 /* 4981 * Set the DOCTYPE token's public identifier to 4982 * the empty string (not missing), 4983 */ 4984 clearStrBufBeforeUse(); 4985 /* 4986 * then switch to the DOCTYPE public identifier 4987 * (single-quoted) state. 4988 */ 4989 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 4990 continue stateloop; 4991 case '>': 4992 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 4993 errExpectedPublicId(); 4994 /* 4995 * Set the DOCTYPE token's force-quirks flag to 4996 * on. 4997 */ 4998 forceQuirks = true; 4999 /* 5000 * Emit that DOCTYPE token. 5001 */ 5002 emitDoctypeToken(pos); 5003 /* 5004 * Switch to the data state. 5005 */ 5006 state = transition(state, Tokenizer.DATA, reconsume, pos); 5007 continue stateloop; 5008 default: 5009 bogusDoctype(); 5010 /* 5011 * Set the DOCTYPE token's force-quirks flag to 5012 * on. 5013 */ 5014 // done by bogusDoctype(); 5015 /* 5016 * Switch to the bogus DOCTYPE state. 5017 */ 5018 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5019 continue stateloop; 5020 } 5021 } 5022 // FALLTHRU DON'T REORDER 5023 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 5024 beforedoctypepublicidentifierloop: for (;;) { 5025 if (++pos == endPos) { 5026 break stateloop; 5027 } 5028 c = checkChar(buf, pos); 5029 /* 5030 * Consume the next input character: 5031 */ 5032 switch (c) { 5033 case '\r': 5034 silentCarriageReturn(); 5035 break stateloop; 5036 case '\n': 5037 silentLineFeed(); 5038 // fall thru 5039 case ' ': 5040 case '\t': 5041 case '\u000C': 5042 /* 5043 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5044 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5045 * in the before DOCTYPE public identifier 5046 * state. 5047 */ 5048 continue; 5049 case '"': 5050 /* 5051 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5052 * token's public identifier to the empty string 5053 * (not missing), 5054 */ 5055 clearStrBufBeforeUse(); 5056 /* 5057 * then switch to the DOCTYPE public identifier 5058 * (double-quoted) state. 5059 */ 5060 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5061 break beforedoctypepublicidentifierloop; 5062 // continue stateloop; 5063 case '\'': 5064 /* 5065 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5066 * public identifier to the empty string (not 5067 * missing), 5068 */ 5069 clearStrBufBeforeUse(); 5070 /* 5071 * then switch to the DOCTYPE public identifier 5072 * (single-quoted) state. 5073 */ 5074 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5075 continue stateloop; 5076 case '>': 5077 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5078 errExpectedPublicId(); 5079 /* 5080 * Set the DOCTYPE token's force-quirks flag to 5081 * on. 5082 */ 5083 forceQuirks = true; 5084 /* 5085 * Emit that DOCTYPE token. 5086 */ 5087 emitDoctypeToken(pos); 5088 /* 5089 * Switch to the data state. 5090 */ 5091 state = transition(state, Tokenizer.DATA, reconsume, pos); 5092 continue stateloop; 5093 default: 5094 bogusDoctype(); 5095 /* 5096 * Set the DOCTYPE token's force-quirks flag to 5097 * on. 5098 */ 5099 // done by bogusDoctype(); 5100 /* 5101 * Switch to the bogus DOCTYPE state. 5102 */ 5103 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5104 continue stateloop; 5105 } 5106 } 5107 // FALLTHRU DON'T REORDER 5108 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 5109 doctypepublicidentifierdoublequotedloop: for (;;) { 5110 if (++pos == endPos) { 5111 break stateloop; 5112 } 5113 c = checkChar(buf, pos); 5114 /* 5115 * Consume the next input character: 5116 */ 5117 switch (c) { 5118 case '"': 5119 /* 5120 * U+0022 QUOTATION MARK (") Switch to the after 5121 * DOCTYPE public identifier state. 5122 */ 5123 publicIdentifier = strBufToString(); 5124 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5125 break doctypepublicidentifierdoublequotedloop; 5126 // continue stateloop; 5127 case '>': 5128 /* 5129 * U+003E GREATER-THAN SIGN (>) Parse error. 5130 */ 5131 errGtInPublicId(); 5132 /* 5133 * Set the DOCTYPE token's force-quirks flag to 5134 * on. 5135 */ 5136 forceQuirks = true; 5137 /* 5138 * Emit that DOCTYPE token. 5139 */ 5140 publicIdentifier = strBufToString(); 5141 emitDoctypeToken(pos); 5142 /* 5143 * Switch to the data state. 5144 */ 5145 state = transition(state, Tokenizer.DATA, reconsume, pos); 5146 continue stateloop; 5147 case '\r': 5148 appendStrBufCarriageReturn(); 5149 break stateloop; 5150 case '\n': 5151 appendStrBufLineFeed(); 5152 continue; 5153 case '\u0000': 5154 c = '\uFFFD'; 5155 // fall thru 5156 default: 5157 /* 5158 * Anything else Append the current input 5159 * character to the current DOCTYPE token's 5160 * public identifier. 5161 */ 5162 appendStrBuf(c); 5163 /* 5164 * Stay in the DOCTYPE public identifier 5165 * (double-quoted) state. 5166 */ 5167 continue; 5168 } 5169 } 5170 // FALLTHRU DON'T REORDER 5171 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 5172 afterdoctypepublicidentifierloop: for (;;) { 5173 if (++pos == endPos) { 5174 break stateloop; 5175 } 5176 c = checkChar(buf, pos); 5177 /* 5178 * Consume the next input character: 5179 */ 5180 switch (c) { 5181 case '\r': 5182 silentCarriageReturn(); 5183 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 5184 break stateloop; 5185 case '\n': 5186 silentLineFeed(); 5187 // fall thru 5188 case ' ': 5189 case '\t': 5190 case '\u000C': 5191 /* 5192 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5193 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5194 * Switch to the between DOCTYPE public and 5195 * system identifiers state. 5196 */ 5197 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 5198 break afterdoctypepublicidentifierloop; 5199 // continue stateloop; 5200 case '>': 5201 /* 5202 * U+003E GREATER-THAN SIGN (>) Emit the current 5203 * DOCTYPE token. 5204 */ 5205 emitDoctypeToken(pos); 5206 /* 5207 * Switch to the data state. 5208 */ 5209 state = transition(state, Tokenizer.DATA, reconsume, pos); 5210 continue stateloop; 5211 case '"': 5212 /* 5213 * U+0022 QUOTATION MARK (") Parse error. 5214 */ 5215 errNoSpaceBetweenPublicAndSystemIds(); 5216 /* 5217 * Set the DOCTYPE token's system identifier to 5218 * the empty string (not missing), 5219 */ 5220 clearStrBufBeforeUse(); 5221 /* 5222 * then switch to the DOCTYPE system identifier 5223 * (double-quoted) state. 5224 */ 5225 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5226 continue stateloop; 5227 case '\'': 5228 /* 5229 * U+0027 APOSTROPHE (') Parse error. 5230 */ 5231 errNoSpaceBetweenPublicAndSystemIds(); 5232 /* 5233 * Set the DOCTYPE token's system identifier to 5234 * the empty string (not missing), 5235 */ 5236 clearStrBufBeforeUse(); 5237 /* 5238 * then switch to the DOCTYPE system identifier 5239 * (single-quoted) state. 5240 */ 5241 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5242 continue stateloop; 5243 default: 5244 bogusDoctype(); 5245 /* 5246 * Set the DOCTYPE token's force-quirks flag to 5247 * on. 5248 */ 5249 // done by bogusDoctype(); 5250 /* 5251 * Switch to the bogus DOCTYPE state. 5252 */ 5253 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5254 continue stateloop; 5255 } 5256 } 5257 // FALLTHRU DON'T REORDER 5258 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 5259 betweendoctypepublicandsystemidentifiersloop: for (;;) { 5260 if (++pos == endPos) { 5261 break stateloop; 5262 } 5263 c = checkChar(buf, pos); 5264 /* 5265 * Consume the next input character: 5266 */ 5267 switch (c) { 5268 case '\r': 5269 silentCarriageReturn(); 5270 break stateloop; 5271 case '\n': 5272 silentLineFeed(); 5273 // fall thru 5274 case ' ': 5275 case '\t': 5276 case '\u000C': 5277 /* 5278 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5279 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5280 * in the between DOCTYPE public and system 5281 * identifiers state. 5282 */ 5283 continue; 5284 case '>': 5285 /* 5286 * U+003E GREATER-THAN SIGN (>) Emit the current 5287 * DOCTYPE token. 5288 */ 5289 emitDoctypeToken(pos); 5290 /* 5291 * Switch to the data state. 5292 */ 5293 state = transition(state, Tokenizer.DATA, reconsume, pos); 5294 continue stateloop; 5295 case '"': 5296 /* 5297 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5298 * token's system identifier to the empty string 5299 * (not missing), 5300 */ 5301 clearStrBufBeforeUse(); 5302 /* 5303 * then switch to the DOCTYPE system identifier 5304 * (double-quoted) state. 5305 */ 5306 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5307 break betweendoctypepublicandsystemidentifiersloop; 5308 // continue stateloop; 5309 case '\'': 5310 /* 5311 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5312 * system identifier to the empty string (not 5313 * missing), 5314 */ 5315 clearStrBufBeforeUse(); 5316 /* 5317 * then switch to the DOCTYPE system identifier 5318 * (single-quoted) state. 5319 */ 5320 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5321 continue stateloop; 5322 default: 5323 bogusDoctype(); 5324 /* 5325 * Set the DOCTYPE token's force-quirks flag to 5326 * on. 5327 */ 5328 // done by bogusDoctype(); 5329 /* 5330 * Switch to the bogus DOCTYPE state. 5331 */ 5332 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5333 continue stateloop; 5334 } 5335 } 5336 // FALLTHRU DON'T REORDER 5337 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 5338 doctypesystemidentifierdoublequotedloop: for (;;) { 5339 if (++pos == endPos) { 5340 break stateloop; 5341 } 5342 c = checkChar(buf, pos); 5343 /* 5344 * Consume the next input character: 5345 */ 5346 switch (c) { 5347 case '"': 5348 /* 5349 * U+0022 QUOTATION MARK (") Switch to the after 5350 * DOCTYPE system identifier state. 5351 */ 5352 systemIdentifier = strBufToString(); 5353 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5354 continue stateloop; 5355 case '>': 5356 /* 5357 * U+003E GREATER-THAN SIGN (>) Parse error. 5358 */ 5359 errGtInSystemId(); 5360 /* 5361 * Set the DOCTYPE token's force-quirks flag to 5362 * on. 5363 */ 5364 forceQuirks = true; 5365 /* 5366 * Emit that DOCTYPE token. 5367 */ 5368 systemIdentifier = strBufToString(); 5369 emitDoctypeToken(pos); 5370 /* 5371 * Switch to the data state. 5372 */ 5373 state = transition(state, Tokenizer.DATA, reconsume, pos); 5374 continue stateloop; 5375 case '\r': 5376 appendStrBufCarriageReturn(); 5377 break stateloop; 5378 case '\n': 5379 appendStrBufLineFeed(); 5380 continue; 5381 case '\u0000': 5382 c = '\uFFFD'; 5383 // fall thru 5384 default: 5385 /* 5386 * Anything else Append the current input 5387 * character to the current DOCTYPE token's 5388 * system identifier. 5389 */ 5390 appendStrBuf(c); 5391 /* 5392 * Stay in the DOCTYPE system identifier 5393 * (double-quoted) state. 5394 */ 5395 continue; 5396 } 5397 } 5398 // FALLTHRU DON'T REORDER 5399 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 5400 afterdoctypesystemidentifierloop: for (;;) { 5401 if (++pos == endPos) { 5402 break stateloop; 5403 } 5404 c = checkChar(buf, pos); 5405 /* 5406 * Consume the next input character: 5407 */ 5408 switch (c) { 5409 case '\r': 5410 silentCarriageReturn(); 5411 break stateloop; 5412 case '\n': 5413 silentLineFeed(); 5414 // fall thru 5415 case ' ': 5416 case '\t': 5417 case '\u000C': 5418 /* 5419 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5420 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5421 * in the after DOCTYPE system identifier state. 5422 */ 5423 continue; 5424 case '>': 5425 /* 5426 * U+003E GREATER-THAN SIGN (>) Emit the current 5427 * DOCTYPE token. 5428 */ 5429 emitDoctypeToken(pos); 5430 /* 5431 * Switch to the data state. 5432 */ 5433 state = transition(state, Tokenizer.DATA, reconsume, pos); 5434 continue stateloop; 5435 default: 5436 /* 5437 * Switch to the bogus DOCTYPE state. (This does 5438 * not set the DOCTYPE token's force-quirks flag 5439 * to on.) 5440 */ 5441 bogusDoctypeWithoutQuirks(); 5442 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5443 break afterdoctypesystemidentifierloop; 5444 // continue stateloop; 5445 } 5446 } 5447 // FALLTHRU DON'T REORDER 5448 case BOGUS_DOCTYPE: 5449 for (;;) { 5450 if (reconsume) { 5451 reconsume = false; 5452 } else { 5453 if (++pos == endPos) { 5454 break stateloop; 5455 } 5456 c = checkChar(buf, pos); 5457 } 5458 /* 5459 * Consume the next input character: 5460 */ 5461 switch (c) { 5462 case '>': 5463 /* 5464 * U+003E GREATER-THAN SIGN (>) Emit that 5465 * DOCTYPE token. 5466 */ 5467 emitDoctypeToken(pos); 5468 /* 5469 * Switch to the data state. 5470 */ 5471 state = transition(state, Tokenizer.DATA, reconsume, pos); 5472 continue stateloop; 5473 case '\r': 5474 silentCarriageReturn(); 5475 break stateloop; 5476 case '\n': 5477 silentLineFeed(); 5478 // fall thru 5479 default: 5480 /* 5481 * Anything else Stay in the bogus DOCTYPE 5482 * state. 5483 */ 5484 continue; 5485 } 5486 } 5487 // XXX reorder point 5488 case DOCTYPE_YSTEM: 5489 doctypeystemloop: for (;;) { 5490 if (++pos == endPos) { 5491 break stateloop; 5492 } 5493 c = checkChar(buf, pos); 5494 /* 5495 * Otherwise, if the six characters starting from the 5496 * current input character are an ASCII case-insensitive 5497 * match for the word "SYSTEM", then consume those 5498 * characters and switch to the before DOCTYPE system 5499 * identifier state. 5500 */ 5501 if (index < 5) { // YSTEM.length 5502 char folded = c; 5503 if (c >= 'A' && c <= 'Z') { 5504 folded += 0x20; 5505 } 5506 if (folded != Tokenizer.YSTEM[index]) { 5507 bogusDoctype(); 5508 reconsume = true; 5509 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5510 continue stateloop; 5511 } 5512 index++; 5513 continue stateloop; 5514 } else { 5515 reconsume = true; 5516 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); 5517 break doctypeystemloop; 5518 // continue stateloop; 5519 } 5520 } 5521 // FALLTHRU DON'T REORDER 5522 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 5523 afterdoctypesystemkeywordloop: for (;;) { 5524 if (reconsume) { 5525 reconsume = false; 5526 } else { 5527 if (++pos == endPos) { 5528 break stateloop; 5529 } 5530 c = checkChar(buf, pos); 5531 } 5532 /* 5533 * Consume the next input character: 5534 */ 5535 switch (c) { 5536 case '\r': 5537 silentCarriageReturn(); 5538 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5539 break stateloop; 5540 case '\n': 5541 silentLineFeed(); 5542 // fall thru 5543 case ' ': 5544 case '\t': 5545 case '\u000C': 5546 /* 5547 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5548 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5549 * Switch to the before DOCTYPE public 5550 * identifier state. 5551 */ 5552 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5553 break afterdoctypesystemkeywordloop; 5554 // FALL THROUGH continue stateloop 5555 case '"': 5556 /* 5557 * U+0022 QUOTATION MARK (") Parse Error. 5558 */ 5559 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 5560 /* 5561 * Set the DOCTYPE token's system identifier to 5562 * the empty string (not missing), 5563 */ 5564 clearStrBufBeforeUse(); 5565 /* 5566 * then switch to the DOCTYPE public identifier 5567 * (double-quoted) state. 5568 */ 5569 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5570 continue stateloop; 5571 case '\'': 5572 /* 5573 * U+0027 APOSTROPHE (') Parse Error. 5574 */ 5575 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 5576 /* 5577 * Set the DOCTYPE token's public identifier to 5578 * the empty string (not missing), 5579 */ 5580 clearStrBufBeforeUse(); 5581 /* 5582 * then switch to the DOCTYPE public identifier 5583 * (single-quoted) state. 5584 */ 5585 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5586 continue stateloop; 5587 case '>': 5588 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5589 errExpectedPublicId(); 5590 /* 5591 * Set the DOCTYPE token's force-quirks flag to 5592 * on. 5593 */ 5594 forceQuirks = true; 5595 /* 5596 * Emit that DOCTYPE token. 5597 */ 5598 emitDoctypeToken(pos); 5599 /* 5600 * Switch to the data state. 5601 */ 5602 state = transition(state, Tokenizer.DATA, reconsume, pos); 5603 continue stateloop; 5604 default: 5605 bogusDoctype(); 5606 /* 5607 * Set the DOCTYPE token's force-quirks flag to 5608 * on. 5609 */ 5610 // done by bogusDoctype(); 5611 /* 5612 * Switch to the bogus DOCTYPE state. 5613 */ 5614 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5615 continue stateloop; 5616 } 5617 } 5618 // FALLTHRU DON'T REORDER 5619 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 5620 beforedoctypesystemidentifierloop: for (;;) { 5621 if (++pos == endPos) { 5622 break stateloop; 5623 } 5624 c = checkChar(buf, pos); 5625 /* 5626 * Consume the next input character: 5627 */ 5628 switch (c) { 5629 case '\r': 5630 silentCarriageReturn(); 5631 break stateloop; 5632 case '\n': 5633 silentLineFeed(); 5634 // fall thru 5635 case ' ': 5636 case '\t': 5637 case '\u000C': 5638 /* 5639 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5640 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5641 * in the before DOCTYPE system identifier 5642 * state. 5643 */ 5644 continue; 5645 case '"': 5646 /* 5647 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5648 * token's system identifier to the empty string 5649 * (not missing), 5650 */ 5651 clearStrBufBeforeUse(); 5652 /* 5653 * then switch to the DOCTYPE system identifier 5654 * (double-quoted) state. 5655 */ 5656 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5657 continue stateloop; 5658 case '\'': 5659 /* 5660 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5661 * system identifier to the empty string (not 5662 * missing), 5663 */ 5664 clearStrBufBeforeUse(); 5665 /* 5666 * then switch to the DOCTYPE system identifier 5667 * (single-quoted) state. 5668 */ 5669 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5670 break beforedoctypesystemidentifierloop; 5671 // continue stateloop; 5672 case '>': 5673 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5674 errExpectedSystemId(); 5675 /* 5676 * Set the DOCTYPE token's force-quirks flag to 5677 * on. 5678 */ 5679 forceQuirks = true; 5680 /* 5681 * Emit that DOCTYPE token. 5682 */ 5683 emitDoctypeToken(pos); 5684 /* 5685 * Switch to the data state. 5686 */ 5687 state = transition(state, Tokenizer.DATA, reconsume, pos); 5688 continue stateloop; 5689 default: 5690 bogusDoctype(); 5691 /* 5692 * Set the DOCTYPE token's force-quirks flag to 5693 * on. 5694 */ 5695 // done by bogusDoctype(); 5696 /* 5697 * Switch to the bogus DOCTYPE state. 5698 */ 5699 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5700 continue stateloop; 5701 } 5702 } 5703 // FALLTHRU DON'T REORDER 5704 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 5705 for (;;) { 5706 if (++pos == endPos) { 5707 break stateloop; 5708 } 5709 c = checkChar(buf, pos); 5710 /* 5711 * Consume the next input character: 5712 */ 5713 switch (c) { 5714 case '\'': 5715 /* 5716 * U+0027 APOSTROPHE (') Switch to the after 5717 * DOCTYPE system identifier state. 5718 */ 5719 systemIdentifier = strBufToString(); 5720 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5721 continue stateloop; 5722 case '>': 5723 errGtInSystemId(); 5724 /* 5725 * Set the DOCTYPE token's force-quirks flag to 5726 * on. 5727 */ 5728 forceQuirks = true; 5729 /* 5730 * Emit that DOCTYPE token. 5731 */ 5732 systemIdentifier = strBufToString(); 5733 emitDoctypeToken(pos); 5734 /* 5735 * Switch to the data state. 5736 */ 5737 state = transition(state, Tokenizer.DATA, reconsume, pos); 5738 continue stateloop; 5739 case '\r': 5740 appendStrBufCarriageReturn(); 5741 break stateloop; 5742 case '\n': 5743 appendStrBufLineFeed(); 5744 continue; 5745 case '\u0000': 5746 c = '\uFFFD'; 5747 // fall thru 5748 default: 5749 /* 5750 * Anything else Append the current input 5751 * character to the current DOCTYPE token's 5752 * system identifier. 5753 */ 5754 appendStrBuf(c); 5755 /* 5756 * Stay in the DOCTYPE system identifier 5757 * (double-quoted) state. 5758 */ 5759 continue; 5760 } 5761 } 5762 // XXX reorder point 5763 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 5764 for (;;) { 5765 if (++pos == endPos) { 5766 break stateloop; 5767 } 5768 c = checkChar(buf, pos); 5769 /* 5770 * Consume the next input character: 5771 */ 5772 switch (c) { 5773 case '\'': 5774 /* 5775 * U+0027 APOSTROPHE (') Switch to the after 5776 * DOCTYPE public identifier state. 5777 */ 5778 publicIdentifier = strBufToString(); 5779 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5780 continue stateloop; 5781 case '>': 5782 errGtInPublicId(); 5783 /* 5784 * Set the DOCTYPE token's force-quirks flag to 5785 * on. 5786 */ 5787 forceQuirks = true; 5788 /* 5789 * Emit that DOCTYPE token. 5790 */ 5791 publicIdentifier = strBufToString(); 5792 emitDoctypeToken(pos); 5793 /* 5794 * Switch to the data state. 5795 */ 5796 state = transition(state, Tokenizer.DATA, reconsume, pos); 5797 continue stateloop; 5798 case '\r': 5799 appendStrBufCarriageReturn(); 5800 break stateloop; 5801 case '\n': 5802 appendStrBufLineFeed(); 5803 continue; 5804 case '\u0000': 5805 c = '\uFFFD'; 5806 // fall thru 5807 default: 5808 /* 5809 * Anything else Append the current input 5810 * character to the current DOCTYPE token's 5811 * public identifier. 5812 */ 5813 appendStrBuf(c); 5814 /* 5815 * Stay in the DOCTYPE public identifier 5816 * (single-quoted) state. 5817 */ 5818 continue; 5819 } 5820 } 5821 // XXX reorder point 5822 case PROCESSING_INSTRUCTION: 5823 processinginstructionloop: for (;;) { 5824 if (++pos == endPos) { 5825 break stateloop; 5826 } 5827 c = checkChar(buf, pos); 5828 switch (c) { 5829 case '?': 5830 state = transition( 5831 state, 5832 Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK, 5833 reconsume, pos); 5834 break processinginstructionloop; 5835 // continue stateloop; 5836 default: 5837 continue; 5838 } 5839 } 5840 case PROCESSING_INSTRUCTION_QUESTION_MARK: 5841 if (++pos == endPos) { 5842 break stateloop; 5843 } 5844 c = checkChar(buf, pos); 5845 switch (c) { 5846 case '>': 5847 state = transition(state, Tokenizer.DATA, 5848 reconsume, pos); 5849 continue stateloop; 5850 default: 5851 state = transition(state, 5852 Tokenizer.PROCESSING_INSTRUCTION, 5853 reconsume, pos); 5854 continue stateloop; 5855 } 5856 // END HOTSPOT WORKAROUND 5857 } 5858 } 5859 flushChars(buf, pos); 5860 /* 5861 * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } 5862 */ 5863 // Save locals 5864 stateSave = state; 5865 returnStateSave = returnState; 5866 return pos; 5867 } 5868 5869 // HOTSPOT WORKAROUND INSERTION POINT 5870 5871 // [NOCPP[ 5872 transition(int from, int to, boolean reconsume, int pos)5873 protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { 5874 return to; 5875 } 5876 5877 // ]NOCPP] 5878 initDoctypeFields()5879 private void initDoctypeFields() { 5880 // Discard the characters "DOCTYPE" accumulated as a potential bogus 5881 // comment into strBuf. 5882 clearStrBufAfterUse(); 5883 doctypeName = ""; 5884 if (systemIdentifier != null) { 5885 Portability.releaseString(systemIdentifier); 5886 systemIdentifier = null; 5887 } 5888 if (publicIdentifier != null) { 5889 Portability.releaseString(publicIdentifier); 5890 publicIdentifier = null; 5891 } 5892 forceQuirks = false; 5893 } 5894 adjustDoubleHyphenAndAppendToStrBufCarriageReturn()5895 @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn() 5896 throws SAXException { 5897 silentCarriageReturn(); 5898 adjustDoubleHyphenAndAppendToStrBufAndErr('\n'); 5899 } 5900 adjustDoubleHyphenAndAppendToStrBufLineFeed()5901 @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed() 5902 throws SAXException { 5903 silentLineFeed(); 5904 adjustDoubleHyphenAndAppendToStrBufAndErr('\n'); 5905 } 5906 appendStrBufLineFeed()5907 @Inline private void appendStrBufLineFeed() { 5908 silentLineFeed(); 5909 appendStrBuf('\n'); 5910 } 5911 appendStrBufCarriageReturn()5912 @Inline private void appendStrBufCarriageReturn() { 5913 silentCarriageReturn(); 5914 appendStrBuf('\n'); 5915 } 5916 silentCarriageReturn()5917 @Inline protected void silentCarriageReturn() { 5918 ++line; 5919 lastCR = true; 5920 } 5921 silentLineFeed()5922 @Inline protected void silentLineFeed() { 5923 ++line; 5924 } 5925 emitCarriageReturn(@oLength char[] buf, int pos)5926 private void emitCarriageReturn(@NoLength char[] buf, int pos) 5927 throws SAXException { 5928 silentCarriageReturn(); 5929 flushChars(buf, pos); 5930 tokenHandler.characters(Tokenizer.LF, 0, 1); 5931 cstart = Integer.MAX_VALUE; 5932 } 5933 emitReplacementCharacter(@oLength char[] buf, int pos)5934 private void emitReplacementCharacter(@NoLength char[] buf, int pos) 5935 throws SAXException { 5936 flushChars(buf, pos); 5937 tokenHandler.zeroOriginatingReplacementCharacter(); 5938 cstart = pos + 1; 5939 } 5940 emitPlaintextReplacementCharacter(@oLength char[] buf, int pos)5941 private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos) 5942 throws SAXException { 5943 flushChars(buf, pos); 5944 tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1); 5945 cstart = pos + 1; 5946 } 5947 setAdditionalAndRememberAmpersandLocation(char add)5948 private void setAdditionalAndRememberAmpersandLocation(char add) { 5949 additional = add; 5950 // [NOCPP[ 5951 ampersandLocation = new LocatorImpl(this); 5952 // ]NOCPP] 5953 } 5954 bogusDoctype()5955 private void bogusDoctype() throws SAXException { 5956 errBogusDoctype(); 5957 forceQuirks = true; 5958 } 5959 bogusDoctypeWithoutQuirks()5960 private void bogusDoctypeWithoutQuirks() throws SAXException { 5961 errBogusDoctype(); 5962 forceQuirks = false; 5963 } 5964 handleNcrValue(int returnState)5965 private void handleNcrValue(int returnState) throws SAXException { 5966 /* 5967 * If one or more characters match the range, then take them all and 5968 * interpret the string of characters as a number (either hexadecimal or 5969 * decimal as appropriate). 5970 */ 5971 if (value <= 0xFFFF) { 5972 if (value >= 0x80 && value <= 0x9f) { 5973 /* 5974 * If that number is one of the numbers in the first column of 5975 * the following table, then this is a parse error. 5976 */ 5977 errNcrInC1Range(); 5978 /* 5979 * Find the row with that number in the first column, and return 5980 * a character token for the Unicode character given in the 5981 * second column of that row. 5982 */ 5983 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80]; 5984 emitOrAppendOne(val, returnState); 5985 // [NOCPP[ 5986 } else if (value == 0xC 5987 && contentSpacePolicy != XmlViolationPolicy.ALLOW) { 5988 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) { 5989 emitOrAppendOne(Tokenizer.SPACE, returnState); 5990 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) { 5991 fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space."); 5992 } 5993 // ]NOCPP] 5994 } else if (value == 0x0) { 5995 errNcrZero(); 5996 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 5997 } else if ((value & 0xF800) == 0xD800) { 5998 errNcrSurrogate(); 5999 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6000 } else { 6001 /* 6002 * Otherwise, return a character token for the Unicode character 6003 * whose code point is that number. 6004 */ 6005 char ch = (char) value; 6006 // [NOCPP[ 6007 if (value == 0x0D) { 6008 errNcrCr(); 6009 } else if ((value <= 0x0008) || (value == 0x000B) 6010 || (value >= 0x000E && value <= 0x001F)) { 6011 ch = errNcrControlChar(ch); 6012 } else if (value >= 0xFDD0 && value <= 0xFDEF) { 6013 errNcrUnassigned(); 6014 } else if ((value & 0xFFFE) == 0xFFFE) { 6015 ch = errNcrNonCharacter(ch); 6016 } else if (value >= 0x007F && value <= 0x009F) { 6017 errNcrControlChar(); 6018 } else { 6019 maybeWarnPrivateUse(ch); 6020 } 6021 // ]NOCPP] 6022 bmpChar[0] = ch; 6023 emitOrAppendOne(bmpChar, returnState); 6024 } 6025 } else if (value <= 0x10FFFF) { 6026 // [NOCPP[ 6027 maybeWarnPrivateUseAstral(); 6028 if ((value & 0xFFFE) == 0xFFFE) { 6029 errAstralNonCharacter(value); 6030 } 6031 // ]NOCPP] 6032 astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10)); 6033 astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); 6034 emitOrAppendTwo(astralChar, returnState); 6035 } else { 6036 errNcrOutOfRange(); 6037 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6038 } 6039 } 6040 eof()6041 public void eof() throws SAXException { 6042 int state = stateSave; 6043 int returnState = returnStateSave; 6044 6045 eofloop: for (;;) { 6046 switch (state) { 6047 case SCRIPT_DATA_LESS_THAN_SIGN: 6048 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 6049 /* 6050 * Otherwise, emit a U+003C LESS-THAN SIGN character token 6051 */ 6052 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6053 /* 6054 * and reconsume the current input character in the data 6055 * state. 6056 */ 6057 break eofloop; 6058 case TAG_OPEN: 6059 /* 6060 * The behavior of this state depends on the content model 6061 * flag. 6062 */ 6063 /* 6064 * Anything else Parse error. 6065 */ 6066 errEofAfterLt(); 6067 /* 6068 * Emit a U+003C LESS-THAN SIGN character token 6069 */ 6070 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6071 /* 6072 * and reconsume the current input character in the data 6073 * state. 6074 */ 6075 break eofloop; 6076 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 6077 /* 6078 * Emit a U+003C LESS-THAN SIGN character token 6079 */ 6080 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6081 /* 6082 * and reconsume the current input character in the RCDATA 6083 * state. 6084 */ 6085 break eofloop; 6086 case NON_DATA_END_TAG_NAME: 6087 /* 6088 * Emit a U+003C LESS-THAN SIGN character token, a U+002F 6089 * SOLIDUS character token, 6090 */ 6091 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 6092 /* 6093 * a character token for each of the characters in the 6094 * temporary buffer (in the order they were added to the 6095 * buffer), 6096 */ 6097 emitStrBuf(); 6098 /* 6099 * and reconsume the current input character in the RCDATA 6100 * state. 6101 */ 6102 break eofloop; 6103 case CLOSE_TAG_OPEN: 6104 /* EOF Parse error. */ 6105 errEofAfterLt(); 6106 /* 6107 * Emit a U+003C LESS-THAN SIGN character token and a U+002F 6108 * SOLIDUS character token. 6109 */ 6110 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 6111 /* 6112 * Reconsume the EOF character in the data state. 6113 */ 6114 break eofloop; 6115 case TAG_NAME: 6116 /* 6117 * EOF Parse error. 6118 */ 6119 errEofInTagName(); 6120 /* 6121 * Reconsume the EOF character in the data state. 6122 */ 6123 break eofloop; 6124 case BEFORE_ATTRIBUTE_NAME: 6125 case AFTER_ATTRIBUTE_VALUE_QUOTED: 6126 case SELF_CLOSING_START_TAG: 6127 /* EOF Parse error. */ 6128 errEofWithoutGt(); 6129 /* 6130 * Reconsume the EOF character in the data state. 6131 */ 6132 break eofloop; 6133 case ATTRIBUTE_NAME: 6134 /* 6135 * EOF Parse error. 6136 */ 6137 errEofInAttributeName(); 6138 /* 6139 * Reconsume the EOF character in the data state. 6140 */ 6141 break eofloop; 6142 case AFTER_ATTRIBUTE_NAME: 6143 case BEFORE_ATTRIBUTE_VALUE: 6144 /* EOF Parse error. */ 6145 errEofWithoutGt(); 6146 /* 6147 * Reconsume the EOF character in the data state. 6148 */ 6149 break eofloop; 6150 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 6151 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 6152 case ATTRIBUTE_VALUE_UNQUOTED: 6153 /* EOF Parse error. */ 6154 errEofInAttributeValue(); 6155 /* 6156 * Reconsume the EOF character in the data state. 6157 */ 6158 break eofloop; 6159 case BOGUS_COMMENT: 6160 emitComment(0, 0); 6161 break eofloop; 6162 case BOGUS_COMMENT_HYPHEN: 6163 // [NOCPP[ 6164 maybeAppendSpaceToBogusComment(); 6165 // ]NOCPP] 6166 emitComment(0, 0); 6167 break eofloop; 6168 case MARKUP_DECLARATION_OPEN: 6169 errBogusComment(); 6170 emitComment(0, 0); 6171 break eofloop; 6172 case MARKUP_DECLARATION_HYPHEN: 6173 errBogusComment(); 6174 emitComment(0, 0); 6175 break eofloop; 6176 case MARKUP_DECLARATION_OCTYPE: 6177 if (index < 6) { 6178 errBogusComment(); 6179 emitComment(0, 0); 6180 } else { 6181 /* EOF Parse error. */ 6182 errEofInDoctype(); 6183 /* 6184 * Create a new DOCTYPE token. Set its force-quirks flag 6185 * to on. 6186 */ 6187 doctypeName = ""; 6188 if (systemIdentifier != null) { 6189 Portability.releaseString(systemIdentifier); 6190 systemIdentifier = null; 6191 } 6192 if (publicIdentifier != null) { 6193 Portability.releaseString(publicIdentifier); 6194 publicIdentifier = null; 6195 } 6196 forceQuirks = true; 6197 /* 6198 * Emit the token. 6199 */ 6200 emitDoctypeToken(0); 6201 /* 6202 * Reconsume the EOF character in the data state. 6203 */ 6204 break eofloop; 6205 } 6206 break eofloop; 6207 case COMMENT_START: 6208 case COMMENT: 6209 /* 6210 * EOF Parse error. 6211 */ 6212 errEofInComment(); 6213 /* Emit the comment token. */ 6214 emitComment(0, 0); 6215 /* 6216 * Reconsume the EOF character in the data state. 6217 */ 6218 break eofloop; 6219 case COMMENT_END: 6220 errEofInComment(); 6221 /* Emit the comment token. */ 6222 emitComment(2, 0); 6223 /* 6224 * Reconsume the EOF character in the data state. 6225 */ 6226 break eofloop; 6227 case COMMENT_END_DASH: 6228 case COMMENT_START_DASH: 6229 errEofInComment(); 6230 /* Emit the comment token. */ 6231 emitComment(1, 0); 6232 /* 6233 * Reconsume the EOF character in the data state. 6234 */ 6235 break eofloop; 6236 case COMMENT_END_BANG: 6237 errEofInComment(); 6238 /* Emit the comment token. */ 6239 emitComment(3, 0); 6240 /* 6241 * Reconsume the EOF character in the data state. 6242 */ 6243 break eofloop; 6244 case DOCTYPE: 6245 case BEFORE_DOCTYPE_NAME: 6246 errEofInDoctype(); 6247 /* 6248 * Create a new DOCTYPE token. Set its force-quirks flag to 6249 * on. 6250 */ 6251 forceQuirks = true; 6252 /* 6253 * Emit the token. 6254 */ 6255 emitDoctypeToken(0); 6256 /* 6257 * Reconsume the EOF character in the data state. 6258 */ 6259 break eofloop; 6260 case DOCTYPE_NAME: 6261 errEofInDoctype(); 6262 strBufToDoctypeName(); 6263 /* 6264 * Set the DOCTYPE token's force-quirks flag to on. 6265 */ 6266 forceQuirks = true; 6267 /* 6268 * Emit that DOCTYPE token. 6269 */ 6270 emitDoctypeToken(0); 6271 /* 6272 * Reconsume the EOF character in the data state. 6273 */ 6274 break eofloop; 6275 case DOCTYPE_UBLIC: 6276 case DOCTYPE_YSTEM: 6277 case AFTER_DOCTYPE_NAME: 6278 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 6279 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 6280 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 6281 errEofInDoctype(); 6282 /* 6283 * Set the DOCTYPE token's force-quirks flag to on. 6284 */ 6285 forceQuirks = true; 6286 /* 6287 * Emit that DOCTYPE token. 6288 */ 6289 emitDoctypeToken(0); 6290 /* 6291 * Reconsume the EOF character in the data state. 6292 */ 6293 break eofloop; 6294 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 6295 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 6296 /* EOF Parse error. */ 6297 errEofInPublicId(); 6298 /* 6299 * Set the DOCTYPE token's force-quirks flag to on. 6300 */ 6301 forceQuirks = true; 6302 /* 6303 * Emit that DOCTYPE token. 6304 */ 6305 publicIdentifier = strBufToString(); 6306 emitDoctypeToken(0); 6307 /* 6308 * Reconsume the EOF character in the data state. 6309 */ 6310 break eofloop; 6311 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 6312 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 6313 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 6314 errEofInDoctype(); 6315 /* 6316 * Set the DOCTYPE token's force-quirks flag to on. 6317 */ 6318 forceQuirks = true; 6319 /* 6320 * Emit that DOCTYPE token. 6321 */ 6322 emitDoctypeToken(0); 6323 /* 6324 * Reconsume the EOF character in the data state. 6325 */ 6326 break eofloop; 6327 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 6328 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 6329 /* EOF Parse error. */ 6330 errEofInSystemId(); 6331 /* 6332 * Set the DOCTYPE token's force-quirks flag to on. 6333 */ 6334 forceQuirks = true; 6335 /* 6336 * Emit that DOCTYPE token. 6337 */ 6338 systemIdentifier = strBufToString(); 6339 emitDoctypeToken(0); 6340 /* 6341 * Reconsume the EOF character in the data state. 6342 */ 6343 break eofloop; 6344 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 6345 errEofInDoctype(); 6346 /* 6347 * Set the DOCTYPE token's force-quirks flag to on. 6348 */ 6349 forceQuirks = true; 6350 /* 6351 * Emit that DOCTYPE token. 6352 */ 6353 emitDoctypeToken(0); 6354 /* 6355 * Reconsume the EOF character in the data state. 6356 */ 6357 break eofloop; 6358 case BOGUS_DOCTYPE: 6359 /* 6360 * Emit that DOCTYPE token. 6361 */ 6362 emitDoctypeToken(0); 6363 /* 6364 * Reconsume the EOF character in the data state. 6365 */ 6366 break eofloop; 6367 case CONSUME_CHARACTER_REFERENCE: 6368 /* 6369 * Unlike the definition is the spec, this state does not 6370 * return a value and never requires the caller to 6371 * backtrack. This state takes care of emitting characters 6372 * or appending to the current attribute value. It also 6373 * takes care of that in the case when consuming the entity 6374 * fails. 6375 */ 6376 /* 6377 * This section defines how to consume an entity. This 6378 * definition is used when parsing entities in text and in 6379 * attributes. 6380 * 6381 * The behavior depends on the identity of the next 6382 * character (the one immediately after the U+0026 AMPERSAND 6383 * character): 6384 */ 6385 6386 emitOrAppendCharRefBuf(returnState); 6387 state = returnState; 6388 continue; 6389 case CHARACTER_REFERENCE_HILO_LOOKUP: 6390 errNoNamedCharacterMatch(); 6391 emitOrAppendCharRefBuf(returnState); 6392 state = returnState; 6393 continue; 6394 case CHARACTER_REFERENCE_TAIL: 6395 outer: for (;;) { 6396 char c = '\u0000'; 6397 entCol++; 6398 /* 6399 * Consume the maximum number of characters possible, 6400 * with the consumed characters matching one of the 6401 * identifiers in the first column of the named 6402 * character references table (in a case-sensitive 6403 * manner). 6404 */ 6405 hiloop: for (;;) { 6406 if (hi == -1) { 6407 break hiloop; 6408 } 6409 if (entCol == NamedCharacters.NAMES[hi].length()) { 6410 break hiloop; 6411 } 6412 if (entCol > NamedCharacters.NAMES[hi].length()) { 6413 break outer; 6414 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 6415 hi--; 6416 } else { 6417 break hiloop; 6418 } 6419 } 6420 6421 loloop: for (;;) { 6422 if (hi < lo) { 6423 break outer; 6424 } 6425 if (entCol == NamedCharacters.NAMES[lo].length()) { 6426 candidate = lo; 6427 charRefBufMark = charRefBufLen; 6428 lo++; 6429 } else if (entCol > NamedCharacters.NAMES[lo].length()) { 6430 break outer; 6431 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 6432 lo++; 6433 } else { 6434 break loloop; 6435 } 6436 } 6437 if (hi < lo) { 6438 break outer; 6439 } 6440 continue; 6441 } 6442 6443 if (candidate == -1) { 6444 /* 6445 * If no match can be made, then this is a parse error. 6446 */ 6447 errNoNamedCharacterMatch(); 6448 emitOrAppendCharRefBuf(returnState); 6449 state = returnState; 6450 continue eofloop; 6451 } else { 6452 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 6453 if (candidateName.length() == 0 6454 || candidateName.charAt(candidateName.length() - 1) != ';') { 6455 /* 6456 * If the last character matched is not a U+003B 6457 * SEMICOLON (;), there is a parse error. 6458 */ 6459 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6460 /* 6461 * If the entity is being consumed as part of an 6462 * attribute, and the last character matched is 6463 * not a U+003B SEMICOLON (;), 6464 */ 6465 char ch; 6466 if (charRefBufMark == charRefBufLen) { 6467 ch = '\u0000'; 6468 } else { 6469 ch = charRefBuf[charRefBufMark]; 6470 } 6471 if ((ch >= '0' && ch <= '9') 6472 || (ch >= 'A' && ch <= 'Z') 6473 || (ch >= 'a' && ch <= 'z')) { 6474 /* 6475 * and the next character is in the range 6476 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 6477 * U+0041 LATIN CAPITAL LETTER A to U+005A 6478 * LATIN CAPITAL LETTER Z, or U+0061 LATIN 6479 * SMALL LETTER A to U+007A LATIN SMALL 6480 * LETTER Z, then, for historical reasons, 6481 * all the characters that were matched 6482 * after the U+0026 AMPERSAND (&) must be 6483 * unconsumed, and nothing is returned. 6484 */ 6485 errNoNamedCharacterMatch(); 6486 appendCharRefBufToStrBuf(); 6487 state = returnState; 6488 continue eofloop; 6489 } 6490 } 6491 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6492 errUnescapedAmpersandInterpretedAsCharacterReference(); 6493 } else { 6494 errNotSemicolonTerminated(); 6495 } 6496 } 6497 6498 /* 6499 * Otherwise, return a character token for the character 6500 * corresponding to the entity name (as given by the 6501 * second column of the named character references 6502 * table). 6503 */ 6504 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 6505 if ( 6506 // [NOCPP[ 6507 val.length == 1 6508 // ]NOCPP] 6509 // CPPONLY: val[1] == 0 6510 ) { 6511 emitOrAppendOne(val, returnState); 6512 } else { 6513 emitOrAppendTwo(val, returnState); 6514 } 6515 // this is so complicated! 6516 if (charRefBufMark < charRefBufLen) { 6517 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6518 appendStrBuf(charRefBuf, charRefBufMark, 6519 charRefBufLen - charRefBufMark); 6520 } else { 6521 tokenHandler.characters(charRefBuf, charRefBufMark, 6522 charRefBufLen - charRefBufMark); 6523 } 6524 } 6525 charRefBufLen = 0; 6526 state = returnState; 6527 continue eofloop; 6528 /* 6529 * If the markup contains I'm ¬it; I tell you, the 6530 * entity is parsed as "not", as in, I'm ¬it; I tell 6531 * you. But if the markup was I'm ∉ I tell you, 6532 * the entity would be parsed as "notin;", resulting in 6533 * I'm ∉ I tell you. 6534 */ 6535 } 6536 case CONSUME_NCR: 6537 case DECIMAL_NRC_LOOP: 6538 case HEX_NCR_LOOP: 6539 /* 6540 * If no characters match the range, then don't consume any 6541 * characters (and unconsume the U+0023 NUMBER SIGN 6542 * character and, if appropriate, the X character). This is 6543 * a parse error; nothing is returned. 6544 * 6545 * Otherwise, if the next character is a U+003B SEMICOLON, 6546 * consume that too. If it isn't, there is a parse error. 6547 */ 6548 if (!seenDigits) { 6549 errNoDigitsInNCR(); 6550 emitOrAppendCharRefBuf(returnState); 6551 state = returnState; 6552 continue; 6553 } else { 6554 errCharRefLacksSemicolon(); 6555 } 6556 // WARNING previous state sets reconsume 6557 handleNcrValue(returnState); 6558 state = returnState; 6559 continue; 6560 case CDATA_RSQB: 6561 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 6562 break eofloop; 6563 case CDATA_RSQB_RSQB: 6564 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 6565 break eofloop; 6566 case DATA: 6567 default: 6568 break eofloop; 6569 } 6570 } 6571 // case DATA: 6572 /* 6573 * EOF Emit an end-of-file token. 6574 */ 6575 tokenHandler.eof(); 6576 return; 6577 } 6578 emitDoctypeToken(int pos)6579 private void emitDoctypeToken(int pos) throws SAXException { 6580 cstart = pos + 1; 6581 tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier, 6582 forceQuirks); 6583 // It is OK and sufficient to release these here, since 6584 // there's no way out of the doctype states than through paths 6585 // that call this method. 6586 doctypeName = null; 6587 Portability.releaseString(publicIdentifier); 6588 publicIdentifier = null; 6589 Portability.releaseString(systemIdentifier); 6590 systemIdentifier = null; 6591 } 6592 checkChar(@oLength char[] buf, int pos)6593 @Inline protected char checkChar(@NoLength char[] buf, int pos) 6594 throws SAXException { 6595 return buf[pos]; 6596 } 6597 internalEncodingDeclaration(String internalCharset)6598 public boolean internalEncodingDeclaration(String internalCharset) 6599 throws SAXException { 6600 if (encodingDeclarationHandler != null) { 6601 return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset); 6602 } 6603 return false; 6604 } 6605 6606 /** 6607 * @param val 6608 * @throws SAXException 6609 */ emitOrAppendTwo(@onst @oLength char[] val, int returnState)6610 private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState) 6611 throws SAXException { 6612 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6613 appendStrBuf(val[0]); 6614 appendStrBuf(val[1]); 6615 } else { 6616 tokenHandler.characters(val, 0, 2); 6617 } 6618 } 6619 emitOrAppendOne(@onst @oLength char[] val, int returnState)6620 private void emitOrAppendOne(@Const @NoLength char[] val, int returnState) 6621 throws SAXException { 6622 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6623 appendStrBuf(val[0]); 6624 } else { 6625 tokenHandler.characters(val, 0, 1); 6626 } 6627 } 6628 end()6629 public void end() throws SAXException { 6630 strBuf = null; 6631 doctypeName = null; 6632 if (systemIdentifier != null) { 6633 Portability.releaseString(systemIdentifier); 6634 systemIdentifier = null; 6635 } 6636 if (publicIdentifier != null) { 6637 Portability.releaseString(publicIdentifier); 6638 publicIdentifier = null; 6639 } 6640 if (tagName != null) { 6641 tagName.release(); 6642 tagName = null; 6643 } 6644 if (attributeName != null) { 6645 attributeName.release(); 6646 attributeName = null; 6647 } 6648 tokenHandler.endTokenization(); 6649 if (attributes != null) { 6650 // [NOCPP[ 6651 attributes = null; 6652 // ]NOCPP] 6653 // CPPONLY: attributes.clear(mappingLangToXmlLang); 6654 } 6655 } 6656 requestSuspension()6657 public void requestSuspension() { 6658 shouldSuspend = true; 6659 } 6660 6661 // [NOCPP[ 6662 becomeConfident()6663 public void becomeConfident() { 6664 confident = true; 6665 } 6666 6667 /** 6668 * Returns the nextCharOnNewLine. 6669 * 6670 * @return the nextCharOnNewLine 6671 */ isNextCharOnNewLine()6672 public boolean isNextCharOnNewLine() { 6673 return false; 6674 } 6675 isPrevCR()6676 public boolean isPrevCR() { 6677 return lastCR; 6678 } 6679 6680 /** 6681 * Returns the line. 6682 * 6683 * @return the line 6684 */ getLine()6685 public int getLine() { 6686 return -1; 6687 } 6688 6689 /** 6690 * Returns the col. 6691 * 6692 * @return the col 6693 */ getCol()6694 public int getCol() { 6695 return -1; 6696 } 6697 6698 // ]NOCPP] 6699 isInDataState()6700 public boolean isInDataState() { 6701 return (stateSave == DATA); 6702 } 6703 resetToDataState()6704 public void resetToDataState() { 6705 clearStrBufAfterUse(); 6706 charRefBufLen = 0; 6707 stateSave = Tokenizer.DATA; 6708 // line = 1; XXX line numbers 6709 lastCR = false; 6710 index = 0; 6711 forceQuirks = false; 6712 additional = '\u0000'; 6713 entCol = -1; 6714 firstCharKey = -1; 6715 lo = 0; 6716 hi = 0; // will always be overwritten before use anyway 6717 candidate = -1; 6718 charRefBufMark = 0; 6719 value = 0; 6720 seenDigits = false; 6721 endTag = false; 6722 shouldSuspend = false; 6723 initDoctypeFields(); 6724 if (tagName != null) { 6725 tagName.release(); 6726 tagName = null; 6727 } 6728 if (attributeName != null) { 6729 attributeName.release(); 6730 attributeName = null; 6731 } 6732 if (newAttributesEachTime) { 6733 if (attributes != null) { 6734 Portability.delete(attributes); 6735 attributes = null; 6736 } 6737 } 6738 } 6739 loadState(Tokenizer other)6740 public void loadState(Tokenizer other) throws SAXException { 6741 strBufLen = other.strBufLen; 6742 if (strBufLen > strBuf.length) { 6743 strBuf = new char[strBufLen]; 6744 } 6745 System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen); 6746 6747 charRefBufLen = other.charRefBufLen; 6748 System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen); 6749 6750 stateSave = other.stateSave; 6751 returnStateSave = other.returnStateSave; 6752 endTagExpectation = other.endTagExpectation; 6753 endTagExpectationAsArray = other.endTagExpectationAsArray; 6754 // line = 1; XXX line numbers 6755 lastCR = other.lastCR; 6756 index = other.index; 6757 forceQuirks = other.forceQuirks; 6758 additional = other.additional; 6759 entCol = other.entCol; 6760 firstCharKey = other.firstCharKey; 6761 lo = other.lo; 6762 hi = other.hi; 6763 candidate = other.candidate; 6764 charRefBufMark = other.charRefBufMark; 6765 value = other.value; 6766 seenDigits = other.seenDigits; 6767 endTag = other.endTag; 6768 shouldSuspend = false; 6769 6770 if (other.doctypeName == null) { 6771 doctypeName = null; 6772 } else { 6773 doctypeName = Portability.newLocalFromLocal(other.doctypeName, 6774 interner); 6775 } 6776 6777 Portability.releaseString(systemIdentifier); 6778 if (other.systemIdentifier == null) { 6779 systemIdentifier = null; 6780 } else { 6781 systemIdentifier = Portability.newStringFromString(other.systemIdentifier); 6782 } 6783 6784 Portability.releaseString(publicIdentifier); 6785 if (other.publicIdentifier == null) { 6786 publicIdentifier = null; 6787 } else { 6788 publicIdentifier = Portability.newStringFromString(other.publicIdentifier); 6789 } 6790 6791 if (tagName != null) { 6792 tagName.release(); 6793 } 6794 if (other.tagName == null) { 6795 tagName = null; 6796 } else { 6797 tagName = other.tagName.cloneElementName(interner); 6798 } 6799 6800 if (attributeName != null) { 6801 attributeName.release(); 6802 } 6803 if (other.attributeName == null) { 6804 attributeName = null; 6805 } else { 6806 attributeName = other.attributeName.cloneAttributeName(interner); 6807 } 6808 6809 Portability.delete(attributes); 6810 if (other.attributes == null) { 6811 attributes = null; 6812 } else { 6813 attributes = other.attributes.cloneAttributes(interner); 6814 } 6815 } 6816 initializeWithoutStarting()6817 public void initializeWithoutStarting() throws SAXException { 6818 confident = false; 6819 strBuf = null; 6820 line = 1; 6821 // CPPONLY: attributeLine = 1; 6822 // [NOCPP[ 6823 html4 = false; 6824 metaBoundaryPassed = false; 6825 wantsComments = tokenHandler.wantsComments(); 6826 if (!newAttributesEachTime) { 6827 attributes = new HtmlAttributes(mappingLangToXmlLang); 6828 } 6829 // ]NOCPP] 6830 resetToDataState(); 6831 } 6832 errGarbageAfterLtSlash()6833 protected void errGarbageAfterLtSlash() throws SAXException { 6834 } 6835 errLtSlashGt()6836 protected void errLtSlashGt() throws SAXException { 6837 } 6838 errWarnLtSlashInRcdata()6839 protected void errWarnLtSlashInRcdata() throws SAXException { 6840 } 6841 errHtml4LtSlashInRcdata(char folded)6842 protected void errHtml4LtSlashInRcdata(char folded) throws SAXException { 6843 } 6844 errCharRefLacksSemicolon()6845 protected void errCharRefLacksSemicolon() throws SAXException { 6846 } 6847 errNoDigitsInNCR()6848 protected void errNoDigitsInNCR() throws SAXException { 6849 } 6850 errGtInSystemId()6851 protected void errGtInSystemId() throws SAXException { 6852 } 6853 errGtInPublicId()6854 protected void errGtInPublicId() throws SAXException { 6855 } 6856 errNamelessDoctype()6857 protected void errNamelessDoctype() throws SAXException { 6858 } 6859 errConsecutiveHyphens()6860 protected void errConsecutiveHyphens() throws SAXException { 6861 } 6862 errPrematureEndOfComment()6863 protected void errPrematureEndOfComment() throws SAXException { 6864 } 6865 errBogusComment()6866 protected void errBogusComment() throws SAXException { 6867 } 6868 errUnquotedAttributeValOrNull(char c)6869 protected void errUnquotedAttributeValOrNull(char c) throws SAXException { 6870 } 6871 errSlashNotFollowedByGt()6872 protected void errSlashNotFollowedByGt() throws SAXException { 6873 } 6874 errHtml4XmlVoidSyntax()6875 protected void errHtml4XmlVoidSyntax() throws SAXException { 6876 } 6877 errNoSpaceBetweenAttributes()6878 protected void errNoSpaceBetweenAttributes() throws SAXException { 6879 } 6880 errHtml4NonNameInUnquotedAttribute(char c)6881 protected void errHtml4NonNameInUnquotedAttribute(char c) 6882 throws SAXException { 6883 } 6884 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)6885 protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c) 6886 throws SAXException { 6887 } 6888 errAttributeValueMissing()6889 protected void errAttributeValueMissing() throws SAXException { 6890 } 6891 errBadCharBeforeAttributeNameOrNull(char c)6892 protected void errBadCharBeforeAttributeNameOrNull(char c) 6893 throws SAXException { 6894 } 6895 errEqualsSignBeforeAttributeName()6896 protected void errEqualsSignBeforeAttributeName() throws SAXException { 6897 } 6898 errBadCharAfterLt(char c)6899 protected void errBadCharAfterLt(char c) throws SAXException { 6900 } 6901 errLtGt()6902 protected void errLtGt() throws SAXException { 6903 } 6904 errProcessingInstruction()6905 protected void errProcessingInstruction() throws SAXException { 6906 } 6907 errUnescapedAmpersandInterpretedAsCharacterReference()6908 protected void errUnescapedAmpersandInterpretedAsCharacterReference() 6909 throws SAXException { 6910 } 6911 errNotSemicolonTerminated()6912 protected void errNotSemicolonTerminated() throws SAXException { 6913 } 6914 errNoNamedCharacterMatch()6915 protected void errNoNamedCharacterMatch() throws SAXException { 6916 } 6917 errQuoteBeforeAttributeName(char c)6918 protected void errQuoteBeforeAttributeName(char c) throws SAXException { 6919 } 6920 errQuoteOrLtInAttributeNameOrNull(char c)6921 protected void errQuoteOrLtInAttributeNameOrNull(char c) 6922 throws SAXException { 6923 } 6924 errExpectedPublicId()6925 protected void errExpectedPublicId() throws SAXException { 6926 } 6927 errBogusDoctype()6928 protected void errBogusDoctype() throws SAXException { 6929 } 6930 maybeWarnPrivateUseAstral()6931 protected void maybeWarnPrivateUseAstral() throws SAXException { 6932 } 6933 maybeWarnPrivateUse(char ch)6934 protected void maybeWarnPrivateUse(char ch) throws SAXException { 6935 } 6936 maybeErrAttributesOnEndTag(HtmlAttributes attrs)6937 protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs) 6938 throws SAXException { 6939 } 6940 maybeErrSlashInEndTag(boolean selfClosing)6941 protected void maybeErrSlashInEndTag(boolean selfClosing) 6942 throws SAXException { 6943 } 6944 errNcrNonCharacter(char ch)6945 protected char errNcrNonCharacter(char ch) throws SAXException { 6946 return ch; 6947 } 6948 errAstralNonCharacter(int ch)6949 protected void errAstralNonCharacter(int ch) throws SAXException { 6950 } 6951 errNcrSurrogate()6952 protected void errNcrSurrogate() throws SAXException { 6953 } 6954 errNcrControlChar(char ch)6955 protected char errNcrControlChar(char ch) throws SAXException { 6956 return ch; 6957 } 6958 errNcrCr()6959 protected void errNcrCr() throws SAXException { 6960 } 6961 errNcrInC1Range()6962 protected void errNcrInC1Range() throws SAXException { 6963 } 6964 errEofInPublicId()6965 protected void errEofInPublicId() throws SAXException { 6966 } 6967 errEofInComment()6968 protected void errEofInComment() throws SAXException { 6969 } 6970 errEofInDoctype()6971 protected void errEofInDoctype() throws SAXException { 6972 } 6973 errEofInAttributeValue()6974 protected void errEofInAttributeValue() throws SAXException { 6975 } 6976 errEofInAttributeName()6977 protected void errEofInAttributeName() throws SAXException { 6978 } 6979 errEofWithoutGt()6980 protected void errEofWithoutGt() throws SAXException { 6981 } 6982 errEofInTagName()6983 protected void errEofInTagName() throws SAXException { 6984 } 6985 errEofInEndTag()6986 protected void errEofInEndTag() throws SAXException { 6987 } 6988 errEofAfterLt()6989 protected void errEofAfterLt() throws SAXException { 6990 } 6991 errNcrOutOfRange()6992 protected void errNcrOutOfRange() throws SAXException { 6993 } 6994 errNcrUnassigned()6995 protected void errNcrUnassigned() throws SAXException { 6996 } 6997 errDuplicateAttribute()6998 protected void errDuplicateAttribute() throws SAXException { 6999 } 7000 errEofInSystemId()7001 protected void errEofInSystemId() throws SAXException { 7002 } 7003 errExpectedSystemId()7004 protected void errExpectedSystemId() throws SAXException { 7005 } 7006 errMissingSpaceBeforeDoctypeName()7007 protected void errMissingSpaceBeforeDoctypeName() throws SAXException { 7008 } 7009 errHyphenHyphenBang()7010 protected void errHyphenHyphenBang() throws SAXException { 7011 } 7012 errNcrControlChar()7013 protected void errNcrControlChar() throws SAXException { 7014 } 7015 errNcrZero()7016 protected void errNcrZero() throws SAXException { 7017 } 7018 errNoSpaceBetweenDoctypeSystemKeywordAndQuote()7019 protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote() 7020 throws SAXException { 7021 } 7022 errNoSpaceBetweenPublicAndSystemIds()7023 protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException { 7024 } 7025 errNoSpaceBetweenDoctypePublicKeywordAndQuote()7026 protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote() 7027 throws SAXException { 7028 } 7029 noteAttributeWithoutValue()7030 protected void noteAttributeWithoutValue() throws SAXException { 7031 } 7032 noteUnquotedAttributeValue()7033 protected void noteUnquotedAttributeValue() throws SAXException { 7034 } 7035 7036 /** 7037 * Sets the encodingDeclarationHandler. 7038 * 7039 * @param encodingDeclarationHandler 7040 * the encodingDeclarationHandler to set 7041 */ setEncodingDeclarationHandler( EncodingDeclarationHandler encodingDeclarationHandler)7042 public void setEncodingDeclarationHandler( 7043 EncodingDeclarationHandler encodingDeclarationHandler) { 7044 this.encodingDeclarationHandler = encodingDeclarationHandler; 7045 } 7046 destructor()7047 void destructor() { 7048 // The translator will write refcount tracing stuff here 7049 Portability.delete(attributes); 7050 attributes = null; 7051 } 7052 7053 // [NOCPP[ 7054 7055 /** 7056 * Sets an offset to be added to the position reported to 7057 * <code>TransitionHandler</code>. 7058 * 7059 * @param offset the offset 7060 */ setTransitionBaseOffset(int offset)7061 public void setTransitionBaseOffset(int offset) { 7062 7063 } 7064 7065 // ]NOCPP] 7066 7067 } 7068