1 /*
2  * Copyright (c) 2005-2007 Henri Sivonen
3  * Copyright (c) 2007-2015 Mozilla Foundation
4  * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
5  * Foundation, and Opera Software ASA.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23  * DEALINGS IN THE SOFTWARE.
24  */
25 
26 /*
27  * The comments following this one that use the same comment syntax as this
28  * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
29  * amended as of June 18 2008 and May 31 2010.
30  * That document came with this statement:
31  * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
32  * Opera Software ASA. You are granted a license to use, reproduce and
33  * create derivative works of this document."
34  */
35 
36 package nu.validator.htmlparser.impl;
37 
38 import org.xml.sax.ErrorHandler;
39 import org.xml.sax.Locator;
40 import org.xml.sax.SAXException;
41 import org.xml.sax.SAXParseException;
42 
43 import nu.validator.htmlparser.annotation.Auto;
44 import nu.validator.htmlparser.annotation.CharacterName;
45 import nu.validator.htmlparser.annotation.Const;
46 import nu.validator.htmlparser.annotation.Inline;
47 import nu.validator.htmlparser.annotation.Local;
48 import nu.validator.htmlparser.annotation.NoLength;
49 import nu.validator.htmlparser.common.EncodingDeclarationHandler;
50 import nu.validator.htmlparser.common.Interner;
51 import nu.validator.htmlparser.common.TokenHandler;
52 import nu.validator.htmlparser.common.XmlViolationPolicy;
53 
54 /**
55  * An implementation of
56  * https://html.spec.whatwg.org/multipage/syntax.html#tokenization
57  *
58  * This class implements the <code>Locator</code> interface. This is not an
59  * incidental implementation detail: Users of this class are encouraged to make
60  * use of the <code>Locator</code> nature.
61  *
62  * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
63  * can be configured to treat these conditions as fatal or to coerce the infoset
64  * to something that XML 1.0 allows.
65  *
66  * @version $Id$
67  * @author hsivonen
68  */
69 public class Tokenizer implements Locator {
70 
71     private static final int DATA_AND_RCDATA_MASK = ~1;
72 
73     public static final int DATA = 0;
74 
75     public static final int RCDATA = 1;
76 
77     public static final int SCRIPT_DATA = 2;
78 
79     public static final int RAWTEXT = 3;
80 
81     public static final int SCRIPT_DATA_ESCAPED = 4;
82 
83     public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
84 
85     public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
86 
87     public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
88 
89     public static final int PLAINTEXT = 8;
90 
91     public static final int TAG_OPEN = 9;
92 
93     public static final int CLOSE_TAG_OPEN = 10;
94 
95     public static final int TAG_NAME = 11;
96 
97     public static final int BEFORE_ATTRIBUTE_NAME = 12;
98 
99     public static final int ATTRIBUTE_NAME = 13;
100 
101     public static final int AFTER_ATTRIBUTE_NAME = 14;
102 
103     public static final int BEFORE_ATTRIBUTE_VALUE = 15;
104 
105     public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
106 
107     public static final int BOGUS_COMMENT = 17;
108 
109     public static final int MARKUP_DECLARATION_OPEN = 18;
110 
111     public static final int DOCTYPE = 19;
112 
113     public static final int BEFORE_DOCTYPE_NAME = 20;
114 
115     public static final int DOCTYPE_NAME = 21;
116 
117     public static final int AFTER_DOCTYPE_NAME = 22;
118 
119     public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
120 
121     public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
122 
123     public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
124 
125     public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
126 
127     public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
128 
129     public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
130 
131     public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
132 
133     public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
134 
135     public static final int BOGUS_DOCTYPE = 31;
136 
137     public static final int COMMENT_START = 32;
138 
139     public static final int COMMENT_START_DASH = 33;
140 
141     public static final int COMMENT = 34;
142 
143     public static final int COMMENT_END_DASH = 35;
144 
145     public static final int COMMENT_END = 36;
146 
147     public static final int COMMENT_END_BANG = 37;
148 
149     public static final int NON_DATA_END_TAG_NAME = 38;
150 
151     public static final int MARKUP_DECLARATION_HYPHEN = 39;
152 
153     public static final int MARKUP_DECLARATION_OCTYPE = 40;
154 
155     public static final int DOCTYPE_UBLIC = 41;
156 
157     public static final int DOCTYPE_YSTEM = 42;
158 
159     public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
160 
161     public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
162 
163     public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
164 
165     public static final int CONSUME_CHARACTER_REFERENCE = 46;
166 
167     public static final int CONSUME_NCR = 47;
168 
169     public static final int CHARACTER_REFERENCE_TAIL = 48;
170 
171     public static final int HEX_NCR_LOOP = 49;
172 
173     public static final int DECIMAL_NRC_LOOP = 50;
174 
175     public static final int HANDLE_NCR_VALUE = 51;
176 
177     public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
178 
179     public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
180 
181     public static final int SELF_CLOSING_START_TAG = 54;
182 
183     public static final int CDATA_START = 55;
184 
185     public static final int CDATA_SECTION = 56;
186 
187     public static final int CDATA_RSQB = 57;
188 
189     public static final int CDATA_RSQB_RSQB = 58;
190 
191     public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
192 
193     public static final int SCRIPT_DATA_ESCAPE_START = 60;
194 
195     public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
196 
197     public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
198 
199     public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
200 
201     public static final int BOGUS_COMMENT_HYPHEN = 64;
202 
203     public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
204 
205     public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
206 
207     public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
208 
209     public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
210 
211     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
212 
213     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
214 
215     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
216 
217     public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
218 
219     public static final int PROCESSING_INSTRUCTION = 73;
220 
221     public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
222 
223     /**
224      * Magic value for UTF-16 operations.
225      */
226     private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
227 
228     /**
229      * UTF-16 code unit array containing less than and greater than for emitting
230      * those characters on certain parse errors.
231      */
232     private static final @NoLength char[] LT_GT = { '<', '>' };
233 
234     /**
235      * UTF-16 code unit array containing less than and solidus for emitting
236      * those characters on certain parse errors.
237      */
238     private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
239 
240     /**
241      * UTF-16 code unit array containing ]] for emitting those characters on
242      * state transitions.
243      */
244     private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
245 
246     /**
247      * Array version of U+FFFD.
248      */
249     private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
250 
251     // [NOCPP[
252 
253     /**
254      * Array version of space.
255      */
256     private static final @NoLength char[] SPACE = { ' ' };
257 
258     // ]NOCPP]
259 
260     /**
261      * Array version of line feed.
262      */
263     private static final @NoLength char[] LF = { '\n' };
264 
265     /**
266      * "CDATA[" as <code>char[]</code>
267      */
268     private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
269             'A', '[' };
270 
271     /**
272      * "octype" as <code>char[]</code>
273      */
274     private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
275             'e' };
276 
277     /**
278      * "ublic" as <code>char[]</code>
279      */
280     private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
281 
282     /**
283      * "ystem" as <code>char[]</code>
284      */
285     private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
286 
287     private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
288 
289     private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
290 
291     private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
292 
293     private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
294             'e', 'x', 't' };
295 
296     private static final char[] XMP_ARR = { 'x', 'm', 'p' };
297 
298     private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
299             'e', 'a' };
300 
301     private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
302 
303     private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
304             'd' };
305 
306     private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
307             'p', 't' };
308 
309     private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
310             'e', 's' };
311 
312     /**
313      * The token handler.
314      */
315     protected final TokenHandler tokenHandler;
316 
317     protected EncodingDeclarationHandler encodingDeclarationHandler;
318 
319     // [NOCPP[
320 
321     /**
322      * The error handler.
323      */
324     protected ErrorHandler errorHandler;
325 
326     // ]NOCPP]
327 
328     /**
329      * Whether the previous char read was CR.
330      */
331     protected boolean lastCR;
332 
333     protected int stateSave;
334 
335     private int returnStateSave;
336 
337     protected int index;
338 
339     private boolean forceQuirks;
340 
341     private char additional;
342 
343     private int entCol;
344 
345     private int firstCharKey;
346 
347     private int lo;
348 
349     private int hi;
350 
351     private int candidate;
352 
353     private int charRefBufMark;
354 
355     protected int value;
356 
357     private boolean seenDigits;
358 
359     protected int cstart;
360 
361     /**
362      * The SAX public id for the resource being tokenized. (Only passed to back
363      * as part of locator data.)
364      */
365     private String publicId;
366 
367     /**
368      * The SAX system id for the resource being tokenized. (Only passed to back
369      * as part of locator data.)
370      */
371     private String systemId;
372 
373     /**
374      * Buffer for bufferable things other than those that fit the description
375      * of <code>charRefBuf</code>.
376      */
377     private @Auto char[] strBuf;
378 
379     /**
380      * Number of significant <code>char</code>s in <code>strBuf</code>.
381      */
382     private int strBufLen;
383 
384     /**
385      * Buffer for characters that might form a character reference but may
386      * end up not forming one.
387      */
388     private final @Auto char[] charRefBuf;
389 
390     /**
391      * Number of significant <code>char</code>s in <code>charRefBuf</code>.
392      */
393     private int charRefBufLen;
394 
395     /**
396      * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
397      */
398     private final @Auto char[] bmpChar;
399 
400     /**
401      * Buffer for expanding astral NCRs.
402      */
403     private final @Auto char[] astralChar;
404 
405     /**
406      * The element whose end tag closes the current CDATA or RCDATA element.
407      */
408     protected ElementName endTagExpectation = null;
409 
410     private char[] endTagExpectationAsArray; // not @Auto!
411 
412     /**
413      * <code>true</code> if tokenizing an end tag
414      */
415     protected boolean endTag;
416 
417     /**
418      * The current tag token name.
419      */
420     private ElementName tagName = null;
421 
422     /**
423      * The current attribute name.
424      */
425     protected AttributeName attributeName = null;
426 
427     // [NOCPP[
428 
429     /**
430      * Whether comment tokens are emitted.
431      */
432     private boolean wantsComments = false;
433 
434     /**
435      * <code>true</code> when HTML4-specific additional errors are requested.
436      */
437     protected boolean html4;
438 
439     /**
440      * Whether the stream is past the first 1024 bytes.
441      */
442     private boolean metaBoundaryPassed;
443 
444     // ]NOCPP]
445 
446     /**
447      * The name of the current doctype token.
448      */
449     private @Local String doctypeName;
450 
451     /**
452      * The public id of the current doctype token.
453      */
454     private String publicIdentifier;
455 
456     /**
457      * The system id of the current doctype token.
458      */
459     private String systemIdentifier;
460 
461     /**
462      * The attribute holder.
463      */
464     private HtmlAttributes attributes;
465 
466     // [NOCPP[
467 
468     /**
469      * The policy for vertical tab and form feed.
470      */
471     private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
472 
473     /**
474      * The policy for comments.
475      */
476     private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
477 
478     private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
479 
480     private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
481 
482     private boolean html4ModeCompatibleWithXhtml1Schemata;
483 
484     private int mappingLangToXmlLang;
485 
486     // ]NOCPP]
487 
488     private final boolean newAttributesEachTime;
489 
490     private boolean shouldSuspend;
491 
492     protected boolean confident;
493 
494     private int line;
495 
496     /*
497      * The line number of the current attribute. First set to the line of the
498      * attribute name and if there is a value, set to the line the value
499      * started on.
500      */
501     // CPPONLY: private int attributeLine;
502 
503     private Interner interner;
504 
505     // CPPONLY: private boolean viewingXmlSource;
506 
507     // [NOCPP[
508 
509     protected LocatorImpl ampersandLocation;
510 
Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime)511     public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
512         this.tokenHandler = tokenHandler;
513         this.encodingDeclarationHandler = null;
514         this.newAttributesEachTime = newAttributesEachTime;
515         // &CounterClockwiseContourIntegral; is the longest valid char ref and
516         // the semicolon never gets appended to the buffer.
517         this.charRefBuf = new char[32];
518         this.bmpChar = new char[1];
519         this.astralChar = new char[2];
520         this.tagName = null;
521         this.attributeName = null;
522         this.doctypeName = null;
523         this.publicIdentifier = null;
524         this.systemIdentifier = null;
525         this.attributes = null;
526     }
527 
528     // ]NOCPP]
529 
530     /**
531      * The constructor.
532      *
533      * @param tokenHandler
534      *            the handler for receiving tokens
535      */
Tokenizer(TokenHandler tokenHandler )536     public Tokenizer(TokenHandler tokenHandler
537     // CPPONLY: , boolean viewingXmlSource
538     ) {
539         this.tokenHandler = tokenHandler;
540         this.encodingDeclarationHandler = null;
541         // [NOCPP[
542         this.newAttributesEachTime = false;
543         // ]NOCPP]
544         // &CounterClockwiseContourIntegral; is the longest valid char ref and
545         // the semicolon never gets appended to the buffer.
546         this.charRefBuf = new char[32];
547         this.bmpChar = new char[1];
548         this.astralChar = new char[2];
549         this.tagName = null;
550         this.attributeName = null;
551         this.doctypeName = null;
552         this.publicIdentifier = null;
553         this.systemIdentifier = null;
554         // [NOCPP[
555         this.attributes = null;
556         // ]NOCPP]
557         // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
558         // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
559         // CPPONLY: this.viewingXmlSource = viewingXmlSource;
560     }
561 
setInterner(Interner interner)562     public void setInterner(Interner interner) {
563         this.interner = interner;
564     }
565 
initLocation(String newPublicId, String newSystemId)566     public void initLocation(String newPublicId, String newSystemId) {
567         this.systemId = newSystemId;
568         this.publicId = newPublicId;
569 
570     }
571 
572     // CPPONLY: boolean isViewingXmlSource() {
573     // CPPONLY: return viewingXmlSource;
574     // CPPONLY: }
575 
576     // [NOCPP[
577 
578     /**
579      * Returns the mappingLangToXmlLang.
580      *
581      * @return the mappingLangToXmlLang
582      */
isMappingLangToXmlLang()583     public boolean isMappingLangToXmlLang() {
584         return mappingLangToXmlLang == AttributeName.HTML_LANG;
585     }
586 
587     /**
588      * Sets the mappingLangToXmlLang.
589      *
590      * @param mappingLangToXmlLang
591      *            the mappingLangToXmlLang to set
592      */
setMappingLangToXmlLang(boolean mappingLangToXmlLang)593     public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
594         this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
595                 : AttributeName.HTML;
596     }
597 
598     /**
599      * Sets the error handler.
600      *
601      * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
602      */
setErrorHandler(ErrorHandler eh)603     public void setErrorHandler(ErrorHandler eh) {
604         this.errorHandler = eh;
605     }
606 
getErrorHandler()607     public ErrorHandler getErrorHandler() {
608         return this.errorHandler;
609     }
610 
611     /**
612      * Sets the commentPolicy.
613      *
614      * @param commentPolicy
615      *            the commentPolicy to set
616      */
setCommentPolicy(XmlViolationPolicy commentPolicy)617     public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
618         this.commentPolicy = commentPolicy;
619     }
620 
621     /**
622      * Sets the contentNonXmlCharPolicy.
623      *
624      * @param contentNonXmlCharPolicy
625      *            the contentNonXmlCharPolicy to set
626      */
setContentNonXmlCharPolicy( XmlViolationPolicy contentNonXmlCharPolicy)627     public void setContentNonXmlCharPolicy(
628             XmlViolationPolicy contentNonXmlCharPolicy) {
629         if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
630             throw new IllegalArgumentException(
631                     "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
632         }
633     }
634 
635     /**
636      * Sets the contentSpacePolicy.
637      *
638      * @param contentSpacePolicy
639      *            the contentSpacePolicy to set
640      */
setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy)641     public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
642         this.contentSpacePolicy = contentSpacePolicy;
643     }
644 
645     /**
646      * Sets the xmlnsPolicy.
647      *
648      * @param xmlnsPolicy
649      *            the xmlnsPolicy to set
650      */
setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy)651     public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
652         if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
653             throw new IllegalArgumentException("Can't use FATAL here.");
654         }
655         this.xmlnsPolicy = xmlnsPolicy;
656     }
657 
setNamePolicy(XmlViolationPolicy namePolicy)658     public void setNamePolicy(XmlViolationPolicy namePolicy) {
659         this.namePolicy = namePolicy;
660     }
661 
662     /**
663      * Sets the html4ModeCompatibleWithXhtml1Schemata.
664      *
665      * @param html4ModeCompatibleWithXhtml1Schemata
666      *            the html4ModeCompatibleWithXhtml1Schemata to set
667      */
setHtml4ModeCompatibleWithXhtml1Schemata( boolean html4ModeCompatibleWithXhtml1Schemata)668     public void setHtml4ModeCompatibleWithXhtml1Schemata(
669             boolean html4ModeCompatibleWithXhtml1Schemata) {
670         this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
671     }
672 
673     // ]NOCPP]
674 
675     // For the token handler to call
676     /**
677      * Sets the tokenizer state and the associated element name. This should
678      * only ever used to put the tokenizer into one of the states that have
679      * a special end tag expectation.
680      *
681      * @param specialTokenizerState
682      *            the tokenizer state to set
683      * @param endTagExpectation
684      *            the expected end tag for transitioning back to normal
685      */
setStateAndEndTagExpectation(int specialTokenizerState, @Local String endTagExpectation)686     public void setStateAndEndTagExpectation(int specialTokenizerState,
687             @Local String endTagExpectation) {
688         this.stateSave = specialTokenizerState;
689         if (specialTokenizerState == Tokenizer.DATA) {
690             return;
691         }
692         @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
693         this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
694                 asArray.length, interner);
695         endTagExpectationToArray();
696     }
697 
698     /**
699      * Sets the tokenizer state and the associated element name. This should
700      * only ever used to put the tokenizer into one of the states that have
701      * a special end tag expectation.
702      *
703      * @param specialTokenizerState
704      *            the tokenizer state to set
705      * @param endTagExpectation
706      *            the expected end tag for transitioning back to normal
707      */
setStateAndEndTagExpectation(int specialTokenizerState, ElementName endTagExpectation)708     public void setStateAndEndTagExpectation(int specialTokenizerState,
709             ElementName endTagExpectation) {
710         this.stateSave = specialTokenizerState;
711         this.endTagExpectation = endTagExpectation;
712         endTagExpectationToArray();
713     }
714 
endTagExpectationToArray()715     private void endTagExpectationToArray() {
716         switch (endTagExpectation.getGroup()) {
717             case TreeBuilder.TITLE:
718                 endTagExpectationAsArray = TITLE_ARR;
719                 return;
720             case TreeBuilder.SCRIPT:
721                 endTagExpectationAsArray = SCRIPT_ARR;
722                 return;
723             case TreeBuilder.STYLE:
724                 endTagExpectationAsArray = STYLE_ARR;
725                 return;
726             case TreeBuilder.PLAINTEXT:
727                 endTagExpectationAsArray = PLAINTEXT_ARR;
728                 return;
729             case TreeBuilder.XMP:
730                 endTagExpectationAsArray = XMP_ARR;
731                 return;
732             case TreeBuilder.TEXTAREA:
733                 endTagExpectationAsArray = TEXTAREA_ARR;
734                 return;
735             case TreeBuilder.IFRAME:
736                 endTagExpectationAsArray = IFRAME_ARR;
737                 return;
738             case TreeBuilder.NOEMBED:
739                 endTagExpectationAsArray = NOEMBED_ARR;
740                 return;
741             case TreeBuilder.NOSCRIPT:
742                 endTagExpectationAsArray = NOSCRIPT_ARR;
743                 return;
744             case TreeBuilder.NOFRAMES:
745                 endTagExpectationAsArray = NOFRAMES_ARR;
746                 return;
747             default:
748                 assert false: "Bad end tag expectation.";
749                 return;
750         }
751     }
752 
753     /**
754      * For C++ use only.
755      */
setLineNumber(int line)756     public void setLineNumber(int line) {
757         // CPPONLY: this.attributeLine = line; // XXX is this needed?
758         this.line = line;
759     }
760 
761     // start Locator impl
762 
763     /**
764      * @see org.xml.sax.Locator#getLineNumber()
765      */
getLineNumber()766     @Inline public int getLineNumber() {
767         return line;
768     }
769 
770     // [NOCPP[
771 
772     /**
773      * @see org.xml.sax.Locator#getColumnNumber()
774      */
getColumnNumber()775     @Inline public int getColumnNumber() {
776         return -1;
777     }
778 
779     /**
780      * @see org.xml.sax.Locator#getPublicId()
781      */
getPublicId()782     public String getPublicId() {
783         return publicId;
784     }
785 
786     /**
787      * @see org.xml.sax.Locator#getSystemId()
788      */
getSystemId()789     public String getSystemId() {
790         return systemId;
791     }
792 
793     // end Locator impl
794 
795     // end public API
796 
notifyAboutMetaBoundary()797     public void notifyAboutMetaBoundary() {
798         metaBoundaryPassed = true;
799     }
800 
turnOnAdditionalHtml4Errors()801     void turnOnAdditionalHtml4Errors() {
802         html4 = true;
803     }
804 
805     // ]NOCPP]
806 
emptyAttributes()807     HtmlAttributes emptyAttributes() {
808         // [NOCPP[
809         if (newAttributesEachTime) {
810             return new HtmlAttributes(mappingLangToXmlLang);
811         } else {
812             // ]NOCPP]
813             return HtmlAttributes.EMPTY_ATTRIBUTES;
814             // [NOCPP[
815         }
816         // ]NOCPP]
817     }
818 
appendCharRefBuf(char c)819     @Inline private void appendCharRefBuf(char c) {
820         // CPPONLY: assert charRefBufLen < charRefBuf.length:
821         // CPPONLY:     "RELEASE: Attempted to overrun charRefBuf!";
822         charRefBuf[charRefBufLen++] = c;
823     }
824 
emitOrAppendCharRefBuf(int returnState)825     private void emitOrAppendCharRefBuf(int returnState) throws SAXException {
826         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
827             appendCharRefBufToStrBuf();
828         } else {
829             if (charRefBufLen > 0) {
830                 tokenHandler.characters(charRefBuf, 0, charRefBufLen);
831                 charRefBufLen = 0;
832             }
833         }
834     }
835 
clearStrBufAfterUse()836     @Inline private void clearStrBufAfterUse() {
837         strBufLen = 0;
838     }
839 
clearStrBufBeforeUse()840     @Inline private void clearStrBufBeforeUse() {
841         assert strBufLen == 0: "strBufLen not reset after previous use!";
842         strBufLen = 0; // no-op in the absence of bugs
843     }
844 
clearStrBufAfterOneHyphen()845     @Inline private void clearStrBufAfterOneHyphen() {
846         assert strBufLen == 1: "strBufLen length not one!";
847         assert strBuf[0] == '-': "strBuf does not start with a hyphen!";
848         strBufLen = 0;
849     }
850 
851     /**
852      * Appends to the buffer.
853      *
854      * @param c
855      *            the UTF-16 code unit to append
856      */
appendStrBuf(char c)857     @Inline private void appendStrBuf(char c) {
858         // CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient.";
859         // CPPONLY: if (strBufLen == strBuf.length) {
860         // CPPONLY:     if (!EnsureBufferSpace(1)) {
861         // CPPONLY:         assert false: "RELEASE: Unable to recover from buffer reallocation failure";
862         // CPPONLY:     } // TODO: Add telemetry when outer if fires but inner does not
863         // CPPONLY: }
864         strBuf[strBufLen++] = c;
865     }
866 
867     /**
868      * The buffer as a String. Currently only used for error reporting.
869      *
870      * <p>
871      * C++ memory note: The return value must be released.
872      *
873      * @return the buffer as a string
874      */
strBufToString()875     protected String strBufToString() {
876         String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen
877             // CPPONLY: , tokenHandler
878         );
879         clearStrBufAfterUse();
880         return str;
881     }
882 
883     /**
884      * Returns the buffer as a local name. The return value is released in
885      * emitDoctypeToken().
886      *
887      * @return the buffer as local name
888      */
strBufToDoctypeName()889     private void strBufToDoctypeName() {
890         doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
891                 interner);
892         clearStrBufAfterUse();
893     }
894 
895     /**
896      * Emits the buffer as character tokens.
897      *
898      * @throws SAXException
899      *             if the token handler threw
900      */
emitStrBuf()901     private void emitStrBuf() throws SAXException {
902         if (strBufLen > 0) {
903             tokenHandler.characters(strBuf, 0, strBufLen);
904             clearStrBufAfterUse();
905         }
906     }
907 
appendSecondHyphenToBogusComment()908     @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
909         // [NOCPP[
910         switch (commentPolicy) {
911             case ALTER_INFOSET:
912                 appendStrBuf(' ');
913                 // FALLTHROUGH
914             case ALLOW:
915                 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
916                 // ]NOCPP]
917                 appendStrBuf('-');
918                 // [NOCPP[
919                 break;
920             case FATAL:
921                 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
922                 break;
923         }
924         // ]NOCPP]
925     }
926 
927     // [NOCPP[
maybeAppendSpaceToBogusComment()928     private void maybeAppendSpaceToBogusComment() throws SAXException {
929         switch (commentPolicy) {
930             case ALTER_INFOSET:
931                 appendStrBuf(' ');
932                 // FALLTHROUGH
933             case ALLOW:
934                 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
935                 break;
936             case FATAL:
937                 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
938                 break;
939         }
940     }
941 
942     // ]NOCPP]
943 
adjustDoubleHyphenAndAppendToStrBufAndErr(char c)944     @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c)
945             throws SAXException {
946         errConsecutiveHyphens();
947         // [NOCPP[
948         switch (commentPolicy) {
949             case ALTER_INFOSET:
950                 strBufLen--;
951                 // WARNING!!! This expands the worst case of the buffer length
952                 // given the length of input!
953                 appendStrBuf(' ');
954                 appendStrBuf('-');
955                 // FALLTHROUGH
956             case ALLOW:
957                 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
958                 // ]NOCPP]
959                 appendStrBuf(c);
960                 // [NOCPP[
961                 break;
962             case FATAL:
963                 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
964                 break;
965         }
966         // ]NOCPP]
967     }
968 
appendStrBuf(@oLength char[] buffer, int offset, int length)969     private void appendStrBuf(@NoLength char[] buffer, int offset, int length) {
970         int newLen = strBufLen + length;
971         // CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient.";
972         // CPPONLY: if (strBuf.length < newLen) {
973         // CPPONLY:     if (!EnsureBufferSpace(length)) {
974         // CPPONLY:         assert false: "RELEASE: Unable to recover from buffer reallocation failure";
975         // CPPONLY:     } // TODO: Add telemetry when outer if fires but inner does not
976         // CPPONLY: }
977         System.arraycopy(buffer, offset, strBuf, strBufLen, length);
978         strBufLen = newLen;
979     }
980 
981     /**
982      * Append the contents of the char reference buffer to the main one.
983      */
appendCharRefBufToStrBuf()984     @Inline private void appendCharRefBufToStrBuf() {
985         appendStrBuf(charRefBuf, 0, charRefBufLen);
986         charRefBufLen = 0;
987     }
988 
989     /**
990      * Emits the current comment token.
991      *
992      * @param pos
993      *            TODO
994      *
995      * @throws SAXException
996      */
emitComment(int provisionalHyphens, int pos)997     private void emitComment(int provisionalHyphens, int pos)
998             throws SAXException {
999         // [NOCPP[
1000         if (wantsComments) {
1001             // ]NOCPP]
1002             tokenHandler.comment(strBuf, 0, strBufLen
1003                     - provisionalHyphens);
1004             // [NOCPP[
1005         }
1006         // ]NOCPP]
1007         clearStrBufAfterUse();
1008         cstart = pos + 1;
1009     }
1010 
1011     /**
1012      * Flushes coalesced character tokens.
1013      *
1014      * @param buf
1015      *            TODO
1016      * @param pos
1017      *            TODO
1018      *
1019      * @throws SAXException
1020      */
flushChars(@oLength char[] buf, int pos)1021     protected void flushChars(@NoLength char[] buf, int pos)
1022             throws SAXException {
1023         if (pos > cstart) {
1024             tokenHandler.characters(buf, cstart, pos - cstart);
1025         }
1026         cstart = Integer.MAX_VALUE;
1027     }
1028 
1029     /**
1030      * Reports an condition that would make the infoset incompatible with XML
1031      * 1.0 as fatal.
1032      *
1033      * @param message
1034      *            the message
1035      * @throws SAXException
1036      * @throws SAXParseException
1037      */
fatal(String message)1038     public void fatal(String message) throws SAXException {
1039         SAXParseException spe = new SAXParseException(message, this);
1040         if (errorHandler != null) {
1041             errorHandler.fatalError(spe);
1042         }
1043         throw spe;
1044     }
1045 
1046     /**
1047      * Reports a Parse Error.
1048      *
1049      * @param message
1050      *            the message
1051      * @throws SAXException
1052      */
err(String message)1053     public void err(String message) throws SAXException {
1054         if (errorHandler == null) {
1055             return;
1056         }
1057         SAXParseException spe = new SAXParseException(message, this);
1058         errorHandler.error(spe);
1059     }
1060 
errTreeBuilder(String message)1061     public void errTreeBuilder(String message) throws SAXException {
1062         ErrorHandler eh = null;
1063         if (tokenHandler instanceof TreeBuilder<?>) {
1064             TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
1065             eh = treeBuilder.getErrorHandler();
1066         }
1067         if (eh == null) {
1068             eh = errorHandler;
1069         }
1070         if (eh == null) {
1071             return;
1072         }
1073         SAXParseException spe = new SAXParseException(message, this);
1074         eh.error(spe);
1075     }
1076 
1077     /**
1078      * Reports a warning
1079      *
1080      * @param message
1081      *            the message
1082      * @throws SAXException
1083      */
warn(String message)1084     public void warn(String message) throws SAXException {
1085         if (errorHandler == null) {
1086             return;
1087         }
1088         SAXParseException spe = new SAXParseException(message, this);
1089         errorHandler.warning(spe);
1090     }
1091 
strBufToElementNameString()1092     private void strBufToElementNameString() {
1093         tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
1094                 interner);
1095         clearStrBufAfterUse();
1096     }
1097 
emitCurrentTagToken(boolean selfClosing, int pos)1098     private int emitCurrentTagToken(boolean selfClosing, int pos)
1099             throws SAXException {
1100         cstart = pos + 1;
1101         maybeErrSlashInEndTag(selfClosing);
1102         stateSave = Tokenizer.DATA;
1103         HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
1104                 : attributes);
1105         if (endTag) {
1106             /*
1107              * When an end tag token is emitted, the content model flag must be
1108              * switched to the PCDATA state.
1109              */
1110             maybeErrAttributesOnEndTag(attrs);
1111             // CPPONLY: if (!viewingXmlSource) {
1112             tokenHandler.endTag(tagName);
1113             // CPPONLY: }
1114             // CPPONLY: if (newAttributesEachTime) {
1115             // CPPONLY:   Portability.delete(attributes);
1116             // CPPONLY:   attributes = null;
1117             // CPPONLY: }
1118         } else {
1119             // CPPONLY: if (viewingXmlSource) {
1120             // CPPONLY:   assert newAttributesEachTime;
1121             // CPPONLY:   Portability.delete(attributes);
1122             // CPPONLY:   attributes = null;
1123             // CPPONLY: } else {
1124             tokenHandler.startTag(tagName, attrs, selfClosing);
1125             // CPPONLY: }
1126         }
1127         tagName.release();
1128         tagName = null;
1129         if (newAttributesEachTime) {
1130             attributes = null;
1131         } else {
1132             attributes.clear(mappingLangToXmlLang);
1133         }
1134         /*
1135          * The token handler may have called setStateAndEndTagExpectation
1136          * and changed stateSave since the start of this method.
1137          */
1138         return stateSave;
1139     }
1140 
attributeNameComplete()1141     private void attributeNameComplete() throws SAXException {
1142         attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
1143         // [NOCPP[
1144                 , namePolicy != XmlViolationPolicy.ALLOW
1145                 // ]NOCPP]
1146                 , interner);
1147         clearStrBufAfterUse();
1148 
1149         if (attributes == null) {
1150             attributes = new HtmlAttributes(mappingLangToXmlLang);
1151         }
1152 
1153         /*
1154          * When the user agent leaves the attribute name state (and before
1155          * emitting the tag token, if appropriate), the complete attribute's
1156          * name must be compared to the other attributes on the same token; if
1157          * there is already an attribute on the token with the exact same name,
1158          * then this is a parse error and the new attribute must be dropped,
1159          * along with the value that gets associated with it (if any).
1160          */
1161         if (attributes.contains(attributeName)) {
1162             errDuplicateAttribute();
1163             attributeName.release();
1164             attributeName = null;
1165         }
1166     }
1167 
addAttributeWithoutValue()1168     private void addAttributeWithoutValue() throws SAXException {
1169         noteAttributeWithoutValue();
1170 
1171         // [NOCPP[
1172         if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
1173                 && ElementName.META == tagName) {
1174             err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1175         }
1176         // ]NOCPP]
1177         if (attributeName != null) {
1178             // [NOCPP[
1179             if (html4) {
1180                 if (attributeName.isBoolean()) {
1181                     if (html4ModeCompatibleWithXhtml1Schemata) {
1182                         attributes.addAttribute(attributeName,
1183                                 attributeName.getLocal(AttributeName.HTML),
1184                                 xmlnsPolicy);
1185                     } else {
1186                         attributes.addAttribute(attributeName, "", xmlnsPolicy);
1187                     }
1188                 } else {
1189                     if (AttributeName.BORDER != attributeName) {
1190                         err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
1191                         attributes.addAttribute(attributeName, "", xmlnsPolicy);
1192                     }
1193                 }
1194             } else {
1195                 if (AttributeName.SRC == attributeName
1196                         || AttributeName.HREF == attributeName) {
1197                     warn("Attribute \u201C"
1198                             + attributeName.getLocal(AttributeName.HTML)
1199                             + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
1200                 }
1201                 // ]NOCPP]
1202                 attributes.addAttribute(attributeName,
1203                         Portability.newEmptyString()
1204                         // [NOCPP[
1205                         , xmlnsPolicy
1206                 // ]NOCPP]
1207                 // CPPONLY: , attributeLine
1208                 );
1209                 // [NOCPP[
1210             }
1211             // ]NOCPP]
1212             attributeName = null; // attributeName has been adopted by the
1213             // |attributes| object
1214         } else {
1215             clearStrBufAfterUse();
1216         }
1217     }
1218 
addAttributeWithValue()1219     private void addAttributeWithValue() throws SAXException {
1220         // [NOCPP[
1221         if (metaBoundaryPassed && ElementName.META == tagName
1222                 && AttributeName.CHARSET == attributeName) {
1223             err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1224         }
1225         // ]NOCPP]
1226         if (attributeName != null) {
1227             String val = strBufToString(); // Ownership transferred to
1228             // HtmlAttributes
1229             // CPPONLY: if (mViewSource) {
1230             // CPPONLY:   mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
1231             // CPPONLY: }
1232             // [NOCPP[
1233             if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
1234                     && attributeName.isCaseFolded()) {
1235                 val = newAsciiLowerCaseStringFromString(val);
1236             }
1237             // ]NOCPP]
1238             attributes.addAttribute(attributeName, val
1239             // [NOCPP[
1240                     , xmlnsPolicy
1241             // ]NOCPP]
1242             // CPPONLY: , attributeLine
1243             );
1244             attributeName = null; // attributeName has been adopted by the
1245             // |attributes| object
1246         } else {
1247             // We have a duplicate attribute. Explicitly discard its value.
1248             clearStrBufAfterUse();
1249         }
1250     }
1251 
1252     // [NOCPP[
1253 
newAsciiLowerCaseStringFromString(String str)1254     private static String newAsciiLowerCaseStringFromString(String str) {
1255         if (str == null) {
1256             return null;
1257         }
1258         char[] buf = new char[str.length()];
1259         for (int i = 0; i < str.length(); i++) {
1260             char c = str.charAt(i);
1261             if (c >= 'A' && c <= 'Z') {
1262                 c += 0x20;
1263             }
1264             buf[i] = c;
1265         }
1266         return new String(buf);
1267     }
1268 
startErrorReporting()1269     protected void startErrorReporting() throws SAXException {
1270 
1271     }
1272 
1273     // ]NOCPP]
1274 
start()1275     public void start() throws SAXException {
1276         initializeWithoutStarting();
1277         tokenHandler.startTokenization(this);
1278         // [NOCPP[
1279         startErrorReporting();
1280         // ]NOCPP]
1281     }
1282 
tokenizeBuffer(UTF16Buffer buffer)1283     public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
1284         int state = stateSave;
1285         int returnState = returnStateSave;
1286         char c = '\u0000';
1287         shouldSuspend = false;
1288         lastCR = false;
1289 
1290         int start = buffer.getStart();
1291         int end = buffer.getEnd();
1292 
1293         // In C++, the caller of tokenizeBuffer needs to do this explicitly.
1294         // [NOCPP[
1295         ensureBufferSpace(end - start);
1296         // ]NOCPP]
1297 
1298         /**
1299          * The index of the last <code>char</code> read from <code>buf</code>.
1300          */
1301         int pos = start - 1;
1302 
1303         /**
1304          * The index of the first <code>char</code> in <code>buf</code> that is
1305          * part of a coalesced run of character tokens or
1306          * <code>Integer.MAX_VALUE</code> if there is not a current run being
1307          * coalesced.
1308          */
1309         switch (state) {
1310             case DATA:
1311             case RCDATA:
1312             case SCRIPT_DATA:
1313             case PLAINTEXT:
1314             case RAWTEXT:
1315             case CDATA_SECTION:
1316             case SCRIPT_DATA_ESCAPED:
1317             case SCRIPT_DATA_ESCAPE_START:
1318             case SCRIPT_DATA_ESCAPE_START_DASH:
1319             case SCRIPT_DATA_ESCAPED_DASH:
1320             case SCRIPT_DATA_ESCAPED_DASH_DASH:
1321             case SCRIPT_DATA_DOUBLE_ESCAPE_START:
1322             case SCRIPT_DATA_DOUBLE_ESCAPED:
1323             case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
1324             case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
1325             case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
1326             case SCRIPT_DATA_DOUBLE_ESCAPE_END:
1327                 cstart = start;
1328                 break;
1329             default:
1330                 cstart = Integer.MAX_VALUE;
1331                 break;
1332         }
1333 
1334         /**
1335          * The number of <code>char</code>s in <code>buf</code> that have
1336          * meaning. (The rest of the array is garbage and should not be
1337          * examined.)
1338          */
1339         // CPPONLY: if (mViewSource) {
1340         // CPPONLY:   mViewSource.SetBuffer(buffer);
1341         // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1342         // CPPONLY:   mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
1343         // CPPONLY: } else {
1344         // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1345         // CPPONLY: }
1346         // [NOCPP[
1347         pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
1348                 end);
1349         // ]NOCPP]
1350         if (pos == end) {
1351             // exiting due to end of buffer
1352             buffer.setStart(pos);
1353         } else {
1354             buffer.setStart(pos + 1);
1355         }
1356         return lastCR;
1357     }
1358 
1359     // [NOCPP[
ensureBufferSpace(int inputLength)1360     private void ensureBufferSpace(int inputLength) throws SAXException {
1361         // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB.
1362         // Adding to the general worst case instead of only the
1363         // TreeBuilder-exposed worst case to avoid re-introducing a bug when
1364         // unifying the tokenizer and tree builder buffers in the future.
1365         int worstCase = strBufLen + inputLength + charRefBufLen + 2;
1366         tokenHandler.ensureBufferSpace(worstCase);
1367         if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) {
1368             // When altering infoset, if the comment contents are consecutive
1369             // hyphens, each hyphen generates a space, too. These buffer
1370             // contents never get emitted as characters() to the tokenHandler,
1371             // which is why this calculation happens after the call to
1372             // ensureBufferSpace on tokenHandler.
1373             worstCase *= 2;
1374         }
1375         if (strBuf == null) {
1376             // Add an arbitrary small value to avoid immediate reallocation
1377             // once there are a few characters in the buffer.
1378             strBuf = new char[worstCase + 128];
1379         } else if (worstCase > strBuf.length) {
1380             // HotSpot reportedly allocates memory with 8-byte accuracy, so
1381             // there's no point in trying to do math here to avoid slop.
1382             // Maybe we should add some small constant to worstCase here
1383             // but not doing that without profiling. In C++ with jemalloc,
1384             // the corresponding method should do math to round up here
1385             // to avoid slop.
1386             char[] newBuf = new char[worstCase];
1387             System.arraycopy(strBuf, 0, newBuf, 0, strBufLen);
1388             strBuf = newBuf;
1389         }
1390     }
1391     // ]NOCPP]
1392 
stateLoop(int state, char c, int pos, @NoLength char[] buf, boolean reconsume, int returnState, int endPos)1393     @SuppressWarnings("unused") private int stateLoop(int state, char c,
1394             int pos, @NoLength char[] buf, boolean reconsume, int returnState,
1395             int endPos) throws SAXException {
1396         /*
1397          * Idioms used in this code:
1398          *
1399          *
1400          * Consuming the next input character
1401          *
1402          * To consume the next input character, the code does this: if (++pos ==
1403          * endPos) { break stateloop; } c = checkChar(buf, pos);
1404          *
1405          *
1406          * Staying in a state
1407          *
1408          * When there's a state that the tokenizer may stay in over multiple
1409          * input characters, the state has a wrapper |for(;;)| loop and staying
1410          * in the state continues the loop.
1411          *
1412          *
1413          * Switching to another state
1414          *
1415          * To switch to another state, the code sets the state variable to the
1416          * magic number of the new state. Then it either continues stateloop or
1417          * breaks out of the state's own wrapper loop if the target state is
1418          * right after the current state in source order. (This is a partial
1419          * workaround for Java's lack of goto.)
1420          *
1421          *
1422          * Reconsume support
1423          *
1424          * The spec sometimes says that an input character is reconsumed in
1425          * another state. If a state can ever be entered so that an input
1426          * character can be reconsumed in it, the state's code starts with an
1427          * |if (reconsume)| that sets reconsume to false and skips over the
1428          * normal code for consuming a new character.
1429          *
1430          * To reconsume the current character in another state, the code sets
1431          * |reconsume| to true and then switches to the other state.
1432          *
1433          *
1434          * Emitting character tokens
1435          *
1436          * This method emits character tokens lazily. Whenever a new range of
1437          * character tokens starts, the field cstart must be set to the start
1438          * index of the range. The flushChars() method must be called at the end
1439          * of a range to flush it.
1440          *
1441          *
1442          * U+0000 handling
1443          *
1444          * The various states have to handle the replacement of U+0000 with
1445          * U+FFFD. However, if U+0000 would be reconsumed in another state, the
1446          * replacement doesn't need to happen, because it's handled by the
1447          * reconsuming state.
1448          *
1449          *
1450          * LF handling
1451          *
1452          * Every state needs to increment the line number upon LF unless the LF
1453          * gets reconsumed by another state which increments the line number.
1454          *
1455          *
1456          * CR handling
1457          *
1458          * Every state needs to handle CR unless the CR gets reconsumed and is
1459          * handled by the reconsuming state. The CR needs to be handled as if it
1460          * were and LF, the lastCR field must be set to true and then this
1461          * method must return. The IO driver will then swallow the next
1462          * character if it is an LF to coalesce CRLF.
1463          */
1464         stateloop: for (;;) {
1465             switch (state) {
1466                 case DATA:
1467                     dataloop: for (;;) {
1468                         if (reconsume) {
1469                             reconsume = false;
1470                         } else {
1471                             if (++pos == endPos) {
1472                                 break stateloop;
1473                             }
1474                             c = checkChar(buf, pos);
1475                         }
1476                         switch (c) {
1477                             case '&':
1478                                 /*
1479                                  * U+0026 AMPERSAND (&) Switch to the character
1480                                  * reference in data state.
1481                                  */
1482                                 flushChars(buf, pos);
1483                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
1484                                 appendCharRefBuf(c);
1485                                 setAdditionalAndRememberAmpersandLocation('\u0000');
1486                                 returnState = state;
1487                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
1488                                 continue stateloop;
1489                             case '<':
1490                                 /*
1491                                  * U+003C LESS-THAN SIGN (<) Switch to the tag
1492                                  * open state.
1493                                  */
1494                                 flushChars(buf, pos);
1495 
1496                                 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
1497                                 break dataloop; // FALL THROUGH continue
1498                             // stateloop;
1499                             case '\u0000':
1500                                 emitReplacementCharacter(buf, pos);
1501                                 continue;
1502                             case '\r':
1503                                 emitCarriageReturn(buf, pos);
1504                                 break stateloop;
1505                             case '\n':
1506                                 silentLineFeed();
1507                             default:
1508                                 /*
1509                                  * Anything else Emit the input character as a
1510                                  * character token.
1511                                  *
1512                                  * Stay in the data state.
1513                                  */
1514                                 continue;
1515                         }
1516                     }
1517                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
1518                 case TAG_OPEN:
1519                     tagopenloop: for (;;) {
1520                         /*
1521                          * The behavior of this state depends on the content
1522                          * model flag.
1523                          */
1524                         if (++pos == endPos) {
1525                             break stateloop;
1526                         }
1527                         c = checkChar(buf, pos);
1528                         /*
1529                          * If the content model flag is set to the PCDATA state
1530                          * Consume the next input character:
1531                          */
1532                         if (c >= 'A' && c <= 'Z') {
1533                             /*
1534                              * U+0041 LATIN CAPITAL LETTER A through to U+005A
1535                              * LATIN CAPITAL LETTER Z Create a new start tag
1536                              * token,
1537                              */
1538                             endTag = false;
1539                             /*
1540                              * set its tag name to the lowercase version of the
1541                              * input character (add 0x0020 to the character's
1542                              * code point),
1543                              */
1544                             clearStrBufBeforeUse();
1545                             appendStrBuf((char) (c + 0x20));
1546                             /* then switch to the tag name state. */
1547                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1548                             /*
1549                              * (Don't emit the token yet; further details will
1550                              * be filled in before it is emitted.)
1551                              */
1552                             break tagopenloop;
1553                             // continue stateloop;
1554                         } else if (c >= 'a' && c <= 'z') {
1555                             /*
1556                              * U+0061 LATIN SMALL LETTER A through to U+007A
1557                              * LATIN SMALL LETTER Z Create a new start tag
1558                              * token,
1559                              */
1560                             endTag = false;
1561                             /*
1562                              * set its tag name to the input character,
1563                              */
1564                             clearStrBufBeforeUse();
1565                             appendStrBuf(c);
1566                             /* then switch to the tag name state. */
1567                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1568                             /*
1569                              * (Don't emit the token yet; further details will
1570                              * be filled in before it is emitted.)
1571                              */
1572                             break tagopenloop;
1573                             // continue stateloop;
1574                         }
1575                         switch (c) {
1576                             case '!':
1577                                 /*
1578                                  * U+0021 EXCLAMATION MARK (!) Switch to the
1579                                  * markup declaration open state.
1580                                  */
1581                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
1582                                 continue stateloop;
1583                             case '/':
1584                                 /*
1585                                  * U+002F SOLIDUS (/) Switch to the close tag
1586                                  * open state.
1587                                  */
1588                                 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
1589                                 continue stateloop;
1590                             case '?':
1591                                 // CPPONLY: if (viewingXmlSource) {
1592                                 // CPPONLY: state = transition(state,
1593                                 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
1594                                 // CPPONLY: reconsume,
1595                                 // CPPONLY: pos);
1596                                 // CPPONLY: continue stateloop;
1597                                 // CPPONLY: }
1598                                 /*
1599                                  * U+003F QUESTION MARK (?) Parse error.
1600                                  */
1601                                 errProcessingInstruction();
1602                                 /*
1603                                  * Switch to the bogus comment state.
1604                                  */
1605                                 clearStrBufBeforeUse();
1606                                 appendStrBuf(c);
1607                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
1608                                 continue stateloop;
1609                             case '>':
1610                                 /*
1611                                  * U+003E GREATER-THAN SIGN (>) Parse error.
1612                                  */
1613                                 errLtGt();
1614                                 /*
1615                                  * Emit a U+003C LESS-THAN SIGN character token
1616                                  * and a U+003E GREATER-THAN SIGN character
1617                                  * token.
1618                                  */
1619                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
1620                                 /* Switch to the data state. */
1621                                 cstart = pos + 1;
1622                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
1623                                 continue stateloop;
1624                             default:
1625                                 /*
1626                                  * Anything else Parse error.
1627                                  */
1628                                 errBadCharAfterLt(c);
1629                                 /*
1630                                  * Emit a U+003C LESS-THAN SIGN character token
1631                                  */
1632                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
1633                                 /*
1634                                  * and reconsume the current input character in
1635                                  * the data state.
1636                                  */
1637                                 cstart = pos;
1638                                 reconsume = true;
1639                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
1640                                 continue stateloop;
1641                         }
1642                     }
1643                     // FALL THROUGH DON'T REORDER
1644                 case TAG_NAME:
1645                     tagnameloop: for (;;) {
1646                         if (++pos == endPos) {
1647                             break stateloop;
1648                         }
1649                         c = checkChar(buf, pos);
1650                         /*
1651                          * Consume the next input character:
1652                          */
1653                         switch (c) {
1654                             case '\r':
1655                                 silentCarriageReturn();
1656                                 strBufToElementNameString();
1657                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1658                                 break stateloop;
1659                             case '\n':
1660                                 silentLineFeed();
1661                             case ' ':
1662                             case '\t':
1663                             case '\u000C':
1664                                 /*
1665                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
1666                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1667                                  * Switch to the before attribute name state.
1668                                  */
1669                                 strBufToElementNameString();
1670                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1671                                 break tagnameloop;
1672                             // continue stateloop;
1673                             case '/':
1674                                 /*
1675                                  * U+002F SOLIDUS (/) Switch to the self-closing
1676                                  * start tag state.
1677                                  */
1678                                 strBufToElementNameString();
1679                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1680                                 continue stateloop;
1681                             case '>':
1682                                 /*
1683                                  * U+003E GREATER-THAN SIGN (>) Emit the current
1684                                  * tag token.
1685                                  */
1686                                 strBufToElementNameString();
1687                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1688                                 if (shouldSuspend) {
1689                                     break stateloop;
1690                                 }
1691                                 /*
1692                                  * Switch to the data state.
1693                                  */
1694                                 continue stateloop;
1695                             case '\u0000':
1696                                 c = '\uFFFD';
1697                                 // fall thru
1698                             default:
1699                                 if (c >= 'A' && c <= 'Z') {
1700                                     /*
1701                                      * U+0041 LATIN CAPITAL LETTER A through to
1702                                      * U+005A LATIN CAPITAL LETTER Z Append the
1703                                      * lowercase version of the current input
1704                                      * character (add 0x0020 to the character's
1705                                      * code point) to the current tag token's
1706                                      * tag name.
1707                                      */
1708                                     c += 0x20;
1709                                 }
1710                                 /*
1711                                  * Anything else Append the current input
1712                                  * character to the current tag token's tag
1713                                  * name.
1714                                  */
1715                                 appendStrBuf(c);
1716                                 /*
1717                                  * Stay in the tag name state.
1718                                  */
1719                                 continue;
1720                         }
1721                     }
1722                     // FALLTHRU DON'T REORDER
1723                 case BEFORE_ATTRIBUTE_NAME:
1724                     beforeattributenameloop: for (;;) {
1725                         if (reconsume) {
1726                             reconsume = false;
1727                         } else {
1728                             if (++pos == endPos) {
1729                                 break stateloop;
1730                             }
1731                             c = checkChar(buf, pos);
1732                         }
1733                         /*
1734                          * Consume the next input character:
1735                          */
1736                         switch (c) {
1737                             case '\r':
1738                                 silentCarriageReturn();
1739                                 break stateloop;
1740                             case '\n':
1741                                 silentLineFeed();
1742                                 // fall thru
1743                             case ' ':
1744                             case '\t':
1745                             case '\u000C':
1746                                 /*
1747                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
1748                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
1749                                  * in the before attribute name state.
1750                                  */
1751                                 continue;
1752                             case '/':
1753                                 /*
1754                                  * U+002F SOLIDUS (/) Switch to the self-closing
1755                                  * start tag state.
1756                                  */
1757                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1758                                 continue stateloop;
1759                             case '>':
1760                                 /*
1761                                  * U+003E GREATER-THAN SIGN (>) Emit the current
1762                                  * tag token.
1763                                  */
1764                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1765                                 if (shouldSuspend) {
1766                                     break stateloop;
1767                                 }
1768                                 /*
1769                                  * Switch to the data state.
1770                                  */
1771                                 continue stateloop;
1772                             case '\u0000':
1773                                 c = '\uFFFD';
1774                                 // fall thru
1775                             case '\"':
1776                             case '\'':
1777                             case '<':
1778                             case '=':
1779                                 /*
1780                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1781                                  * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
1782                                  * SIGN (=) Parse error.
1783                                  */
1784                                 errBadCharBeforeAttributeNameOrNull(c);
1785                                 /*
1786                                  * Treat it as per the "anything else" entry
1787                                  * below.
1788                                  */
1789                             default:
1790                                 /*
1791                                  * Anything else Start a new attribute in the
1792                                  * current tag token.
1793                                  */
1794                                 if (c >= 'A' && c <= 'Z') {
1795                                     /*
1796                                      * U+0041 LATIN CAPITAL LETTER A through to
1797                                      * U+005A LATIN CAPITAL LETTER Z Set that
1798                                      * attribute's name to the lowercase version
1799                                      * of the current input character (add
1800                                      * 0x0020 to the character's code point)
1801                                      */
1802                                     c += 0x20;
1803                                 }
1804                                 // CPPONLY: attributeLine = line;
1805                                 /*
1806                                  * Set that attribute's name to the current
1807                                  * input character,
1808                                  */
1809                                 clearStrBufBeforeUse();
1810                                 appendStrBuf(c);
1811                                 /*
1812                                  * and its value to the empty string.
1813                                  */
1814                                 // Will do later.
1815                                 /*
1816                                  * Switch to the attribute name state.
1817                                  */
1818                                 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
1819                                 break beforeattributenameloop;
1820                             // continue stateloop;
1821                         }
1822                     }
1823                     // FALLTHRU DON'T REORDER
1824                 case ATTRIBUTE_NAME:
1825                     attributenameloop: for (;;) {
1826                         if (++pos == endPos) {
1827                             break stateloop;
1828                         }
1829                         c = checkChar(buf, pos);
1830                         /*
1831                          * Consume the next input character:
1832                          */
1833                         switch (c) {
1834                             case '\r':
1835                                 silentCarriageReturn();
1836                                 attributeNameComplete();
1837                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1838                                 break stateloop;
1839                             case '\n':
1840                                 silentLineFeed();
1841                                 // fall thru
1842                             case ' ':
1843                             case '\t':
1844                             case '\u000C':
1845                                 /*
1846                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
1847                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1848                                  * Switch to the after attribute name state.
1849                                  */
1850                                 attributeNameComplete();
1851                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1852                                 continue stateloop;
1853                             case '/':
1854                                 /*
1855                                  * U+002F SOLIDUS (/) Switch to the self-closing
1856                                  * start tag state.
1857                                  */
1858                                 attributeNameComplete();
1859                                 addAttributeWithoutValue();
1860                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1861                                 continue stateloop;
1862                             case '=':
1863                                 /*
1864                                  * U+003D EQUALS SIGN (=) Switch to the before
1865                                  * attribute value state.
1866                                  */
1867                                 attributeNameComplete();
1868                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
1869                                 break attributenameloop;
1870                             // continue stateloop;
1871                             case '>':
1872                                 /*
1873                                  * U+003E GREATER-THAN SIGN (>) Emit the current
1874                                  * tag token.
1875                                  */
1876                                 attributeNameComplete();
1877                                 addAttributeWithoutValue();
1878                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1879                                 if (shouldSuspend) {
1880                                     break stateloop;
1881                                 }
1882                                 /*
1883                                  * Switch to the data state.
1884                                  */
1885                                 continue stateloop;
1886                             case '\u0000':
1887                                 c = '\uFFFD';
1888                                 // fall thru
1889                             case '\"':
1890                             case '\'':
1891                             case '<':
1892                                 /*
1893                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1894                                  * (') U+003C LESS-THAN SIGN (<) Parse error.
1895                                  */
1896                                 errQuoteOrLtInAttributeNameOrNull(c);
1897                                 /*
1898                                  * Treat it as per the "anything else" entry
1899                                  * below.
1900                                  */
1901                             default:
1902                                 if (c >= 'A' && c <= 'Z') {
1903                                     /*
1904                                      * U+0041 LATIN CAPITAL LETTER A through to
1905                                      * U+005A LATIN CAPITAL LETTER Z Append the
1906                                      * lowercase version of the current input
1907                                      * character (add 0x0020 to the character's
1908                                      * code point) to the current attribute's
1909                                      * name.
1910                                      */
1911                                     c += 0x20;
1912                                 }
1913                                 /*
1914                                  * Anything else Append the current input
1915                                  * character to the current attribute's name.
1916                                  */
1917                                 appendStrBuf(c);
1918                                 /*
1919                                  * Stay in the attribute name state.
1920                                  */
1921                                 continue;
1922                         }
1923                     }
1924                     // FALLTHRU DON'T REORDER
1925                 case BEFORE_ATTRIBUTE_VALUE:
1926                     beforeattributevalueloop: for (;;) {
1927                         if (++pos == endPos) {
1928                             break stateloop;
1929                         }
1930                         c = checkChar(buf, pos);
1931                         /*
1932                          * Consume the next input character:
1933                          */
1934                         switch (c) {
1935                             case '\r':
1936                                 silentCarriageReturn();
1937                                 break stateloop;
1938                             case '\n':
1939                                 silentLineFeed();
1940                                 // fall thru
1941                             case ' ':
1942                             case '\t':
1943                             case '\u000C':
1944                                 /*
1945                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
1946                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
1947                                  * in the before attribute value state.
1948                                  */
1949                                 continue;
1950                             case '"':
1951                                 /*
1952                                  * U+0022 QUOTATION MARK (") Switch to the
1953                                  * attribute value (double-quoted) state.
1954                                  */
1955                                 // CPPONLY: attributeLine = line;
1956                                 clearStrBufBeforeUse();
1957                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
1958                                 break beforeattributevalueloop;
1959                             // continue stateloop;
1960                             case '&':
1961                                 /*
1962                                  * U+0026 AMPERSAND (&) Switch to the attribute
1963                                  * value (unquoted) state and reconsume this
1964                                  * input character.
1965                                  */
1966                                 // CPPONLY: attributeLine = line;
1967                                 clearStrBufBeforeUse();
1968                                 reconsume = true;
1969                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
1970                                 noteUnquotedAttributeValue();
1971                                 continue stateloop;
1972                             case '\'':
1973                                 /*
1974                                  * U+0027 APOSTROPHE (') Switch to the attribute
1975                                  * value (single-quoted) state.
1976                                  */
1977                                 // CPPONLY: attributeLine = line;
1978                                 clearStrBufBeforeUse();
1979                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
1980                                 continue stateloop;
1981                             case '>':
1982                                 /*
1983                                  * U+003E GREATER-THAN SIGN (>) Parse error.
1984                                  */
1985                                 errAttributeValueMissing();
1986                                 /*
1987                                  * Emit the current tag token.
1988                                  */
1989                                 addAttributeWithoutValue();
1990                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1991                                 if (shouldSuspend) {
1992                                     break stateloop;
1993                                 }
1994                                 /*
1995                                  * Switch to the data state.
1996                                  */
1997                                 continue stateloop;
1998                             case '\u0000':
1999                                 c = '\uFFFD';
2000                                 // fall thru
2001                             case '<':
2002                             case '=':
2003                             case '`':
2004                                 /*
2005                                  * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
2006                                  * (=) U+0060 GRAVE ACCENT (`)
2007                                  */
2008                                 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
2009                                 /*
2010                                  * Treat it as per the "anything else" entry
2011                                  * below.
2012                                  */
2013                             default:
2014                                 // [NOCPP[
2015                                 errHtml4NonNameInUnquotedAttribute(c);
2016                                 // ]NOCPP]
2017                                 /*
2018                                  * Anything else Append the current input
2019                                  * character to the current attribute's value.
2020                                  */
2021                                 // CPPONLY: attributeLine = line;
2022                                 clearStrBufBeforeUse();
2023                                 appendStrBuf(c);
2024                                 /*
2025                                  * Switch to the attribute value (unquoted)
2026                                  * state.
2027                                  */
2028 
2029                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
2030                                 noteUnquotedAttributeValue();
2031                                 continue stateloop;
2032                         }
2033                     }
2034                     // FALLTHRU DON'T REORDER
2035                 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
2036                     attributevaluedoublequotedloop: for (;;) {
2037                         if (reconsume) {
2038                             reconsume = false;
2039                         } else {
2040                             if (++pos == endPos) {
2041                                 break stateloop;
2042                             }
2043                             c = checkChar(buf, pos);
2044                         }
2045                         /*
2046                          * Consume the next input character:
2047                          */
2048                         switch (c) {
2049                             case '"':
2050                                 /*
2051                                  * U+0022 QUOTATION MARK (") Switch to the after
2052                                  * attribute value (quoted) state.
2053                                  */
2054                                 addAttributeWithValue();
2055 
2056                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2057                                 break attributevaluedoublequotedloop;
2058                             // continue stateloop;
2059                             case '&':
2060                                 /*
2061                                  * U+0026 AMPERSAND (&) Switch to the character
2062                                  * reference in attribute value state, with the
2063                                  * additional allowed character being U+0022
2064                                  * QUOTATION MARK (").
2065                                  */
2066                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
2067                                 appendCharRefBuf(c);
2068                                 setAdditionalAndRememberAmpersandLocation('\"');
2069                                 returnState = state;
2070                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2071                                 continue stateloop;
2072                             case '\r':
2073                                 appendStrBufCarriageReturn();
2074                                 break stateloop;
2075                             case '\n':
2076                                 appendStrBufLineFeed();
2077                                 continue;
2078                             case '\u0000':
2079                                 c = '\uFFFD';
2080                                 // fall thru
2081                             default:
2082                                 /*
2083                                  * Anything else Append the current input
2084                                  * character to the current attribute's value.
2085                                  */
2086                                 appendStrBuf(c);
2087                                 /*
2088                                  * Stay in the attribute value (double-quoted)
2089                                  * state.
2090                                  */
2091                                 continue;
2092                         }
2093                     }
2094                     // FALLTHRU DON'T REORDER
2095                 case AFTER_ATTRIBUTE_VALUE_QUOTED:
2096                     afterattributevaluequotedloop: for (;;) {
2097                         if (++pos == endPos) {
2098                             break stateloop;
2099                         }
2100                         c = checkChar(buf, pos);
2101                         /*
2102                          * Consume the next input character:
2103                          */
2104                         switch (c) {
2105                             case '\r':
2106                                 silentCarriageReturn();
2107                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2108                                 break stateloop;
2109                             case '\n':
2110                                 silentLineFeed();
2111                                 // fall thru
2112                             case ' ':
2113                             case '\t':
2114                             case '\u000C':
2115                                 /*
2116                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2117                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2118                                  * Switch to the before attribute name state.
2119                                  */
2120                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2121                                 continue stateloop;
2122                             case '/':
2123                                 /*
2124                                  * U+002F SOLIDUS (/) Switch to the self-closing
2125                                  * start tag state.
2126                                  */
2127                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2128                                 break afterattributevaluequotedloop;
2129                             // continue stateloop;
2130                             case '>':
2131                                 /*
2132                                  * U+003E GREATER-THAN SIGN (>) Emit the current
2133                                  * tag token.
2134                                  */
2135                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2136                                 if (shouldSuspend) {
2137                                     break stateloop;
2138                                 }
2139                                 /*
2140                                  * Switch to the data state.
2141                                  */
2142                                 continue stateloop;
2143                             default:
2144                                 /*
2145                                  * Anything else Parse error.
2146                                  */
2147                                 errNoSpaceBetweenAttributes();
2148                                 /*
2149                                  * Reconsume the character in the before
2150                                  * attribute name state.
2151                                  */
2152                                 reconsume = true;
2153                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2154                                 continue stateloop;
2155                         }
2156                     }
2157                     // FALLTHRU DON'T REORDER
2158                 case SELF_CLOSING_START_TAG:
2159                     if (++pos == endPos) {
2160                         break stateloop;
2161                     }
2162                     c = checkChar(buf, pos);
2163                     /*
2164                      * Consume the next input character:
2165                      */
2166                     switch (c) {
2167                         case '>':
2168                             /*
2169                              * U+003E GREATER-THAN SIGN (>) Set the self-closing
2170                              * flag of the current tag token. Emit the current
2171                              * tag token.
2172                              */
2173                             // [NOCPP[
2174                             errHtml4XmlVoidSyntax();
2175                             // ]NOCPP]
2176                             state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
2177                             if (shouldSuspend) {
2178                                 break stateloop;
2179                             }
2180                             /*
2181                              * Switch to the data state.
2182                              */
2183                             continue stateloop;
2184                         default:
2185                             /* Anything else Parse error. */
2186                             errSlashNotFollowedByGt();
2187                             /*
2188                              * Reconsume the character in the before attribute
2189                              * name state.
2190                              */
2191                             reconsume = true;
2192                             state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2193                             continue stateloop;
2194                     }
2195                     // XXX reorder point
2196                 case ATTRIBUTE_VALUE_UNQUOTED:
2197                     for (;;) {
2198                         if (reconsume) {
2199                             reconsume = false;
2200                         } else {
2201                             if (++pos == endPos) {
2202                                 break stateloop;
2203                             }
2204                             c = checkChar(buf, pos);
2205                         }
2206                         /*
2207                          * Consume the next input character:
2208                          */
2209                         switch (c) {
2210                             case '\r':
2211                                 silentCarriageReturn();
2212                                 addAttributeWithValue();
2213                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2214                                 break stateloop;
2215                             case '\n':
2216                                 silentLineFeed();
2217                                 // fall thru
2218                             case ' ':
2219                             case '\t':
2220                             case '\u000C':
2221                                 /*
2222                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2223                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2224                                  * Switch to the before attribute name state.
2225                                  */
2226                                 addAttributeWithValue();
2227                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2228                                 continue stateloop;
2229                             case '&':
2230                                 /*
2231                                  * U+0026 AMPERSAND (&) Switch to the character
2232                                  * reference in attribute value state, with the
2233                                  * additional allowed character being U+003E
2234                                  * GREATER-THAN SIGN (>)
2235                                  */
2236                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
2237                                 appendCharRefBuf(c);
2238                                 setAdditionalAndRememberAmpersandLocation('>');
2239                                 returnState = state;
2240                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2241                                 continue stateloop;
2242                             case '>':
2243                                 /*
2244                                  * U+003E GREATER-THAN SIGN (>) Emit the current
2245                                  * tag token.
2246                                  */
2247                                 addAttributeWithValue();
2248                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2249                                 if (shouldSuspend) {
2250                                     break stateloop;
2251                                 }
2252                                 /*
2253                                  * Switch to the data state.
2254                                  */
2255                                 continue stateloop;
2256                             case '\u0000':
2257                                 c = '\uFFFD';
2258                                 // fall thru
2259                             case '<':
2260                             case '\"':
2261                             case '\'':
2262                             case '=':
2263                             case '`':
2264                                 /*
2265                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
2266                                  * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
2267                                  * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
2268                                  */
2269                                 errUnquotedAttributeValOrNull(c);
2270                                 /*
2271                                  * Treat it as per the "anything else" entry
2272                                  * below.
2273                                  */
2274                                 // fall through
2275                             default:
2276                                 // [NOCPP]
2277                                 errHtml4NonNameInUnquotedAttribute(c);
2278                                 // ]NOCPP]
2279                                 /*
2280                                  * Anything else Append the current input
2281                                  * character to the current attribute's value.
2282                                  */
2283                                 appendStrBuf(c);
2284                                 /*
2285                                  * Stay in the attribute value (unquoted) state.
2286                                  */
2287                                 continue;
2288                         }
2289                     }
2290                     // XXX reorder point
2291                 case AFTER_ATTRIBUTE_NAME:
2292                     for (;;) {
2293                         if (++pos == endPos) {
2294                             break stateloop;
2295                         }
2296                         c = checkChar(buf, pos);
2297                         /*
2298                          * Consume the next input character:
2299                          */
2300                         switch (c) {
2301                             case '\r':
2302                                 silentCarriageReturn();
2303                                 break stateloop;
2304                             case '\n':
2305                                 silentLineFeed();
2306                                 // fall thru
2307                             case ' ':
2308                             case '\t':
2309                             case '\u000C':
2310                                 /*
2311                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2312                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
2313                                  * in the after attribute name state.
2314                                  */
2315                                 continue;
2316                             case '/':
2317                                 /*
2318                                  * U+002F SOLIDUS (/) Switch to the self-closing
2319                                  * start tag state.
2320                                  */
2321                                 addAttributeWithoutValue();
2322                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2323                                 continue stateloop;
2324                             case '=':
2325                                 /*
2326                                  * U+003D EQUALS SIGN (=) Switch to the before
2327                                  * attribute value state.
2328                                  */
2329                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
2330                                 continue stateloop;
2331                             case '>':
2332                                 /*
2333                                  * U+003E GREATER-THAN SIGN (>) Emit the current
2334                                  * tag token.
2335                                  */
2336                                 addAttributeWithoutValue();
2337                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2338                                 if (shouldSuspend) {
2339                                     break stateloop;
2340                                 }
2341                                 /*
2342                                  * Switch to the data state.
2343                                  */
2344                                 continue stateloop;
2345                             case '\u0000':
2346                                 c = '\uFFFD';
2347                                 // fall thru
2348                             case '\"':
2349                             case '\'':
2350                             case '<':
2351                                 errQuoteOrLtInAttributeNameOrNull(c);
2352                                 /*
2353                                  * Treat it as per the "anything else" entry
2354                                  * below.
2355                                  */
2356                             default:
2357                                 addAttributeWithoutValue();
2358                                 /*
2359                                  * Anything else Start a new attribute in the
2360                                  * current tag token.
2361                                  */
2362                                 if (c >= 'A' && c <= 'Z') {
2363                                     /*
2364                                      * U+0041 LATIN CAPITAL LETTER A through to
2365                                      * U+005A LATIN CAPITAL LETTER Z Set that
2366                                      * attribute's name to the lowercase version
2367                                      * of the current input character (add
2368                                      * 0x0020 to the character's code point)
2369                                      */
2370                                     c += 0x20;
2371                                 }
2372                                 /*
2373                                  * Set that attribute's name to the current
2374                                  * input character,
2375                                  */
2376                                 clearStrBufBeforeUse();
2377                                 appendStrBuf(c);
2378                                 /*
2379                                  * and its value to the empty string.
2380                                  */
2381                                 // Will do later.
2382                                 /*
2383                                  * Switch to the attribute name state.
2384                                  */
2385                                 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
2386                                 continue stateloop;
2387                         }
2388                     }
2389                     // XXX reorder point
2390                 case MARKUP_DECLARATION_OPEN:
2391                     markupdeclarationopenloop: for (;;) {
2392                         if (++pos == endPos) {
2393                             break stateloop;
2394                         }
2395                         c = checkChar(buf, pos);
2396                         /*
2397                          * If the next two characters are both U+002D
2398                          * HYPHEN-MINUS characters (-), consume those two
2399                          * characters, create a comment token whose data is the
2400                          * empty string, and switch to the comment start state.
2401                          *
2402                          * Otherwise, if the next seven characters are an ASCII
2403                          * case-insensitive match for the word "DOCTYPE", then
2404                          * consume those characters and switch to the DOCTYPE
2405                          * state.
2406                          *
2407                          * Otherwise, if the insertion mode is
2408                          * "in foreign content" and the current node is not an
2409                          * element in the HTML namespace and the next seven
2410                          * characters are an case-sensitive match for the string
2411                          * "[CDATA[" (the five uppercase letters "CDATA" with a
2412                          * U+005B LEFT SQUARE BRACKET character before and
2413                          * after), then consume those characters and switch to
2414                          * the CDATA section state.
2415                          *
2416                          * Otherwise, is is a parse error. Switch to the bogus
2417                          * comment state. The next character that is consumed,
2418                          * if any, is the first character that will be in the
2419                          * comment.
2420                          */
2421                         switch (c) {
2422                             case '-':
2423                                 clearStrBufBeforeUse();
2424                                 appendStrBuf(c);
2425                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
2426                                 break markupdeclarationopenloop;
2427                             // continue stateloop;
2428                             case 'd':
2429                             case 'D':
2430                                 clearStrBufBeforeUse();
2431                                 appendStrBuf(c);
2432                                 index = 0;
2433                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
2434                                 continue stateloop;
2435                             case '[':
2436                                 if (tokenHandler.cdataSectionAllowed()) {
2437                                     clearStrBufBeforeUse();
2438                                     appendStrBuf(c);
2439                                     index = 0;
2440                                     state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
2441                                     continue stateloop;
2442                                 }
2443                                 // else fall through
2444                             default:
2445                                 errBogusComment();
2446                                 clearStrBufBeforeUse();
2447                                 reconsume = true;
2448                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2449                                 continue stateloop;
2450                         }
2451                     }
2452                     // FALLTHRU DON'T REORDER
2453                 case MARKUP_DECLARATION_HYPHEN:
2454                     markupdeclarationhyphenloop: for (;;) {
2455                         if (++pos == endPos) {
2456                             break stateloop;
2457                         }
2458                         c = checkChar(buf, pos);
2459                         switch (c) {
2460                             case '\u0000':
2461                                 break stateloop;
2462                             case '-':
2463                                 clearStrBufAfterOneHyphen();
2464                                 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
2465                                 break markupdeclarationhyphenloop;
2466                             // continue stateloop;
2467                             default:
2468                                 errBogusComment();
2469                                 reconsume = true;
2470                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2471                                 continue stateloop;
2472                         }
2473                     }
2474                     // FALLTHRU DON'T REORDER
2475                 case COMMENT_START:
2476                     commentstartloop: for (;;) {
2477                         if (++pos == endPos) {
2478                             break stateloop;
2479                         }
2480                         c = checkChar(buf, pos);
2481                         /*
2482                          * Comment start state
2483                          *
2484                          *
2485                          * Consume the next input character:
2486                          */
2487                         switch (c) {
2488                             case '-':
2489                                 /*
2490                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
2491                                  * start dash state.
2492                                  */
2493                                 appendStrBuf(c);
2494                                 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
2495                                 continue stateloop;
2496                             case '>':
2497                                 /*
2498                                  * U+003E GREATER-THAN SIGN (>) Parse error.
2499                                  */
2500                                 errPrematureEndOfComment();
2501                                 /* Emit the comment token. */
2502                                 emitComment(0, pos);
2503                                 /*
2504                                  * Switch to the data state.
2505                                  */
2506                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2507                                 continue stateloop;
2508                             case '\r':
2509                                 appendStrBufCarriageReturn();
2510                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2511                                 break stateloop;
2512                             case '\n':
2513                                 appendStrBufLineFeed();
2514                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2515                                 break commentstartloop;
2516                             case '\u0000':
2517                                 c = '\uFFFD';
2518                                 // fall thru
2519                             default:
2520                                 /*
2521                                  * Anything else Append the input character to
2522                                  * the comment token's data.
2523                                  */
2524                                 appendStrBuf(c);
2525                                 /*
2526                                  * Switch to the comment state.
2527                                  */
2528                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2529                                 break commentstartloop;
2530                             // continue stateloop;
2531                         }
2532                     }
2533                     // FALLTHRU DON'T REORDER
2534                 case COMMENT:
2535                     commentloop: for (;;) {
2536                         if (++pos == endPos) {
2537                             break stateloop;
2538                         }
2539                         c = checkChar(buf, pos);
2540                         /*
2541                          * Comment state Consume the next input character:
2542                          */
2543                         switch (c) {
2544                             case '-':
2545                                 /*
2546                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
2547                                  * end dash state
2548                                  */
2549                                 appendStrBuf(c);
2550                                 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2551                                 break commentloop;
2552                             // continue stateloop;
2553                             case '\r':
2554                                 appendStrBufCarriageReturn();
2555                                 break stateloop;
2556                             case '\n':
2557                                 appendStrBufLineFeed();
2558                                 continue;
2559                             case '\u0000':
2560                                 c = '\uFFFD';
2561                                 // fall thru
2562                             default:
2563                                 /*
2564                                  * Anything else Append the input character to
2565                                  * the comment token's data.
2566                                  */
2567                                 appendStrBuf(c);
2568                                 /*
2569                                  * Stay in the comment state.
2570                                  */
2571                                 continue;
2572                         }
2573                     }
2574                     // FALLTHRU DON'T REORDER
2575                 case COMMENT_END_DASH:
2576                     commentenddashloop: for (;;) {
2577                         if (++pos == endPos) {
2578                             break stateloop;
2579                         }
2580                         c = checkChar(buf, pos);
2581                         /*
2582                          * Comment end dash state Consume the next input
2583                          * character:
2584                          */
2585                         switch (c) {
2586                             case '-':
2587                                 /*
2588                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
2589                                  * end state
2590                                  */
2591                                 appendStrBuf(c);
2592                                 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2593                                 break commentenddashloop;
2594                             // continue stateloop;
2595                             case '\r':
2596                                 appendStrBufCarriageReturn();
2597                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2598                                 break stateloop;
2599                             case '\n':
2600                                 appendStrBufLineFeed();
2601                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2602                                 continue stateloop;
2603                             case '\u0000':
2604                                 c = '\uFFFD';
2605                                 // fall thru
2606                             default:
2607                                 /*
2608                                  * Anything else Append a U+002D HYPHEN-MINUS
2609                                  * (-) character and the input character to the
2610                                  * comment token's data.
2611                                  */
2612                                 appendStrBuf(c);
2613                                 /*
2614                                  * Switch to the comment state.
2615                                  */
2616                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2617                                 continue stateloop;
2618                         }
2619                     }
2620                     // FALLTHRU DON'T REORDER
2621                 case COMMENT_END:
2622                     commentendloop: for (;;) {
2623                         if (++pos == endPos) {
2624                             break stateloop;
2625                         }
2626                         c = checkChar(buf, pos);
2627                         /*
2628                          * Comment end dash state Consume the next input
2629                          * character:
2630                          */
2631                         switch (c) {
2632                             case '>':
2633                                 /*
2634                                  * U+003E GREATER-THAN SIGN (>) Emit the comment
2635                                  * token.
2636                                  */
2637                                 emitComment(2, pos);
2638                                 /*
2639                                  * Switch to the data state.
2640                                  */
2641                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2642                                 continue stateloop;
2643                             case '-':
2644                                 /* U+002D HYPHEN-MINUS (-) Parse error. */
2645                                 /*
2646                                  * Append a U+002D HYPHEN-MINUS (-) character to
2647                                  * the comment token's data.
2648                                  */
2649                                 adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2650                                 /*
2651                                  * Stay in the comment end state.
2652                                  */
2653                                 continue;
2654                             case '\r':
2655                                 adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
2656                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2657                                 break stateloop;
2658                             case '\n':
2659                                 adjustDoubleHyphenAndAppendToStrBufLineFeed();
2660                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2661                                 continue stateloop;
2662                             case '!':
2663                                 errHyphenHyphenBang();
2664                                 appendStrBuf(c);
2665                                 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
2666                                 continue stateloop;
2667                             case '\u0000':
2668                                 c = '\uFFFD';
2669                                 // fall thru
2670                             default:
2671                                 /*
2672                                  * Append two U+002D HYPHEN-MINUS (-) characters
2673                                  * and the input character to the comment
2674                                  * token's data.
2675                                  */
2676                                 adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2677                                 /*
2678                                  * Switch to the comment state.
2679                                  */
2680                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2681                                 continue stateloop;
2682                         }
2683                     }
2684                     // XXX reorder point
2685                 case COMMENT_END_BANG:
2686                     for (;;) {
2687                         if (++pos == endPos) {
2688                             break stateloop;
2689                         }
2690                         c = checkChar(buf, pos);
2691                         /*
2692                          * Comment end bang state
2693                          *
2694                          * Consume the next input character:
2695                          */
2696                         switch (c) {
2697                             case '>':
2698                                 /*
2699                                  * U+003E GREATER-THAN SIGN (>) Emit the comment
2700                                  * token.
2701                                  */
2702                                 emitComment(3, pos);
2703                                 /*
2704                                  * Switch to the data state.
2705                                  */
2706                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2707                                 continue stateloop;
2708                             case '-':
2709                                 /*
2710                                  * Append two U+002D HYPHEN-MINUS (-) characters
2711                                  * and a U+0021 EXCLAMATION MARK (!) character
2712                                  * to the comment token's data.
2713                                  */
2714                                 appendStrBuf(c);
2715                                 /*
2716                                  * Switch to the comment end dash state.
2717                                  */
2718                                 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2719                                 continue stateloop;
2720                             case '\r':
2721                                 appendStrBufCarriageReturn();
2722                                 break stateloop;
2723                             case '\n':
2724                                 appendStrBufLineFeed();
2725                                 continue;
2726                             case '\u0000':
2727                                 c = '\uFFFD';
2728                                 // fall thru
2729                             default:
2730                                 /*
2731                                  * Anything else Append two U+002D HYPHEN-MINUS
2732                                  * (-) characters, a U+0021 EXCLAMATION MARK (!)
2733                                  * character, and the input character to the
2734                                  * comment token's data. Switch to the comment
2735                                  * state.
2736                                  */
2737                                 appendStrBuf(c);
2738                                 /*
2739                                  * Switch to the comment state.
2740                                  */
2741                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2742                                 continue stateloop;
2743                         }
2744                     }
2745                     // XXX reorder point
2746                 case COMMENT_START_DASH:
2747                     if (++pos == endPos) {
2748                         break stateloop;
2749                     }
2750                     c = checkChar(buf, pos);
2751                     /*
2752                      * Comment start dash state
2753                      *
2754                      * Consume the next input character:
2755                      */
2756                     switch (c) {
2757                         case '-':
2758                             /*
2759                              * U+002D HYPHEN-MINUS (-) Switch to the comment end
2760                              * state
2761                              */
2762                             appendStrBuf(c);
2763                             state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2764                             continue stateloop;
2765                         case '>':
2766                             errPrematureEndOfComment();
2767                             /* Emit the comment token. */
2768                             emitComment(1, pos);
2769                             /*
2770                              * Switch to the data state.
2771                              */
2772                             state = transition(state, Tokenizer.DATA, reconsume, pos);
2773                             continue stateloop;
2774                         case '\r':
2775                             appendStrBufCarriageReturn();
2776                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2777                             break stateloop;
2778                         case '\n':
2779                             appendStrBufLineFeed();
2780                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2781                             continue stateloop;
2782                         case '\u0000':
2783                             c = '\uFFFD';
2784                             // fall thru
2785                         default:
2786                             /*
2787                              * Append a U+002D HYPHEN-MINUS character (-) and
2788                              * the current input character to the comment
2789                              * token's data.
2790                              */
2791                             appendStrBuf(c);
2792                             /*
2793                              * Switch to the comment state.
2794                              */
2795                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2796                             continue stateloop;
2797                     }
2798                     // XXX reorder point
2799                 case CDATA_START:
2800                     for (;;) {
2801                         if (++pos == endPos) {
2802                             break stateloop;
2803                         }
2804                         c = checkChar(buf, pos);
2805                         if (index < 6) { // CDATA_LSQB.length
2806                             if (c == Tokenizer.CDATA_LSQB[index]) {
2807                                 appendStrBuf(c);
2808                             } else {
2809                                 errBogusComment();
2810                                 reconsume = true;
2811                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2812                                 continue stateloop;
2813                             }
2814                             index++;
2815                             continue;
2816                         } else {
2817                             clearStrBufAfterUse();
2818                             cstart = pos; // start coalescing
2819                             reconsume = true;
2820                             state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2821                             break; // FALL THROUGH continue stateloop;
2822                         }
2823                     }
2824                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2825                 case CDATA_SECTION:
2826                     cdatasectionloop: for (;;) {
2827                         if (reconsume) {
2828                             reconsume = false;
2829                         } else {
2830                             if (++pos == endPos) {
2831                                 break stateloop;
2832                             }
2833                             c = checkChar(buf, pos);
2834                         }
2835                         switch (c) {
2836                             case ']':
2837                                 flushChars(buf, pos);
2838                                 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
2839                                 break cdatasectionloop; // FALL THROUGH
2840                             case '\u0000':
2841                                 emitReplacementCharacter(buf, pos);
2842                                 continue;
2843                             case '\r':
2844                                 emitCarriageReturn(buf, pos);
2845                                 break stateloop;
2846                             case '\n':
2847                                 silentLineFeed();
2848                                 // fall thru
2849                             default:
2850                                 continue;
2851                         }
2852                     }
2853                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2854                 case CDATA_RSQB:
2855                     cdatarsqb: for (;;) {
2856                         if (++pos == endPos) {
2857                             break stateloop;
2858                         }
2859                         c = checkChar(buf, pos);
2860                         switch (c) {
2861                             case ']':
2862                                 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
2863                                 break cdatarsqb;
2864                             default:
2865                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
2866                                         1);
2867                                 cstart = pos;
2868                                 reconsume = true;
2869                                 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2870                                 continue stateloop;
2871                         }
2872                     }
2873                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2874                 case CDATA_RSQB_RSQB:
2875                     cdatarsqbrsqb: for (;;) {
2876                         if (++pos == endPos) {
2877                             break stateloop;
2878                         }
2879                         c = checkChar(buf, pos);
2880                         switch (c) {
2881                             case ']':
2882                                 // Saw a third ]. Emit one ] (logically the
2883                                 // first one) and stay in this state to
2884                                 // remember that the last two characters seen
2885                                 // have been ]].
2886                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
2887                                 continue;
2888                             case '>':
2889                                 cstart = pos + 1;
2890                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2891                                 continue stateloop;
2892                             default:
2893                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
2894                                 cstart = pos;
2895                                 reconsume = true;
2896                                 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2897                                 continue stateloop;
2898                         }
2899                     }
2900                     // XXX reorder point
2901                 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
2902                     attributevaluesinglequotedloop: for (;;) {
2903                         if (reconsume) {
2904                             reconsume = false;
2905                         } else {
2906                             if (++pos == endPos) {
2907                                 break stateloop;
2908                             }
2909                             c = checkChar(buf, pos);
2910                         }
2911                         /*
2912                          * Consume the next input character:
2913                          */
2914                         switch (c) {
2915                             case '\'':
2916                                 /*
2917                                  * U+0027 APOSTROPHE (') Switch to the after
2918                                  * attribute value (quoted) state.
2919                                  */
2920                                 addAttributeWithValue();
2921 
2922                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2923                                 continue stateloop;
2924                             case '&':
2925                                 /*
2926                                  * U+0026 AMPERSAND (&) Switch to the character
2927                                  * reference in attribute value state, with the
2928                                  * + additional allowed character being U+0027
2929                                  * APOSTROPHE (').
2930                                  */
2931                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
2932                                 appendCharRefBuf(c);
2933                                 setAdditionalAndRememberAmpersandLocation('\'');
2934                                 returnState = state;
2935                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2936                                 break attributevaluesinglequotedloop;
2937                             // continue stateloop;
2938                             case '\r':
2939                                 appendStrBufCarriageReturn();
2940                                 break stateloop;
2941                             case '\n':
2942                                 appendStrBufLineFeed();
2943                                 continue;
2944                             case '\u0000':
2945                                 c = '\uFFFD';
2946                                 // fall thru
2947                             default:
2948                                 /*
2949                                  * Anything else Append the current input
2950                                  * character to the current attribute's value.
2951                                  */
2952                                 appendStrBuf(c);
2953                                 /*
2954                                  * Stay in the attribute value (double-quoted)
2955                                  * state.
2956                                  */
2957                                 continue;
2958                         }
2959                     }
2960                     // FALLTHRU DON'T REORDER
2961                 case CONSUME_CHARACTER_REFERENCE:
2962                     if (++pos == endPos) {
2963                         break stateloop;
2964                     }
2965                     c = checkChar(buf, pos);
2966                     if (c == '\u0000') {
2967                         break stateloop;
2968                     }
2969                     /*
2970                      * Unlike the definition is the spec, this state does not
2971                      * return a value and never requires the caller to
2972                      * backtrack. This state takes care of emitting characters
2973                      * or appending to the current attribute value. It also
2974                      * takes care of that in the case when consuming the
2975                      * character reference fails.
2976                      */
2977                     /*
2978                      * This section defines how to consume a character
2979                      * reference. This definition is used when parsing character
2980                      * references in text and in attributes.
2981                      *
2982                      * The behavior depends on the identity of the next
2983                      * character (the one immediately after the U+0026 AMPERSAND
2984                      * character):
2985                      */
2986                     switch (c) {
2987                         case ' ':
2988                         case '\t':
2989                         case '\n':
2990                         case '\r': // we'll reconsume!
2991                         case '\u000C':
2992                         case '<':
2993                         case '&':
2994                             emitOrAppendCharRefBuf(returnState);
2995                             if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
2996                                 cstart = pos;
2997                             }
2998                             reconsume = true;
2999                             state = transition(state, returnState, reconsume, pos);
3000                             continue stateloop;
3001                         case '#':
3002                             /*
3003                              * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
3004                              * SIGN.
3005                              */
3006                             appendCharRefBuf('#');
3007                             state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
3008                             continue stateloop;
3009                         default:
3010                             if (c == additional) {
3011                                 emitOrAppendCharRefBuf(returnState);
3012                                 reconsume = true;
3013                                 state = transition(state, returnState, reconsume, pos);
3014                                 continue stateloop;
3015                             }
3016                             if (c >= 'a' && c <= 'z') {
3017                                 firstCharKey = c - 'a' + 26;
3018                             } else if (c >= 'A' && c <= 'Z') {
3019                                 firstCharKey = c - 'A';
3020                             } else {
3021                                 // No match
3022                                 /*
3023                                  * If no match can be made, then this is a parse
3024                                  * error.
3025                                  */
3026                                 errNoNamedCharacterMatch();
3027                                 emitOrAppendCharRefBuf(returnState);
3028                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3029                                     cstart = pos;
3030                                 }
3031                                 reconsume = true;
3032                                 state = transition(state, returnState, reconsume, pos);
3033                                 continue stateloop;
3034                             }
3035                             // Didn't fail yet
3036                             appendCharRefBuf(c);
3037                             state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
3038                             // FALL THROUGH continue stateloop;
3039                     }
3040                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3041                 case CHARACTER_REFERENCE_HILO_LOOKUP:
3042                     {
3043                         if (++pos == endPos) {
3044                             break stateloop;
3045                         }
3046                         c = checkChar(buf, pos);
3047                         if (c == '\u0000') {
3048                             break stateloop;
3049                         }
3050                         /*
3051                          * The data structure is as follows:
3052                          *
3053                          * HILO_ACCEL is a two-dimensional int array whose major
3054                          * index corresponds to the second character of the
3055                          * character reference (code point as index) and the
3056                          * minor index corresponds to the first character of the
3057                          * character reference (packed so that A-Z runs from 0
3058                          * to 25 and a-z runs from 26 to 51). This layout makes
3059                          * it easier to use the sparseness of the data structure
3060                          * to omit parts of it: The second dimension of the
3061                          * table is null when no character reference starts with
3062                          * the character corresponding to that row.
3063                          *
3064                          * The int value HILO_ACCEL (by these indeces) is zero
3065                          * if there exists no character reference starting with
3066                          * that two-letter prefix. Otherwise, the value is an
3067                          * int that packs two shorts so that the higher short is
3068                          * the index of the highest character reference name
3069                          * with that prefix in NAMES and the lower short
3070                          * corresponds to the index of the lowest character
3071                          * reference name with that prefix. (It happens that the
3072                          * first two character reference names share their
3073                          * prefix so the packed int cannot be 0 by packing the
3074                          * two shorts.)
3075                          *
3076                          * NAMES is an array of byte arrays where each byte
3077                          * array encodes the name of a character references as
3078                          * ASCII. The names omit the first two letters of the
3079                          * name. (Since storing the first two letters would be
3080                          * redundant with the data contained in HILO_ACCEL.) The
3081                          * entries are lexically sorted.
3082                          *
3083                          * For a given index in NAMES, the same index in VALUES
3084                          * contains the corresponding expansion as an array of
3085                          * two UTF-16 code units (either the character and
3086                          * U+0000 or a suggogate pair).
3087                          */
3088                         int hilo = 0;
3089                         if (c <= 'z') {
3090                             @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
3091                             if (row != null) {
3092                                 hilo = row[firstCharKey];
3093                             }
3094                         }
3095                         if (hilo == 0) {
3096                             /*
3097                              * If no match can be made, then this is a parse
3098                              * error.
3099                              */
3100                             errNoNamedCharacterMatch();
3101                             emitOrAppendCharRefBuf(returnState);
3102                             if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3103                                 cstart = pos;
3104                             }
3105                             reconsume = true;
3106                             state = transition(state, returnState, reconsume, pos);
3107                             continue stateloop;
3108                         }
3109                         // Didn't fail yet
3110                         appendCharRefBuf(c);
3111                         lo = hilo & 0xFFFF;
3112                         hi = hilo >> 16;
3113                         entCol = -1;
3114                         candidate = -1;
3115                         charRefBufMark = 0;
3116                         state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
3117                         // FALL THROUGH continue stateloop;
3118                     }
3119                 case CHARACTER_REFERENCE_TAIL:
3120                     outer: for (;;) {
3121                         if (++pos == endPos) {
3122                             break stateloop;
3123                         }
3124                         c = checkChar(buf, pos);
3125                         if (c == '\u0000') {
3126                             break stateloop;
3127                         }
3128                         entCol++;
3129                         /*
3130                          * Consume the maximum number of characters possible,
3131                          * with the consumed characters matching one of the
3132                          * identifiers in the first column of the named
3133                          * character references table (in a case-sensitive
3134                          * manner).
3135                          */
3136                         loloop: for (;;) {
3137                             if (hi < lo) {
3138                                 break outer;
3139                             }
3140                             if (entCol == NamedCharacters.NAMES[lo].length()) {
3141                                 candidate = lo;
3142                                 charRefBufMark = charRefBufLen;
3143                                 lo++;
3144                             } else if (entCol > NamedCharacters.NAMES[lo].length()) {
3145                                 break outer;
3146                             } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
3147                                 lo++;
3148                             } else {
3149                                 break loloop;
3150                             }
3151                         }
3152 
3153                         hiloop: for (;;) {
3154                             if (hi < lo) {
3155                                 break outer;
3156                             }
3157                             if (entCol == NamedCharacters.NAMES[hi].length()) {
3158                                 break hiloop;
3159                             }
3160                             if (entCol > NamedCharacters.NAMES[hi].length()) {
3161                                 break outer;
3162                             } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
3163                                 hi--;
3164                             } else {
3165                                 break hiloop;
3166                             }
3167                         }
3168 
3169                         if (c == ';') {
3170                             // If we see a semicolon, there cannot be a
3171                             // longer match. Break the loop. However, before
3172                             // breaking, take the longest match so far as the
3173                             // candidate, if we are just about to complete a
3174                             // match.
3175                             if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
3176                                 candidate = lo;
3177                                 charRefBufMark = charRefBufLen;
3178                             }
3179                             break outer;
3180                         }
3181 
3182                         if (hi < lo) {
3183                             break outer;
3184                         }
3185                         appendCharRefBuf(c);
3186                         continue;
3187                     }
3188 
3189                     if (candidate == -1) {
3190                         // reconsume deals with CR, LF or nul
3191                         /*
3192                          * If no match can be made, then this is a parse error.
3193                          */
3194                         errNoNamedCharacterMatch();
3195                         emitOrAppendCharRefBuf(returnState);
3196                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3197                             cstart = pos;
3198                         }
3199                         reconsume = true;
3200                         state = transition(state, returnState, reconsume, pos);
3201                         continue stateloop;
3202                     } else {
3203                         // c can't be CR, LF or nul if we got here
3204                         @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
3205                         if (candidateName.length() == 0
3206                                 || candidateName.charAt(candidateName.length() - 1) != ';') {
3207                             /*
3208                              * If the last character matched is not a U+003B
3209                              * SEMICOLON (;), there is a parse error.
3210                              */
3211                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3212                                 /*
3213                                  * If the entity is being consumed as part of an
3214                                  * attribute, and the last character matched is
3215                                  * not a U+003B SEMICOLON (;),
3216                                  */
3217                                 char ch;
3218                                 if (charRefBufMark == charRefBufLen) {
3219                                     ch = c;
3220                                 } else {
3221                                     ch = charRefBuf[charRefBufMark];
3222                                 }
3223                                 if (ch == '=' || (ch >= '0' && ch <= '9')
3224                                         || (ch >= 'A' && ch <= 'Z')
3225                                         || (ch >= 'a' && ch <= 'z')) {
3226                                     /*
3227                                      * and the next character is either a U+003D
3228                                      * EQUALS SIGN character (=) or in the range
3229                                      * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
3230                                      * U+0041 LATIN CAPITAL LETTER A to U+005A
3231                                      * LATIN CAPITAL LETTER Z, or U+0061 LATIN
3232                                      * SMALL LETTER A to U+007A LATIN SMALL
3233                                      * LETTER Z, then, for historical reasons,
3234                                      * all the characters that were matched
3235                                      * after the U+0026 AMPERSAND (&) must be
3236                                      * unconsumed, and nothing is returned.
3237                                      */
3238                                     errNoNamedCharacterMatch();
3239                                     appendCharRefBufToStrBuf();
3240                                     reconsume = true;
3241                                     state = transition(state, returnState, reconsume, pos);
3242                                     continue stateloop;
3243                                 }
3244                             }
3245                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3246                                 errUnescapedAmpersandInterpretedAsCharacterReference();
3247                             } else {
3248                                 errNotSemicolonTerminated();
3249                             }
3250                         }
3251 
3252                         /*
3253                          * Otherwise, return a character token for the character
3254                          * corresponding to the entity name (as given by the
3255                          * second column of the named character references
3256                          * table).
3257                          */
3258                         // CPPONLY: completedNamedCharacterReference();
3259                         @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
3260                         if (
3261                         // [NOCPP[
3262                         val.length == 1
3263                         // ]NOCPP]
3264                         // CPPONLY: val[1] == 0
3265                         ) {
3266                             emitOrAppendOne(val, returnState);
3267                         } else {
3268                             emitOrAppendTwo(val, returnState);
3269                         }
3270                         // this is so complicated!
3271                         if (charRefBufMark < charRefBufLen) {
3272                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3273                                 appendStrBuf(charRefBuf, charRefBufMark,
3274                                         charRefBufLen - charRefBufMark);
3275                             } else {
3276                                 tokenHandler.characters(charRefBuf, charRefBufMark,
3277                                         charRefBufLen - charRefBufMark);
3278                             }
3279                         }
3280                         // charRefBufLen will be zeroed below!
3281 
3282                         // Check if we broke out early with c being the last
3283                         // character that matched as opposed to being the
3284                         // first one that didn't match. In the case of an
3285                         // early break, the next run on text should start
3286                         // *after* the current character and the current
3287                         // character shouldn't be reconsumed.
3288                         boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen);
3289                         charRefBufLen = 0;
3290                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3291                             cstart = earlyBreak ? pos + 1 : pos;
3292                         }
3293                         reconsume = !earlyBreak;
3294                         state = transition(state, returnState, reconsume, pos);
3295                         continue stateloop;
3296                         /*
3297                          * If the markup contains I'm &notit; I tell you, the
3298                          * entity is parsed as "not", as in, I'm ¬it; I tell
3299                          * you. But if the markup was I'm &notin; I tell you,
3300                          * the entity would be parsed as "notin;", resulting in
3301                          * I'm ∉ I tell you.
3302                          */
3303                     }
3304                     // XXX reorder point
3305                 case CONSUME_NCR:
3306                     if (++pos == endPos) {
3307                         break stateloop;
3308                     }
3309                     c = checkChar(buf, pos);
3310                     value = 0;
3311                     seenDigits = false;
3312                     /*
3313                      * The behavior further depends on the character after the
3314                      * U+0023 NUMBER SIGN:
3315                      */
3316                     switch (c) {
3317                         case 'x':
3318                         case 'X':
3319 
3320                             /*
3321                              * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
3322                              * LETTER X Consume the X.
3323                              *
3324                              * Follow the steps below, but using the range of
3325                              * characters U+0030 DIGIT ZERO through to U+0039
3326                              * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
3327                              * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
3328                              * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
3329                              * LETTER F (in other words, 0-9, A-F, a-f).
3330                              *
3331                              * When it comes to interpreting the number,
3332                              * interpret it as a hexadecimal number.
3333                              */
3334                             appendCharRefBuf(c);
3335                             state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
3336                             continue stateloop;
3337                         default:
3338                             /*
3339                              * Anything else Follow the steps below, but using
3340                              * the range of characters U+0030 DIGIT ZERO through
3341                              * to U+0039 DIGIT NINE (i.e. just 0-9).
3342                              *
3343                              * When it comes to interpreting the number,
3344                              * interpret it as a decimal number.
3345                              */
3346                             reconsume = true;
3347                             state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
3348                             // FALL THROUGH continue stateloop;
3349                     }
3350                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3351                 case DECIMAL_NRC_LOOP:
3352                     decimalloop: for (;;) {
3353                         if (reconsume) {
3354                             reconsume = false;
3355                         } else {
3356                             if (++pos == endPos) {
3357                                 break stateloop;
3358                             }
3359                             c = checkChar(buf, pos);
3360                         }
3361                         /*
3362                          * Consume as many characters as match the range of
3363                          * characters given above.
3364                          */
3365                         assert value >= 0: "value must not become negative.";
3366                         if (c >= '0' && c <= '9') {
3367                             seenDigits = true;
3368                             // Avoid overflow
3369                             if (value <= 0x10FFFF) {
3370                                 value *= 10;
3371                                 value += c - '0';
3372                             }
3373                             continue;
3374                         } else if (c == ';') {
3375                             if (seenDigits) {
3376                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3377                                     cstart = pos + 1;
3378                                 }
3379                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3380                                 // FALL THROUGH continue stateloop;
3381                                 break decimalloop;
3382                             } else {
3383                                 errNoDigitsInNCR();
3384                                 appendCharRefBuf(';');
3385                                 emitOrAppendCharRefBuf(returnState);
3386                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3387                                     cstart = pos + 1;
3388                                 }
3389                                 state = transition(state, returnState, reconsume, pos);
3390                                 continue stateloop;
3391                             }
3392                         } else {
3393                             /*
3394                              * If no characters match the range, then don't
3395                              * consume any characters (and unconsume the U+0023
3396                              * NUMBER SIGN character and, if appropriate, the X
3397                              * character). This is a parse error; nothing is
3398                              * returned.
3399                              *
3400                              * Otherwise, if the next character is a U+003B
3401                              * SEMICOLON, consume that too. If it isn't, there
3402                              * is a parse error.
3403                              */
3404                             if (!seenDigits) {
3405                                 errNoDigitsInNCR();
3406                                 emitOrAppendCharRefBuf(returnState);
3407                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3408                                     cstart = pos;
3409                                 }
3410                                 reconsume = true;
3411                                 state = transition(state, returnState, reconsume, pos);
3412                                 continue stateloop;
3413                             } else {
3414                                 errCharRefLacksSemicolon();
3415                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3416                                     cstart = pos;
3417                                 }
3418                                 reconsume = true;
3419                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3420                                 // FALL THROUGH continue stateloop;
3421                                 break decimalloop;
3422                             }
3423                         }
3424                     }
3425                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3426                 case HANDLE_NCR_VALUE:
3427                     // WARNING previous state sets reconsume
3428                     // We are not going to emit the contents of charRefBuf.
3429                     charRefBufLen = 0;
3430                     // XXX inline this case if the method size can take it
3431                     handleNcrValue(returnState);
3432                     state = transition(state, returnState, reconsume, pos);
3433                     continue stateloop;
3434                     // XXX reorder point
3435                 case HEX_NCR_LOOP:
3436                     for (;;) {
3437                         if (++pos == endPos) {
3438                             break stateloop;
3439                         }
3440                         c = checkChar(buf, pos);
3441                         /*
3442                          * Consume as many characters as match the range of
3443                          * characters given above.
3444                          */
3445                         assert value >= 0: "value must not become negative.";
3446                         if (c >= '0' && c <= '9') {
3447                             seenDigits = true;
3448                             // Avoid overflow
3449                             if (value <= 0x10FFFF) {
3450                                 value *= 16;
3451                                 value += c - '0';
3452                             }
3453                             continue;
3454                         } else if (c >= 'A' && c <= 'F') {
3455                             seenDigits = true;
3456                             // Avoid overflow
3457                             if (value <= 0x10FFFF) {
3458                                 value *= 16;
3459                                 value += c - 'A' + 10;
3460                             }
3461                             continue;
3462                         } else if (c >= 'a' && c <= 'f') {
3463                             seenDigits = true;
3464                             // Avoid overflow
3465                             if (value <= 0x10FFFF) {
3466                                 value *= 16;
3467                                 value += c - 'a' + 10;
3468                             }
3469                             continue;
3470                         } else if (c == ';') {
3471                             if (seenDigits) {
3472                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3473                                     cstart = pos + 1;
3474                                 }
3475                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3476                                 continue stateloop;
3477                             } else {
3478                                 errNoDigitsInNCR();
3479                                 appendCharRefBuf(';');
3480                                 emitOrAppendCharRefBuf(returnState);
3481                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3482                                     cstart = pos + 1;
3483                                 }
3484                                 state = transition(state, returnState, reconsume, pos);
3485                                 continue stateloop;
3486                             }
3487                         } else {
3488                             /*
3489                              * If no characters match the range, then don't
3490                              * consume any characters (and unconsume the U+0023
3491                              * NUMBER SIGN character and, if appropriate, the X
3492                              * character). This is a parse error; nothing is
3493                              * returned.
3494                              *
3495                              * Otherwise, if the next character is a U+003B
3496                              * SEMICOLON, consume that too. If it isn't, there
3497                              * is a parse error.
3498                              */
3499                             if (!seenDigits) {
3500                                 errNoDigitsInNCR();
3501                                 emitOrAppendCharRefBuf(returnState);
3502                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3503                                     cstart = pos;
3504                                 }
3505                                 reconsume = true;
3506                                 state = transition(state, returnState, reconsume, pos);
3507                                 continue stateloop;
3508                             } else {
3509                                 errCharRefLacksSemicolon();
3510                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3511                                     cstart = pos;
3512                                 }
3513                                 reconsume = true;
3514                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3515                                 continue stateloop;
3516                             }
3517                         }
3518                     }
3519                     // XXX reorder point
3520                 case PLAINTEXT:
3521                     plaintextloop: for (;;) {
3522                         if (reconsume) {
3523                             reconsume = false;
3524                         } else {
3525                             if (++pos == endPos) {
3526                                 break stateloop;
3527                             }
3528                             c = checkChar(buf, pos);
3529                         }
3530                         switch (c) {
3531                             case '\u0000':
3532                                 emitPlaintextReplacementCharacter(buf, pos);
3533                                 continue;
3534                             case '\r':
3535                                 emitCarriageReturn(buf, pos);
3536                                 break stateloop;
3537                             case '\n':
3538                                 silentLineFeed();
3539                             default:
3540                                 /*
3541                                  * Anything else Emit the current input
3542                                  * character as a character token. Stay in the
3543                                  * RAWTEXT state.
3544                                  */
3545                                 continue;
3546                         }
3547                     }
3548                     // XXX reorder point
3549                 case CLOSE_TAG_OPEN:
3550                     if (++pos == endPos) {
3551                         break stateloop;
3552                     }
3553                     c = checkChar(buf, pos);
3554                     /*
3555                      * Otherwise, if the content model flag is set to the PCDATA
3556                      * state, or if the next few characters do match that tag
3557                      * name, consume the next input character:
3558                      */
3559                     switch (c) {
3560                         case '>':
3561                             /* U+003E GREATER-THAN SIGN (>) Parse error. */
3562                             errLtSlashGt();
3563                             /*
3564                              * Switch to the data state.
3565                              */
3566                             cstart = pos + 1;
3567                             state = transition(state, Tokenizer.DATA, reconsume, pos);
3568                             continue stateloop;
3569                         case '\r':
3570                             silentCarriageReturn();
3571                             /* Anything else Parse error. */
3572                             errGarbageAfterLtSlash();
3573                             /*
3574                              * Switch to the bogus comment state.
3575                              */
3576                             clearStrBufBeforeUse();
3577                             appendStrBuf('\n');
3578                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3579                             break stateloop;
3580                         case '\n':
3581                             silentLineFeed();
3582                             /* Anything else Parse error. */
3583                             errGarbageAfterLtSlash();
3584                             /*
3585                              * Switch to the bogus comment state.
3586                              */
3587                             clearStrBufBeforeUse();
3588                             appendStrBuf(c);
3589                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3590                             continue stateloop;
3591                         case '\u0000':
3592                             c = '\uFFFD';
3593                             // fall thru
3594                         default:
3595                             if (c >= 'A' && c <= 'Z') {
3596                                 c += 0x20;
3597                             }
3598                             if (c >= 'a' && c <= 'z') {
3599                                 /*
3600                                  * U+0061 LATIN SMALL LETTER A through to U+007A
3601                                  * LATIN SMALL LETTER Z Create a new end tag
3602                                  * token,
3603                                  */
3604                                 endTag = true;
3605                                 /*
3606                                  * set its tag name to the input character,
3607                                  */
3608                                 clearStrBufBeforeUse();
3609                                 appendStrBuf(c);
3610                                 /*
3611                                  * then switch to the tag name state. (Don't
3612                                  * emit the token yet; further details will be
3613                                  * filled in before it is emitted.)
3614                                  */
3615                                 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
3616                                 continue stateloop;
3617                             } else {
3618                                 /* Anything else Parse error. */
3619                                 errGarbageAfterLtSlash();
3620                                 /*
3621                                  * Switch to the bogus comment state.
3622                                  */
3623                                 clearStrBufBeforeUse();
3624                                 appendStrBuf(c);
3625                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3626                                 continue stateloop;
3627                             }
3628                     }
3629                     // XXX reorder point
3630                 case RCDATA:
3631                     rcdataloop: for (;;) {
3632                         if (reconsume) {
3633                             reconsume = false;
3634                         } else {
3635                             if (++pos == endPos) {
3636                                 break stateloop;
3637                             }
3638                             c = checkChar(buf, pos);
3639                         }
3640                         switch (c) {
3641                             case '&':
3642                                 /*
3643                                  * U+0026 AMPERSAND (&) Switch to the character
3644                                  * reference in RCDATA state.
3645                                  */
3646                                 flushChars(buf, pos);
3647                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
3648                                 appendCharRefBuf(c);
3649                                 setAdditionalAndRememberAmpersandLocation('\u0000');
3650                                 returnState = state;
3651                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
3652                                 continue stateloop;
3653                             case '<':
3654                                 /*
3655                                  * U+003C LESS-THAN SIGN (<) Switch to the
3656                                  * RCDATA less-than sign state.
3657                                  */
3658                                 flushChars(buf, pos);
3659 
3660                                 returnState = state;
3661                                 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3662                                 continue stateloop;
3663                             case '\u0000':
3664                                 emitReplacementCharacter(buf, pos);
3665                                 continue;
3666                             case '\r':
3667                                 emitCarriageReturn(buf, pos);
3668                                 break stateloop;
3669                             case '\n':
3670                                 silentLineFeed();
3671                             default:
3672                                 /*
3673                                  * Emit the current input character as a
3674                                  * character token. Stay in the RCDATA state.
3675                                  */
3676                                 continue;
3677                         }
3678                     }
3679                     // XXX reorder point
3680                 case RAWTEXT:
3681                     rawtextloop: for (;;) {
3682                         if (reconsume) {
3683                             reconsume = false;
3684                         } else {
3685                             if (++pos == endPos) {
3686                                 break stateloop;
3687                             }
3688                             c = checkChar(buf, pos);
3689                         }
3690                         switch (c) {
3691                             case '<':
3692                                 /*
3693                                  * U+003C LESS-THAN SIGN (<) Switch to the
3694                                  * RAWTEXT less-than sign state.
3695                                  */
3696                                 flushChars(buf, pos);
3697 
3698                                 returnState = state;
3699                                 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3700                                 break rawtextloop;
3701                             // FALL THRU continue stateloop;
3702                             case '\u0000':
3703                                 emitReplacementCharacter(buf, pos);
3704                                 continue;
3705                             case '\r':
3706                                 emitCarriageReturn(buf, pos);
3707                                 break stateloop;
3708                             case '\n':
3709                                 silentLineFeed();
3710                             default:
3711                                 /*
3712                                  * Emit the current input character as a
3713                                  * character token. Stay in the RAWTEXT state.
3714                                  */
3715                                 continue;
3716                         }
3717                     }
3718                     // XXX fallthru don't reorder
3719                 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
3720                     rawtextrcdatalessthansignloop: for (;;) {
3721                         if (++pos == endPos) {
3722                             break stateloop;
3723                         }
3724                         c = checkChar(buf, pos);
3725                         switch (c) {
3726                             case '/':
3727                                 /*
3728                                  * U+002F SOLIDUS (/) Set the temporary buffer
3729                                  * to the empty string. Switch to the script
3730                                  * data end tag open state.
3731                                  */
3732                                 index = 0;
3733                                 clearStrBufBeforeUse();
3734                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
3735                                 break rawtextrcdatalessthansignloop;
3736                             // FALL THRU continue stateloop;
3737                             default:
3738                                 /*
3739                                  * Otherwise, emit a U+003C LESS-THAN SIGN
3740                                  * character token
3741                                  */
3742                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3743                                 /*
3744                                  * and reconsume the current input character in
3745                                  * the data state.
3746                                  */
3747                                 cstart = pos;
3748                                 reconsume = true;
3749                                 state = transition(state, returnState, reconsume, pos);
3750                                 continue stateloop;
3751                         }
3752                     }
3753                     // XXX fall thru. don't reorder.
3754                 case NON_DATA_END_TAG_NAME:
3755                     for (;;) {
3756                         if (++pos == endPos) {
3757                             break stateloop;
3758                         }
3759                         c = checkChar(buf, pos);
3760                         /*
3761                          * ASSERT! when entering this state, set index to 0 and
3762                          * call clearStrBufBeforeUse() assert (contentModelElement !=
3763                          * null); Let's implement the above without lookahead.
3764                          * strBuf is the 'temporary buffer'.
3765                          */
3766                         if (index < endTagExpectationAsArray.length) {
3767                             char e = endTagExpectationAsArray[index];
3768                             char folded = c;
3769                             if (c >= 'A' && c <= 'Z') {
3770                                 folded += 0x20;
3771                             }
3772                             if (folded != e) {
3773                                 // [NOCPP[
3774                                 errHtml4LtSlashInRcdata(folded);
3775                                 // ]NOCPP]
3776                                 tokenHandler.characters(Tokenizer.LT_SOLIDUS,
3777                                         0, 2);
3778                                 emitStrBuf();
3779                                 cstart = pos;
3780                                 reconsume = true;
3781                                 state = transition(state, returnState, reconsume, pos);
3782                                 continue stateloop;
3783                             }
3784                             appendStrBuf(c);
3785                             index++;
3786                             continue;
3787                         } else {
3788                             endTag = true;
3789                             // XXX replace contentModelElement with different
3790                             // type
3791                             tagName = endTagExpectation;
3792                             switch (c) {
3793                                 case '\r':
3794                                     silentCarriageReturn();
3795                                     clearStrBufAfterUse(); // strBuf not used
3796                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3797                                     break stateloop;
3798                                 case '\n':
3799                                     silentLineFeed();
3800                                     // fall thru
3801                                 case ' ':
3802                                 case '\t':
3803                                 case '\u000C':
3804                                     /*
3805                                      * U+0009 CHARACTER TABULATION U+000A LINE
3806                                      * FEED (LF) U+000C FORM FEED (FF) U+0020
3807                                      * SPACE If the current end tag token is an
3808                                      * appropriate end tag token, then switch to
3809                                      * the before attribute name state.
3810                                      */
3811                                     clearStrBufAfterUse(); // strBuf not used
3812                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3813                                     continue stateloop;
3814                                 case '/':
3815                                     /*
3816                                      * U+002F SOLIDUS (/) If the current end tag
3817                                      * token is an appropriate end tag token,
3818                                      * then switch to the self-closing start tag
3819                                      * state.
3820                                      */
3821                                     clearStrBufAfterUse(); // strBuf not used
3822                                     state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
3823                                     continue stateloop;
3824                                 case '>':
3825                                     /*
3826                                      * U+003E GREATER-THAN SIGN (>) If the
3827                                      * current end tag token is an appropriate
3828                                      * end tag token, then emit the current tag
3829                                      * token and switch to the data state.
3830                                      */
3831                                     clearStrBufAfterUse(); // strBuf not used
3832                                     state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
3833                                     if (shouldSuspend) {
3834                                         break stateloop;
3835                                     }
3836                                     continue stateloop;
3837                                 default:
3838                                     /*
3839                                      * Emit a U+003C LESS-THAN SIGN character
3840                                      * token, a U+002F SOLIDUS character token,
3841                                      * a character token for each of the
3842                                      * characters in the temporary buffer (in
3843                                      * the order they were added to the buffer),
3844                                      * and reconsume the current input character
3845                                      * in the RAWTEXT state.
3846                                      */
3847                                     // [NOCPP[
3848                                     errWarnLtSlashInRcdata();
3849                                     // ]NOCPP]
3850                                     tokenHandler.characters(
3851                                             Tokenizer.LT_SOLIDUS, 0, 2);
3852                                     emitStrBuf();
3853                                     if (c == '\u0000') {
3854                                         emitReplacementCharacter(buf, pos);
3855                                     } else {
3856                                         cstart = pos; // don't drop the
3857                                         // character
3858                                     }
3859                                     state = transition(state, returnState, reconsume, pos);
3860                                     continue stateloop;
3861                             }
3862                         }
3863                     }
3864                     // XXX reorder point
3865                     // BEGIN HOTSPOT WORKAROUND
3866                 case BOGUS_COMMENT:
3867                     boguscommentloop: for (;;) {
3868                         if (reconsume) {
3869                             reconsume = false;
3870                         } else {
3871                             if (++pos == endPos) {
3872                                 break stateloop;
3873                             }
3874                             c = checkChar(buf, pos);
3875                         }
3876                         /*
3877                          * Consume every character up to and including the first
3878                          * U+003E GREATER-THAN SIGN character (>) or the end of
3879                          * the file (EOF), whichever comes first. Emit a comment
3880                          * token whose data is the concatenation of all the
3881                          * characters starting from and including the character
3882                          * that caused the state machine to switch into the
3883                          * bogus comment state, up to and including the
3884                          * character immediately before the last consumed
3885                          * character (i.e. up to the character just before the
3886                          * U+003E or EOF character). (If the comment was started
3887                          * by the end of the file (EOF), the token is empty.)
3888                          *
3889                          * Switch to the data state.
3890                          *
3891                          * If the end of the file was reached, reconsume the EOF
3892                          * character.
3893                          */
3894                         switch (c) {
3895                             case '>':
3896                                 emitComment(0, pos);
3897                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
3898                                 continue stateloop;
3899                             case '-':
3900                                 appendStrBuf(c);
3901                                 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
3902                                 break boguscommentloop;
3903                             case '\r':
3904                                 appendStrBufCarriageReturn();
3905                                 break stateloop;
3906                             case '\n':
3907                                 appendStrBufLineFeed();
3908                                 continue;
3909                             case '\u0000':
3910                                 c = '\uFFFD';
3911                                 // fall thru
3912                             default:
3913                                 appendStrBuf(c);
3914                                 continue;
3915                         }
3916                     }
3917                     // FALLTHRU DON'T REORDER
3918                 case BOGUS_COMMENT_HYPHEN:
3919                     boguscommenthyphenloop: for (;;) {
3920                         if (++pos == endPos) {
3921                             break stateloop;
3922                         }
3923                         c = checkChar(buf, pos);
3924                         switch (c) {
3925                             case '>':
3926                                 // [NOCPP[
3927                                 maybeAppendSpaceToBogusComment();
3928                                 // ]NOCPP]
3929                                 emitComment(0, pos);
3930                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
3931                                 continue stateloop;
3932                             case '-':
3933                                 appendSecondHyphenToBogusComment();
3934                                 continue boguscommenthyphenloop;
3935                             case '\r':
3936                                 appendStrBufCarriageReturn();
3937                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3938                                 break stateloop;
3939                             case '\n':
3940                                 appendStrBufLineFeed();
3941                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3942                                 continue stateloop;
3943                             case '\u0000':
3944                                 c = '\uFFFD';
3945                                 // fall thru
3946                             default:
3947                                 appendStrBuf(c);
3948                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3949                                 continue stateloop;
3950                         }
3951                     }
3952                     // XXX reorder point
3953                 case SCRIPT_DATA:
3954                     scriptdataloop: for (;;) {
3955                         if (reconsume) {
3956                             reconsume = false;
3957                         } else {
3958                             if (++pos == endPos) {
3959                                 break stateloop;
3960                             }
3961                             c = checkChar(buf, pos);
3962                         }
3963                         switch (c) {
3964                             case '<':
3965                                 /*
3966                                  * U+003C LESS-THAN SIGN (<) Switch to the
3967                                  * script data less-than sign state.
3968                                  */
3969                                 flushChars(buf, pos);
3970                                 returnState = state;
3971                                 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
3972                                 break scriptdataloop; // FALL THRU continue
3973                             // stateloop;
3974                             case '\u0000':
3975                                 emitReplacementCharacter(buf, pos);
3976                                 continue;
3977                             case '\r':
3978                                 emitCarriageReturn(buf, pos);
3979                                 break stateloop;
3980                             case '\n':
3981                                 silentLineFeed();
3982                             default:
3983                                 /*
3984                                  * Anything else Emit the current input
3985                                  * character as a character token. Stay in the
3986                                  * script data state.
3987                                  */
3988                                 continue;
3989                         }
3990                     }
3991                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3992                 case SCRIPT_DATA_LESS_THAN_SIGN:
3993                     scriptdatalessthansignloop: for (;;) {
3994                         if (++pos == endPos) {
3995                             break stateloop;
3996                         }
3997                         c = checkChar(buf, pos);
3998                         switch (c) {
3999                             case '/':
4000                                 /*
4001                                  * U+002F SOLIDUS (/) Set the temporary buffer
4002                                  * to the empty string. Switch to the script
4003                                  * data end tag open state.
4004                                  */
4005                                 index = 0;
4006                                 clearStrBufBeforeUse();
4007                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
4008                                 continue stateloop;
4009                             case '!':
4010                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4011                                 cstart = pos;
4012                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
4013                                 break scriptdatalessthansignloop; // FALL THRU
4014                             // continue
4015                             // stateloop;
4016                             default:
4017                                 /*
4018                                  * Otherwise, emit a U+003C LESS-THAN SIGN
4019                                  * character token
4020                                  */
4021                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4022                                 /*
4023                                  * and reconsume the current input character in
4024                                  * the data state.
4025                                  */
4026                                 cstart = pos;
4027                                 reconsume = true;
4028                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4029                                 continue stateloop;
4030                         }
4031                     }
4032                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4033                 case SCRIPT_DATA_ESCAPE_START:
4034                     scriptdataescapestartloop: for (;;) {
4035                         if (++pos == endPos) {
4036                             break stateloop;
4037                         }
4038                         c = checkChar(buf, pos);
4039                         /*
4040                          * Consume the next input character:
4041                          */
4042                         switch (c) {
4043                             case '-':
4044                                 /*
4045                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4046                                  * HYPHEN-MINUS character token. Switch to the
4047                                  * script data escape start dash state.
4048                                  */
4049                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
4050                                 break scriptdataescapestartloop; // FALL THRU
4051                             // continue
4052                             // stateloop;
4053                             default:
4054                                 /*
4055                                  * Anything else Reconsume the current input
4056                                  * character in the script data state.
4057                                  */
4058                                 reconsume = true;
4059                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4060                                 continue stateloop;
4061                         }
4062                     }
4063                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4064                 case SCRIPT_DATA_ESCAPE_START_DASH:
4065                     scriptdataescapestartdashloop: for (;;) {
4066                         if (++pos == endPos) {
4067                             break stateloop;
4068                         }
4069                         c = checkChar(buf, pos);
4070                         /*
4071                          * Consume the next input character:
4072                          */
4073                         switch (c) {
4074                             case '-':
4075                                 /*
4076                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4077                                  * HYPHEN-MINUS character token. Switch to the
4078                                  * script data escaped dash dash state.
4079                                  */
4080                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4081                                 break scriptdataescapestartdashloop;
4082                             // continue stateloop;
4083                             default:
4084                                 /*
4085                                  * Anything else Reconsume the current input
4086                                  * character in the script data state.
4087                                  */
4088                                 reconsume = true;
4089                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4090                                 continue stateloop;
4091                         }
4092                     }
4093                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4094                 case SCRIPT_DATA_ESCAPED_DASH_DASH:
4095                     scriptdataescapeddashdashloop: for (;;) {
4096                         if (++pos == endPos) {
4097                             break stateloop;
4098                         }
4099                         c = checkChar(buf, pos);
4100                         /*
4101                          * Consume the next input character:
4102                          */
4103                         switch (c) {
4104                             case '-':
4105                                 /*
4106                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4107                                  * HYPHEN-MINUS character token. Stay in the
4108                                  * script data escaped dash dash state.
4109                                  */
4110                                 continue;
4111                             case '<':
4112                                 /*
4113                                  * U+003C LESS-THAN SIGN (<) Switch to the
4114                                  * script data escaped less-than sign state.
4115                                  */
4116                                 flushChars(buf, pos);
4117                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4118                                 continue stateloop;
4119                             case '>':
4120                                 /*
4121                                  * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4122                                  * GREATER-THAN SIGN character token. Switch to
4123                                  * the script data state.
4124                                  */
4125                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4126                                 continue stateloop;
4127                             case '\u0000':
4128                                 emitReplacementCharacter(buf, pos);
4129                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4130                                 break scriptdataescapeddashdashloop;
4131                             case '\r':
4132                                 emitCarriageReturn(buf, pos);
4133                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4134                                 break stateloop;
4135                             case '\n':
4136                                 silentLineFeed();
4137                             default:
4138                                 /*
4139                                  * Anything else Emit the current input
4140                                  * character as a character token. Switch to the
4141                                  * script data escaped state.
4142                                  */
4143                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4144                                 break scriptdataescapeddashdashloop;
4145                             // continue stateloop;
4146                         }
4147                     }
4148                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4149                 case SCRIPT_DATA_ESCAPED:
4150                     scriptdataescapedloop: for (;;) {
4151                         if (reconsume) {
4152                             reconsume = false;
4153                         } else {
4154                             if (++pos == endPos) {
4155                                 break stateloop;
4156                             }
4157                             c = checkChar(buf, pos);
4158                         }
4159                         /*
4160                          * Consume the next input character:
4161                          */
4162                         switch (c) {
4163                             case '-':
4164                                 /*
4165                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4166                                  * HYPHEN-MINUS character token. Switch to the
4167                                  * script data escaped dash state.
4168                                  */
4169                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
4170                                 break scriptdataescapedloop; // FALL THRU
4171                             // continue
4172                             // stateloop;
4173                             case '<':
4174                                 /*
4175                                  * U+003C LESS-THAN SIGN (<) Switch to the
4176                                  * script data escaped less-than sign state.
4177                                  */
4178                                 flushChars(buf, pos);
4179                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4180                                 continue stateloop;
4181                             case '\u0000':
4182                                 emitReplacementCharacter(buf, pos);
4183                                 continue;
4184                             case '\r':
4185                                 emitCarriageReturn(buf, pos);
4186                                 break stateloop;
4187                             case '\n':
4188                                 silentLineFeed();
4189                             default:
4190                                 /*
4191                                  * Anything else Emit the current input
4192                                  * character as a character token. Stay in the
4193                                  * script data escaped state.
4194                                  */
4195                                 continue;
4196                         }
4197                     }
4198                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4199                 case SCRIPT_DATA_ESCAPED_DASH:
4200                     scriptdataescapeddashloop: for (;;) {
4201                         if (++pos == endPos) {
4202                             break stateloop;
4203                         }
4204                         c = checkChar(buf, pos);
4205                         /*
4206                          * Consume the next input character:
4207                          */
4208                         switch (c) {
4209                             case '-':
4210                                 /*
4211                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4212                                  * HYPHEN-MINUS character token. Switch to the
4213                                  * script data escaped dash dash state.
4214                                  */
4215                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4216                                 continue stateloop;
4217                             case '<':
4218                                 /*
4219                                  * U+003C LESS-THAN SIGN (<) Switch to the
4220                                  * script data escaped less-than sign state.
4221                                  */
4222                                 flushChars(buf, pos);
4223                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4224                                 break scriptdataescapeddashloop;
4225                             // continue stateloop;
4226                             case '\u0000':
4227                                 emitReplacementCharacter(buf, pos);
4228                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4229                                 continue stateloop;
4230                             case '\r':
4231                                 emitCarriageReturn(buf, pos);
4232                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4233                                 break stateloop;
4234                             case '\n':
4235                                 silentLineFeed();
4236                             default:
4237                                 /*
4238                                  * Anything else Emit the current input
4239                                  * character as a character token. Switch to the
4240                                  * script data escaped state.
4241                                  */
4242                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4243                                 continue stateloop;
4244                         }
4245                     }
4246                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4247                 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
4248                     scriptdataescapedlessthanloop: for (;;) {
4249                         if (++pos == endPos) {
4250                             break stateloop;
4251                         }
4252                         c = checkChar(buf, pos);
4253                         /*
4254                          * Consume the next input character:
4255                          */
4256                         switch (c) {
4257                             case '/':
4258                                 /*
4259                                  * U+002F SOLIDUS (/) Set the temporary buffer
4260                                  * to the empty string. Switch to the script
4261                                  * data escaped end tag open state.
4262                                  */
4263                                 index = 0;
4264                                 clearStrBufBeforeUse();
4265                                 returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
4266                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
4267                                 continue stateloop;
4268                             case 'S':
4269                             case 's':
4270                                 /*
4271                                  * U+0041 LATIN CAPITAL LETTER A through to
4272                                  * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
4273                                  * LESS-THAN SIGN character token and the
4274                                  * current input character as a character token.
4275                                  */
4276                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4277                                 cstart = pos;
4278                                 index = 1;
4279                                 /*
4280                                  * Set the temporary buffer to the empty string.
4281                                  * Append the lowercase version of the current
4282                                  * input character (add 0x0020 to the
4283                                  * character's code point) to the temporary
4284                                  * buffer. Switch to the script data double
4285                                  * escape start state.
4286                                  */
4287                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
4288                                 break scriptdataescapedlessthanloop;
4289                             // continue stateloop;
4290                             default:
4291                                 /*
4292                                  * Anything else Emit a U+003C LESS-THAN SIGN
4293                                  * character token and reconsume the current
4294                                  * input character in the script data escaped
4295                                  * state.
4296                                  */
4297                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4298                                 cstart = pos;
4299                                 reconsume = true;
4300                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4301                                 continue stateloop;
4302                         }
4303                     }
4304                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4305                 case SCRIPT_DATA_DOUBLE_ESCAPE_START:
4306                     scriptdatadoubleescapestartloop: for (;;) {
4307                         if (++pos == endPos) {
4308                             break stateloop;
4309                         }
4310                         c = checkChar(buf, pos);
4311                         assert index > 0;
4312                         if (index < 6) { // SCRIPT_ARR.length
4313                             char folded = c;
4314                             if (c >= 'A' && c <= 'Z') {
4315                                 folded += 0x20;
4316                             }
4317                             if (folded != Tokenizer.SCRIPT_ARR[index]) {
4318                                 reconsume = true;
4319                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4320                                 continue stateloop;
4321                             }
4322                             index++;
4323                             continue;
4324                         }
4325                         switch (c) {
4326                             case '\r':
4327                                 emitCarriageReturn(buf, pos);
4328                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4329                                 break stateloop;
4330                             case '\n':
4331                                 silentLineFeed();
4332                             case ' ':
4333                             case '\t':
4334                             case '\u000C':
4335                             case '/':
4336                             case '>':
4337                                 /*
4338                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4339                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4340                                  * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4341                                  * (>) Emit the current input character as a
4342                                  * character token. If the temporary buffer is
4343                                  * the string "script", then switch to the
4344                                  * script data double escaped state.
4345                                  */
4346                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4347                                 break scriptdatadoubleescapestartloop;
4348                             // continue stateloop;
4349                             default:
4350                                 /*
4351                                  * Anything else Reconsume the current input
4352                                  * character in the script data escaped state.
4353                                  */
4354                                 reconsume = true;
4355                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4356                                 continue stateloop;
4357                         }
4358                     }
4359                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4360                 case SCRIPT_DATA_DOUBLE_ESCAPED:
4361                     scriptdatadoubleescapedloop: for (;;) {
4362                         if (reconsume) {
4363                             reconsume = false;
4364                         } else {
4365                             if (++pos == endPos) {
4366                                 break stateloop;
4367                             }
4368                             c = checkChar(buf, pos);
4369                         }
4370                         /*
4371                          * Consume the next input character:
4372                          */
4373                         switch (c) {
4374                             case '-':
4375                                 /*
4376                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4377                                  * HYPHEN-MINUS character token. Switch to the
4378                                  * script data double escaped dash state.
4379                                  */
4380                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
4381                                 break scriptdatadoubleescapedloop; // FALL THRU
4382                             // continue
4383                             // stateloop;
4384                             case '<':
4385                                 /*
4386                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
4387                                  * LESS-THAN SIGN character token. Switch to the
4388                                  * script data double escaped less-than sign
4389                                  * state.
4390                                  */
4391                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4392                                 continue stateloop;
4393                             case '\u0000':
4394                                 emitReplacementCharacter(buf, pos);
4395                                 continue;
4396                             case '\r':
4397                                 emitCarriageReturn(buf, pos);
4398                                 break stateloop;
4399                             case '\n':
4400                                 silentLineFeed();
4401                             default:
4402                                 /*
4403                                  * Anything else Emit the current input
4404                                  * character as a character token. Stay in the
4405                                  * script data double escaped state.
4406                                  */
4407                                 continue;
4408                         }
4409                     }
4410                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4411                 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
4412                     scriptdatadoubleescapeddashloop: for (;;) {
4413                         if (++pos == endPos) {
4414                             break stateloop;
4415                         }
4416                         c = checkChar(buf, pos);
4417                         /*
4418                          * Consume the next input character:
4419                          */
4420                         switch (c) {
4421                             case '-':
4422                                 /*
4423                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4424                                  * HYPHEN-MINUS character token. Switch to the
4425                                  * script data double escaped dash dash state.
4426                                  */
4427                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
4428                                 break scriptdatadoubleescapeddashloop;
4429                             // continue stateloop;
4430                             case '<':
4431                                 /*
4432                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
4433                                  * LESS-THAN SIGN character token. Switch to the
4434                                  * script data double escaped less-than sign
4435                                  * state.
4436                                  */
4437                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4438                                 continue stateloop;
4439                             case '\u0000':
4440                                 emitReplacementCharacter(buf, pos);
4441                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4442                                 continue stateloop;
4443                             case '\r':
4444                                 emitCarriageReturn(buf, pos);
4445                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4446                                 break stateloop;
4447                             case '\n':
4448                                 silentLineFeed();
4449                             default:
4450                                 /*
4451                                  * Anything else Emit the current input
4452                                  * character as a character token. Switch to the
4453                                  * script data double escaped state.
4454                                  */
4455                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4456                                 continue stateloop;
4457                         }
4458                     }
4459                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4460                 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
4461                     scriptdatadoubleescapeddashdashloop: for (;;) {
4462                         if (++pos == endPos) {
4463                             break stateloop;
4464                         }
4465                         c = checkChar(buf, pos);
4466                         /*
4467                          * Consume the next input character:
4468                          */
4469                         switch (c) {
4470                             case '-':
4471                                 /*
4472                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4473                                  * HYPHEN-MINUS character token. Stay in the
4474                                  * script data double escaped dash dash state.
4475                                  */
4476                                 continue;
4477                             case '<':
4478                                 /*
4479                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
4480                                  * LESS-THAN SIGN character token. Switch to the
4481                                  * script data double escaped less-than sign
4482                                  * state.
4483                                  */
4484                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4485                                 break scriptdatadoubleescapeddashdashloop;
4486                             case '>':
4487                                 /*
4488                                  * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4489                                  * GREATER-THAN SIGN character token. Switch to
4490                                  * the script data state.
4491                                  */
4492                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4493                                 continue stateloop;
4494                             case '\u0000':
4495                                 emitReplacementCharacter(buf, pos);
4496                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4497                                 continue stateloop;
4498                             case '\r':
4499                                 emitCarriageReturn(buf, pos);
4500                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4501                                 break stateloop;
4502                             case '\n':
4503                                 silentLineFeed();
4504                             default:
4505                                 /*
4506                                  * Anything else Emit the current input
4507                                  * character as a character token. Switch to the
4508                                  * script data double escaped state.
4509                                  */
4510                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4511                                 continue stateloop;
4512                         }
4513                     }
4514                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4515                 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
4516                     scriptdatadoubleescapedlessthanloop: for (;;) {
4517                         if (++pos == endPos) {
4518                             break stateloop;
4519                         }
4520                         c = checkChar(buf, pos);
4521                         /*
4522                          * Consume the next input character:
4523                          */
4524                         switch (c) {
4525                             case '/':
4526                                 /*
4527                                  * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
4528                                  * character token. Set the temporary buffer to
4529                                  * the empty string. Switch to the script data
4530                                  * double escape end state.
4531                                  */
4532                                 index = 0;
4533                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
4534                                 break scriptdatadoubleescapedlessthanloop;
4535                             default:
4536                                 /*
4537                                  * Anything else Reconsume the current input
4538                                  * character in the script data double escaped
4539                                  * state.
4540                                  */
4541                                 reconsume = true;
4542                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4543                                 continue stateloop;
4544                         }
4545                     }
4546                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4547                 case SCRIPT_DATA_DOUBLE_ESCAPE_END:
4548                     scriptdatadoubleescapeendloop: for (;;) {
4549                         if (++pos == endPos) {
4550                             break stateloop;
4551                         }
4552                         c = checkChar(buf, pos);
4553                         if (index < 6) { // SCRIPT_ARR.length
4554                             char folded = c;
4555                             if (c >= 'A' && c <= 'Z') {
4556                                 folded += 0x20;
4557                             }
4558                             if (folded != Tokenizer.SCRIPT_ARR[index]) {
4559                                 reconsume = true;
4560                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4561                                 continue stateloop;
4562                             }
4563                             index++;
4564                             continue;
4565                         }
4566                         switch (c) {
4567                             case '\r':
4568                                 emitCarriageReturn(buf, pos);
4569                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4570                                 break stateloop;
4571                             case '\n':
4572                                 silentLineFeed();
4573                             case ' ':
4574                             case '\t':
4575                             case '\u000C':
4576                             case '/':
4577                             case '>':
4578                                 /*
4579                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4580                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4581                                  * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4582                                  * (>) Emit the current input character as a
4583                                  * character token. If the temporary buffer is
4584                                  * the string "script", then switch to the
4585                                  * script data escaped state.
4586                                  */
4587                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4588                                 continue stateloop;
4589                             default:
4590                                 /*
4591                                  * Reconsume the current input character in the
4592                                  * script data double escaped state.
4593                                  */
4594                                 reconsume = true;
4595                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4596                                 continue stateloop;
4597                         }
4598                     }
4599                     // XXX reorder point
4600                 case MARKUP_DECLARATION_OCTYPE:
4601                     markupdeclarationdoctypeloop: for (;;) {
4602                         if (++pos == endPos) {
4603                             break stateloop;
4604                         }
4605                         c = checkChar(buf, pos);
4606                         if (index < 6) { // OCTYPE.length
4607                             char folded = c;
4608                             if (c >= 'A' && c <= 'Z') {
4609                                 folded += 0x20;
4610                             }
4611                             if (folded == Tokenizer.OCTYPE[index]) {
4612                                 appendStrBuf(c);
4613                             } else {
4614                                 errBogusComment();
4615                                 reconsume = true;
4616                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4617                                 continue stateloop;
4618                             }
4619                             index++;
4620                             continue;
4621                         } else {
4622                             reconsume = true;
4623                             state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
4624                             break markupdeclarationdoctypeloop;
4625                             // continue stateloop;
4626                         }
4627                     }
4628                     // FALLTHRU DON'T REORDER
4629                 case DOCTYPE:
4630                     doctypeloop: for (;;) {
4631                         if (reconsume) {
4632                             reconsume = false;
4633                         } else {
4634                             if (++pos == endPos) {
4635                                 break stateloop;
4636                             }
4637                             c = checkChar(buf, pos);
4638                         }
4639                         initDoctypeFields();
4640                         /*
4641                          * Consume the next input character:
4642                          */
4643                         switch (c) {
4644                             case '\r':
4645                                 silentCarriageReturn();
4646                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4647                                 break stateloop;
4648                             case '\n':
4649                                 silentLineFeed();
4650                                 // fall thru
4651                             case ' ':
4652                             case '\t':
4653                             case '\u000C':
4654                                 /*
4655                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4656                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4657                                  * Switch to the before DOCTYPE name state.
4658                                  */
4659                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4660                                 break doctypeloop;
4661                             // continue stateloop;
4662                             default:
4663                                 /*
4664                                  * Anything else Parse error.
4665                                  */
4666                                 errMissingSpaceBeforeDoctypeName();
4667                                 /*
4668                                  * Reconsume the current character in the before
4669                                  * DOCTYPE name state.
4670                                  */
4671                                 reconsume = true;
4672                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4673                                 break doctypeloop;
4674                             // continue stateloop;
4675                         }
4676                     }
4677                     // FALLTHRU DON'T REORDER
4678                 case BEFORE_DOCTYPE_NAME:
4679                     beforedoctypenameloop: for (;;) {
4680                         if (reconsume) {
4681                             reconsume = false;
4682                         } else {
4683                             if (++pos == endPos) {
4684                                 break stateloop;
4685                             }
4686                             c = checkChar(buf, pos);
4687                         }
4688                         /*
4689                          * Consume the next input character:
4690                          */
4691                         switch (c) {
4692                             case '\r':
4693                                 silentCarriageReturn();
4694                                 break stateloop;
4695                             case '\n':
4696                                 silentLineFeed();
4697                                 // fall thru
4698                             case ' ':
4699                             case '\t':
4700                             case '\u000C':
4701                                 /*
4702                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4703                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4704                                  * in the before DOCTYPE name state.
4705                                  */
4706                                 continue;
4707                             case '>':
4708                                 /*
4709                                  * U+003E GREATER-THAN SIGN (>) Parse error.
4710                                  */
4711                                 errNamelessDoctype();
4712                                 /*
4713                                  * Create a new DOCTYPE token. Set its
4714                                  * force-quirks flag to on.
4715                                  */
4716                                 forceQuirks = true;
4717                                 /*
4718                                  * Emit the token.
4719                                  */
4720                                 emitDoctypeToken(pos);
4721                                 /*
4722                                  * Switch to the data state.
4723                                  */
4724                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
4725                                 continue stateloop;
4726                             case '\u0000':
4727                                 c = '\uFFFD';
4728                                 // fall thru
4729                             default:
4730                                 if (c >= 'A' && c <= 'Z') {
4731                                     /*
4732                                      * U+0041 LATIN CAPITAL LETTER A through to
4733                                      * U+005A LATIN CAPITAL LETTER Z Create a
4734                                      * new DOCTYPE token. Set the token's name
4735                                      * to the lowercase version of the input
4736                                      * character (add 0x0020 to the character's
4737                                      * code point).
4738                                      */
4739                                     c += 0x20;
4740                                 }
4741                                 /* Anything else Create a new DOCTYPE token. */
4742                                 /*
4743                                  * Set the token's name name to the current
4744                                  * input character.
4745                                  */
4746                                 clearStrBufBeforeUse();
4747                                 appendStrBuf(c);
4748                                 /*
4749                                  * Switch to the DOCTYPE name state.
4750                                  */
4751                                 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
4752                                 break beforedoctypenameloop;
4753                             // continue stateloop;
4754                         }
4755                     }
4756                     // FALLTHRU DON'T REORDER
4757                 case DOCTYPE_NAME:
4758                     doctypenameloop: for (;;) {
4759                         if (++pos == endPos) {
4760                             break stateloop;
4761                         }
4762                         c = checkChar(buf, pos);
4763                         /*
4764                          * Consume the next input character:
4765                          */
4766                         switch (c) {
4767                             case '\r':
4768                                 silentCarriageReturn();
4769                                 strBufToDoctypeName();
4770                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4771                                 break stateloop;
4772                             case '\n':
4773                                 silentLineFeed();
4774                                 // fall thru
4775                             case ' ':
4776                             case '\t':
4777                             case '\u000C':
4778                                 /*
4779                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4780                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4781                                  * Switch to the after DOCTYPE name state.
4782                                  */
4783                                 strBufToDoctypeName();
4784                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4785                                 break doctypenameloop;
4786                             // continue stateloop;
4787                             case '>':
4788                                 /*
4789                                  * U+003E GREATER-THAN SIGN (>) Emit the current
4790                                  * DOCTYPE token.
4791                                  */
4792                                 strBufToDoctypeName();
4793                                 emitDoctypeToken(pos);
4794                                 /*
4795                                  * Switch to the data state.
4796                                  */
4797                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
4798                                 continue stateloop;
4799                             case '\u0000':
4800                                 c = '\uFFFD';
4801                                 // fall thru
4802                             default:
4803                                 /*
4804                                  * U+0041 LATIN CAPITAL LETTER A through to
4805                                  * U+005A LATIN CAPITAL LETTER Z Append the
4806                                  * lowercase version of the input character (add
4807                                  * 0x0020 to the character's code point) to the
4808                                  * current DOCTYPE token's name.
4809                                  */
4810                                 if (c >= 'A' && c <= 'Z') {
4811                                     c += 0x0020;
4812                                 }
4813                                 /*
4814                                  * Anything else Append the current input
4815                                  * character to the current DOCTYPE token's
4816                                  * name.
4817                                  */
4818                                 appendStrBuf(c);
4819                                 /*
4820                                  * Stay in the DOCTYPE name state.
4821                                  */
4822                                 continue;
4823                         }
4824                     }
4825                     // FALLTHRU DON'T REORDER
4826                 case AFTER_DOCTYPE_NAME:
4827                     afterdoctypenameloop: for (;;) {
4828                         if (++pos == endPos) {
4829                             break stateloop;
4830                         }
4831                         c = checkChar(buf, pos);
4832                         /*
4833                          * Consume the next input character:
4834                          */
4835                         switch (c) {
4836                             case '\r':
4837                                 silentCarriageReturn();
4838                                 break stateloop;
4839                             case '\n':
4840                                 silentLineFeed();
4841                                 // fall thru
4842                             case ' ':
4843                             case '\t':
4844                             case '\u000C':
4845                                 /*
4846                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4847                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4848                                  * in the after DOCTYPE name state.
4849                                  */
4850                                 continue;
4851                             case '>':
4852                                 /*
4853                                  * U+003E GREATER-THAN SIGN (>) Emit the current
4854                                  * DOCTYPE token.
4855                                  */
4856                                 emitDoctypeToken(pos);
4857                                 /*
4858                                  * Switch to the data state.
4859                                  */
4860                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
4861                                 continue stateloop;
4862                             case 'p':
4863                             case 'P':
4864                                 index = 0;
4865                                 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
4866                                 break afterdoctypenameloop;
4867                             // continue stateloop;
4868                             case 's':
4869                             case 'S':
4870                                 index = 0;
4871                                 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
4872                                 continue stateloop;
4873                             default:
4874                                 /*
4875                                  * Otherwise, this is the parse error.
4876                                  */
4877                                 bogusDoctype();
4878 
4879                                 /*
4880                                  * Set the DOCTYPE token's force-quirks flag to
4881                                  * on.
4882                                  */
4883                                 // done by bogusDoctype();
4884                                 /*
4885                                  * Switch to the bogus DOCTYPE state.
4886                                  */
4887                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4888                                 continue stateloop;
4889                         }
4890                     }
4891                     // FALLTHRU DON'T REORDER
4892                 case DOCTYPE_UBLIC:
4893                     doctypeublicloop: for (;;) {
4894                         if (++pos == endPos) {
4895                             break stateloop;
4896                         }
4897                         c = checkChar(buf, pos);
4898                         /*
4899                          * If the six characters starting from the current input
4900                          * character are an ASCII case-insensitive match for the
4901                          * word "PUBLIC", then consume those characters and
4902                          * switch to the before DOCTYPE public identifier state.
4903                          */
4904                         if (index < 5) { // UBLIC.length
4905                             char folded = c;
4906                             if (c >= 'A' && c <= 'Z') {
4907                                 folded += 0x20;
4908                             }
4909                             if (folded != Tokenizer.UBLIC[index]) {
4910                                 bogusDoctype();
4911                                 // forceQuirks = true;
4912                                 reconsume = true;
4913                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4914                                 continue stateloop;
4915                             }
4916                             index++;
4917                             continue;
4918                         } else {
4919                             reconsume = true;
4920                             state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
4921                             break doctypeublicloop;
4922                             // continue stateloop;
4923                         }
4924                     }
4925                     // FALLTHRU DON'T REORDER
4926                 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
4927                     afterdoctypepublickeywordloop: for (;;) {
4928                         if (reconsume) {
4929                             reconsume = false;
4930                         } else {
4931                             if (++pos == endPos) {
4932                                 break stateloop;
4933                             }
4934                             c = checkChar(buf, pos);
4935                         }
4936                         /*
4937                          * Consume the next input character:
4938                          */
4939                         switch (c) {
4940                             case '\r':
4941                                 silentCarriageReturn();
4942                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
4943                                 break stateloop;
4944                             case '\n':
4945                                 silentLineFeed();
4946                                 // fall thru
4947                             case ' ':
4948                             case '\t':
4949                             case '\u000C':
4950                                 /*
4951                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4952                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4953                                  * Switch to the before DOCTYPE public
4954                                  * identifier state.
4955                                  */
4956                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
4957                                 break afterdoctypepublickeywordloop;
4958                             // FALL THROUGH continue stateloop
4959                             case '"':
4960                                 /*
4961                                  * U+0022 QUOTATION MARK (") Parse Error.
4962                                  */
4963                                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
4964                                 /*
4965                                  * Set the DOCTYPE token's public identifier to
4966                                  * the empty string (not missing),
4967                                  */
4968                                 clearStrBufBeforeUse();
4969                                 /*
4970                                  * then switch to the DOCTYPE public identifier
4971                                  * (double-quoted) state.
4972                                  */
4973                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
4974                                 continue stateloop;
4975                             case '\'':
4976                                 /*
4977                                  * U+0027 APOSTROPHE (') Parse Error.
4978                                  */
4979                                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
4980                                 /*
4981                                  * Set the DOCTYPE token's public identifier to
4982                                  * the empty string (not missing),
4983                                  */
4984                                 clearStrBufBeforeUse();
4985                                 /*
4986                                  * then switch to the DOCTYPE public identifier
4987                                  * (single-quoted) state.
4988                                  */
4989                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
4990                                 continue stateloop;
4991                             case '>':
4992                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
4993                                 errExpectedPublicId();
4994                                 /*
4995                                  * Set the DOCTYPE token's force-quirks flag to
4996                                  * on.
4997                                  */
4998                                 forceQuirks = true;
4999                                 /*
5000                                  * Emit that DOCTYPE token.
5001                                  */
5002                                 emitDoctypeToken(pos);
5003                                 /*
5004                                  * Switch to the data state.
5005                                  */
5006                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5007                                 continue stateloop;
5008                             default:
5009                                 bogusDoctype();
5010                                 /*
5011                                  * Set the DOCTYPE token's force-quirks flag to
5012                                  * on.
5013                                  */
5014                                 // done by bogusDoctype();
5015                                 /*
5016                                  * Switch to the bogus DOCTYPE state.
5017                                  */
5018                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5019                                 continue stateloop;
5020                         }
5021                     }
5022                     // FALLTHRU DON'T REORDER
5023                 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
5024                     beforedoctypepublicidentifierloop: for (;;) {
5025                         if (++pos == endPos) {
5026                             break stateloop;
5027                         }
5028                         c = checkChar(buf, pos);
5029                         /*
5030                          * Consume the next input character:
5031                          */
5032                         switch (c) {
5033                             case '\r':
5034                                 silentCarriageReturn();
5035                                 break stateloop;
5036                             case '\n':
5037                                 silentLineFeed();
5038                                 // fall thru
5039                             case ' ':
5040                             case '\t':
5041                             case '\u000C':
5042                                 /*
5043                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5044                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5045                                  * in the before DOCTYPE public identifier
5046                                  * state.
5047                                  */
5048                                 continue;
5049                             case '"':
5050                                 /*
5051                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
5052                                  * token's public identifier to the empty string
5053                                  * (not missing),
5054                                  */
5055                                 clearStrBufBeforeUse();
5056                                 /*
5057                                  * then switch to the DOCTYPE public identifier
5058                                  * (double-quoted) state.
5059                                  */
5060                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5061                                 break beforedoctypepublicidentifierloop;
5062                             // continue stateloop;
5063                             case '\'':
5064                                 /*
5065                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5066                                  * public identifier to the empty string (not
5067                                  * missing),
5068                                  */
5069                                 clearStrBufBeforeUse();
5070                                 /*
5071                                  * then switch to the DOCTYPE public identifier
5072                                  * (single-quoted) state.
5073                                  */
5074                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5075                                 continue stateloop;
5076                             case '>':
5077                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5078                                 errExpectedPublicId();
5079                                 /*
5080                                  * Set the DOCTYPE token's force-quirks flag to
5081                                  * on.
5082                                  */
5083                                 forceQuirks = true;
5084                                 /*
5085                                  * Emit that DOCTYPE token.
5086                                  */
5087                                 emitDoctypeToken(pos);
5088                                 /*
5089                                  * Switch to the data state.
5090                                  */
5091                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5092                                 continue stateloop;
5093                             default:
5094                                 bogusDoctype();
5095                                 /*
5096                                  * Set the DOCTYPE token's force-quirks flag to
5097                                  * on.
5098                                  */
5099                                 // done by bogusDoctype();
5100                                 /*
5101                                  * Switch to the bogus DOCTYPE state.
5102                                  */
5103                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5104                                 continue stateloop;
5105                         }
5106                     }
5107                     // FALLTHRU DON'T REORDER
5108                 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
5109                     doctypepublicidentifierdoublequotedloop: for (;;) {
5110                         if (++pos == endPos) {
5111                             break stateloop;
5112                         }
5113                         c = checkChar(buf, pos);
5114                         /*
5115                          * Consume the next input character:
5116                          */
5117                         switch (c) {
5118                             case '"':
5119                                 /*
5120                                  * U+0022 QUOTATION MARK (") Switch to the after
5121                                  * DOCTYPE public identifier state.
5122                                  */
5123                                 publicIdentifier = strBufToString();
5124                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5125                                 break doctypepublicidentifierdoublequotedloop;
5126                             // continue stateloop;
5127                             case '>':
5128                                 /*
5129                                  * U+003E GREATER-THAN SIGN (>) Parse error.
5130                                  */
5131                                 errGtInPublicId();
5132                                 /*
5133                                  * Set the DOCTYPE token's force-quirks flag to
5134                                  * on.
5135                                  */
5136                                 forceQuirks = true;
5137                                 /*
5138                                  * Emit that DOCTYPE token.
5139                                  */
5140                                 publicIdentifier = strBufToString();
5141                                 emitDoctypeToken(pos);
5142                                 /*
5143                                  * Switch to the data state.
5144                                  */
5145                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5146                                 continue stateloop;
5147                             case '\r':
5148                                 appendStrBufCarriageReturn();
5149                                 break stateloop;
5150                             case '\n':
5151                                 appendStrBufLineFeed();
5152                                 continue;
5153                             case '\u0000':
5154                                 c = '\uFFFD';
5155                                 // fall thru
5156                             default:
5157                                 /*
5158                                  * Anything else Append the current input
5159                                  * character to the current DOCTYPE token's
5160                                  * public identifier.
5161                                  */
5162                                 appendStrBuf(c);
5163                                 /*
5164                                  * Stay in the DOCTYPE public identifier
5165                                  * (double-quoted) state.
5166                                  */
5167                                 continue;
5168                         }
5169                     }
5170                     // FALLTHRU DON'T REORDER
5171                 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
5172                     afterdoctypepublicidentifierloop: for (;;) {
5173                         if (++pos == endPos) {
5174                             break stateloop;
5175                         }
5176                         c = checkChar(buf, pos);
5177                         /*
5178                          * Consume the next input character:
5179                          */
5180                         switch (c) {
5181                             case '\r':
5182                                 silentCarriageReturn();
5183                                 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5184                                 break stateloop;
5185                             case '\n':
5186                                 silentLineFeed();
5187                                 // fall thru
5188                             case ' ':
5189                             case '\t':
5190                             case '\u000C':
5191                                 /*
5192                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5193                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5194                                  * Switch to the between DOCTYPE public and
5195                                  * system identifiers state.
5196                                  */
5197                                 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5198                                 break afterdoctypepublicidentifierloop;
5199                             // continue stateloop;
5200                             case '>':
5201                                 /*
5202                                  * U+003E GREATER-THAN SIGN (>) Emit the current
5203                                  * DOCTYPE token.
5204                                  */
5205                                 emitDoctypeToken(pos);
5206                                 /*
5207                                  * Switch to the data state.
5208                                  */
5209                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5210                                 continue stateloop;
5211                             case '"':
5212                                 /*
5213                                  * U+0022 QUOTATION MARK (") Parse error.
5214                                  */
5215                                 errNoSpaceBetweenPublicAndSystemIds();
5216                                 /*
5217                                  * Set the DOCTYPE token's system identifier to
5218                                  * the empty string (not missing),
5219                                  */
5220                                 clearStrBufBeforeUse();
5221                                 /*
5222                                  * then switch to the DOCTYPE system identifier
5223                                  * (double-quoted) state.
5224                                  */
5225                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5226                                 continue stateloop;
5227                             case '\'':
5228                                 /*
5229                                  * U+0027 APOSTROPHE (') Parse error.
5230                                  */
5231                                 errNoSpaceBetweenPublicAndSystemIds();
5232                                 /*
5233                                  * Set the DOCTYPE token's system identifier to
5234                                  * the empty string (not missing),
5235                                  */
5236                                 clearStrBufBeforeUse();
5237                                 /*
5238                                  * then switch to the DOCTYPE system identifier
5239                                  * (single-quoted) state.
5240                                  */
5241                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5242                                 continue stateloop;
5243                             default:
5244                                 bogusDoctype();
5245                                 /*
5246                                  * Set the DOCTYPE token's force-quirks flag to
5247                                  * on.
5248                                  */
5249                                 // done by bogusDoctype();
5250                                 /*
5251                                  * Switch to the bogus DOCTYPE state.
5252                                  */
5253                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5254                                 continue stateloop;
5255                         }
5256                     }
5257                     // FALLTHRU DON'T REORDER
5258                 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
5259                     betweendoctypepublicandsystemidentifiersloop: for (;;) {
5260                         if (++pos == endPos) {
5261                             break stateloop;
5262                         }
5263                         c = checkChar(buf, pos);
5264                         /*
5265                          * Consume the next input character:
5266                          */
5267                         switch (c) {
5268                             case '\r':
5269                                 silentCarriageReturn();
5270                                 break stateloop;
5271                             case '\n':
5272                                 silentLineFeed();
5273                                 // fall thru
5274                             case ' ':
5275                             case '\t':
5276                             case '\u000C':
5277                                 /*
5278                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5279                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5280                                  * in the between DOCTYPE public and system
5281                                  * identifiers state.
5282                                  */
5283                                 continue;
5284                             case '>':
5285                                 /*
5286                                  * U+003E GREATER-THAN SIGN (>) Emit the current
5287                                  * DOCTYPE token.
5288                                  */
5289                                 emitDoctypeToken(pos);
5290                                 /*
5291                                  * Switch to the data state.
5292                                  */
5293                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5294                                 continue stateloop;
5295                             case '"':
5296                                 /*
5297                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
5298                                  * token's system identifier to the empty string
5299                                  * (not missing),
5300                                  */
5301                                 clearStrBufBeforeUse();
5302                                 /*
5303                                  * then switch to the DOCTYPE system identifier
5304                                  * (double-quoted) state.
5305                                  */
5306                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5307                                 break betweendoctypepublicandsystemidentifiersloop;
5308                             // continue stateloop;
5309                             case '\'':
5310                                 /*
5311                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5312                                  * system identifier to the empty string (not
5313                                  * missing),
5314                                  */
5315                                 clearStrBufBeforeUse();
5316                                 /*
5317                                  * then switch to the DOCTYPE system identifier
5318                                  * (single-quoted) state.
5319                                  */
5320                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5321                                 continue stateloop;
5322                             default:
5323                                 bogusDoctype();
5324                                 /*
5325                                  * Set the DOCTYPE token's force-quirks flag to
5326                                  * on.
5327                                  */
5328                                 // done by bogusDoctype();
5329                                 /*
5330                                  * Switch to the bogus DOCTYPE state.
5331                                  */
5332                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5333                                 continue stateloop;
5334                         }
5335                     }
5336                     // FALLTHRU DON'T REORDER
5337                 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
5338                     doctypesystemidentifierdoublequotedloop: for (;;) {
5339                         if (++pos == endPos) {
5340                             break stateloop;
5341                         }
5342                         c = checkChar(buf, pos);
5343                         /*
5344                          * Consume the next input character:
5345                          */
5346                         switch (c) {
5347                             case '"':
5348                                 /*
5349                                  * U+0022 QUOTATION MARK (") Switch to the after
5350                                  * DOCTYPE system identifier state.
5351                                  */
5352                                 systemIdentifier = strBufToString();
5353                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5354                                 continue stateloop;
5355                             case '>':
5356                                 /*
5357                                  * U+003E GREATER-THAN SIGN (>) Parse error.
5358                                  */
5359                                 errGtInSystemId();
5360                                 /*
5361                                  * Set the DOCTYPE token's force-quirks flag to
5362                                  * on.
5363                                  */
5364                                 forceQuirks = true;
5365                                 /*
5366                                  * Emit that DOCTYPE token.
5367                                  */
5368                                 systemIdentifier = strBufToString();
5369                                 emitDoctypeToken(pos);
5370                                 /*
5371                                  * Switch to the data state.
5372                                  */
5373                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5374                                 continue stateloop;
5375                             case '\r':
5376                                 appendStrBufCarriageReturn();
5377                                 break stateloop;
5378                             case '\n':
5379                                 appendStrBufLineFeed();
5380                                 continue;
5381                             case '\u0000':
5382                                 c = '\uFFFD';
5383                                 // fall thru
5384                             default:
5385                                 /*
5386                                  * Anything else Append the current input
5387                                  * character to the current DOCTYPE token's
5388                                  * system identifier.
5389                                  */
5390                                 appendStrBuf(c);
5391                                 /*
5392                                  * Stay in the DOCTYPE system identifier
5393                                  * (double-quoted) state.
5394                                  */
5395                                 continue;
5396                         }
5397                     }
5398                     // FALLTHRU DON'T REORDER
5399                 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
5400                     afterdoctypesystemidentifierloop: for (;;) {
5401                         if (++pos == endPos) {
5402                             break stateloop;
5403                         }
5404                         c = checkChar(buf, pos);
5405                         /*
5406                          * Consume the next input character:
5407                          */
5408                         switch (c) {
5409                             case '\r':
5410                                 silentCarriageReturn();
5411                                 break stateloop;
5412                             case '\n':
5413                                 silentLineFeed();
5414                                 // fall thru
5415                             case ' ':
5416                             case '\t':
5417                             case '\u000C':
5418                                 /*
5419                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5420                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5421                                  * in the after DOCTYPE system identifier state.
5422                                  */
5423                                 continue;
5424                             case '>':
5425                                 /*
5426                                  * U+003E GREATER-THAN SIGN (>) Emit the current
5427                                  * DOCTYPE token.
5428                                  */
5429                                 emitDoctypeToken(pos);
5430                                 /*
5431                                  * Switch to the data state.
5432                                  */
5433                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5434                                 continue stateloop;
5435                             default:
5436                                 /*
5437                                  * Switch to the bogus DOCTYPE state. (This does
5438                                  * not set the DOCTYPE token's force-quirks flag
5439                                  * to on.)
5440                                  */
5441                                 bogusDoctypeWithoutQuirks();
5442                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5443                                 break afterdoctypesystemidentifierloop;
5444                             // continue stateloop;
5445                         }
5446                     }
5447                     // FALLTHRU DON'T REORDER
5448                 case BOGUS_DOCTYPE:
5449                     for (;;) {
5450                         if (reconsume) {
5451                             reconsume = false;
5452                         } else {
5453                             if (++pos == endPos) {
5454                                 break stateloop;
5455                             }
5456                             c = checkChar(buf, pos);
5457                         }
5458                         /*
5459                          * Consume the next input character:
5460                          */
5461                         switch (c) {
5462                             case '>':
5463                                 /*
5464                                  * U+003E GREATER-THAN SIGN (>) Emit that
5465                                  * DOCTYPE token.
5466                                  */
5467                                 emitDoctypeToken(pos);
5468                                 /*
5469                                  * Switch to the data state.
5470                                  */
5471                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5472                                 continue stateloop;
5473                             case '\r':
5474                                 silentCarriageReturn();
5475                                 break stateloop;
5476                             case '\n':
5477                                 silentLineFeed();
5478                                 // fall thru
5479                             default:
5480                                 /*
5481                                  * Anything else Stay in the bogus DOCTYPE
5482                                  * state.
5483                                  */
5484                                 continue;
5485                         }
5486                     }
5487                     // XXX reorder point
5488                 case DOCTYPE_YSTEM:
5489                     doctypeystemloop: for (;;) {
5490                         if (++pos == endPos) {
5491                             break stateloop;
5492                         }
5493                         c = checkChar(buf, pos);
5494                         /*
5495                          * Otherwise, if the six characters starting from the
5496                          * current input character are an ASCII case-insensitive
5497                          * match for the word "SYSTEM", then consume those
5498                          * characters and switch to the before DOCTYPE system
5499                          * identifier state.
5500                          */
5501                         if (index < 5) { // YSTEM.length
5502                             char folded = c;
5503                             if (c >= 'A' && c <= 'Z') {
5504                                 folded += 0x20;
5505                             }
5506                             if (folded != Tokenizer.YSTEM[index]) {
5507                                 bogusDoctype();
5508                                 reconsume = true;
5509                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5510                                 continue stateloop;
5511                             }
5512                             index++;
5513                             continue stateloop;
5514                         } else {
5515                             reconsume = true;
5516                             state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
5517                             break doctypeystemloop;
5518                             // continue stateloop;
5519                         }
5520                     }
5521                     // FALLTHRU DON'T REORDER
5522                 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
5523                     afterdoctypesystemkeywordloop: for (;;) {
5524                         if (reconsume) {
5525                             reconsume = false;
5526                         } else {
5527                             if (++pos == endPos) {
5528                                 break stateloop;
5529                             }
5530                             c = checkChar(buf, pos);
5531                         }
5532                         /*
5533                          * Consume the next input character:
5534                          */
5535                         switch (c) {
5536                             case '\r':
5537                                 silentCarriageReturn();
5538                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5539                                 break stateloop;
5540                             case '\n':
5541                                 silentLineFeed();
5542                                 // fall thru
5543                             case ' ':
5544                             case '\t':
5545                             case '\u000C':
5546                                 /*
5547                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5548                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5549                                  * Switch to the before DOCTYPE public
5550                                  * identifier state.
5551                                  */
5552                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5553                                 break afterdoctypesystemkeywordloop;
5554                             // FALL THROUGH continue stateloop
5555                             case '"':
5556                                 /*
5557                                  * U+0022 QUOTATION MARK (") Parse Error.
5558                                  */
5559                                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5560                                 /*
5561                                  * Set the DOCTYPE token's system identifier to
5562                                  * the empty string (not missing),
5563                                  */
5564                                 clearStrBufBeforeUse();
5565                                 /*
5566                                  * then switch to the DOCTYPE public identifier
5567                                  * (double-quoted) state.
5568                                  */
5569                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5570                                 continue stateloop;
5571                             case '\'':
5572                                 /*
5573                                  * U+0027 APOSTROPHE (') Parse Error.
5574                                  */
5575                                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5576                                 /*
5577                                  * Set the DOCTYPE token's public identifier to
5578                                  * the empty string (not missing),
5579                                  */
5580                                 clearStrBufBeforeUse();
5581                                 /*
5582                                  * then switch to the DOCTYPE public identifier
5583                                  * (single-quoted) state.
5584                                  */
5585                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5586                                 continue stateloop;
5587                             case '>':
5588                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5589                                 errExpectedPublicId();
5590                                 /*
5591                                  * Set the DOCTYPE token's force-quirks flag to
5592                                  * on.
5593                                  */
5594                                 forceQuirks = true;
5595                                 /*
5596                                  * Emit that DOCTYPE token.
5597                                  */
5598                                 emitDoctypeToken(pos);
5599                                 /*
5600                                  * Switch to the data state.
5601                                  */
5602                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5603                                 continue stateloop;
5604                             default:
5605                                 bogusDoctype();
5606                                 /*
5607                                  * Set the DOCTYPE token's force-quirks flag to
5608                                  * on.
5609                                  */
5610                                 // done by bogusDoctype();
5611                                 /*
5612                                  * Switch to the bogus DOCTYPE state.
5613                                  */
5614                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5615                                 continue stateloop;
5616                         }
5617                     }
5618                     // FALLTHRU DON'T REORDER
5619                 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
5620                     beforedoctypesystemidentifierloop: for (;;) {
5621                         if (++pos == endPos) {
5622                             break stateloop;
5623                         }
5624                         c = checkChar(buf, pos);
5625                         /*
5626                          * Consume the next input character:
5627                          */
5628                         switch (c) {
5629                             case '\r':
5630                                 silentCarriageReturn();
5631                                 break stateloop;
5632                             case '\n':
5633                                 silentLineFeed();
5634                                 // fall thru
5635                             case ' ':
5636                             case '\t':
5637                             case '\u000C':
5638                                 /*
5639                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5640                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5641                                  * in the before DOCTYPE system identifier
5642                                  * state.
5643                                  */
5644                                 continue;
5645                             case '"':
5646                                 /*
5647                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
5648                                  * token's system identifier to the empty string
5649                                  * (not missing),
5650                                  */
5651                                 clearStrBufBeforeUse();
5652                                 /*
5653                                  * then switch to the DOCTYPE system identifier
5654                                  * (double-quoted) state.
5655                                  */
5656                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5657                                 continue stateloop;
5658                             case '\'':
5659                                 /*
5660                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5661                                  * system identifier to the empty string (not
5662                                  * missing),
5663                                  */
5664                                 clearStrBufBeforeUse();
5665                                 /*
5666                                  * then switch to the DOCTYPE system identifier
5667                                  * (single-quoted) state.
5668                                  */
5669                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5670                                 break beforedoctypesystemidentifierloop;
5671                             // continue stateloop;
5672                             case '>':
5673                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5674                                 errExpectedSystemId();
5675                                 /*
5676                                  * Set the DOCTYPE token's force-quirks flag to
5677                                  * on.
5678                                  */
5679                                 forceQuirks = true;
5680                                 /*
5681                                  * Emit that DOCTYPE token.
5682                                  */
5683                                 emitDoctypeToken(pos);
5684                                 /*
5685                                  * Switch to the data state.
5686                                  */
5687                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5688                                 continue stateloop;
5689                             default:
5690                                 bogusDoctype();
5691                                 /*
5692                                  * Set the DOCTYPE token's force-quirks flag to
5693                                  * on.
5694                                  */
5695                                 // done by bogusDoctype();
5696                                 /*
5697                                  * Switch to the bogus DOCTYPE state.
5698                                  */
5699                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5700                                 continue stateloop;
5701                         }
5702                     }
5703                     // FALLTHRU DON'T REORDER
5704                 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
5705                     for (;;) {
5706                         if (++pos == endPos) {
5707                             break stateloop;
5708                         }
5709                         c = checkChar(buf, pos);
5710                         /*
5711                          * Consume the next input character:
5712                          */
5713                         switch (c) {
5714                             case '\'':
5715                                 /*
5716                                  * U+0027 APOSTROPHE (') Switch to the after
5717                                  * DOCTYPE system identifier state.
5718                                  */
5719                                 systemIdentifier = strBufToString();
5720                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5721                                 continue stateloop;
5722                             case '>':
5723                                 errGtInSystemId();
5724                                 /*
5725                                  * Set the DOCTYPE token's force-quirks flag to
5726                                  * on.
5727                                  */
5728                                 forceQuirks = true;
5729                                 /*
5730                                  * Emit that DOCTYPE token.
5731                                  */
5732                                 systemIdentifier = strBufToString();
5733                                 emitDoctypeToken(pos);
5734                                 /*
5735                                  * Switch to the data state.
5736                                  */
5737                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5738                                 continue stateloop;
5739                             case '\r':
5740                                 appendStrBufCarriageReturn();
5741                                 break stateloop;
5742                             case '\n':
5743                                 appendStrBufLineFeed();
5744                                 continue;
5745                             case '\u0000':
5746                                 c = '\uFFFD';
5747                                 // fall thru
5748                             default:
5749                                 /*
5750                                  * Anything else Append the current input
5751                                  * character to the current DOCTYPE token's
5752                                  * system identifier.
5753                                  */
5754                                 appendStrBuf(c);
5755                                 /*
5756                                  * Stay in the DOCTYPE system identifier
5757                                  * (double-quoted) state.
5758                                  */
5759                                 continue;
5760                         }
5761                     }
5762                     // XXX reorder point
5763                 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
5764                     for (;;) {
5765                         if (++pos == endPos) {
5766                             break stateloop;
5767                         }
5768                         c = checkChar(buf, pos);
5769                         /*
5770                          * Consume the next input character:
5771                          */
5772                         switch (c) {
5773                             case '\'':
5774                                 /*
5775                                  * U+0027 APOSTROPHE (') Switch to the after
5776                                  * DOCTYPE public identifier state.
5777                                  */
5778                                 publicIdentifier = strBufToString();
5779                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5780                                 continue stateloop;
5781                             case '>':
5782                                 errGtInPublicId();
5783                                 /*
5784                                  * Set the DOCTYPE token's force-quirks flag to
5785                                  * on.
5786                                  */
5787                                 forceQuirks = true;
5788                                 /*
5789                                  * Emit that DOCTYPE token.
5790                                  */
5791                                 publicIdentifier = strBufToString();
5792                                 emitDoctypeToken(pos);
5793                                 /*
5794                                  * Switch to the data state.
5795                                  */
5796                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5797                                 continue stateloop;
5798                             case '\r':
5799                                 appendStrBufCarriageReturn();
5800                                 break stateloop;
5801                             case '\n':
5802                                 appendStrBufLineFeed();
5803                                 continue;
5804                             case '\u0000':
5805                                 c = '\uFFFD';
5806                                 // fall thru
5807                             default:
5808                                 /*
5809                                  * Anything else Append the current input
5810                                  * character to the current DOCTYPE token's
5811                                  * public identifier.
5812                                  */
5813                                 appendStrBuf(c);
5814                                 /*
5815                                  * Stay in the DOCTYPE public identifier
5816                                  * (single-quoted) state.
5817                                  */
5818                                 continue;
5819                         }
5820                     }
5821                     // XXX reorder point
5822                 case PROCESSING_INSTRUCTION:
5823                     processinginstructionloop: for (;;) {
5824                         if (++pos == endPos) {
5825                             break stateloop;
5826                         }
5827                         c = checkChar(buf, pos);
5828                         switch (c) {
5829                             case '?':
5830                                 state = transition(
5831                                         state,
5832                                         Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
5833                                         reconsume, pos);
5834                                 break processinginstructionloop;
5835                             // continue stateloop;
5836                             default:
5837                                 continue;
5838                         }
5839                     }
5840                 case PROCESSING_INSTRUCTION_QUESTION_MARK:
5841                     if (++pos == endPos) {
5842                         break stateloop;
5843                     }
5844                     c = checkChar(buf, pos);
5845                     switch (c) {
5846                         case '>':
5847                             state = transition(state, Tokenizer.DATA,
5848                                     reconsume, pos);
5849                             continue stateloop;
5850                         default:
5851                             state = transition(state,
5852                                     Tokenizer.PROCESSING_INSTRUCTION,
5853                                     reconsume, pos);
5854                             continue stateloop;
5855                     }
5856                     // END HOTSPOT WORKAROUND
5857             }
5858         }
5859         flushChars(buf, pos);
5860         /*
5861          * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
5862          */
5863         // Save locals
5864         stateSave = state;
5865         returnStateSave = returnState;
5866         return pos;
5867     }
5868 
5869     // HOTSPOT WORKAROUND INSERTION POINT
5870 
5871     // [NOCPP[
5872 
transition(int from, int to, boolean reconsume, int pos)5873     protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
5874         return to;
5875     }
5876 
5877     // ]NOCPP]
5878 
initDoctypeFields()5879     private void initDoctypeFields() {
5880         // Discard the characters "DOCTYPE" accumulated as a potential bogus
5881         // comment into strBuf.
5882         clearStrBufAfterUse();
5883         doctypeName = "";
5884         if (systemIdentifier != null) {
5885             Portability.releaseString(systemIdentifier);
5886             systemIdentifier = null;
5887         }
5888         if (publicIdentifier != null) {
5889             Portability.releaseString(publicIdentifier);
5890             publicIdentifier = null;
5891         }
5892         forceQuirks = false;
5893     }
5894 
adjustDoubleHyphenAndAppendToStrBufCarriageReturn()5895     @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
5896             throws SAXException {
5897         silentCarriageReturn();
5898         adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
5899     }
5900 
adjustDoubleHyphenAndAppendToStrBufLineFeed()5901     @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed()
5902             throws SAXException {
5903         silentLineFeed();
5904         adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
5905     }
5906 
appendStrBufLineFeed()5907     @Inline private void appendStrBufLineFeed() {
5908         silentLineFeed();
5909         appendStrBuf('\n');
5910     }
5911 
appendStrBufCarriageReturn()5912     @Inline private void appendStrBufCarriageReturn() {
5913         silentCarriageReturn();
5914         appendStrBuf('\n');
5915     }
5916 
silentCarriageReturn()5917     @Inline protected void silentCarriageReturn() {
5918         ++line;
5919         lastCR = true;
5920     }
5921 
silentLineFeed()5922     @Inline protected void silentLineFeed() {
5923         ++line;
5924     }
5925 
emitCarriageReturn(@oLength char[] buf, int pos)5926     private void emitCarriageReturn(@NoLength char[] buf, int pos)
5927             throws SAXException {
5928         silentCarriageReturn();
5929         flushChars(buf, pos);
5930         tokenHandler.characters(Tokenizer.LF, 0, 1);
5931         cstart = Integer.MAX_VALUE;
5932     }
5933 
emitReplacementCharacter(@oLength char[] buf, int pos)5934     private void emitReplacementCharacter(@NoLength char[] buf, int pos)
5935             throws SAXException {
5936         flushChars(buf, pos);
5937         tokenHandler.zeroOriginatingReplacementCharacter();
5938         cstart = pos + 1;
5939     }
5940 
emitPlaintextReplacementCharacter(@oLength char[] buf, int pos)5941     private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
5942             throws SAXException {
5943         flushChars(buf, pos);
5944         tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
5945         cstart = pos + 1;
5946     }
5947 
setAdditionalAndRememberAmpersandLocation(char add)5948     private void setAdditionalAndRememberAmpersandLocation(char add) {
5949         additional = add;
5950         // [NOCPP[
5951         ampersandLocation = new LocatorImpl(this);
5952         // ]NOCPP]
5953     }
5954 
bogusDoctype()5955     private void bogusDoctype() throws SAXException {
5956         errBogusDoctype();
5957         forceQuirks = true;
5958     }
5959 
bogusDoctypeWithoutQuirks()5960     private void bogusDoctypeWithoutQuirks() throws SAXException {
5961         errBogusDoctype();
5962         forceQuirks = false;
5963     }
5964 
handleNcrValue(int returnState)5965     private void handleNcrValue(int returnState) throws SAXException {
5966         /*
5967          * If one or more characters match the range, then take them all and
5968          * interpret the string of characters as a number (either hexadecimal or
5969          * decimal as appropriate).
5970          */
5971         if (value <= 0xFFFF) {
5972             if (value >= 0x80 && value <= 0x9f) {
5973                 /*
5974                  * If that number is one of the numbers in the first column of
5975                  * the following table, then this is a parse error.
5976                  */
5977                 errNcrInC1Range();
5978                 /*
5979                  * Find the row with that number in the first column, and return
5980                  * a character token for the Unicode character given in the
5981                  * second column of that row.
5982                  */
5983                 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
5984                 emitOrAppendOne(val, returnState);
5985                 // [NOCPP[
5986             } else if (value == 0xC
5987                     && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
5988                 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
5989                     emitOrAppendOne(Tokenizer.SPACE, returnState);
5990                 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
5991                     fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
5992                 }
5993                 // ]NOCPP]
5994             } else if (value == 0x0) {
5995                 errNcrZero();
5996                 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
5997             } else if ((value & 0xF800) == 0xD800) {
5998                 errNcrSurrogate();
5999                 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
6000             } else {
6001                 /*
6002                  * Otherwise, return a character token for the Unicode character
6003                  * whose code point is that number.
6004                  */
6005                 char ch = (char) value;
6006                 // [NOCPP[
6007                 if (value == 0x0D) {
6008                     errNcrCr();
6009                 } else if ((value <= 0x0008) || (value == 0x000B)
6010                         || (value >= 0x000E && value <= 0x001F)) {
6011                     ch = errNcrControlChar(ch);
6012                 } else if (value >= 0xFDD0 && value <= 0xFDEF) {
6013                     errNcrUnassigned();
6014                 } else if ((value & 0xFFFE) == 0xFFFE) {
6015                     ch = errNcrNonCharacter(ch);
6016                 } else if (value >= 0x007F && value <= 0x009F) {
6017                     errNcrControlChar();
6018                 } else {
6019                     maybeWarnPrivateUse(ch);
6020                 }
6021                 // ]NOCPP]
6022                 bmpChar[0] = ch;
6023                 emitOrAppendOne(bmpChar, returnState);
6024             }
6025         } else if (value <= 0x10FFFF) {
6026             // [NOCPP[
6027             maybeWarnPrivateUseAstral();
6028             if ((value & 0xFFFE) == 0xFFFE) {
6029                 errAstralNonCharacter(value);
6030             }
6031             // ]NOCPP]
6032             astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
6033             astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
6034             emitOrAppendTwo(astralChar, returnState);
6035         } else {
6036             errNcrOutOfRange();
6037             emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
6038         }
6039     }
6040 
eof()6041     public void eof() throws SAXException {
6042         int state = stateSave;
6043         int returnState = returnStateSave;
6044 
6045         eofloop: for (;;) {
6046             switch (state) {
6047                 case SCRIPT_DATA_LESS_THAN_SIGN:
6048                 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
6049                     /*
6050                      * Otherwise, emit a U+003C LESS-THAN SIGN character token
6051                      */
6052                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6053                     /*
6054                      * and reconsume the current input character in the data
6055                      * state.
6056                      */
6057                     break eofloop;
6058                 case TAG_OPEN:
6059                     /*
6060                      * The behavior of this state depends on the content model
6061                      * flag.
6062                      */
6063                     /*
6064                      * Anything else Parse error.
6065                      */
6066                     errEofAfterLt();
6067                     /*
6068                      * Emit a U+003C LESS-THAN SIGN character token
6069                      */
6070                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6071                     /*
6072                      * and reconsume the current input character in the data
6073                      * state.
6074                      */
6075                     break eofloop;
6076                 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
6077                     /*
6078                      * Emit a U+003C LESS-THAN SIGN character token
6079                      */
6080                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6081                     /*
6082                      * and reconsume the current input character in the RCDATA
6083                      * state.
6084                      */
6085                     break eofloop;
6086                 case NON_DATA_END_TAG_NAME:
6087                     /*
6088                      * Emit a U+003C LESS-THAN SIGN character token, a U+002F
6089                      * SOLIDUS character token,
6090                      */
6091                     tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
6092                     /*
6093                      * a character token for each of the characters in the
6094                      * temporary buffer (in the order they were added to the
6095                      * buffer),
6096                      */
6097                     emitStrBuf();
6098                     /*
6099                      * and reconsume the current input character in the RCDATA
6100                      * state.
6101                      */
6102                     break eofloop;
6103                 case CLOSE_TAG_OPEN:
6104                     /* EOF Parse error. */
6105                     errEofAfterLt();
6106                     /*
6107                      * Emit a U+003C LESS-THAN SIGN character token and a U+002F
6108                      * SOLIDUS character token.
6109                      */
6110                     tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
6111                     /*
6112                      * Reconsume the EOF character in the data state.
6113                      */
6114                     break eofloop;
6115                 case TAG_NAME:
6116                     /*
6117                      * EOF Parse error.
6118                      */
6119                     errEofInTagName();
6120                     /*
6121                      * Reconsume the EOF character in the data state.
6122                      */
6123                     break eofloop;
6124                 case BEFORE_ATTRIBUTE_NAME:
6125                 case AFTER_ATTRIBUTE_VALUE_QUOTED:
6126                 case SELF_CLOSING_START_TAG:
6127                     /* EOF Parse error. */
6128                     errEofWithoutGt();
6129                     /*
6130                      * Reconsume the EOF character in the data state.
6131                      */
6132                     break eofloop;
6133                 case ATTRIBUTE_NAME:
6134                     /*
6135                      * EOF Parse error.
6136                      */
6137                     errEofInAttributeName();
6138                     /*
6139                      * Reconsume the EOF character in the data state.
6140                      */
6141                     break eofloop;
6142                 case AFTER_ATTRIBUTE_NAME:
6143                 case BEFORE_ATTRIBUTE_VALUE:
6144                     /* EOF Parse error. */
6145                     errEofWithoutGt();
6146                     /*
6147                      * Reconsume the EOF character in the data state.
6148                      */
6149                     break eofloop;
6150                 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
6151                 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
6152                 case ATTRIBUTE_VALUE_UNQUOTED:
6153                     /* EOF Parse error. */
6154                     errEofInAttributeValue();
6155                     /*
6156                      * Reconsume the EOF character in the data state.
6157                      */
6158                     break eofloop;
6159                 case BOGUS_COMMENT:
6160                     emitComment(0, 0);
6161                     break eofloop;
6162                 case BOGUS_COMMENT_HYPHEN:
6163                     // [NOCPP[
6164                     maybeAppendSpaceToBogusComment();
6165                     // ]NOCPP]
6166                     emitComment(0, 0);
6167                     break eofloop;
6168                 case MARKUP_DECLARATION_OPEN:
6169                     errBogusComment();
6170                     emitComment(0, 0);
6171                     break eofloop;
6172                 case MARKUP_DECLARATION_HYPHEN:
6173                     errBogusComment();
6174                     emitComment(0, 0);
6175                     break eofloop;
6176                 case MARKUP_DECLARATION_OCTYPE:
6177                     if (index < 6) {
6178                         errBogusComment();
6179                         emitComment(0, 0);
6180                     } else {
6181                         /* EOF Parse error. */
6182                         errEofInDoctype();
6183                         /*
6184                          * Create a new DOCTYPE token. Set its force-quirks flag
6185                          * to on.
6186                          */
6187                         doctypeName = "";
6188                         if (systemIdentifier != null) {
6189                             Portability.releaseString(systemIdentifier);
6190                             systemIdentifier = null;
6191                         }
6192                         if (publicIdentifier != null) {
6193                             Portability.releaseString(publicIdentifier);
6194                             publicIdentifier = null;
6195                         }
6196                         forceQuirks = true;
6197                         /*
6198                          * Emit the token.
6199                          */
6200                         emitDoctypeToken(0);
6201                         /*
6202                          * Reconsume the EOF character in the data state.
6203                          */
6204                         break eofloop;
6205                     }
6206                     break eofloop;
6207                 case COMMENT_START:
6208                 case COMMENT:
6209                     /*
6210                      * EOF Parse error.
6211                      */
6212                     errEofInComment();
6213                     /* Emit the comment token. */
6214                     emitComment(0, 0);
6215                     /*
6216                      * Reconsume the EOF character in the data state.
6217                      */
6218                     break eofloop;
6219                 case COMMENT_END:
6220                     errEofInComment();
6221                     /* Emit the comment token. */
6222                     emitComment(2, 0);
6223                     /*
6224                      * Reconsume the EOF character in the data state.
6225                      */
6226                     break eofloop;
6227                 case COMMENT_END_DASH:
6228                 case COMMENT_START_DASH:
6229                     errEofInComment();
6230                     /* Emit the comment token. */
6231                     emitComment(1, 0);
6232                     /*
6233                      * Reconsume the EOF character in the data state.
6234                      */
6235                     break eofloop;
6236                 case COMMENT_END_BANG:
6237                     errEofInComment();
6238                     /* Emit the comment token. */
6239                     emitComment(3, 0);
6240                     /*
6241                      * Reconsume the EOF character in the data state.
6242                      */
6243                     break eofloop;
6244                 case DOCTYPE:
6245                 case BEFORE_DOCTYPE_NAME:
6246                     errEofInDoctype();
6247                     /*
6248                      * Create a new DOCTYPE token. Set its force-quirks flag to
6249                      * on.
6250                      */
6251                     forceQuirks = true;
6252                     /*
6253                      * Emit the token.
6254                      */
6255                     emitDoctypeToken(0);
6256                     /*
6257                      * Reconsume the EOF character in the data state.
6258                      */
6259                     break eofloop;
6260                 case DOCTYPE_NAME:
6261                     errEofInDoctype();
6262                     strBufToDoctypeName();
6263                     /*
6264                      * Set the DOCTYPE token's force-quirks flag to on.
6265                      */
6266                     forceQuirks = true;
6267                     /*
6268                      * Emit that DOCTYPE token.
6269                      */
6270                     emitDoctypeToken(0);
6271                     /*
6272                      * Reconsume the EOF character in the data state.
6273                      */
6274                     break eofloop;
6275                 case DOCTYPE_UBLIC:
6276                 case DOCTYPE_YSTEM:
6277                 case AFTER_DOCTYPE_NAME:
6278                 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
6279                 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
6280                 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
6281                     errEofInDoctype();
6282                     /*
6283                      * Set the DOCTYPE token's force-quirks flag to on.
6284                      */
6285                     forceQuirks = true;
6286                     /*
6287                      * Emit that DOCTYPE token.
6288                      */
6289                     emitDoctypeToken(0);
6290                     /*
6291                      * Reconsume the EOF character in the data state.
6292                      */
6293                     break eofloop;
6294                 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
6295                 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
6296                     /* EOF Parse error. */
6297                     errEofInPublicId();
6298                     /*
6299                      * Set the DOCTYPE token's force-quirks flag to on.
6300                      */
6301                     forceQuirks = true;
6302                     /*
6303                      * Emit that DOCTYPE token.
6304                      */
6305                     publicIdentifier = strBufToString();
6306                     emitDoctypeToken(0);
6307                     /*
6308                      * Reconsume the EOF character in the data state.
6309                      */
6310                     break eofloop;
6311                 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
6312                 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
6313                 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
6314                     errEofInDoctype();
6315                     /*
6316                      * Set the DOCTYPE token's force-quirks flag to on.
6317                      */
6318                     forceQuirks = true;
6319                     /*
6320                      * Emit that DOCTYPE token.
6321                      */
6322                     emitDoctypeToken(0);
6323                     /*
6324                      * Reconsume the EOF character in the data state.
6325                      */
6326                     break eofloop;
6327                 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
6328                 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
6329                     /* EOF Parse error. */
6330                     errEofInSystemId();
6331                     /*
6332                      * Set the DOCTYPE token's force-quirks flag to on.
6333                      */
6334                     forceQuirks = true;
6335                     /*
6336                      * Emit that DOCTYPE token.
6337                      */
6338                     systemIdentifier = strBufToString();
6339                     emitDoctypeToken(0);
6340                     /*
6341                      * Reconsume the EOF character in the data state.
6342                      */
6343                     break eofloop;
6344                 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
6345                     errEofInDoctype();
6346                     /*
6347                      * Set the DOCTYPE token's force-quirks flag to on.
6348                      */
6349                     forceQuirks = true;
6350                     /*
6351                      * Emit that DOCTYPE token.
6352                      */
6353                     emitDoctypeToken(0);
6354                     /*
6355                      * Reconsume the EOF character in the data state.
6356                      */
6357                     break eofloop;
6358                 case BOGUS_DOCTYPE:
6359                     /*
6360                      * Emit that DOCTYPE token.
6361                      */
6362                     emitDoctypeToken(0);
6363                     /*
6364                      * Reconsume the EOF character in the data state.
6365                      */
6366                     break eofloop;
6367                 case CONSUME_CHARACTER_REFERENCE:
6368                     /*
6369                      * Unlike the definition is the spec, this state does not
6370                      * return a value and never requires the caller to
6371                      * backtrack. This state takes care of emitting characters
6372                      * or appending to the current attribute value. It also
6373                      * takes care of that in the case when consuming the entity
6374                      * fails.
6375                      */
6376                     /*
6377                      * This section defines how to consume an entity. This
6378                      * definition is used when parsing entities in text and in
6379                      * attributes.
6380                      *
6381                      * The behavior depends on the identity of the next
6382                      * character (the one immediately after the U+0026 AMPERSAND
6383                      * character):
6384                      */
6385 
6386                     emitOrAppendCharRefBuf(returnState);
6387                     state = returnState;
6388                     continue;
6389                 case CHARACTER_REFERENCE_HILO_LOOKUP:
6390                     errNoNamedCharacterMatch();
6391                     emitOrAppendCharRefBuf(returnState);
6392                     state = returnState;
6393                     continue;
6394                 case CHARACTER_REFERENCE_TAIL:
6395                     outer: for (;;) {
6396                         char c = '\u0000';
6397                         entCol++;
6398                         /*
6399                          * Consume the maximum number of characters possible,
6400                          * with the consumed characters matching one of the
6401                          * identifiers in the first column of the named
6402                          * character references table (in a case-sensitive
6403                          * manner).
6404                          */
6405                         hiloop: for (;;) {
6406                             if (hi == -1) {
6407                                 break hiloop;
6408                             }
6409                             if (entCol == NamedCharacters.NAMES[hi].length()) {
6410                                 break hiloop;
6411                             }
6412                             if (entCol > NamedCharacters.NAMES[hi].length()) {
6413                                 break outer;
6414                             } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
6415                                 hi--;
6416                             } else {
6417                                 break hiloop;
6418                             }
6419                         }
6420 
6421                         loloop: for (;;) {
6422                             if (hi < lo) {
6423                                 break outer;
6424                             }
6425                             if (entCol == NamedCharacters.NAMES[lo].length()) {
6426                                 candidate = lo;
6427                                 charRefBufMark = charRefBufLen;
6428                                 lo++;
6429                             } else if (entCol > NamedCharacters.NAMES[lo].length()) {
6430                                 break outer;
6431                             } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
6432                                 lo++;
6433                             } else {
6434                                 break loloop;
6435                             }
6436                         }
6437                         if (hi < lo) {
6438                             break outer;
6439                         }
6440                         continue;
6441                     }
6442 
6443                     if (candidate == -1) {
6444                         /*
6445                          * If no match can be made, then this is a parse error.
6446                          */
6447                         errNoNamedCharacterMatch();
6448                         emitOrAppendCharRefBuf(returnState);
6449                         state = returnState;
6450                         continue eofloop;
6451                     } else {
6452                         @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
6453                         if (candidateName.length() == 0
6454                                 || candidateName.charAt(candidateName.length() - 1) != ';') {
6455                             /*
6456                              * If the last character matched is not a U+003B
6457                              * SEMICOLON (;), there is a parse error.
6458                              */
6459                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6460                                 /*
6461                                  * If the entity is being consumed as part of an
6462                                  * attribute, and the last character matched is
6463                                  * not a U+003B SEMICOLON (;),
6464                                  */
6465                                 char ch;
6466                                 if (charRefBufMark == charRefBufLen) {
6467                                     ch = '\u0000';
6468                                 } else {
6469                                     ch = charRefBuf[charRefBufMark];
6470                                 }
6471                                 if ((ch >= '0' && ch <= '9')
6472                                         || (ch >= 'A' && ch <= 'Z')
6473                                         || (ch >= 'a' && ch <= 'z')) {
6474                                     /*
6475                                      * and the next character is in the range
6476                                      * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
6477                                      * U+0041 LATIN CAPITAL LETTER A to U+005A
6478                                      * LATIN CAPITAL LETTER Z, or U+0061 LATIN
6479                                      * SMALL LETTER A to U+007A LATIN SMALL
6480                                      * LETTER Z, then, for historical reasons,
6481                                      * all the characters that were matched
6482                                      * after the U+0026 AMPERSAND (&) must be
6483                                      * unconsumed, and nothing is returned.
6484                                      */
6485                                     errNoNamedCharacterMatch();
6486                                     appendCharRefBufToStrBuf();
6487                                     state = returnState;
6488                                     continue eofloop;
6489                                 }
6490                             }
6491                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6492                                 errUnescapedAmpersandInterpretedAsCharacterReference();
6493                             } else {
6494                                 errNotSemicolonTerminated();
6495                             }
6496                         }
6497 
6498                         /*
6499                          * Otherwise, return a character token for the character
6500                          * corresponding to the entity name (as given by the
6501                          * second column of the named character references
6502                          * table).
6503                          */
6504                         @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
6505                         if (
6506                         // [NOCPP[
6507                         val.length == 1
6508                         // ]NOCPP]
6509                         // CPPONLY: val[1] == 0
6510                         ) {
6511                             emitOrAppendOne(val, returnState);
6512                         } else {
6513                             emitOrAppendTwo(val, returnState);
6514                         }
6515                         // this is so complicated!
6516                         if (charRefBufMark < charRefBufLen) {
6517                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6518                                 appendStrBuf(charRefBuf, charRefBufMark,
6519                                         charRefBufLen - charRefBufMark);
6520                             } else {
6521                                 tokenHandler.characters(charRefBuf, charRefBufMark,
6522                                         charRefBufLen - charRefBufMark);
6523                             }
6524                         }
6525                         charRefBufLen = 0;
6526                         state = returnState;
6527                         continue eofloop;
6528                         /*
6529                          * If the markup contains I'm &notit; I tell you, the
6530                          * entity is parsed as "not", as in, I'm ¬it; I tell
6531                          * you. But if the markup was I'm &notin; I tell you,
6532                          * the entity would be parsed as "notin;", resulting in
6533                          * I'm ∉ I tell you.
6534                          */
6535                     }
6536                 case CONSUME_NCR:
6537                 case DECIMAL_NRC_LOOP:
6538                 case HEX_NCR_LOOP:
6539                     /*
6540                      * If no characters match the range, then don't consume any
6541                      * characters (and unconsume the U+0023 NUMBER SIGN
6542                      * character and, if appropriate, the X character). This is
6543                      * a parse error; nothing is returned.
6544                      *
6545                      * Otherwise, if the next character is a U+003B SEMICOLON,
6546                      * consume that too. If it isn't, there is a parse error.
6547                      */
6548                     if (!seenDigits) {
6549                         errNoDigitsInNCR();
6550                         emitOrAppendCharRefBuf(returnState);
6551                         state = returnState;
6552                         continue;
6553                     } else {
6554                         errCharRefLacksSemicolon();
6555                     }
6556                     // WARNING previous state sets reconsume
6557                     handleNcrValue(returnState);
6558                     state = returnState;
6559                     continue;
6560                 case CDATA_RSQB:
6561                     tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
6562                     break eofloop;
6563                 case CDATA_RSQB_RSQB:
6564                     tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
6565                     break eofloop;
6566                 case DATA:
6567                 default:
6568                     break eofloop;
6569             }
6570         }
6571         // case DATA:
6572         /*
6573          * EOF Emit an end-of-file token.
6574          */
6575         tokenHandler.eof();
6576         return;
6577     }
6578 
emitDoctypeToken(int pos)6579     private void emitDoctypeToken(int pos) throws SAXException {
6580         cstart = pos + 1;
6581         tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
6582                 forceQuirks);
6583         // It is OK and sufficient to release these here, since
6584         // there's no way out of the doctype states than through paths
6585         // that call this method.
6586         doctypeName = null;
6587         Portability.releaseString(publicIdentifier);
6588         publicIdentifier = null;
6589         Portability.releaseString(systemIdentifier);
6590         systemIdentifier = null;
6591     }
6592 
checkChar(@oLength char[] buf, int pos)6593     @Inline protected char checkChar(@NoLength char[] buf, int pos)
6594             throws SAXException {
6595         return buf[pos];
6596     }
6597 
internalEncodingDeclaration(String internalCharset)6598     public boolean internalEncodingDeclaration(String internalCharset)
6599             throws SAXException {
6600         if (encodingDeclarationHandler != null) {
6601             return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
6602         }
6603         return false;
6604     }
6605 
6606     /**
6607      * @param val
6608      * @throws SAXException
6609      */
emitOrAppendTwo(@onst @oLength char[] val, int returnState)6610     private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
6611             throws SAXException {
6612         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6613             appendStrBuf(val[0]);
6614             appendStrBuf(val[1]);
6615         } else {
6616             tokenHandler.characters(val, 0, 2);
6617         }
6618     }
6619 
emitOrAppendOne(@onst @oLength char[] val, int returnState)6620     private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
6621             throws SAXException {
6622         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6623             appendStrBuf(val[0]);
6624         } else {
6625             tokenHandler.characters(val, 0, 1);
6626         }
6627     }
6628 
end()6629     public void end() throws SAXException {
6630         strBuf = null;
6631         doctypeName = null;
6632         if (systemIdentifier != null) {
6633             Portability.releaseString(systemIdentifier);
6634             systemIdentifier = null;
6635         }
6636         if (publicIdentifier != null) {
6637             Portability.releaseString(publicIdentifier);
6638             publicIdentifier = null;
6639         }
6640         if (tagName != null) {
6641             tagName.release();
6642             tagName = null;
6643         }
6644         if (attributeName != null) {
6645             attributeName.release();
6646             attributeName = null;
6647         }
6648         tokenHandler.endTokenization();
6649         if (attributes != null) {
6650             // [NOCPP[
6651             attributes = null;
6652             // ]NOCPP]
6653             // CPPONLY: attributes.clear(mappingLangToXmlLang);
6654         }
6655     }
6656 
requestSuspension()6657     public void requestSuspension() {
6658         shouldSuspend = true;
6659     }
6660 
6661     // [NOCPP[
6662 
becomeConfident()6663     public void becomeConfident() {
6664         confident = true;
6665     }
6666 
6667     /**
6668      * Returns the nextCharOnNewLine.
6669      *
6670      * @return the nextCharOnNewLine
6671      */
isNextCharOnNewLine()6672     public boolean isNextCharOnNewLine() {
6673         return false;
6674     }
6675 
isPrevCR()6676     public boolean isPrevCR() {
6677         return lastCR;
6678     }
6679 
6680     /**
6681      * Returns the line.
6682      *
6683      * @return the line
6684      */
getLine()6685     public int getLine() {
6686         return -1;
6687     }
6688 
6689     /**
6690      * Returns the col.
6691      *
6692      * @return the col
6693      */
getCol()6694     public int getCol() {
6695         return -1;
6696     }
6697 
6698     // ]NOCPP]
6699 
isInDataState()6700     public boolean isInDataState() {
6701         return (stateSave == DATA);
6702     }
6703 
resetToDataState()6704     public void resetToDataState() {
6705         clearStrBufAfterUse();
6706         charRefBufLen = 0;
6707         stateSave = Tokenizer.DATA;
6708         // line = 1; XXX line numbers
6709         lastCR = false;
6710         index = 0;
6711         forceQuirks = false;
6712         additional = '\u0000';
6713         entCol = -1;
6714         firstCharKey = -1;
6715         lo = 0;
6716         hi = 0; // will always be overwritten before use anyway
6717         candidate = -1;
6718         charRefBufMark = 0;
6719         value = 0;
6720         seenDigits = false;
6721         endTag = false;
6722         shouldSuspend = false;
6723         initDoctypeFields();
6724         if (tagName != null) {
6725             tagName.release();
6726             tagName = null;
6727         }
6728         if (attributeName != null) {
6729             attributeName.release();
6730             attributeName = null;
6731         }
6732         if (newAttributesEachTime) {
6733             if (attributes != null) {
6734                 Portability.delete(attributes);
6735                 attributes = null;
6736             }
6737         }
6738     }
6739 
loadState(Tokenizer other)6740     public void loadState(Tokenizer other) throws SAXException {
6741         strBufLen = other.strBufLen;
6742         if (strBufLen > strBuf.length) {
6743             strBuf = new char[strBufLen];
6744         }
6745         System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
6746 
6747         charRefBufLen = other.charRefBufLen;
6748         System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen);
6749 
6750         stateSave = other.stateSave;
6751         returnStateSave = other.returnStateSave;
6752         endTagExpectation = other.endTagExpectation;
6753         endTagExpectationAsArray = other.endTagExpectationAsArray;
6754         // line = 1; XXX line numbers
6755         lastCR = other.lastCR;
6756         index = other.index;
6757         forceQuirks = other.forceQuirks;
6758         additional = other.additional;
6759         entCol = other.entCol;
6760         firstCharKey = other.firstCharKey;
6761         lo = other.lo;
6762         hi = other.hi;
6763         candidate = other.candidate;
6764         charRefBufMark = other.charRefBufMark;
6765         value = other.value;
6766         seenDigits = other.seenDigits;
6767         endTag = other.endTag;
6768         shouldSuspend = false;
6769 
6770         if (other.doctypeName == null) {
6771             doctypeName = null;
6772         } else {
6773             doctypeName = Portability.newLocalFromLocal(other.doctypeName,
6774                     interner);
6775         }
6776 
6777         Portability.releaseString(systemIdentifier);
6778         if (other.systemIdentifier == null) {
6779             systemIdentifier = null;
6780         } else {
6781             systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
6782         }
6783 
6784         Portability.releaseString(publicIdentifier);
6785         if (other.publicIdentifier == null) {
6786             publicIdentifier = null;
6787         } else {
6788             publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
6789         }
6790 
6791         if (tagName != null) {
6792             tagName.release();
6793         }
6794         if (other.tagName == null) {
6795             tagName = null;
6796         } else {
6797             tagName = other.tagName.cloneElementName(interner);
6798         }
6799 
6800         if (attributeName != null) {
6801             attributeName.release();
6802         }
6803         if (other.attributeName == null) {
6804             attributeName = null;
6805         } else {
6806             attributeName = other.attributeName.cloneAttributeName(interner);
6807         }
6808 
6809         Portability.delete(attributes);
6810         if (other.attributes == null) {
6811             attributes = null;
6812         } else {
6813             attributes = other.attributes.cloneAttributes(interner);
6814         }
6815     }
6816 
initializeWithoutStarting()6817     public void initializeWithoutStarting() throws SAXException {
6818         confident = false;
6819         strBuf = null;
6820         line = 1;
6821         // CPPONLY: attributeLine = 1;
6822         // [NOCPP[
6823         html4 = false;
6824         metaBoundaryPassed = false;
6825         wantsComments = tokenHandler.wantsComments();
6826         if (!newAttributesEachTime) {
6827             attributes = new HtmlAttributes(mappingLangToXmlLang);
6828         }
6829         // ]NOCPP]
6830         resetToDataState();
6831     }
6832 
errGarbageAfterLtSlash()6833     protected void errGarbageAfterLtSlash() throws SAXException {
6834     }
6835 
errLtSlashGt()6836     protected void errLtSlashGt() throws SAXException {
6837     }
6838 
errWarnLtSlashInRcdata()6839     protected void errWarnLtSlashInRcdata() throws SAXException {
6840     }
6841 
errHtml4LtSlashInRcdata(char folded)6842     protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
6843     }
6844 
errCharRefLacksSemicolon()6845     protected void errCharRefLacksSemicolon() throws SAXException {
6846     }
6847 
errNoDigitsInNCR()6848     protected void errNoDigitsInNCR() throws SAXException {
6849     }
6850 
errGtInSystemId()6851     protected void errGtInSystemId() throws SAXException {
6852     }
6853 
errGtInPublicId()6854     protected void errGtInPublicId() throws SAXException {
6855     }
6856 
errNamelessDoctype()6857     protected void errNamelessDoctype() throws SAXException {
6858     }
6859 
errConsecutiveHyphens()6860     protected void errConsecutiveHyphens() throws SAXException {
6861     }
6862 
errPrematureEndOfComment()6863     protected void errPrematureEndOfComment() throws SAXException {
6864     }
6865 
errBogusComment()6866     protected void errBogusComment() throws SAXException {
6867     }
6868 
errUnquotedAttributeValOrNull(char c)6869     protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
6870     }
6871 
errSlashNotFollowedByGt()6872     protected void errSlashNotFollowedByGt() throws SAXException {
6873     }
6874 
errHtml4XmlVoidSyntax()6875     protected void errHtml4XmlVoidSyntax() throws SAXException {
6876     }
6877 
errNoSpaceBetweenAttributes()6878     protected void errNoSpaceBetweenAttributes() throws SAXException {
6879     }
6880 
errHtml4NonNameInUnquotedAttribute(char c)6881     protected void errHtml4NonNameInUnquotedAttribute(char c)
6882             throws SAXException {
6883     }
6884 
errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)6885     protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
6886             throws SAXException {
6887     }
6888 
errAttributeValueMissing()6889     protected void errAttributeValueMissing() throws SAXException {
6890     }
6891 
errBadCharBeforeAttributeNameOrNull(char c)6892     protected void errBadCharBeforeAttributeNameOrNull(char c)
6893             throws SAXException {
6894     }
6895 
errEqualsSignBeforeAttributeName()6896     protected void errEqualsSignBeforeAttributeName() throws SAXException {
6897     }
6898 
errBadCharAfterLt(char c)6899     protected void errBadCharAfterLt(char c) throws SAXException {
6900     }
6901 
errLtGt()6902     protected void errLtGt() throws SAXException {
6903     }
6904 
errProcessingInstruction()6905     protected void errProcessingInstruction() throws SAXException {
6906     }
6907 
errUnescapedAmpersandInterpretedAsCharacterReference()6908     protected void errUnescapedAmpersandInterpretedAsCharacterReference()
6909             throws SAXException {
6910     }
6911 
errNotSemicolonTerminated()6912     protected void errNotSemicolonTerminated() throws SAXException {
6913     }
6914 
errNoNamedCharacterMatch()6915     protected void errNoNamedCharacterMatch() throws SAXException {
6916     }
6917 
errQuoteBeforeAttributeName(char c)6918     protected void errQuoteBeforeAttributeName(char c) throws SAXException {
6919     }
6920 
errQuoteOrLtInAttributeNameOrNull(char c)6921     protected void errQuoteOrLtInAttributeNameOrNull(char c)
6922             throws SAXException {
6923     }
6924 
errExpectedPublicId()6925     protected void errExpectedPublicId() throws SAXException {
6926     }
6927 
errBogusDoctype()6928     protected void errBogusDoctype() throws SAXException {
6929     }
6930 
maybeWarnPrivateUseAstral()6931     protected void maybeWarnPrivateUseAstral() throws SAXException {
6932     }
6933 
maybeWarnPrivateUse(char ch)6934     protected void maybeWarnPrivateUse(char ch) throws SAXException {
6935     }
6936 
maybeErrAttributesOnEndTag(HtmlAttributes attrs)6937     protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
6938             throws SAXException {
6939     }
6940 
maybeErrSlashInEndTag(boolean selfClosing)6941     protected void maybeErrSlashInEndTag(boolean selfClosing)
6942             throws SAXException {
6943     }
6944 
errNcrNonCharacter(char ch)6945     protected char errNcrNonCharacter(char ch) throws SAXException {
6946         return ch;
6947     }
6948 
errAstralNonCharacter(int ch)6949     protected void errAstralNonCharacter(int ch) throws SAXException {
6950     }
6951 
errNcrSurrogate()6952     protected void errNcrSurrogate() throws SAXException {
6953     }
6954 
errNcrControlChar(char ch)6955     protected char errNcrControlChar(char ch) throws SAXException {
6956         return ch;
6957     }
6958 
errNcrCr()6959     protected void errNcrCr() throws SAXException {
6960     }
6961 
errNcrInC1Range()6962     protected void errNcrInC1Range() throws SAXException {
6963     }
6964 
errEofInPublicId()6965     protected void errEofInPublicId() throws SAXException {
6966     }
6967 
errEofInComment()6968     protected void errEofInComment() throws SAXException {
6969     }
6970 
errEofInDoctype()6971     protected void errEofInDoctype() throws SAXException {
6972     }
6973 
errEofInAttributeValue()6974     protected void errEofInAttributeValue() throws SAXException {
6975     }
6976 
errEofInAttributeName()6977     protected void errEofInAttributeName() throws SAXException {
6978     }
6979 
errEofWithoutGt()6980     protected void errEofWithoutGt() throws SAXException {
6981     }
6982 
errEofInTagName()6983     protected void errEofInTagName() throws SAXException {
6984     }
6985 
errEofInEndTag()6986     protected void errEofInEndTag() throws SAXException {
6987     }
6988 
errEofAfterLt()6989     protected void errEofAfterLt() throws SAXException {
6990     }
6991 
errNcrOutOfRange()6992     protected void errNcrOutOfRange() throws SAXException {
6993     }
6994 
errNcrUnassigned()6995     protected void errNcrUnassigned() throws SAXException {
6996     }
6997 
errDuplicateAttribute()6998     protected void errDuplicateAttribute() throws SAXException {
6999     }
7000 
errEofInSystemId()7001     protected void errEofInSystemId() throws SAXException {
7002     }
7003 
errExpectedSystemId()7004     protected void errExpectedSystemId() throws SAXException {
7005     }
7006 
errMissingSpaceBeforeDoctypeName()7007     protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
7008     }
7009 
errHyphenHyphenBang()7010     protected void errHyphenHyphenBang() throws SAXException {
7011     }
7012 
errNcrControlChar()7013     protected void errNcrControlChar() throws SAXException {
7014     }
7015 
errNcrZero()7016     protected void errNcrZero() throws SAXException {
7017     }
7018 
errNoSpaceBetweenDoctypeSystemKeywordAndQuote()7019     protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
7020             throws SAXException {
7021     }
7022 
errNoSpaceBetweenPublicAndSystemIds()7023     protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
7024     }
7025 
errNoSpaceBetweenDoctypePublicKeywordAndQuote()7026     protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
7027             throws SAXException {
7028     }
7029 
noteAttributeWithoutValue()7030     protected void noteAttributeWithoutValue() throws SAXException {
7031     }
7032 
noteUnquotedAttributeValue()7033     protected void noteUnquotedAttributeValue() throws SAXException {
7034     }
7035 
7036     /**
7037      * Sets the encodingDeclarationHandler.
7038      *
7039      * @param encodingDeclarationHandler
7040      *            the encodingDeclarationHandler to set
7041      */
setEncodingDeclarationHandler( EncodingDeclarationHandler encodingDeclarationHandler)7042     public void setEncodingDeclarationHandler(
7043             EncodingDeclarationHandler encodingDeclarationHandler) {
7044         this.encodingDeclarationHandler = encodingDeclarationHandler;
7045     }
7046 
destructor()7047     void destructor() {
7048         // The translator will write refcount tracing stuff here
7049         Portability.delete(attributes);
7050         attributes = null;
7051     }
7052 
7053     // [NOCPP[
7054 
7055     /**
7056      * Sets an offset to be added to the position reported to
7057      * <code>TransitionHandler</code>.
7058      *
7059      * @param offset the offset
7060      */
setTransitionBaseOffset(int offset)7061     public void setTransitionBaseOffset(int offset) {
7062 
7063     }
7064 
7065     // ]NOCPP]
7066 
7067 }
7068