1 /*
2  * Copyright (c) 2005-2007 Henri Sivonen
3  * Copyright (c) 2007-2015 Mozilla Foundation
4  * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
5  * Foundation, and Opera Software ASA.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23  * DEALINGS IN THE SOFTWARE.
24  */
25 
26 /*
27  * The comments following this one that use the same comment syntax as this
28  * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
29  * amended as of June 18 2008 and May 31 2010.
30  * That document came with this statement:
31  * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
32  * Opera Software ASA. You are granted a license to use, reproduce and
33  * create derivative works of this document."
34  */
35 
36 package nu.validator.htmlparser.impl;
37 
38 import org.xml.sax.ErrorHandler;
39 import org.xml.sax.Locator;
40 import org.xml.sax.SAXException;
41 import org.xml.sax.SAXParseException;
42 
43 import nu.validator.htmlparser.annotation.Auto;
44 import nu.validator.htmlparser.annotation.CharacterName;
45 import nu.validator.htmlparser.annotation.Const;
46 import nu.validator.htmlparser.annotation.Inline;
47 import nu.validator.htmlparser.annotation.Local;
48 import nu.validator.htmlparser.annotation.NoLength;
49 import nu.validator.htmlparser.common.EncodingDeclarationHandler;
50 import nu.validator.htmlparser.common.Interner;
51 import nu.validator.htmlparser.common.TokenHandler;
52 import nu.validator.htmlparser.common.XmlViolationPolicy;
53 
54 /**
55  * An implementation of
56  * https://html.spec.whatwg.org/multipage/syntax.html#tokenization
57  *
58  * This class implements the <code>Locator</code> interface. This is not an
59  * incidental implementation detail: Users of this class are encouraged to make
60  * use of the <code>Locator</code> nature.
61  *
62  * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
63  * can be configured to treat these conditions as fatal or to coerce the infoset
64  * to something that XML 1.0 allows.
65  *
66  * @version $Id$
67  * @author hsivonen
68  */
69 public class Tokenizer implements Locator {
70 
71     private static final int DATA_AND_RCDATA_MASK = ~1;
72 
73     public static final int DATA = 0;
74 
75     public static final int RCDATA = 1;
76 
77     public static final int SCRIPT_DATA = 2;
78 
79     public static final int RAWTEXT = 3;
80 
81     public static final int SCRIPT_DATA_ESCAPED = 4;
82 
83     public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
84 
85     public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
86 
87     public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
88 
89     public static final int PLAINTEXT = 8;
90 
91     public static final int TAG_OPEN = 9;
92 
93     public static final int CLOSE_TAG_OPEN = 10;
94 
95     public static final int TAG_NAME = 11;
96 
97     public static final int BEFORE_ATTRIBUTE_NAME = 12;
98 
99     public static final int ATTRIBUTE_NAME = 13;
100 
101     public static final int AFTER_ATTRIBUTE_NAME = 14;
102 
103     public static final int BEFORE_ATTRIBUTE_VALUE = 15;
104 
105     public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
106 
107     public static final int BOGUS_COMMENT = 17;
108 
109     public static final int MARKUP_DECLARATION_OPEN = 18;
110 
111     public static final int DOCTYPE = 19;
112 
113     public static final int BEFORE_DOCTYPE_NAME = 20;
114 
115     public static final int DOCTYPE_NAME = 21;
116 
117     public static final int AFTER_DOCTYPE_NAME = 22;
118 
119     public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
120 
121     public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
122 
123     public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
124 
125     public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
126 
127     public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
128 
129     public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
130 
131     public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
132 
133     public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
134 
135     public static final int BOGUS_DOCTYPE = 31;
136 
137     public static final int COMMENT_START = 32;
138 
139     public static final int COMMENT_START_DASH = 33;
140 
141     public static final int COMMENT = 34;
142 
143     public static final int COMMENT_END_DASH = 35;
144 
145     public static final int COMMENT_END = 36;
146 
147     public static final int COMMENT_END_BANG = 37;
148 
149     public static final int NON_DATA_END_TAG_NAME = 38;
150 
151     public static final int MARKUP_DECLARATION_HYPHEN = 39;
152 
153     public static final int MARKUP_DECLARATION_OCTYPE = 40;
154 
155     public static final int DOCTYPE_UBLIC = 41;
156 
157     public static final int DOCTYPE_YSTEM = 42;
158 
159     public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
160 
161     public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
162 
163     public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
164 
165     public static final int CONSUME_CHARACTER_REFERENCE = 46;
166 
167     public static final int CONSUME_NCR = 47;
168 
169     public static final int CHARACTER_REFERENCE_TAIL = 48;
170 
171     public static final int HEX_NCR_LOOP = 49;
172 
173     public static final int DECIMAL_NRC_LOOP = 50;
174 
175     public static final int HANDLE_NCR_VALUE = 51;
176 
177     public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
178 
179     public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
180 
181     public static final int SELF_CLOSING_START_TAG = 54;
182 
183     public static final int CDATA_START = 55;
184 
185     public static final int CDATA_SECTION = 56;
186 
187     public static final int CDATA_RSQB = 57;
188 
189     public static final int CDATA_RSQB_RSQB = 58;
190 
191     public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
192 
193     public static final int SCRIPT_DATA_ESCAPE_START = 60;
194 
195     public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
196 
197     public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
198 
199     public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
200 
201     public static final int BOGUS_COMMENT_HYPHEN = 64;
202 
203     public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
204 
205     public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
206 
207     public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
208 
209     public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
210 
211     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
212 
213     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
214 
215     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
216 
217     public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
218 
219     public static final int PROCESSING_INSTRUCTION = 73;
220 
221     public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
222 
223     /**
224      * Magic value for UTF-16 operations.
225      */
226     private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
227 
228     /**
229      * UTF-16 code unit array containing less than and greater than for emitting
230      * those characters on certain parse errors.
231      */
232     private static final @NoLength char[] LT_GT = { '<', '>' };
233 
234     /**
235      * UTF-16 code unit array containing less than and solidus for emitting
236      * those characters on certain parse errors.
237      */
238     private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
239 
240     /**
241      * UTF-16 code unit array containing ]] for emitting those characters on
242      * state transitions.
243      */
244     private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
245 
246     /**
247      * Array version of U+FFFD.
248      */
249     private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
250 
251     // [NOCPP[
252 
253     /**
254      * Array version of space.
255      */
256     private static final @NoLength char[] SPACE = { ' ' };
257 
258     // ]NOCPP]
259 
260     /**
261      * Array version of line feed.
262      */
263     private static final @NoLength char[] LF = { '\n' };
264 
265     /**
266      * "CDATA[" as <code>char[]</code>
267      */
268     private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
269             'A', '[' };
270 
271     /**
272      * "octype" as <code>char[]</code>
273      */
274     private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
275             'e' };
276 
277     /**
278      * "ublic" as <code>char[]</code>
279      */
280     private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
281 
282     /**
283      * "ystem" as <code>char[]</code>
284      */
285     private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
286 
287     private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
288 
289     private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
290 
291     private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
292 
293     private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
294             'e', 'x', 't' };
295 
296     private static final char[] XMP_ARR = { 'x', 'm', 'p' };
297 
298     private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
299             'e', 'a' };
300 
301     private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
302 
303     private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
304             'd' };
305 
306     private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
307             'p', 't' };
308 
309     private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
310             'e', 's' };
311 
312     /**
313      * The token handler.
314      */
315     protected final TokenHandler tokenHandler;
316 
317     protected EncodingDeclarationHandler encodingDeclarationHandler;
318 
319     // [NOCPP[
320 
321     /**
322      * The error handler.
323      */
324     protected ErrorHandler errorHandler;
325 
326     // ]NOCPP]
327 
328     /**
329      * Whether the previous char read was CR.
330      */
331     protected boolean lastCR;
332 
333     protected int stateSave;
334 
335     private int returnStateSave;
336 
337     protected int index;
338 
339     private boolean forceQuirks;
340 
341     private char additional;
342 
343     private int entCol;
344 
345     private int firstCharKey;
346 
347     private int lo;
348 
349     private int hi;
350 
351     private int candidate;
352 
353     private int charRefBufMark;
354 
355     protected int value;
356 
357     private boolean seenDigits;
358 
359     protected int cstart;
360 
361     /**
362      * The SAX public id for the resource being tokenized. (Only passed to back
363      * as part of locator data.)
364      */
365     private String publicId;
366 
367     /**
368      * The SAX system id for the resource being tokenized. (Only passed to back
369      * as part of locator data.)
370      */
371     private String systemId;
372 
373     /**
374      * Buffer for bufferable things other than those that fit the description
375      * of <code>charRefBuf</code>.
376      */
377     private @Auto char[] strBuf;
378 
379     /**
380      * Number of significant <code>char</code>s in <code>strBuf</code>.
381      */
382     private int strBufLen;
383 
384     /**
385      * Buffer for characters that might form a character reference but may
386      * end up not forming one.
387      */
388     private final @Auto char[] charRefBuf;
389 
390     /**
391      * Number of significant <code>char</code>s in <code>charRefBuf</code>.
392      */
393     private int charRefBufLen;
394 
395     /**
396      * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
397      */
398     private final @Auto char[] bmpChar;
399 
400     /**
401      * Buffer for expanding astral NCRs.
402      */
403     private final @Auto char[] astralChar;
404 
405     /**
406      * The element whose end tag closes the current CDATA or RCDATA element.
407      */
408     protected ElementName endTagExpectation = null;
409 
410     private char[] endTagExpectationAsArray; // not @Auto!
411 
412     /**
413      * <code>true</code> if tokenizing an end tag
414      */
415     protected boolean endTag;
416 
417     /**
418      * <code>true</code> iff the current element/attribute name contains
419      * a hyphen.
420      */
421     private boolean containsHyphen;
422 
423     /**
424      * The current tag token name. One of
425      * 1) null,
426      * 2) non-owning reference to nonInternedTagName
427      * 3) non-owning reference to a pre-interned ElementName
428      */
429     private ElementName tagName = null;
430 
431     /**
432      * The recycled ElementName instance for the non-pre-interned cases.
433      */
434     private ElementName nonInternedTagName = null;
435 
436     /**
437      * The current attribute name.
438      */
439     protected AttributeName attributeName = null;
440 
441     // CPPONLY: private AttributeName nonInternedAttributeName = null;
442 
443     // [NOCPP[
444 
445     /**
446      * Whether comment tokens are emitted.
447      */
448     private boolean wantsComments = false;
449 
450     /**
451      * <code>true</code> when HTML4-specific additional errors are requested.
452      */
453     protected boolean html4;
454 
455     /**
456      * Whether the stream is past the first 1024 bytes.
457      */
458     private boolean metaBoundaryPassed;
459 
460     // ]NOCPP]
461 
462     /**
463      * The name of the current doctype token.
464      */
465     private @Local String doctypeName;
466 
467     /**
468      * The public id of the current doctype token.
469      */
470     private String publicIdentifier;
471 
472     /**
473      * The system id of the current doctype token.
474      */
475     private String systemIdentifier;
476 
477     /**
478      * The attribute holder.
479      */
480     private HtmlAttributes attributes;
481 
482     // [NOCPP[
483 
484     /**
485      * The policy for vertical tab and form feed.
486      */
487     private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
488 
489     /**
490      * The policy for comments.
491      */
492     private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
493 
494     private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
495 
496     private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
497 
498     private boolean html4ModeCompatibleWithXhtml1Schemata;
499 
500     private int mappingLangToXmlLang;
501 
502     // ]NOCPP]
503 
504     private final boolean newAttributesEachTime;
505 
506     private boolean shouldSuspend;
507 
508     protected boolean confident;
509 
510     private int line;
511 
512     /*
513      * The line number of the current attribute. First set to the line of the
514      * attribute name and if there is a value, set to the line the value
515      * started on.
516      */
517     // CPPONLY: private int attributeLine;
518 
519     private Interner interner;
520 
521     // CPPONLY: private boolean viewingXmlSource;
522 
523     // [NOCPP[
524 
525     protected LocatorImpl ampersandLocation;
526 
Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime)527     public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
528         this.tokenHandler = tokenHandler;
529         this.encodingDeclarationHandler = null;
530         this.newAttributesEachTime = newAttributesEachTime;
531         // &CounterClockwiseContourIntegral; is the longest valid char ref and
532         // the semicolon never gets appended to the buffer.
533         this.charRefBuf = new char[32];
534         this.bmpChar = new char[1];
535         this.astralChar = new char[2];
536         this.containsHyphen = false;
537         this.tagName = null;
538         this.nonInternedTagName = new ElementName();
539         this.attributeName = null;
540         // CPPONLY: this.nonInternedAttributeName = new AttributeName();
541         this.doctypeName = null;
542         this.publicIdentifier = null;
543         this.systemIdentifier = null;
544         this.attributes = null;
545     }
546 
547     // ]NOCPP]
548 
549     /**
550      * The constructor.
551      *
552      * @param tokenHandler
553      *            the handler for receiving tokens
554      */
Tokenizer(TokenHandler tokenHandler )555     public Tokenizer(TokenHandler tokenHandler
556     // CPPONLY: , boolean viewingXmlSource
557     ) {
558         this.tokenHandler = tokenHandler;
559         this.encodingDeclarationHandler = null;
560         // [NOCPP[
561         this.newAttributesEachTime = false;
562         // ]NOCPP]
563         // &CounterClockwiseContourIntegral; is the longest valid char ref and
564         // the semicolon never gets appended to the buffer.
565         this.charRefBuf = new char[32];
566         this.bmpChar = new char[1];
567         this.astralChar = new char[2];
568         this.containsHyphen = false;
569         this.tagName = null;
570         this.nonInternedTagName = new ElementName();
571         this.attributeName = null;
572         // CPPONLY: this.nonInternedAttributeName = new AttributeName();
573         this.doctypeName = null;
574         this.publicIdentifier = null;
575         this.systemIdentifier = null;
576         // [NOCPP[
577         this.attributes = null;
578         // ]NOCPP]
579         // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
580         // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
581         // CPPONLY: this.viewingXmlSource = viewingXmlSource;
582     }
583 
setInterner(Interner interner)584     public void setInterner(Interner interner) {
585         this.interner = interner;
586     }
587 
initLocation(String newPublicId, String newSystemId)588     public void initLocation(String newPublicId, String newSystemId) {
589         this.systemId = newSystemId;
590         this.publicId = newPublicId;
591 
592     }
593 
594     // CPPONLY: boolean isViewingXmlSource() {
595     // CPPONLY: return viewingXmlSource;
596     // CPPONLY: }
597 
598     // [NOCPP[
599 
600     /**
601      * Returns the mappingLangToXmlLang.
602      *
603      * @return the mappingLangToXmlLang
604      */
isMappingLangToXmlLang()605     public boolean isMappingLangToXmlLang() {
606         return mappingLangToXmlLang == AttributeName.HTML_LANG;
607     }
608 
609     /**
610      * Sets the mappingLangToXmlLang.
611      *
612      * @param mappingLangToXmlLang
613      *            the mappingLangToXmlLang to set
614      */
setMappingLangToXmlLang(boolean mappingLangToXmlLang)615     public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
616         this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
617                 : AttributeName.HTML;
618     }
619 
620     /**
621      * Sets the error handler.
622      *
623      * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
624      */
setErrorHandler(ErrorHandler eh)625     public void setErrorHandler(ErrorHandler eh) {
626         this.errorHandler = eh;
627     }
628 
getErrorHandler()629     public ErrorHandler getErrorHandler() {
630         return this.errorHandler;
631     }
632 
633     /**
634      * Sets the commentPolicy.
635      *
636      * @param commentPolicy
637      *            the commentPolicy to set
638      */
setCommentPolicy(XmlViolationPolicy commentPolicy)639     public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
640         this.commentPolicy = commentPolicy;
641     }
642 
643     /**
644      * Sets the contentNonXmlCharPolicy.
645      *
646      * @param contentNonXmlCharPolicy
647      *            the contentNonXmlCharPolicy to set
648      */
setContentNonXmlCharPolicy( XmlViolationPolicy contentNonXmlCharPolicy)649     public void setContentNonXmlCharPolicy(
650             XmlViolationPolicy contentNonXmlCharPolicy) {
651         if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
652             throw new IllegalArgumentException(
653                     "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
654         }
655     }
656 
657     /**
658      * Sets the contentSpacePolicy.
659      *
660      * @param contentSpacePolicy
661      *            the contentSpacePolicy to set
662      */
setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy)663     public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
664         this.contentSpacePolicy = contentSpacePolicy;
665     }
666 
667     /**
668      * Sets the xmlnsPolicy.
669      *
670      * @param xmlnsPolicy
671      *            the xmlnsPolicy to set
672      */
setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy)673     public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
674         if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
675             throw new IllegalArgumentException("Can't use FATAL here.");
676         }
677         this.xmlnsPolicy = xmlnsPolicy;
678     }
679 
setNamePolicy(XmlViolationPolicy namePolicy)680     public void setNamePolicy(XmlViolationPolicy namePolicy) {
681         this.namePolicy = namePolicy;
682     }
683 
684     /**
685      * Sets the html4ModeCompatibleWithXhtml1Schemata.
686      *
687      * @param html4ModeCompatibleWithXhtml1Schemata
688      *            the html4ModeCompatibleWithXhtml1Schemata to set
689      */
setHtml4ModeCompatibleWithXhtml1Schemata( boolean html4ModeCompatibleWithXhtml1Schemata)690     public void setHtml4ModeCompatibleWithXhtml1Schemata(
691             boolean html4ModeCompatibleWithXhtml1Schemata) {
692         this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
693     }
694 
695     // ]NOCPP]
696 
697     // For the token handler to call
698 
699     /**
700      * Sets the tokenizer state and the associated element name. This should
701      * only ever used to put the tokenizer into one of the states that have
702      * a special end tag expectation.
703      *
704      * @param specialTokenizerState
705      *            the tokenizer state to set
706      */
setState(int specialTokenizerState)707     public void setState(int specialTokenizerState) {
708         this.stateSave = specialTokenizerState;
709         this.endTagExpectation = null;
710         this.endTagExpectationAsArray = null;
711     }
712 
713     // [NOCPP[
714 
715     /**
716      * Sets the tokenizer state and the associated element name. This should
717      * only ever used to put the tokenizer into one of the states that have
718      * a special end tag expectation. For use from the tokenizer test harness.
719      *
720      * @param specialTokenizerState
721      *            the tokenizer state to set
722      * @param endTagExpectation
723      *            the expected end tag for transitioning back to normal
724      */
setStateAndEndTagExpectation(int specialTokenizerState, @Local String endTagExpectation)725     public void setStateAndEndTagExpectation(int specialTokenizerState,
726             @Local String endTagExpectation) {
727         this.stateSave = specialTokenizerState;
728         if (specialTokenizerState == Tokenizer.DATA) {
729             return;
730         }
731         @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
732         this.endTagExpectation = ElementName.elementNameByBuffer(asArray,
733                 asArray.length, interner);
734         assert this.endTagExpectation != null;
735         endTagExpectationToArray();
736     }
737 
738     // ]NOCPP]
739 
740     /**
741      * Sets the tokenizer state and the associated element name. This should
742      * only ever used to put the tokenizer into one of the states that have
743      * a special end tag expectation.
744      *
745      * @param specialTokenizerState
746      *            the tokenizer state to set
747      * @param endTagExpectation
748      *            the expected end tag for transitioning back to normal
749      */
setStateAndEndTagExpectation(int specialTokenizerState, ElementName endTagExpectation)750     public void setStateAndEndTagExpectation(int specialTokenizerState,
751             ElementName endTagExpectation) {
752         this.stateSave = specialTokenizerState;
753         this.endTagExpectation = endTagExpectation;
754         endTagExpectationToArray();
755     }
756 
endTagExpectationToArray()757     private void endTagExpectationToArray() {
758         switch (endTagExpectation.getGroup()) {
759             case TreeBuilder.TITLE:
760                 endTagExpectationAsArray = TITLE_ARR;
761                 return;
762             case TreeBuilder.SCRIPT:
763                 endTagExpectationAsArray = SCRIPT_ARR;
764                 return;
765             case TreeBuilder.STYLE:
766                 endTagExpectationAsArray = STYLE_ARR;
767                 return;
768             case TreeBuilder.PLAINTEXT:
769                 endTagExpectationAsArray = PLAINTEXT_ARR;
770                 return;
771             case TreeBuilder.XMP:
772                 endTagExpectationAsArray = XMP_ARR;
773                 return;
774             case TreeBuilder.TEXTAREA:
775                 endTagExpectationAsArray = TEXTAREA_ARR;
776                 return;
777             case TreeBuilder.IFRAME:
778                 endTagExpectationAsArray = IFRAME_ARR;
779                 return;
780             case TreeBuilder.NOEMBED:
781                 endTagExpectationAsArray = NOEMBED_ARR;
782                 return;
783             case TreeBuilder.NOSCRIPT:
784                 endTagExpectationAsArray = NOSCRIPT_ARR;
785                 return;
786             case TreeBuilder.NOFRAMES:
787                 endTagExpectationAsArray = NOFRAMES_ARR;
788                 return;
789             default:
790                 assert false: "Bad end tag expectation.";
791                 return;
792         }
793     }
794 
795     /**
796      * For C++ use only.
797      */
setLineNumber(int line)798     public void setLineNumber(int line) {
799         // CPPONLY: this.attributeLine = line; // XXX is this needed?
800         this.line = line;
801     }
802 
803     // start Locator impl
804 
805     /**
806      * @see org.xml.sax.Locator#getLineNumber()
807      */
getLineNumber()808     @Inline public int getLineNumber() {
809         return line;
810     }
811 
812     // [NOCPP[
813 
814     /**
815      * @see org.xml.sax.Locator#getColumnNumber()
816      */
getColumnNumber()817     @Inline public int getColumnNumber() {
818         return -1;
819     }
820 
821     /**
822      * @see org.xml.sax.Locator#getPublicId()
823      */
getPublicId()824     public String getPublicId() {
825         return publicId;
826     }
827 
828     /**
829      * @see org.xml.sax.Locator#getSystemId()
830      */
getSystemId()831     public String getSystemId() {
832         return systemId;
833     }
834 
835     // end Locator impl
836 
837     // end public API
838 
notifyAboutMetaBoundary()839     public void notifyAboutMetaBoundary() {
840         metaBoundaryPassed = true;
841     }
842 
turnOnAdditionalHtml4Errors()843     void turnOnAdditionalHtml4Errors() {
844         html4 = true;
845     }
846 
847     // ]NOCPP]
848 
emptyAttributes()849     HtmlAttributes emptyAttributes() {
850         // [NOCPP[
851         if (newAttributesEachTime) {
852             return new HtmlAttributes(mappingLangToXmlLang);
853         } else {
854             // ]NOCPP]
855             return HtmlAttributes.EMPTY_ATTRIBUTES;
856             // [NOCPP[
857         }
858         // ]NOCPP]
859     }
860 
appendCharRefBuf(char c)861     @Inline private void appendCharRefBuf(char c) {
862         // CPPONLY: assert charRefBufLen < charRefBuf.length:
863         // CPPONLY:     "RELEASE: Attempted to overrun charRefBuf!";
864         charRefBuf[charRefBufLen++] = c;
865     }
866 
emitOrAppendCharRefBuf(int returnState)867     private void emitOrAppendCharRefBuf(int returnState) throws SAXException {
868         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
869             appendCharRefBufToStrBuf();
870         } else {
871             if (charRefBufLen > 0) {
872                 tokenHandler.characters(charRefBuf, 0, charRefBufLen);
873                 charRefBufLen = 0;
874             }
875         }
876     }
877 
clearStrBufAfterUse()878     @Inline private void clearStrBufAfterUse() {
879         strBufLen = 0;
880     }
881 
clearStrBufBeforeUse()882     @Inline private void clearStrBufBeforeUse() {
883         assert strBufLen == 0: "strBufLen not reset after previous use!";
884         strBufLen = 0; // no-op in the absence of bugs
885     }
886 
clearStrBufAfterOneHyphen()887     @Inline private void clearStrBufAfterOneHyphen() {
888         assert strBufLen == 1: "strBufLen length not one!";
889         assert strBuf[0] == '-': "strBuf does not start with a hyphen!";
890         strBufLen = 0;
891     }
892 
893     /**
894      * Appends to the buffer.
895      *
896      * @param c
897      *            the UTF-16 code unit to append
898      */
appendStrBuf(char c)899     @Inline private void appendStrBuf(char c) {
900         // CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient.";
901         // CPPONLY: if (strBufLen == strBuf.length) {
902         // CPPONLY:     if (!EnsureBufferSpace(1)) {
903         // CPPONLY:         assert false: "RELEASE: Unable to recover from buffer reallocation failure";
904         // CPPONLY:     } // TODO: Add telemetry when outer if fires but inner does not
905         // CPPONLY: }
906         strBuf[strBufLen++] = c;
907     }
908 
909     /**
910      * The buffer as a String. Currently only used for error reporting.
911      *
912      * <p>
913      * C++ memory note: The return value must be released.
914      *
915      * @return the buffer as a string
916      */
strBufToString()917     protected String strBufToString() {
918         String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen
919             // CPPONLY: , tokenHandler, !newAttributesEachTime && attributeName == AttributeName.CLASS
920         );
921         clearStrBufAfterUse();
922         return str;
923     }
924 
925     /**
926      * Returns the buffer as a local name. The return value is released in
927      * emitDoctypeToken().
928      *
929      * @return the buffer as local name
930      */
strBufToDoctypeName()931     private void strBufToDoctypeName() {
932         doctypeName = Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner);
933         clearStrBufAfterUse();
934     }
935 
936     /**
937      * Emits the buffer as character tokens.
938      *
939      * @throws SAXException
940      *             if the token handler threw
941      */
emitStrBuf()942     private void emitStrBuf() throws SAXException {
943         if (strBufLen > 0) {
944             tokenHandler.characters(strBuf, 0, strBufLen);
945             clearStrBufAfterUse();
946         }
947     }
948 
appendSecondHyphenToBogusComment()949     @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
950         // [NOCPP[
951         switch (commentPolicy) {
952             case ALTER_INFOSET:
953                 appendStrBuf(' ');
954                 // CPPONLY: MOZ_FALLTHROUGH;
955             case ALLOW:
956                 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
957                 // ]NOCPP]
958                 appendStrBuf('-');
959                 // [NOCPP[
960                 break;
961             case FATAL:
962                 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
963                 break;
964         }
965         // ]NOCPP]
966     }
967 
968     // [NOCPP[
maybeAppendSpaceToBogusComment()969     private void maybeAppendSpaceToBogusComment() throws SAXException {
970         switch (commentPolicy) {
971             case ALTER_INFOSET:
972                 appendStrBuf(' ');
973                 // CPPONLY: MOZ_FALLTHROUGH;
974             case ALLOW:
975                 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
976                 break;
977             case FATAL:
978                 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
979                 break;
980         }
981     }
982 
983     // ]NOCPP]
984 
adjustDoubleHyphenAndAppendToStrBufAndErr(char c)985     @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c)
986             throws SAXException {
987         errConsecutiveHyphens();
988         // [NOCPP[
989         switch (commentPolicy) {
990             case ALTER_INFOSET:
991                 strBufLen--;
992                 // WARNING!!! This expands the worst case of the buffer length
993                 // given the length of input!
994                 appendStrBuf(' ');
995                 appendStrBuf('-');
996                 // CPPONLY: MOZ_FALLTHROUGH;
997             case ALLOW:
998                 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
999                 // ]NOCPP]
1000                 appendStrBuf(c);
1001                 // [NOCPP[
1002                 break;
1003             case FATAL:
1004                 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1005                 break;
1006         }
1007         // ]NOCPP]
1008     }
1009 
appendStrBuf(@oLength char[] buffer, int offset, int length)1010     private void appendStrBuf(@NoLength char[] buffer, int offset, int length) {
1011         int newLen = strBufLen + length;
1012         // CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient.";
1013         // CPPONLY: if (strBuf.length < newLen) {
1014         // CPPONLY:     if (!EnsureBufferSpace(length)) {
1015         // CPPONLY:         assert false: "RELEASE: Unable to recover from buffer reallocation failure";
1016         // CPPONLY:     } // TODO: Add telemetry when outer if fires but inner does not
1017         // CPPONLY: }
1018         System.arraycopy(buffer, offset, strBuf, strBufLen, length);
1019         strBufLen = newLen;
1020     }
1021 
1022     /**
1023      * Append the contents of the char reference buffer to the main one.
1024      */
appendCharRefBufToStrBuf()1025     @Inline private void appendCharRefBufToStrBuf() {
1026         appendStrBuf(charRefBuf, 0, charRefBufLen);
1027         charRefBufLen = 0;
1028     }
1029 
1030     /**
1031      * Emits the current comment token.
1032      *
1033      * @param pos
1034      *            TODO
1035      *
1036      * @throws SAXException
1037      */
emitComment(int provisionalHyphens, int pos)1038     private void emitComment(int provisionalHyphens, int pos)
1039             throws SAXException {
1040         // [NOCPP[
1041         if (wantsComments) {
1042             // ]NOCPP]
1043             tokenHandler.comment(strBuf, 0, strBufLen
1044                     - provisionalHyphens);
1045             // [NOCPP[
1046         }
1047         // ]NOCPP]
1048         clearStrBufAfterUse();
1049         cstart = pos + 1;
1050     }
1051 
1052     /**
1053      * Flushes coalesced character tokens.
1054      *
1055      * @param buf
1056      *            TODO
1057      * @param pos
1058      *            TODO
1059      *
1060      * @throws SAXException
1061      */
flushChars(@oLength char[] buf, int pos)1062     protected void flushChars(@NoLength char[] buf, int pos)
1063             throws SAXException {
1064         if (pos > cstart) {
1065             tokenHandler.characters(buf, cstart, pos - cstart);
1066         }
1067         cstart = Integer.MAX_VALUE;
1068     }
1069 
1070     /**
1071      * Reports an condition that would make the infoset incompatible with XML
1072      * 1.0 as fatal.
1073      *
1074      * @param message
1075      *            the message
1076      * @throws SAXException
1077      * @throws SAXParseException
1078      */
fatal(String message)1079     public void fatal(String message) throws SAXException {
1080         SAXParseException spe = new SAXParseException(message, this);
1081         if (errorHandler != null) {
1082             errorHandler.fatalError(spe);
1083         }
1084         throw spe;
1085     }
1086 
1087     /**
1088      * Reports a Parse Error.
1089      *
1090      * @param message
1091      *            the message
1092      * @throws SAXException
1093      */
err(String message)1094     public void err(String message) throws SAXException {
1095         if (errorHandler == null) {
1096             return;
1097         }
1098         SAXParseException spe = new SAXParseException(message, this);
1099         errorHandler.error(spe);
1100     }
1101 
errTreeBuilder(String message)1102     public void errTreeBuilder(String message) throws SAXException {
1103         ErrorHandler eh = null;
1104         if (tokenHandler instanceof TreeBuilder<?>) {
1105             TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
1106             eh = treeBuilder.getErrorHandler();
1107         }
1108         if (eh == null) {
1109             eh = errorHandler;
1110         }
1111         if (eh == null) {
1112             return;
1113         }
1114         SAXParseException spe = new SAXParseException(message, this);
1115         eh.error(spe);
1116     }
1117 
1118     /**
1119      * Reports a warning
1120      *
1121      * @param message
1122      *            the message
1123      * @throws SAXException
1124      */
warn(String message)1125     public void warn(String message) throws SAXException {
1126         if (errorHandler == null) {
1127             return;
1128         }
1129         SAXParseException spe = new SAXParseException(message, this);
1130         errorHandler.warning(spe);
1131     }
1132 
strBufToElementNameString()1133     private void strBufToElementNameString() {
1134         if (containsHyphen) {
1135             // We've got a custom element or annotation-xml.
1136             @Local String annotationName = ElementName.ANNOTATION_XML.getName();
1137             if (Portability.localEqualsBuffer(annotationName, strBuf, strBufLen)) {
1138                 tagName = ElementName.ANNOTATION_XML;
1139             } else {
1140                 nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen,
1141                         interner)
1142                         // CPPONLY: , true
1143                         );
1144                 tagName = nonInternedTagName;
1145             }
1146         } else {
1147             tagName = ElementName.elementNameByBuffer(strBuf, strBufLen, interner);
1148             if (tagName == null) {
1149                 nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen,
1150                     interner)
1151                         // CPPONLY: , false
1152                         );
1153                 tagName = nonInternedTagName;
1154             }
1155         }
1156         containsHyphen = false;
1157         clearStrBufAfterUse();
1158     }
1159 
emitCurrentTagToken(boolean selfClosing, int pos)1160     private int emitCurrentTagToken(boolean selfClosing, int pos)
1161             throws SAXException {
1162         cstart = pos + 1;
1163         maybeErrSlashInEndTag(selfClosing);
1164         stateSave = Tokenizer.DATA;
1165         HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
1166                 : attributes);
1167         if (endTag) {
1168             /*
1169              * When an end tag token is emitted, the content model flag must be
1170              * switched to the PCDATA state.
1171              */
1172             maybeErrAttributesOnEndTag(attrs);
1173             // CPPONLY: if (!viewingXmlSource) {
1174             tokenHandler.endTag(tagName);
1175             // CPPONLY: }
1176             // CPPONLY: if (newAttributesEachTime) {
1177             // CPPONLY:   Portability.delete(attributes);
1178             // CPPONLY:   attributes = null;
1179             // CPPONLY: }
1180         } else {
1181             // CPPONLY: if (viewingXmlSource) {
1182             // CPPONLY:   assert newAttributesEachTime;
1183             // CPPONLY:   Portability.delete(attributes);
1184             // CPPONLY:   attributes = null;
1185             // CPPONLY: } else {
1186             tokenHandler.startTag(tagName, attrs, selfClosing);
1187             // CPPONLY: }
1188         }
1189         tagName = null;
1190         if (newAttributesEachTime) {
1191             attributes = null;
1192         } else {
1193             attributes.clear(mappingLangToXmlLang);
1194         }
1195         /*
1196          * The token handler may have called setStateAndEndTagExpectation
1197          * and changed stateSave since the start of this method.
1198          */
1199         return stateSave;
1200     }
1201 
attributeNameComplete()1202     private void attributeNameComplete() throws SAXException {
1203         attributeName = AttributeName.nameByBuffer(strBuf, strBufLen, interner);
1204         if (attributeName == null) {
1205             // [NOCPP[
1206             attributeName = AttributeName.createAttributeName(
1207                     Portability.newLocalNameFromBuffer(strBuf, strBufLen,
1208                             interner),
1209                     namePolicy != XmlViolationPolicy.ALLOW);
1210             // ]NOCPP]
1211             // CPPONLY:     nonInternedAttributeName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner));
1212             // CPPONLY:     attributeName = nonInternedAttributeName;
1213         }
1214         clearStrBufAfterUse();
1215 
1216         if (attributes == null) {
1217             attributes = new HtmlAttributes(mappingLangToXmlLang);
1218         }
1219 
1220         /*
1221          * When the user agent leaves the attribute name state (and before
1222          * emitting the tag token, if appropriate), the complete attribute's
1223          * name must be compared to the other attributes on the same token; if
1224          * there is already an attribute on the token with the exact same name,
1225          * then this is a parse error and the new attribute must be dropped,
1226          * along with the value that gets associated with it (if any).
1227          */
1228         if (attributes.contains(attributeName)) {
1229             errDuplicateAttribute();
1230             attributeName = null;
1231         }
1232     }
1233 
addAttributeWithoutValue()1234     private void addAttributeWithoutValue() throws SAXException {
1235         noteAttributeWithoutValue();
1236 
1237         // [NOCPP[
1238         if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
1239                 && ElementName.META == tagName) {
1240             err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1241         }
1242         // ]NOCPP]
1243         if (attributeName != null) {
1244             // [NOCPP[
1245             if (html4) {
1246                 if (attributeName.isBoolean()) {
1247                     if (html4ModeCompatibleWithXhtml1Schemata) {
1248                         attributes.addAttribute(attributeName,
1249                                 attributeName.getLocal(AttributeName.HTML),
1250                                 xmlnsPolicy);
1251                     } else {
1252                         attributes.addAttribute(attributeName, "", xmlnsPolicy);
1253                     }
1254                 } else {
1255                     if (AttributeName.BORDER != attributeName) {
1256                         err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
1257                         attributes.addAttribute(attributeName, "", xmlnsPolicy);
1258                     }
1259                 }
1260             } else {
1261                 if (AttributeName.SRC == attributeName
1262                         || AttributeName.HREF == attributeName) {
1263                     warn("Attribute \u201C"
1264                             + attributeName.getLocal(AttributeName.HTML)
1265                             + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
1266                 }
1267                 // ]NOCPP]
1268                 attributes.addAttribute(attributeName,
1269                         Portability.newEmptyString()
1270                         // [NOCPP[
1271                         , xmlnsPolicy
1272                 // ]NOCPP]
1273                 // CPPONLY: , attributeLine
1274                 );
1275                 // [NOCPP[
1276             }
1277             // ]NOCPP]
1278             attributeName = null;
1279         } else {
1280             clearStrBufAfterUse();
1281         }
1282     }
1283 
addAttributeWithValue()1284     private void addAttributeWithValue() throws SAXException {
1285         // [NOCPP[
1286         if (metaBoundaryPassed && ElementName.META == tagName
1287                 && AttributeName.CHARSET == attributeName) {
1288             err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1289         }
1290         // ]NOCPP]
1291         if (attributeName != null) {
1292             String val = strBufToString(); // Ownership transferred to
1293             // HtmlAttributes
1294             // CPPONLY: if (mViewSource) {
1295             // CPPONLY:   mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
1296             // CPPONLY: }
1297             // [NOCPP[
1298             if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
1299                     && attributeName.isCaseFolded()) {
1300                 val = newAsciiLowerCaseStringFromString(val);
1301             }
1302             // ]NOCPP]
1303             attributes.addAttribute(attributeName, val
1304             // [NOCPP[
1305                     , xmlnsPolicy
1306             // ]NOCPP]
1307             // CPPONLY: , attributeLine
1308             );
1309             attributeName = null;
1310         } else {
1311             // We have a duplicate attribute. Explicitly discard its value.
1312             clearStrBufAfterUse();
1313         }
1314     }
1315 
1316     // [NOCPP[
1317 
newAsciiLowerCaseStringFromString(String str)1318     private static String newAsciiLowerCaseStringFromString(String str) {
1319         if (str == null) {
1320             return null;
1321         }
1322         char[] buf = new char[str.length()];
1323         for (int i = 0; i < str.length(); i++) {
1324             char c = str.charAt(i);
1325             if (c >= 'A' && c <= 'Z') {
1326                 c += 0x20;
1327             }
1328             buf[i] = c;
1329         }
1330         return new String(buf);
1331     }
1332 
startErrorReporting()1333     protected void startErrorReporting() throws SAXException {
1334 
1335     }
1336 
1337     // ]NOCPP]
1338 
start()1339     public void start() throws SAXException {
1340         initializeWithoutStarting();
1341         tokenHandler.startTokenization(this);
1342         // [NOCPP[
1343         startErrorReporting();
1344         // ]NOCPP]
1345     }
1346 
tokenizeBuffer(UTF16Buffer buffer)1347     public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
1348         int state = stateSave;
1349         int returnState = returnStateSave;
1350         char c = '\u0000';
1351         shouldSuspend = false;
1352         lastCR = false;
1353 
1354         int start = buffer.getStart();
1355         int end = buffer.getEnd();
1356 
1357         // In C++, the caller of tokenizeBuffer needs to do this explicitly.
1358         // [NOCPP[
1359         ensureBufferSpace(end - start);
1360         // ]NOCPP]
1361 
1362         /**
1363          * The index of the last <code>char</code> read from <code>buf</code>.
1364          */
1365         int pos = start - 1;
1366 
1367         /**
1368          * The index of the first <code>char</code> in <code>buf</code> that is
1369          * part of a coalesced run of character tokens or
1370          * <code>Integer.MAX_VALUE</code> if there is not a current run being
1371          * coalesced.
1372          */
1373         switch (state) {
1374             case DATA:
1375             case RCDATA:
1376             case SCRIPT_DATA:
1377             case PLAINTEXT:
1378             case RAWTEXT:
1379             case CDATA_SECTION:
1380             case SCRIPT_DATA_ESCAPED:
1381             case SCRIPT_DATA_ESCAPE_START:
1382             case SCRIPT_DATA_ESCAPE_START_DASH:
1383             case SCRIPT_DATA_ESCAPED_DASH:
1384             case SCRIPT_DATA_ESCAPED_DASH_DASH:
1385             case SCRIPT_DATA_DOUBLE_ESCAPE_START:
1386             case SCRIPT_DATA_DOUBLE_ESCAPED:
1387             case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
1388             case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
1389             case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
1390             case SCRIPT_DATA_DOUBLE_ESCAPE_END:
1391                 cstart = start;
1392                 break;
1393             default:
1394                 cstart = Integer.MAX_VALUE;
1395                 break;
1396         }
1397 
1398         /**
1399          * The number of <code>char</code>s in <code>buf</code> that have
1400          * meaning. (The rest of the array is garbage and should not be
1401          * examined.)
1402          */
1403         // CPPONLY: if (mViewSource) {
1404         // CPPONLY:   mViewSource.SetBuffer(buffer);
1405         // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1406         // CPPONLY:   mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
1407         // CPPONLY: } else {
1408         // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1409         // CPPONLY: }
1410         // [NOCPP[
1411         pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
1412                 end);
1413         // ]NOCPP]
1414         if (pos == end) {
1415             // exiting due to end of buffer
1416             buffer.setStart(pos);
1417         } else {
1418             buffer.setStart(pos + 1);
1419         }
1420         return lastCR;
1421     }
1422 
1423     // [NOCPP[
ensureBufferSpace(int inputLength)1424     private void ensureBufferSpace(int inputLength) throws SAXException {
1425         // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB.
1426         // Adding to the general worst case instead of only the
1427         // TreeBuilder-exposed worst case to avoid re-introducing a bug when
1428         // unifying the tokenizer and tree builder buffers in the future.
1429         int worstCase = strBufLen + inputLength + charRefBufLen + 2;
1430         tokenHandler.ensureBufferSpace(worstCase);
1431         if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) {
1432             // When altering infoset, if the comment contents are consecutive
1433             // hyphens, each hyphen generates a space, too. These buffer
1434             // contents never get emitted as characters() to the tokenHandler,
1435             // which is why this calculation happens after the call to
1436             // ensureBufferSpace on tokenHandler.
1437             worstCase *= 2;
1438         }
1439         if (strBuf == null) {
1440             // Add an arbitrary small value to avoid immediate reallocation
1441             // once there are a few characters in the buffer.
1442             strBuf = new char[worstCase + 128];
1443         } else if (worstCase > strBuf.length) {
1444             // HotSpot reportedly allocates memory with 8-byte accuracy, so
1445             // there's no point in trying to do math here to avoid slop.
1446             // Maybe we should add some small constant to worstCase here
1447             // but not doing that without profiling. In C++ with jemalloc,
1448             // the corresponding method should do math to round up here
1449             // to avoid slop.
1450             char[] newBuf = new char[worstCase];
1451             System.arraycopy(strBuf, 0, newBuf, 0, strBufLen);
1452             strBuf = newBuf;
1453         }
1454     }
1455     // ]NOCPP]
1456 
stateLoop(int state, char c, int pos, @NoLength char[] buf, boolean reconsume, int returnState, int endPos)1457     @SuppressWarnings("unused") private int stateLoop(int state, char c,
1458             int pos, @NoLength char[] buf, boolean reconsume, int returnState,
1459             int endPos) throws SAXException {
1460         /*
1461          * Idioms used in this code:
1462          *
1463          *
1464          * Consuming the next input character
1465          *
1466          * To consume the next input character, the code does this: if (++pos ==
1467          * endPos) { break stateloop; } c = checkChar(buf, pos);
1468          *
1469          *
1470          * Staying in a state
1471          *
1472          * When there's a state that the tokenizer may stay in over multiple
1473          * input characters, the state has a wrapper |for(;;)| loop and staying
1474          * in the state continues the loop.
1475          *
1476          *
1477          * Switching to another state
1478          *
1479          * To switch to another state, the code sets the state variable to the
1480          * magic number of the new state. Then it either continues stateloop or
1481          * breaks out of the state's own wrapper loop if the target state is
1482          * right after the current state in source order. (This is a partial
1483          * workaround for Java's lack of goto.)
1484          *
1485          *
1486          * Reconsume support
1487          *
1488          * The spec sometimes says that an input character is reconsumed in
1489          * another state. If a state can ever be entered so that an input
1490          * character can be reconsumed in it, the state's code starts with an
1491          * |if (reconsume)| that sets reconsume to false and skips over the
1492          * normal code for consuming a new character.
1493          *
1494          * To reconsume the current character in another state, the code sets
1495          * |reconsume| to true and then switches to the other state.
1496          *
1497          *
1498          * Emitting character tokens
1499          *
1500          * This method emits character tokens lazily. Whenever a new range of
1501          * character tokens starts, the field cstart must be set to the start
1502          * index of the range. The flushChars() method must be called at the end
1503          * of a range to flush it.
1504          *
1505          *
1506          * U+0000 handling
1507          *
1508          * The various states have to handle the replacement of U+0000 with
1509          * U+FFFD. However, if U+0000 would be reconsumed in another state, the
1510          * replacement doesn't need to happen, because it's handled by the
1511          * reconsuming state.
1512          *
1513          *
1514          * LF handling
1515          *
1516          * Every state needs to increment the line number upon LF unless the LF
1517          * gets reconsumed by another state which increments the line number.
1518          *
1519          *
1520          * CR handling
1521          *
1522          * Every state needs to handle CR unless the CR gets reconsumed and is
1523          * handled by the reconsuming state. The CR needs to be handled as if it
1524          * were and LF, the lastCR field must be set to true and then this
1525          * method must return. The IO driver will then swallow the next
1526          * character if it is an LF to coalesce CRLF.
1527          */
1528         stateloop: for (;;) {
1529             switch (state) {
1530                 case DATA:
1531                     dataloop: for (;;) {
1532                         if (reconsume) {
1533                             reconsume = false;
1534                         } else {
1535                             if (++pos == endPos) {
1536                                 break stateloop;
1537                             }
1538                             c = checkChar(buf, pos);
1539                         }
1540                         switch (c) {
1541                             case '&':
1542                                 /*
1543                                  * U+0026 AMPERSAND (&) Switch to the character
1544                                  * reference in data state.
1545                                  */
1546                                 flushChars(buf, pos);
1547                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
1548                                 appendCharRefBuf(c);
1549                                 setAdditionalAndRememberAmpersandLocation('\u0000');
1550                                 returnState = state;
1551                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
1552                                 continue stateloop;
1553                             case '<':
1554                                 /*
1555                                  * U+003C LESS-THAN SIGN (<) Switch to the tag
1556                                  * open state.
1557                                  */
1558                                 flushChars(buf, pos);
1559 
1560                                 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
1561                                 break dataloop; // FALL THROUGH continue
1562                             // stateloop;
1563                             case '\u0000':
1564                                 emitReplacementCharacter(buf, pos);
1565                                 continue;
1566                             case '\r':
1567                                 emitCarriageReturn(buf, pos);
1568                                 break stateloop;
1569                             case '\n':
1570                                 silentLineFeed();
1571                                 // CPPONLY: MOZ_FALLTHROUGH;
1572                             default:
1573                                 /*
1574                                  * Anything else Emit the input character as a
1575                                  * character token.
1576                                  *
1577                                  * Stay in the data state.
1578                                  */
1579                                 continue;
1580                         }
1581                     }
1582                     // CPPONLY: MOZ_FALLTHROUGH;
1583                 case TAG_OPEN:
1584                     tagopenloop: for (;;) {
1585                         /*
1586                          * The behavior of this state depends on the content
1587                          * model flag.
1588                          */
1589                         if (++pos == endPos) {
1590                             break stateloop;
1591                         }
1592                         c = checkChar(buf, pos);
1593                         /*
1594                          * If the content model flag is set to the PCDATA state
1595                          * Consume the next input character:
1596                          */
1597                         if (c >= 'A' && c <= 'Z') {
1598                             /*
1599                              * U+0041 LATIN CAPITAL LETTER A through to U+005A
1600                              * LATIN CAPITAL LETTER Z Create a new start tag
1601                              * token,
1602                              */
1603                             endTag = false;
1604                             /*
1605                              * set its tag name to the lowercase version of the
1606                              * input character (add 0x0020 to the character's
1607                              * code point),
1608                              */
1609                             clearStrBufBeforeUse();
1610                             appendStrBuf((char) (c + 0x20));
1611                             containsHyphen = false;
1612                             /* then switch to the tag name state. */
1613                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1614                             /*
1615                              * (Don't emit the token yet; further details will
1616                              * be filled in before it is emitted.)
1617                              */
1618                             break tagopenloop;
1619                             // continue stateloop;
1620                         } else if (c >= 'a' && c <= 'z') {
1621                             /*
1622                              * U+0061 LATIN SMALL LETTER A through to U+007A
1623                              * LATIN SMALL LETTER Z Create a new start tag
1624                              * token,
1625                              */
1626                             endTag = false;
1627                             /*
1628                              * set its tag name to the input character,
1629                              */
1630                             clearStrBufBeforeUse();
1631                             appendStrBuf(c);
1632                             containsHyphen = false;
1633                             /* then switch to the tag name state. */
1634                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1635                             /*
1636                              * (Don't emit the token yet; further details will
1637                              * be filled in before it is emitted.)
1638                              */
1639                             break tagopenloop;
1640                             // continue stateloop;
1641                         }
1642                         switch (c) {
1643                             case '!':
1644                                 /*
1645                                  * U+0021 EXCLAMATION MARK (!) Switch to the
1646                                  * markup declaration open state.
1647                                  */
1648                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
1649                                 continue stateloop;
1650                             case '/':
1651                                 /*
1652                                  * U+002F SOLIDUS (/) Switch to the close tag
1653                                  * open state.
1654                                  */
1655                                 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
1656                                 continue stateloop;
1657                             case '?':
1658                                 // CPPONLY: if (viewingXmlSource) {
1659                                 // CPPONLY: state = transition(state,
1660                                 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
1661                                 // CPPONLY: reconsume,
1662                                 // CPPONLY: pos);
1663                                 // CPPONLY: continue stateloop;
1664                                 // CPPONLY: }
1665                                 /*
1666                                  * U+003F QUESTION MARK (?) Parse error.
1667                                  */
1668                                 errProcessingInstruction();
1669                                 /*
1670                                  * Switch to the bogus comment state.
1671                                  */
1672                                 clearStrBufBeforeUse();
1673                                 appendStrBuf(c);
1674                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
1675                                 continue stateloop;
1676                             case '>':
1677                                 /*
1678                                  * U+003E GREATER-THAN SIGN (>) Parse error.
1679                                  */
1680                                 errLtGt();
1681                                 /*
1682                                  * Emit a U+003C LESS-THAN SIGN character token
1683                                  * and a U+003E GREATER-THAN SIGN character
1684                                  * token.
1685                                  */
1686                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
1687                                 /* Switch to the data state. */
1688                                 cstart = pos + 1;
1689                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
1690                                 continue stateloop;
1691                             default:
1692                                 /*
1693                                  * Anything else Parse error.
1694                                  */
1695                                 errBadCharAfterLt(c);
1696                                 /*
1697                                  * Emit a U+003C LESS-THAN SIGN character token
1698                                  */
1699                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
1700                                 /*
1701                                  * and reconsume the current input character in
1702                                  * the data state.
1703                                  */
1704                                 cstart = pos;
1705                                 reconsume = true;
1706                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
1707                                 continue stateloop;
1708                         }
1709                     }
1710                     // CPPONLY: MOZ_FALLTHROUGH;
1711                 case TAG_NAME:
1712                     tagnameloop: for (;;) {
1713                         if (++pos == endPos) {
1714                             break stateloop;
1715                         }
1716                         c = checkChar(buf, pos);
1717                         /*
1718                          * Consume the next input character:
1719                          */
1720                         switch (c) {
1721                             case '\r':
1722                                 silentCarriageReturn();
1723                                 strBufToElementNameString();
1724                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1725                                 break stateloop;
1726                             case '\n':
1727                                 silentLineFeed();
1728                                 // CPPONLY: MOZ_FALLTHROUGH;
1729                             case ' ':
1730                             case '\t':
1731                             case '\u000C':
1732                                 /*
1733                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
1734                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1735                                  * Switch to the before attribute name state.
1736                                  */
1737                                 strBufToElementNameString();
1738                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1739                                 break tagnameloop;
1740                             // continue stateloop;
1741                             case '/':
1742                                 /*
1743                                  * U+002F SOLIDUS (/) Switch to the self-closing
1744                                  * start tag state.
1745                                  */
1746                                 strBufToElementNameString();
1747                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1748                                 continue stateloop;
1749                             case '>':
1750                                 /*
1751                                  * U+003E GREATER-THAN SIGN (>) Emit the current
1752                                  * tag token.
1753                                  */
1754                                 strBufToElementNameString();
1755                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1756                                 if (shouldSuspend) {
1757                                     break stateloop;
1758                                 }
1759                                 /*
1760                                  * Switch to the data state.
1761                                  */
1762                                 continue stateloop;
1763                             case '\u0000':
1764                                 c = '\uFFFD';
1765                                 // CPPONLY: MOZ_FALLTHROUGH;
1766                             default:
1767                                 if (c >= 'A' && c <= 'Z') {
1768                                     /*
1769                                      * U+0041 LATIN CAPITAL LETTER A through to
1770                                      * U+005A LATIN CAPITAL LETTER Z Append the
1771                                      * lowercase version of the current input
1772                                      * character (add 0x0020 to the character's
1773                                      * code point) to the current tag token's
1774                                      * tag name.
1775                                      */
1776                                     c += 0x20;
1777                                 } else if (c == '-') {
1778                                     containsHyphen = true;
1779                                 }
1780                                 /*
1781                                  * Anything else Append the current input
1782                                  * character to the current tag token's tag
1783                                  * name.
1784                                  */
1785                                 appendStrBuf(c);
1786                                 /*
1787                                  * Stay in the tag name state.
1788                                  */
1789                                 continue;
1790                         }
1791                     }
1792                     // CPPONLY: MOZ_FALLTHROUGH;
1793                 case BEFORE_ATTRIBUTE_NAME:
1794                     beforeattributenameloop: for (;;) {
1795                         if (reconsume) {
1796                             reconsume = false;
1797                         } else {
1798                             if (++pos == endPos) {
1799                                 break stateloop;
1800                             }
1801                             c = checkChar(buf, pos);
1802                         }
1803                         /*
1804                          * Consume the next input character:
1805                          */
1806                         switch (c) {
1807                             case '\r':
1808                                 silentCarriageReturn();
1809                                 break stateloop;
1810                             case '\n':
1811                                 silentLineFeed();
1812                                 // CPPONLY: MOZ_FALLTHROUGH;
1813                             case ' ':
1814                             case '\t':
1815                             case '\u000C':
1816                                 /*
1817                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
1818                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
1819                                  * in the before attribute name state.
1820                                  */
1821                                 continue;
1822                             case '/':
1823                                 /*
1824                                  * U+002F SOLIDUS (/) Switch to the self-closing
1825                                  * start tag state.
1826                                  */
1827                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1828                                 continue stateloop;
1829                             case '>':
1830                                 /*
1831                                  * U+003E GREATER-THAN SIGN (>) Emit the current
1832                                  * tag token.
1833                                  */
1834                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1835                                 if (shouldSuspend) {
1836                                     break stateloop;
1837                                 }
1838                                 /*
1839                                  * Switch to the data state.
1840                                  */
1841                                 continue stateloop;
1842                             case '\u0000':
1843                                 c = '\uFFFD';
1844                                 // CPPONLY: MOZ_FALLTHROUGH;
1845                             case '\"':
1846                             case '\'':
1847                             case '<':
1848                             case '=':
1849                                 /*
1850                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1851                                  * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
1852                                  * SIGN (=) Parse error.
1853                                  */
1854                                 errBadCharBeforeAttributeNameOrNull(c);
1855                                 /*
1856                                  * Treat it as per the "anything else" entry
1857                                  * below.
1858                                  */
1859                                 // CPPONLY: MOZ_FALLTHROUGH;
1860                             default:
1861                                 /*
1862                                  * Anything else Start a new attribute in the
1863                                  * current tag token.
1864                                  */
1865                                 if (c >= 'A' && c <= 'Z') {
1866                                     /*
1867                                      * U+0041 LATIN CAPITAL LETTER A through to
1868                                      * U+005A LATIN CAPITAL LETTER Z Set that
1869                                      * attribute's name to the lowercase version
1870                                      * of the current input character (add
1871                                      * 0x0020 to the character's code point)
1872                                      */
1873                                     c += 0x20;
1874                                 }
1875                                 // CPPONLY: attributeLine = line;
1876                                 /*
1877                                  * Set that attribute's name to the current
1878                                  * input character,
1879                                  */
1880                                 clearStrBufBeforeUse();
1881                                 appendStrBuf(c);
1882                                 /*
1883                                  * and its value to the empty string.
1884                                  */
1885                                 // Will do later.
1886                                 /*
1887                                  * Switch to the attribute name state.
1888                                  */
1889                                 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
1890                                 break beforeattributenameloop;
1891                             // continue stateloop;
1892                         }
1893                     }
1894                     // CPPONLY: MOZ_FALLTHROUGH;
1895                 case ATTRIBUTE_NAME:
1896                     attributenameloop: for (;;) {
1897                         if (++pos == endPos) {
1898                             break stateloop;
1899                         }
1900                         c = checkChar(buf, pos);
1901                         /*
1902                          * Consume the next input character:
1903                          */
1904                         switch (c) {
1905                             case '\r':
1906                                 silentCarriageReturn();
1907                                 attributeNameComplete();
1908                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1909                                 break stateloop;
1910                             case '\n':
1911                                 silentLineFeed();
1912                                 // CPPONLY: MOZ_FALLTHROUGH;
1913                             case ' ':
1914                             case '\t':
1915                             case '\u000C':
1916                                 /*
1917                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
1918                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1919                                  * Switch to the after attribute name state.
1920                                  */
1921                                 attributeNameComplete();
1922                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1923                                 continue stateloop;
1924                             case '/':
1925                                 /*
1926                                  * U+002F SOLIDUS (/) Switch to the self-closing
1927                                  * start tag state.
1928                                  */
1929                                 attributeNameComplete();
1930                                 addAttributeWithoutValue();
1931                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1932                                 continue stateloop;
1933                             case '=':
1934                                 /*
1935                                  * U+003D EQUALS SIGN (=) Switch to the before
1936                                  * attribute value state.
1937                                  */
1938                                 attributeNameComplete();
1939                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
1940                                 break attributenameloop;
1941                             // continue stateloop;
1942                             case '>':
1943                                 /*
1944                                  * U+003E GREATER-THAN SIGN (>) Emit the current
1945                                  * tag token.
1946                                  */
1947                                 attributeNameComplete();
1948                                 addAttributeWithoutValue();
1949                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1950                                 if (shouldSuspend) {
1951                                     break stateloop;
1952                                 }
1953                                 /*
1954                                  * Switch to the data state.
1955                                  */
1956                                 continue stateloop;
1957                             case '\u0000':
1958                                 c = '\uFFFD';
1959                                 // CPPONLY: MOZ_FALLTHROUGH;
1960                             case '\"':
1961                             case '\'':
1962                             case '<':
1963                                 /*
1964                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1965                                  * (') U+003C LESS-THAN SIGN (<) Parse error.
1966                                  */
1967                                 errQuoteOrLtInAttributeNameOrNull(c);
1968                                 /*
1969                                  * Treat it as per the "anything else" entry
1970                                  * below.
1971                                  */
1972                                 // CPPONLY: MOZ_FALLTHROUGH;
1973                             default:
1974                                 if (c >= 'A' && c <= 'Z') {
1975                                     /*
1976                                      * U+0041 LATIN CAPITAL LETTER A through to
1977                                      * U+005A LATIN CAPITAL LETTER Z Append the
1978                                      * lowercase version of the current input
1979                                      * character (add 0x0020 to the character's
1980                                      * code point) to the current attribute's
1981                                      * name.
1982                                      */
1983                                     c += 0x20;
1984                                 }
1985                                 /*
1986                                  * Anything else Append the current input
1987                                  * character to the current attribute's name.
1988                                  */
1989                                 appendStrBuf(c);
1990                                 /*
1991                                  * Stay in the attribute name state.
1992                                  */
1993                                 continue;
1994                         }
1995                     }
1996                     // CPPONLY: MOZ_FALLTHROUGH;
1997                 case BEFORE_ATTRIBUTE_VALUE:
1998                     beforeattributevalueloop: for (;;) {
1999                         if (++pos == endPos) {
2000                             break stateloop;
2001                         }
2002                         c = checkChar(buf, pos);
2003                         /*
2004                          * Consume the next input character:
2005                          */
2006                         switch (c) {
2007                             case '\r':
2008                                 silentCarriageReturn();
2009                                 break stateloop;
2010                             case '\n':
2011                                 silentLineFeed();
2012                                 // CPPONLY: MOZ_FALLTHROUGH;
2013                             case ' ':
2014                             case '\t':
2015                             case '\u000C':
2016                                 /*
2017                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2018                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
2019                                  * in the before attribute value state.
2020                                  */
2021                                 continue;
2022                             case '"':
2023                                 /*
2024                                  * U+0022 QUOTATION MARK (") Switch to the
2025                                  * attribute value (double-quoted) state.
2026                                  */
2027                                 // CPPONLY: attributeLine = line;
2028                                 clearStrBufBeforeUse();
2029                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
2030                                 break beforeattributevalueloop;
2031                             // continue stateloop;
2032                             case '&':
2033                                 /*
2034                                  * U+0026 AMPERSAND (&) Switch to the attribute
2035                                  * value (unquoted) state and reconsume this
2036                                  * input character.
2037                                  */
2038                                 // CPPONLY: attributeLine = line;
2039                                 clearStrBufBeforeUse();
2040                                 reconsume = true;
2041                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
2042                                 noteUnquotedAttributeValue();
2043                                 continue stateloop;
2044                             case '\'':
2045                                 /*
2046                                  * U+0027 APOSTROPHE (') Switch to the attribute
2047                                  * value (single-quoted) state.
2048                                  */
2049                                 // CPPONLY: attributeLine = line;
2050                                 clearStrBufBeforeUse();
2051                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
2052                                 continue stateloop;
2053                             case '>':
2054                                 /*
2055                                  * U+003E GREATER-THAN SIGN (>) Parse error.
2056                                  */
2057                                 errAttributeValueMissing();
2058                                 /*
2059                                  * Emit the current tag token.
2060                                  */
2061                                 addAttributeWithoutValue();
2062                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2063                                 if (shouldSuspend) {
2064                                     break stateloop;
2065                                 }
2066                                 /*
2067                                  * Switch to the data state.
2068                                  */
2069                                 continue stateloop;
2070                             case '\u0000':
2071                                 c = '\uFFFD';
2072                                 // CPPONLY: MOZ_FALLTHROUGH;
2073                             case '<':
2074                             case '=':
2075                             case '`':
2076                                 /*
2077                                  * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
2078                                  * (=) U+0060 GRAVE ACCENT (`)
2079                                  */
2080                                 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
2081                                 /*
2082                                  * Treat it as per the "anything else" entry
2083                                  * below.
2084                                  */
2085                                 // CPPONLY: MOZ_FALLTHROUGH;
2086                             default:
2087                                 // [NOCPP[
2088                                 errHtml4NonNameInUnquotedAttribute(c);
2089                                 // ]NOCPP]
2090                                 /*
2091                                  * Anything else Append the current input
2092                                  * character to the current attribute's value.
2093                                  */
2094                                 // CPPONLY: attributeLine = line;
2095                                 clearStrBufBeforeUse();
2096                                 appendStrBuf(c);
2097                                 /*
2098                                  * Switch to the attribute value (unquoted)
2099                                  * state.
2100                                  */
2101 
2102                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
2103                                 noteUnquotedAttributeValue();
2104                                 continue stateloop;
2105                         }
2106                     }
2107                     // CPPONLY: MOZ_FALLTHROUGH;
2108                 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
2109                     attributevaluedoublequotedloop: for (;;) {
2110                         if (reconsume) {
2111                             reconsume = false;
2112                         } else {
2113                             if (++pos == endPos) {
2114                                 break stateloop;
2115                             }
2116                             c = checkChar(buf, pos);
2117                         }
2118                         /*
2119                          * Consume the next input character:
2120                          */
2121                         switch (c) {
2122                             case '"':
2123                                 /*
2124                                  * U+0022 QUOTATION MARK (") Switch to the after
2125                                  * attribute value (quoted) state.
2126                                  */
2127                                 addAttributeWithValue();
2128 
2129                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2130                                 break attributevaluedoublequotedloop;
2131                             // continue stateloop;
2132                             case '&':
2133                                 /*
2134                                  * U+0026 AMPERSAND (&) Switch to the character
2135                                  * reference in attribute value state, with the
2136                                  * additional allowed character being U+0022
2137                                  * QUOTATION MARK (").
2138                                  */
2139                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
2140                                 appendCharRefBuf(c);
2141                                 setAdditionalAndRememberAmpersandLocation('\"');
2142                                 returnState = state;
2143                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2144                                 continue stateloop;
2145                             case '\r':
2146                                 appendStrBufCarriageReturn();
2147                                 break stateloop;
2148                             case '\n':
2149                                 appendStrBufLineFeed();
2150                                 continue;
2151                             case '\u0000':
2152                                 c = '\uFFFD';
2153                                 // CPPONLY: MOZ_FALLTHROUGH;
2154                             default:
2155                                 /*
2156                                  * Anything else Append the current input
2157                                  * character to the current attribute's value.
2158                                  */
2159                                 appendStrBuf(c);
2160                                 /*
2161                                  * Stay in the attribute value (double-quoted)
2162                                  * state.
2163                                  */
2164                                 continue;
2165                         }
2166                     }
2167                     // CPPONLY: MOZ_FALLTHROUGH;
2168                 case AFTER_ATTRIBUTE_VALUE_QUOTED:
2169                     afterattributevaluequotedloop: for (;;) {
2170                         if (++pos == endPos) {
2171                             break stateloop;
2172                         }
2173                         c = checkChar(buf, pos);
2174                         /*
2175                          * Consume the next input character:
2176                          */
2177                         switch (c) {
2178                             case '\r':
2179                                 silentCarriageReturn();
2180                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2181                                 break stateloop;
2182                             case '\n':
2183                                 silentLineFeed();
2184                                 // CPPONLY: MOZ_FALLTHROUGH;
2185                             case ' ':
2186                             case '\t':
2187                             case '\u000C':
2188                                 /*
2189                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2190                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2191                                  * Switch to the before attribute name state.
2192                                  */
2193                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2194                                 continue stateloop;
2195                             case '/':
2196                                 /*
2197                                  * U+002F SOLIDUS (/) Switch to the self-closing
2198                                  * start tag state.
2199                                  */
2200                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2201                                 break afterattributevaluequotedloop;
2202                             // continue stateloop;
2203                             case '>':
2204                                 /*
2205                                  * U+003E GREATER-THAN SIGN (>) Emit the current
2206                                  * tag token.
2207                                  */
2208                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2209                                 if (shouldSuspend) {
2210                                     break stateloop;
2211                                 }
2212                                 /*
2213                                  * Switch to the data state.
2214                                  */
2215                                 continue stateloop;
2216                             default:
2217                                 /*
2218                                  * Anything else Parse error.
2219                                  */
2220                                 errNoSpaceBetweenAttributes();
2221                                 /*
2222                                  * Reconsume the character in the before
2223                                  * attribute name state.
2224                                  */
2225                                 reconsume = true;
2226                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2227                                 continue stateloop;
2228                         }
2229                     }
2230                     // CPPONLY: MOZ_FALLTHROUGH;
2231                 case SELF_CLOSING_START_TAG:
2232                     if (++pos == endPos) {
2233                         break stateloop;
2234                     }
2235                     c = checkChar(buf, pos);
2236                     /*
2237                      * Consume the next input character:
2238                      */
2239                     switch (c) {
2240                         case '>':
2241                             /*
2242                              * U+003E GREATER-THAN SIGN (>) Set the self-closing
2243                              * flag of the current tag token. Emit the current
2244                              * tag token.
2245                              */
2246                             // [NOCPP[
2247                             errHtml4XmlVoidSyntax();
2248                             // ]NOCPP]
2249                             state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
2250                             if (shouldSuspend) {
2251                                 break stateloop;
2252                             }
2253                             /*
2254                              * Switch to the data state.
2255                              */
2256                             continue stateloop;
2257                         default:
2258                             /* Anything else Parse error. */
2259                             errSlashNotFollowedByGt();
2260                             /*
2261                              * Reconsume the character in the before attribute
2262                              * name state.
2263                              */
2264                             reconsume = true;
2265                             state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2266                             continue stateloop;
2267                     }
2268                 case ATTRIBUTE_VALUE_UNQUOTED:
2269                     for (;;) {
2270                         if (reconsume) {
2271                             reconsume = false;
2272                         } else {
2273                             if (++pos == endPos) {
2274                                 break stateloop;
2275                             }
2276                             c = checkChar(buf, pos);
2277                         }
2278                         /*
2279                          * Consume the next input character:
2280                          */
2281                         switch (c) {
2282                             case '\r':
2283                                 silentCarriageReturn();
2284                                 addAttributeWithValue();
2285                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2286                                 break stateloop;
2287                             case '\n':
2288                                 silentLineFeed();
2289                                 // CPPONLY: MOZ_FALLTHROUGH;
2290                             case ' ':
2291                             case '\t':
2292                             case '\u000C':
2293                                 /*
2294                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2295                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2296                                  * Switch to the before attribute name state.
2297                                  */
2298                                 addAttributeWithValue();
2299                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2300                                 continue stateloop;
2301                             case '&':
2302                                 /*
2303                                  * U+0026 AMPERSAND (&) Switch to the character
2304                                  * reference in attribute value state, with the
2305                                  * additional allowed character being U+003E
2306                                  * GREATER-THAN SIGN (>)
2307                                  */
2308                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
2309                                 appendCharRefBuf(c);
2310                                 setAdditionalAndRememberAmpersandLocation('>');
2311                                 returnState = state;
2312                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2313                                 continue stateloop;
2314                             case '>':
2315                                 /*
2316                                  * U+003E GREATER-THAN SIGN (>) Emit the current
2317                                  * tag token.
2318                                  */
2319                                 addAttributeWithValue();
2320                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2321                                 if (shouldSuspend) {
2322                                     break stateloop;
2323                                 }
2324                                 /*
2325                                  * Switch to the data state.
2326                                  */
2327                                 continue stateloop;
2328                             case '\u0000':
2329                                 c = '\uFFFD';
2330                                 // CPPONLY: MOZ_FALLTHROUGH;
2331                             case '<':
2332                             case '\"':
2333                             case '\'':
2334                             case '=':
2335                             case '`':
2336                                 /*
2337                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
2338                                  * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
2339                                  * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
2340                                  */
2341                                 errUnquotedAttributeValOrNull(c);
2342                                 /*
2343                                  * Treat it as per the "anything else" entry
2344                                  * below.
2345                                  */
2346                                 // CPPONLY: MOZ_FALLTHROUGH;
2347                             default:
2348                                 // [NOCPP]
2349                                 errHtml4NonNameInUnquotedAttribute(c);
2350                                 // ]NOCPP]
2351                                 /*
2352                                  * Anything else Append the current input
2353                                  * character to the current attribute's value.
2354                                  */
2355                                 appendStrBuf(c);
2356                                 /*
2357                                  * Stay in the attribute value (unquoted) state.
2358                                  */
2359                                 continue;
2360                         }
2361                     }
2362                 case AFTER_ATTRIBUTE_NAME:
2363                     for (;;) {
2364                         if (++pos == endPos) {
2365                             break stateloop;
2366                         }
2367                         c = checkChar(buf, pos);
2368                         /*
2369                          * Consume the next input character:
2370                          */
2371                         switch (c) {
2372                             case '\r':
2373                                 silentCarriageReturn();
2374                                 break stateloop;
2375                             case '\n':
2376                                 silentLineFeed();
2377                                 // CPPONLY: MOZ_FALLTHROUGH;
2378                             case ' ':
2379                             case '\t':
2380                             case '\u000C':
2381                                 /*
2382                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2383                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
2384                                  * in the after attribute name state.
2385                                  */
2386                                 continue;
2387                             case '/':
2388                                 /*
2389                                  * U+002F SOLIDUS (/) Switch to the self-closing
2390                                  * start tag state.
2391                                  */
2392                                 addAttributeWithoutValue();
2393                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2394                                 continue stateloop;
2395                             case '=':
2396                                 /*
2397                                  * U+003D EQUALS SIGN (=) Switch to the before
2398                                  * attribute value state.
2399                                  */
2400                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
2401                                 continue stateloop;
2402                             case '>':
2403                                 /*
2404                                  * U+003E GREATER-THAN SIGN (>) Emit the current
2405                                  * tag token.
2406                                  */
2407                                 addAttributeWithoutValue();
2408                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2409                                 if (shouldSuspend) {
2410                                     break stateloop;
2411                                 }
2412                                 /*
2413                                  * Switch to the data state.
2414                                  */
2415                                 continue stateloop;
2416                             case '\u0000':
2417                                 c = '\uFFFD';
2418                                 // CPPONLY: MOZ_FALLTHROUGH;
2419                             case '\"':
2420                             case '\'':
2421                             case '<':
2422                                 errQuoteOrLtInAttributeNameOrNull(c);
2423                                 /*
2424                                  * Treat it as per the "anything else" entry
2425                                  * below.
2426                                  */
2427                                 // CPPONLY: MOZ_FALLTHROUGH;
2428                             default:
2429                                 addAttributeWithoutValue();
2430                                 /*
2431                                  * Anything else Start a new attribute in the
2432                                  * current tag token.
2433                                  */
2434                                 if (c >= 'A' && c <= 'Z') {
2435                                     /*
2436                                      * U+0041 LATIN CAPITAL LETTER A through to
2437                                      * U+005A LATIN CAPITAL LETTER Z Set that
2438                                      * attribute's name to the lowercase version
2439                                      * of the current input character (add
2440                                      * 0x0020 to the character's code point)
2441                                      */
2442                                     c += 0x20;
2443                                 }
2444                                 /*
2445                                  * Set that attribute's name to the current
2446                                  * input character,
2447                                  */
2448                                 clearStrBufBeforeUse();
2449                                 appendStrBuf(c);
2450                                 /*
2451                                  * and its value to the empty string.
2452                                  */
2453                                 // Will do later.
2454                                 /*
2455                                  * Switch to the attribute name state.
2456                                  */
2457                                 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
2458                                 continue stateloop;
2459                         }
2460                     }
2461                 case MARKUP_DECLARATION_OPEN:
2462                     markupdeclarationopenloop: for (;;) {
2463                         if (++pos == endPos) {
2464                             break stateloop;
2465                         }
2466                         c = checkChar(buf, pos);
2467                         /*
2468                          * If the next two characters are both U+002D
2469                          * HYPHEN-MINUS characters (-), consume those two
2470                          * characters, create a comment token whose data is the
2471                          * empty string, and switch to the comment start state.
2472                          *
2473                          * Otherwise, if the next seven characters are an ASCII
2474                          * case-insensitive match for the word "DOCTYPE", then
2475                          * consume those characters and switch to the DOCTYPE
2476                          * state.
2477                          *
2478                          * Otherwise, if the insertion mode is
2479                          * "in foreign content" and the current node is not an
2480                          * element in the HTML namespace and the next seven
2481                          * characters are an case-sensitive match for the string
2482                          * "[CDATA[" (the five uppercase letters "CDATA" with a
2483                          * U+005B LEFT SQUARE BRACKET character before and
2484                          * after), then consume those characters and switch to
2485                          * the CDATA section state.
2486                          *
2487                          * Otherwise, is is a parse error. Switch to the bogus
2488                          * comment state. The next character that is consumed,
2489                          * if any, is the first character that will be in the
2490                          * comment.
2491                          */
2492                         switch (c) {
2493                             case '-':
2494                                 clearStrBufBeforeUse();
2495                                 appendStrBuf(c);
2496                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
2497                                 break markupdeclarationopenloop;
2498                             // continue stateloop;
2499                             case 'd':
2500                             case 'D':
2501                                 clearStrBufBeforeUse();
2502                                 appendStrBuf(c);
2503                                 index = 0;
2504                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
2505                                 continue stateloop;
2506                             case '[':
2507                                 if (tokenHandler.cdataSectionAllowed()) {
2508                                     clearStrBufBeforeUse();
2509                                     appendStrBuf(c);
2510                                     index = 0;
2511                                     state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
2512                                     continue stateloop;
2513                                 }
2514                                 // CPPONLY: MOZ_FALLTHROUGH;
2515                             default:
2516                                 errBogusComment();
2517                                 clearStrBufBeforeUse();
2518                                 reconsume = true;
2519                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2520                                 continue stateloop;
2521                         }
2522                     }
2523                     // CPPONLY: MOZ_FALLTHROUGH;
2524                 case MARKUP_DECLARATION_HYPHEN:
2525                     markupdeclarationhyphenloop: for (;;) {
2526                         if (++pos == endPos) {
2527                             break stateloop;
2528                         }
2529                         c = checkChar(buf, pos);
2530                         switch (c) {
2531                             case '\u0000':
2532                                 break stateloop;
2533                             case '-':
2534                                 clearStrBufAfterOneHyphen();
2535                                 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
2536                                 break markupdeclarationhyphenloop;
2537                             // continue stateloop;
2538                             default:
2539                                 errBogusComment();
2540                                 reconsume = true;
2541                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2542                                 continue stateloop;
2543                         }
2544                     }
2545                     // CPPONLY: MOZ_FALLTHROUGH;
2546                 case COMMENT_START:
2547                     commentstartloop: for (;;) {
2548                         if (++pos == endPos) {
2549                             break stateloop;
2550                         }
2551                         c = checkChar(buf, pos);
2552                         /*
2553                          * Comment start state
2554                          *
2555                          *
2556                          * Consume the next input character:
2557                          */
2558                         switch (c) {
2559                             case '-':
2560                                 /*
2561                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
2562                                  * start dash state.
2563                                  */
2564                                 appendStrBuf(c);
2565                                 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
2566                                 continue stateloop;
2567                             case '>':
2568                                 /*
2569                                  * U+003E GREATER-THAN SIGN (>) Parse error.
2570                                  */
2571                                 errPrematureEndOfComment();
2572                                 /* Emit the comment token. */
2573                                 emitComment(0, pos);
2574                                 /*
2575                                  * Switch to the data state.
2576                                  */
2577                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2578                                 continue stateloop;
2579                             case '\r':
2580                                 appendStrBufCarriageReturn();
2581                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2582                                 break stateloop;
2583                             case '\n':
2584                                 appendStrBufLineFeed();
2585                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2586                                 break commentstartloop;
2587                             case '\u0000':
2588                                 c = '\uFFFD';
2589                                 // CPPONLY: MOZ_FALLTHROUGH;
2590                             default:
2591                                 /*
2592                                  * Anything else Append the input character to
2593                                  * the comment token's data.
2594                                  */
2595                                 appendStrBuf(c);
2596                                 /*
2597                                  * Switch to the comment state.
2598                                  */
2599                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2600                                 break commentstartloop;
2601                             // continue stateloop;
2602                         }
2603                     }
2604                     // CPPONLY: MOZ_FALLTHROUGH;
2605                 case COMMENT:
2606                     commentloop: for (;;) {
2607                         if (++pos == endPos) {
2608                             break stateloop;
2609                         }
2610                         c = checkChar(buf, pos);
2611                         /*
2612                          * Comment state Consume the next input character:
2613                          */
2614                         switch (c) {
2615                             case '-':
2616                                 /*
2617                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
2618                                  * end dash state
2619                                  */
2620                                 appendStrBuf(c);
2621                                 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2622                                 break commentloop;
2623                             // continue stateloop;
2624                             case '\r':
2625                                 appendStrBufCarriageReturn();
2626                                 break stateloop;
2627                             case '\n':
2628                                 appendStrBufLineFeed();
2629                                 continue;
2630                             case '\u0000':
2631                                 c = '\uFFFD';
2632                                 // CPPONLY: MOZ_FALLTHROUGH;
2633                             default:
2634                                 /*
2635                                  * Anything else Append the input character to
2636                                  * the comment token's data.
2637                                  */
2638                                 appendStrBuf(c);
2639                                 /*
2640                                  * Stay in the comment state.
2641                                  */
2642                                 continue;
2643                         }
2644                     }
2645                     // CPPONLY: MOZ_FALLTHROUGH;
2646                 case COMMENT_END_DASH:
2647                     commentenddashloop: for (;;) {
2648                         if (++pos == endPos) {
2649                             break stateloop;
2650                         }
2651                         c = checkChar(buf, pos);
2652                         /*
2653                          * Comment end dash state Consume the next input
2654                          * character:
2655                          */
2656                         switch (c) {
2657                             case '-':
2658                                 /*
2659                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
2660                                  * end state
2661                                  */
2662                                 appendStrBuf(c);
2663                                 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2664                                 break commentenddashloop;
2665                             // continue stateloop;
2666                             case '\r':
2667                                 appendStrBufCarriageReturn();
2668                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2669                                 break stateloop;
2670                             case '\n':
2671                                 appendStrBufLineFeed();
2672                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2673                                 continue stateloop;
2674                             case '\u0000':
2675                                 c = '\uFFFD';
2676                                 // CPPONLY: MOZ_FALLTHROUGH;
2677                             default:
2678                                 /*
2679                                  * Anything else Append a U+002D HYPHEN-MINUS
2680                                  * (-) character and the input character to the
2681                                  * comment token's data.
2682                                  */
2683                                 appendStrBuf(c);
2684                                 /*
2685                                  * Switch to the comment state.
2686                                  */
2687                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2688                                 continue stateloop;
2689                         }
2690                     }
2691                     // CPPONLY: MOZ_FALLTHROUGH;
2692                 case COMMENT_END:
2693                     commentendloop: for (;;) {
2694                         if (++pos == endPos) {
2695                             break stateloop;
2696                         }
2697                         c = checkChar(buf, pos);
2698                         /*
2699                          * Comment end dash state Consume the next input
2700                          * character:
2701                          */
2702                         switch (c) {
2703                             case '>':
2704                                 /*
2705                                  * U+003E GREATER-THAN SIGN (>) Emit the comment
2706                                  * token.
2707                                  */
2708                                 emitComment(2, pos);
2709                                 /*
2710                                  * Switch to the data state.
2711                                  */
2712                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2713                                 continue stateloop;
2714                             case '-':
2715                                 /* U+002D HYPHEN-MINUS (-) Parse error. */
2716                                 /*
2717                                  * Append a U+002D HYPHEN-MINUS (-) character to
2718                                  * the comment token's data.
2719                                  */
2720                                 adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2721                                 /*
2722                                  * Stay in the comment end state.
2723                                  */
2724                                 continue;
2725                             case '\r':
2726                                 adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
2727                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2728                                 break stateloop;
2729                             case '\n':
2730                                 adjustDoubleHyphenAndAppendToStrBufLineFeed();
2731                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2732                                 continue stateloop;
2733                             case '!':
2734                                 errHyphenHyphenBang();
2735                                 appendStrBuf(c);
2736                                 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
2737                                 continue stateloop;
2738                             case '\u0000':
2739                                 c = '\uFFFD';
2740                                 // CPPONLY: MOZ_FALLTHROUGH;
2741                             default:
2742                                 /*
2743                                  * Append two U+002D HYPHEN-MINUS (-) characters
2744                                  * and the input character to the comment
2745                                  * token's data.
2746                                  */
2747                                 adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2748                                 /*
2749                                  * Switch to the comment state.
2750                                  */
2751                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2752                                 continue stateloop;
2753                         }
2754                     }
2755                 case COMMENT_END_BANG:
2756                     for (;;) {
2757                         if (++pos == endPos) {
2758                             break stateloop;
2759                         }
2760                         c = checkChar(buf, pos);
2761                         /*
2762                          * Comment end bang state
2763                          *
2764                          * Consume the next input character:
2765                          */
2766                         switch (c) {
2767                             case '>':
2768                                 /*
2769                                  * U+003E GREATER-THAN SIGN (>) Emit the comment
2770                                  * token.
2771                                  */
2772                                 emitComment(3, pos);
2773                                 /*
2774                                  * Switch to the data state.
2775                                  */
2776                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2777                                 continue stateloop;
2778                             case '-':
2779                                 /*
2780                                  * Append two U+002D HYPHEN-MINUS (-) characters
2781                                  * and a U+0021 EXCLAMATION MARK (!) character
2782                                  * to the comment token's data.
2783                                  */
2784                                 appendStrBuf(c);
2785                                 /*
2786                                  * Switch to the comment end dash state.
2787                                  */
2788                                 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2789                                 continue stateloop;
2790                             case '\r':
2791                                 appendStrBufCarriageReturn();
2792                                 break stateloop;
2793                             case '\n':
2794                                 appendStrBufLineFeed();
2795                                 continue;
2796                             case '\u0000':
2797                                 c = '\uFFFD';
2798                                 // CPPONLY: MOZ_FALLTHROUGH;
2799                             default:
2800                                 /*
2801                                  * Anything else Append two U+002D HYPHEN-MINUS
2802                                  * (-) characters, a U+0021 EXCLAMATION MARK (!)
2803                                  * character, and the input character to the
2804                                  * comment token's data. Switch to the comment
2805                                  * state.
2806                                  */
2807                                 appendStrBuf(c);
2808                                 /*
2809                                  * Switch to the comment state.
2810                                  */
2811                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2812                                 continue stateloop;
2813                         }
2814                     }
2815                 case COMMENT_START_DASH:
2816                     if (++pos == endPos) {
2817                         break stateloop;
2818                     }
2819                     c = checkChar(buf, pos);
2820                     /*
2821                      * Comment start dash state
2822                      *
2823                      * Consume the next input character:
2824                      */
2825                     switch (c) {
2826                         case '-':
2827                             /*
2828                              * U+002D HYPHEN-MINUS (-) Switch to the comment end
2829                              * state
2830                              */
2831                             appendStrBuf(c);
2832                             state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2833                             continue stateloop;
2834                         case '>':
2835                             errPrematureEndOfComment();
2836                             /* Emit the comment token. */
2837                             emitComment(1, pos);
2838                             /*
2839                              * Switch to the data state.
2840                              */
2841                             state = transition(state, Tokenizer.DATA, reconsume, pos);
2842                             continue stateloop;
2843                         case '\r':
2844                             appendStrBufCarriageReturn();
2845                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2846                             break stateloop;
2847                         case '\n':
2848                             appendStrBufLineFeed();
2849                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2850                             continue stateloop;
2851                         case '\u0000':
2852                             c = '\uFFFD';
2853                             // CPPONLY: MOZ_FALLTHROUGH;
2854                         default:
2855                             /*
2856                              * Append a U+002D HYPHEN-MINUS character (-) and
2857                              * the current input character to the comment
2858                              * token's data.
2859                              */
2860                             appendStrBuf(c);
2861                             /*
2862                              * Switch to the comment state.
2863                              */
2864                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2865                             continue stateloop;
2866                     }
2867                 case CDATA_START:
2868                     for (;;) {
2869                         if (++pos == endPos) {
2870                             break stateloop;
2871                         }
2872                         c = checkChar(buf, pos);
2873                         if (index < 6) { // CDATA_LSQB.length
2874                             if (c == Tokenizer.CDATA_LSQB[index]) {
2875                                 appendStrBuf(c);
2876                             } else {
2877                                 errBogusComment();
2878                                 reconsume = true;
2879                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2880                                 continue stateloop;
2881                             }
2882                             index++;
2883                             continue;
2884                         } else {
2885                             clearStrBufAfterUse();
2886                             cstart = pos; // start coalescing
2887                             reconsume = true;
2888                             state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2889                             break; // FALL THROUGH continue stateloop;
2890                         }
2891                     }
2892                     // CPPONLY: MOZ_FALLTHROUGH;
2893                 case CDATA_SECTION:
2894                     cdatasectionloop: for (;;) {
2895                         if (reconsume) {
2896                             reconsume = false;
2897                         } else {
2898                             if (++pos == endPos) {
2899                                 break stateloop;
2900                             }
2901                             c = checkChar(buf, pos);
2902                         }
2903                         switch (c) {
2904                             case ']':
2905                                 flushChars(buf, pos);
2906                                 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
2907                                 break cdatasectionloop; // FALL THROUGH
2908                             case '\u0000':
2909                                 emitReplacementCharacter(buf, pos);
2910                                 continue;
2911                             case '\r':
2912                                 emitCarriageReturn(buf, pos);
2913                                 break stateloop;
2914                             case '\n':
2915                                 silentLineFeed();
2916                                 // CPPONLY: MOZ_FALLTHROUGH;
2917                             default:
2918                                 continue;
2919                         }
2920                     }
2921                     // CPPONLY: MOZ_FALLTHROUGH;
2922                 case CDATA_RSQB:
2923                     cdatarsqb: for (;;) {
2924                         if (++pos == endPos) {
2925                             break stateloop;
2926                         }
2927                         c = checkChar(buf, pos);
2928                         switch (c) {
2929                             case ']':
2930                                 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
2931                                 break cdatarsqb;
2932                             default:
2933                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
2934                                         1);
2935                                 cstart = pos;
2936                                 reconsume = true;
2937                                 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2938                                 continue stateloop;
2939                         }
2940                     }
2941                     // CPPONLY: MOZ_FALLTHROUGH;
2942                 case CDATA_RSQB_RSQB:
2943                     cdatarsqbrsqb: for (;;) {
2944                         if (++pos == endPos) {
2945                             break stateloop;
2946                         }
2947                         c = checkChar(buf, pos);
2948                         switch (c) {
2949                             case ']':
2950                                 // Saw a third ]. Emit one ] (logically the
2951                                 // first one) and stay in this state to
2952                                 // remember that the last two characters seen
2953                                 // have been ]].
2954                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
2955                                 continue;
2956                             case '>':
2957                                 cstart = pos + 1;
2958                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2959                                 continue stateloop;
2960                             default:
2961                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
2962                                 cstart = pos;
2963                                 reconsume = true;
2964                                 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2965                                 continue stateloop;
2966                         }
2967                     }
2968                 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
2969                     attributevaluesinglequotedloop: for (;;) {
2970                         if (reconsume) {
2971                             reconsume = false;
2972                         } else {
2973                             if (++pos == endPos) {
2974                                 break stateloop;
2975                             }
2976                             c = checkChar(buf, pos);
2977                         }
2978                         /*
2979                          * Consume the next input character:
2980                          */
2981                         switch (c) {
2982                             case '\'':
2983                                 /*
2984                                  * U+0027 APOSTROPHE (') Switch to the after
2985                                  * attribute value (quoted) state.
2986                                  */
2987                                 addAttributeWithValue();
2988 
2989                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2990                                 continue stateloop;
2991                             case '&':
2992                                 /*
2993                                  * U+0026 AMPERSAND (&) Switch to the character
2994                                  * reference in attribute value state, with the
2995                                  * + additional allowed character being U+0027
2996                                  * APOSTROPHE (').
2997                                  */
2998                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
2999                                 appendCharRefBuf(c);
3000                                 setAdditionalAndRememberAmpersandLocation('\'');
3001                                 returnState = state;
3002                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
3003                                 break attributevaluesinglequotedloop;
3004                             // continue stateloop;
3005                             case '\r':
3006                                 appendStrBufCarriageReturn();
3007                                 break stateloop;
3008                             case '\n':
3009                                 appendStrBufLineFeed();
3010                                 continue;
3011                             case '\u0000':
3012                                 c = '\uFFFD';
3013                                 // CPPONLY: MOZ_FALLTHROUGH;
3014                             default:
3015                                 /*
3016                                  * Anything else Append the current input
3017                                  * character to the current attribute's value.
3018                                  */
3019                                 appendStrBuf(c);
3020                                 /*
3021                                  * Stay in the attribute value (double-quoted)
3022                                  * state.
3023                                  */
3024                                 continue;
3025                         }
3026                     }
3027                     // CPPONLY: MOZ_FALLTHROUGH;
3028                 case CONSUME_CHARACTER_REFERENCE:
3029                     if (++pos == endPos) {
3030                         break stateloop;
3031                     }
3032                     c = checkChar(buf, pos);
3033                     if (c == '\u0000') {
3034                         break stateloop;
3035                     }
3036                     /*
3037                      * Unlike the definition is the spec, this state does not
3038                      * return a value and never requires the caller to
3039                      * backtrack. This state takes care of emitting characters
3040                      * or appending to the current attribute value. It also
3041                      * takes care of that in the case when consuming the
3042                      * character reference fails.
3043                      */
3044                     /*
3045                      * This section defines how to consume a character
3046                      * reference. This definition is used when parsing character
3047                      * references in text and in attributes.
3048                      *
3049                      * The behavior depends on the identity of the next
3050                      * character (the one immediately after the U+0026 AMPERSAND
3051                      * character):
3052                      */
3053                     switch (c) {
3054                         case ' ':
3055                         case '\t':
3056                         case '\n':
3057                         case '\r': // we'll reconsume!
3058                         case '\u000C':
3059                         case '<':
3060                         case '&':
3061                             emitOrAppendCharRefBuf(returnState);
3062                             if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3063                                 cstart = pos;
3064                             }
3065                             reconsume = true;
3066                             state = transition(state, returnState, reconsume, pos);
3067                             continue stateloop;
3068                         case '#':
3069                             /*
3070                              * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
3071                              * SIGN.
3072                              */
3073                             appendCharRefBuf('#');
3074                             state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
3075                             continue stateloop;
3076                         default:
3077                             if (c == additional) {
3078                                 emitOrAppendCharRefBuf(returnState);
3079                                 reconsume = true;
3080                                 state = transition(state, returnState, reconsume, pos);
3081                                 continue stateloop;
3082                             }
3083                             if (c >= 'a' && c <= 'z') {
3084                                 firstCharKey = c - 'a' + 26;
3085                             } else if (c >= 'A' && c <= 'Z') {
3086                                 firstCharKey = c - 'A';
3087                             } else {
3088                                 // No match
3089                                 /*
3090                                  * If no match can be made, then this is a parse
3091                                  * error.
3092                                  */
3093                                 errNoNamedCharacterMatch();
3094                                 emitOrAppendCharRefBuf(returnState);
3095                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3096                                     cstart = pos;
3097                                 }
3098                                 reconsume = true;
3099                                 state = transition(state, returnState, reconsume, pos);
3100                                 continue stateloop;
3101                             }
3102                             // Didn't fail yet
3103                             appendCharRefBuf(c);
3104                             state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
3105                             // FALL THROUGH continue stateloop;
3106                     }
3107                     // CPPONLY: MOZ_FALLTHROUGH;
3108                 case CHARACTER_REFERENCE_HILO_LOOKUP:
3109                     {
3110                         if (++pos == endPos) {
3111                             break stateloop;
3112                         }
3113                         c = checkChar(buf, pos);
3114                         if (c == '\u0000') {
3115                             break stateloop;
3116                         }
3117                         /*
3118                          * The data structure is as follows:
3119                          *
3120                          * HILO_ACCEL is a two-dimensional int array whose major
3121                          * index corresponds to the second character of the
3122                          * character reference (code point as index) and the
3123                          * minor index corresponds to the first character of the
3124                          * character reference (packed so that A-Z runs from 0
3125                          * to 25 and a-z runs from 26 to 51). This layout makes
3126                          * it easier to use the sparseness of the data structure
3127                          * to omit parts of it: The second dimension of the
3128                          * table is null when no character reference starts with
3129                          * the character corresponding to that row.
3130                          *
3131                          * The int value HILO_ACCEL (by these indeces) is zero
3132                          * if there exists no character reference starting with
3133                          * that two-letter prefix. Otherwise, the value is an
3134                          * int that packs two shorts so that the higher short is
3135                          * the index of the highest character reference name
3136                          * with that prefix in NAMES and the lower short
3137                          * corresponds to the index of the lowest character
3138                          * reference name with that prefix. (It happens that the
3139                          * first two character reference names share their
3140                          * prefix so the packed int cannot be 0 by packing the
3141                          * two shorts.)
3142                          *
3143                          * NAMES is an array of byte arrays where each byte
3144                          * array encodes the name of a character references as
3145                          * ASCII. The names omit the first two letters of the
3146                          * name. (Since storing the first two letters would be
3147                          * redundant with the data contained in HILO_ACCEL.) The
3148                          * entries are lexically sorted.
3149                          *
3150                          * For a given index in NAMES, the same index in VALUES
3151                          * contains the corresponding expansion as an array of
3152                          * two UTF-16 code units (either the character and
3153                          * U+0000 or a suggogate pair).
3154                          */
3155                         int hilo = 0;
3156                         if (c <= 'z') {
3157                             @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
3158                             if (row != null) {
3159                                 hilo = row[firstCharKey];
3160                             }
3161                         }
3162                         if (hilo == 0) {
3163                             /*
3164                              * If no match can be made, then this is a parse
3165                              * error.
3166                              */
3167                             errNoNamedCharacterMatch();
3168                             emitOrAppendCharRefBuf(returnState);
3169                             if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3170                                 cstart = pos;
3171                             }
3172                             reconsume = true;
3173                             state = transition(state, returnState, reconsume, pos);
3174                             continue stateloop;
3175                         }
3176                         // Didn't fail yet
3177                         appendCharRefBuf(c);
3178                         lo = hilo & 0xFFFF;
3179                         hi = hilo >> 16;
3180                         entCol = -1;
3181                         candidate = -1;
3182                         charRefBufMark = 0;
3183                         state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
3184                         // FALL THROUGH continue stateloop;
3185                     }
3186                     // CPPONLY: MOZ_FALLTHROUGH;
3187                 case CHARACTER_REFERENCE_TAIL:
3188                     outer: for (;;) {
3189                         if (++pos == endPos) {
3190                             break stateloop;
3191                         }
3192                         c = checkChar(buf, pos);
3193                         if (c == '\u0000') {
3194                             break stateloop;
3195                         }
3196                         entCol++;
3197                         /*
3198                          * Consume the maximum number of characters possible,
3199                          * with the consumed characters matching one of the
3200                          * identifiers in the first column of the named
3201                          * character references table (in a case-sensitive
3202                          * manner).
3203                          */
3204                         loloop: for (;;) {
3205                             if (hi < lo) {
3206                                 break outer;
3207                             }
3208                             if (entCol == NamedCharacters.NAMES[lo].length()) {
3209                                 candidate = lo;
3210                                 charRefBufMark = charRefBufLen;
3211                                 lo++;
3212                             } else if (entCol > NamedCharacters.NAMES[lo].length()) {
3213                                 break outer;
3214                             } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
3215                                 lo++;
3216                             } else {
3217                                 break loloop;
3218                             }
3219                         }
3220 
3221                         hiloop: for (;;) {
3222                             if (hi < lo) {
3223                                 break outer;
3224                             }
3225                             if (entCol == NamedCharacters.NAMES[hi].length()) {
3226                                 break hiloop;
3227                             }
3228                             if (entCol > NamedCharacters.NAMES[hi].length()) {
3229                                 break outer;
3230                             } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
3231                                 hi--;
3232                             } else {
3233                                 break hiloop;
3234                             }
3235                         }
3236 
3237                         if (c == ';') {
3238                             // If we see a semicolon, there cannot be a
3239                             // longer match. Break the loop. However, before
3240                             // breaking, take the longest match so far as the
3241                             // candidate, if we are just about to complete a
3242                             // match.
3243                             if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
3244                                 candidate = lo;
3245                                 charRefBufMark = charRefBufLen;
3246                             }
3247                             break outer;
3248                         }
3249 
3250                         if (hi < lo) {
3251                             break outer;
3252                         }
3253                         appendCharRefBuf(c);
3254                         continue;
3255                     }
3256 
3257                     if (candidate == -1) {
3258                         // reconsume deals with CR, LF or nul
3259                         /*
3260                          * If no match can be made, then this is a parse error.
3261                          */
3262                         errNoNamedCharacterMatch();
3263                         emitOrAppendCharRefBuf(returnState);
3264                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3265                             cstart = pos;
3266                         }
3267                         reconsume = true;
3268                         state = transition(state, returnState, reconsume, pos);
3269                         continue stateloop;
3270                     } else {
3271                         // c can't be CR, LF or nul if we got here
3272                         @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
3273                         if (candidateName.length() == 0
3274                                 || candidateName.charAt(candidateName.length() - 1) != ';') {
3275                             /*
3276                              * If the last character matched is not a U+003B
3277                              * SEMICOLON (;), there is a parse error.
3278                              */
3279                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3280                                 /*
3281                                  * If the entity is being consumed as part of an
3282                                  * attribute, and the last character matched is
3283                                  * not a U+003B SEMICOLON (;),
3284                                  */
3285                                 char ch;
3286                                 if (charRefBufMark == charRefBufLen) {
3287                                     ch = c;
3288                                 } else {
3289                                     ch = charRefBuf[charRefBufMark];
3290                                 }
3291                                 if (ch == '=' || (ch >= '0' && ch <= '9')
3292                                         || (ch >= 'A' && ch <= 'Z')
3293                                         || (ch >= 'a' && ch <= 'z')) {
3294                                     /*
3295                                      * and the next character is either a U+003D
3296                                      * EQUALS SIGN character (=) or in the range
3297                                      * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
3298                                      * U+0041 LATIN CAPITAL LETTER A to U+005A
3299                                      * LATIN CAPITAL LETTER Z, or U+0061 LATIN
3300                                      * SMALL LETTER A to U+007A LATIN SMALL
3301                                      * LETTER Z, then, for historical reasons,
3302                                      * all the characters that were matched
3303                                      * after the U+0026 AMPERSAND (&) must be
3304                                      * unconsumed, and nothing is returned.
3305                                      */
3306                                     errNoNamedCharacterMatch();
3307                                     appendCharRefBufToStrBuf();
3308                                     reconsume = true;
3309                                     state = transition(state, returnState, reconsume, pos);
3310                                     continue stateloop;
3311                                 }
3312                             }
3313                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3314                                 errUnescapedAmpersandInterpretedAsCharacterReference();
3315                             } else {
3316                                 errNotSemicolonTerminated();
3317                             }
3318                         }
3319 
3320                         /*
3321                          * Otherwise, return a character token for the character
3322                          * corresponding to the entity name (as given by the
3323                          * second column of the named character references
3324                          * table).
3325                          */
3326                         // CPPONLY: completedNamedCharacterReference();
3327                         @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
3328                         if (
3329                         // [NOCPP[
3330                         val.length == 1
3331                         // ]NOCPP]
3332                         // CPPONLY: val[1] == 0
3333                         ) {
3334                             emitOrAppendOne(val, returnState);
3335                         } else {
3336                             emitOrAppendTwo(val, returnState);
3337                         }
3338                         // this is so complicated!
3339                         if (charRefBufMark < charRefBufLen) {
3340                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3341                                 appendStrBuf(charRefBuf, charRefBufMark,
3342                                         charRefBufLen - charRefBufMark);
3343                             } else {
3344                                 tokenHandler.characters(charRefBuf, charRefBufMark,
3345                                         charRefBufLen - charRefBufMark);
3346                             }
3347                         }
3348                         // charRefBufLen will be zeroed below!
3349 
3350                         // Check if we broke out early with c being the last
3351                         // character that matched as opposed to being the
3352                         // first one that didn't match. In the case of an
3353                         // early break, the next run on text should start
3354                         // *after* the current character and the current
3355                         // character shouldn't be reconsumed.
3356                         boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen);
3357                         charRefBufLen = 0;
3358                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3359                             cstart = earlyBreak ? pos + 1 : pos;
3360                         }
3361                         reconsume = !earlyBreak;
3362                         state = transition(state, returnState, reconsume, pos);
3363                         continue stateloop;
3364                         /*
3365                          * If the markup contains I'm &notit; I tell you, the
3366                          * entity is parsed as "not", as in, I'm ¬it; I tell
3367                          * you. But if the markup was I'm &notin; I tell you,
3368                          * the entity would be parsed as "notin;", resulting in
3369                          * I'm ∉ I tell you.
3370                          */
3371                     }
3372                 case CONSUME_NCR:
3373                     if (++pos == endPos) {
3374                         break stateloop;
3375                     }
3376                     c = checkChar(buf, pos);
3377                     value = 0;
3378                     seenDigits = false;
3379                     /*
3380                      * The behavior further depends on the character after the
3381                      * U+0023 NUMBER SIGN:
3382                      */
3383                     switch (c) {
3384                         case 'x':
3385                         case 'X':
3386 
3387                             /*
3388                              * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
3389                              * LETTER X Consume the X.
3390                              *
3391                              * Follow the steps below, but using the range of
3392                              * characters U+0030 DIGIT ZERO through to U+0039
3393                              * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
3394                              * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
3395                              * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
3396                              * LETTER F (in other words, 0-9, A-F, a-f).
3397                              *
3398                              * When it comes to interpreting the number,
3399                              * interpret it as a hexadecimal number.
3400                              */
3401                             appendCharRefBuf(c);
3402                             state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
3403                             continue stateloop;
3404                         default:
3405                             /*
3406                              * Anything else Follow the steps below, but using
3407                              * the range of characters U+0030 DIGIT ZERO through
3408                              * to U+0039 DIGIT NINE (i.e. just 0-9).
3409                              *
3410                              * When it comes to interpreting the number,
3411                              * interpret it as a decimal number.
3412                              */
3413                             reconsume = true;
3414                             state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
3415                             // FALL THROUGH continue stateloop;
3416                     }
3417                     // CPPONLY: MOZ_FALLTHROUGH;
3418                 case DECIMAL_NRC_LOOP:
3419                     decimalloop: for (;;) {
3420                         if (reconsume) {
3421                             reconsume = false;
3422                         } else {
3423                             if (++pos == endPos) {
3424                                 break stateloop;
3425                             }
3426                             c = checkChar(buf, pos);
3427                         }
3428                         /*
3429                          * Consume as many characters as match the range of
3430                          * characters given above.
3431                          */
3432                         assert value >= 0: "value must not become negative.";
3433                         if (c >= '0' && c <= '9') {
3434                             seenDigits = true;
3435                             // Avoid overflow
3436                             if (value <= 0x10FFFF) {
3437                                 value *= 10;
3438                                 value += c - '0';
3439                             }
3440                             continue;
3441                         } else if (c == ';') {
3442                             if (seenDigits) {
3443                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3444                                     cstart = pos + 1;
3445                                 }
3446                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3447                                 // FALL THROUGH continue stateloop;
3448                                 break decimalloop;
3449                             } else {
3450                                 errNoDigitsInNCR();
3451                                 appendCharRefBuf(';');
3452                                 emitOrAppendCharRefBuf(returnState);
3453                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3454                                     cstart = pos + 1;
3455                                 }
3456                                 state = transition(state, returnState, reconsume, pos);
3457                                 continue stateloop;
3458                             }
3459                         } else {
3460                             /*
3461                              * If no characters match the range, then don't
3462                              * consume any characters (and unconsume the U+0023
3463                              * NUMBER SIGN character and, if appropriate, the X
3464                              * character). This is a parse error; nothing is
3465                              * returned.
3466                              *
3467                              * Otherwise, if the next character is a U+003B
3468                              * SEMICOLON, consume that too. If it isn't, there
3469                              * is a parse error.
3470                              */
3471                             if (!seenDigits) {
3472                                 errNoDigitsInNCR();
3473                                 emitOrAppendCharRefBuf(returnState);
3474                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3475                                     cstart = pos;
3476                                 }
3477                                 reconsume = true;
3478                                 state = transition(state, returnState, reconsume, pos);
3479                                 continue stateloop;
3480                             } else {
3481                                 errCharRefLacksSemicolon();
3482                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3483                                     cstart = pos;
3484                                 }
3485                                 reconsume = true;
3486                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3487                                 // FALL THROUGH continue stateloop;
3488                                 break decimalloop;
3489                             }
3490                         }
3491                     }
3492                     // CPPONLY: MOZ_FALLTHROUGH;
3493                 case HANDLE_NCR_VALUE:
3494                     // WARNING previous state sets reconsume
3495                     // We are not going to emit the contents of charRefBuf.
3496                     charRefBufLen = 0;
3497                     // XXX inline this case if the method size can take it
3498                     handleNcrValue(returnState);
3499                     state = transition(state, returnState, reconsume, pos);
3500                     continue stateloop;
3501                 case HEX_NCR_LOOP:
3502                     for (;;) {
3503                         if (++pos == endPos) {
3504                             break stateloop;
3505                         }
3506                         c = checkChar(buf, pos);
3507                         /*
3508                          * Consume as many characters as match the range of
3509                          * characters given above.
3510                          */
3511                         assert value >= 0: "value must not become negative.";
3512                         if (c >= '0' && c <= '9') {
3513                             seenDigits = true;
3514                             // Avoid overflow
3515                             if (value <= 0x10FFFF) {
3516                                 value *= 16;
3517                                 value += c - '0';
3518                             }
3519                             continue;
3520                         } else if (c >= 'A' && c <= 'F') {
3521                             seenDigits = true;
3522                             // Avoid overflow
3523                             if (value <= 0x10FFFF) {
3524                                 value *= 16;
3525                                 value += c - 'A' + 10;
3526                             }
3527                             continue;
3528                         } else if (c >= 'a' && c <= 'f') {
3529                             seenDigits = true;
3530                             // Avoid overflow
3531                             if (value <= 0x10FFFF) {
3532                                 value *= 16;
3533                                 value += c - 'a' + 10;
3534                             }
3535                             continue;
3536                         } else if (c == ';') {
3537                             if (seenDigits) {
3538                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3539                                     cstart = pos + 1;
3540                                 }
3541                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3542                                 continue stateloop;
3543                             } else {
3544                                 errNoDigitsInNCR();
3545                                 appendCharRefBuf(';');
3546                                 emitOrAppendCharRefBuf(returnState);
3547                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3548                                     cstart = pos + 1;
3549                                 }
3550                                 state = transition(state, returnState, reconsume, pos);
3551                                 continue stateloop;
3552                             }
3553                         } else {
3554                             /*
3555                              * If no characters match the range, then don't
3556                              * consume any characters (and unconsume the U+0023
3557                              * NUMBER SIGN character and, if appropriate, the X
3558                              * character). This is a parse error; nothing is
3559                              * returned.
3560                              *
3561                              * Otherwise, if the next character is a U+003B
3562                              * SEMICOLON, consume that too. If it isn't, there
3563                              * is a parse error.
3564                              */
3565                             if (!seenDigits) {
3566                                 errNoDigitsInNCR();
3567                                 emitOrAppendCharRefBuf(returnState);
3568                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3569                                     cstart = pos;
3570                                 }
3571                                 reconsume = true;
3572                                 state = transition(state, returnState, reconsume, pos);
3573                                 continue stateloop;
3574                             } else {
3575                                 errCharRefLacksSemicolon();
3576                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3577                                     cstart = pos;
3578                                 }
3579                                 reconsume = true;
3580                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3581                                 continue stateloop;
3582                             }
3583                         }
3584                     }
3585                 case PLAINTEXT:
3586                     plaintextloop: for (;;) {
3587                         if (reconsume) {
3588                             reconsume = false;
3589                         } else {
3590                             if (++pos == endPos) {
3591                                 break stateloop;
3592                             }
3593                             c = checkChar(buf, pos);
3594                         }
3595                         switch (c) {
3596                             case '\u0000':
3597                                 emitPlaintextReplacementCharacter(buf, pos);
3598                                 continue;
3599                             case '\r':
3600                                 emitCarriageReturn(buf, pos);
3601                                 break stateloop;
3602                             case '\n':
3603                                 silentLineFeed();
3604                                 // CPPONLY: MOZ_FALLTHROUGH;
3605                             default:
3606                                 /*
3607                                  * Anything else Emit the current input
3608                                  * character as a character token. Stay in the
3609                                  * RAWTEXT state.
3610                                  */
3611                                 continue;
3612                         }
3613                     }
3614                 case CLOSE_TAG_OPEN:
3615                     if (++pos == endPos) {
3616                         break stateloop;
3617                     }
3618                     c = checkChar(buf, pos);
3619                     /*
3620                      * Otherwise, if the content model flag is set to the PCDATA
3621                      * state, or if the next few characters do match that tag
3622                      * name, consume the next input character:
3623                      */
3624                     switch (c) {
3625                         case '>':
3626                             /* U+003E GREATER-THAN SIGN (>) Parse error. */
3627                             errLtSlashGt();
3628                             /*
3629                              * Switch to the data state.
3630                              */
3631                             cstart = pos + 1;
3632                             state = transition(state, Tokenizer.DATA, reconsume, pos);
3633                             continue stateloop;
3634                         case '\r':
3635                             silentCarriageReturn();
3636                             /* Anything else Parse error. */
3637                             errGarbageAfterLtSlash();
3638                             /*
3639                              * Switch to the bogus comment state.
3640                              */
3641                             clearStrBufBeforeUse();
3642                             appendStrBuf('\n');
3643                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3644                             break stateloop;
3645                         case '\n':
3646                             silentLineFeed();
3647                             /* Anything else Parse error. */
3648                             errGarbageAfterLtSlash();
3649                             /*
3650                              * Switch to the bogus comment state.
3651                              */
3652                             clearStrBufBeforeUse();
3653                             appendStrBuf(c);
3654                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3655                             continue stateloop;
3656                         case '\u0000':
3657                             c = '\uFFFD';
3658                             // CPPONLY: MOZ_FALLTHROUGH;
3659                         default:
3660                             if (c >= 'A' && c <= 'Z') {
3661                                 c += 0x20;
3662                             }
3663                             if (c >= 'a' && c <= 'z') {
3664                                 /*
3665                                  * U+0061 LATIN SMALL LETTER A through to U+007A
3666                                  * LATIN SMALL LETTER Z Create a new end tag
3667                                  * token,
3668                                  */
3669                                 endTag = true;
3670                                 /*
3671                                  * set its tag name to the input character,
3672                                  */
3673                                 clearStrBufBeforeUse();
3674                                 appendStrBuf(c);
3675                                 containsHyphen = false;
3676                                 /*
3677                                  * then switch to the tag name state. (Don't
3678                                  * emit the token yet; further details will be
3679                                  * filled in before it is emitted.)
3680                                  */
3681                                 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
3682                                 continue stateloop;
3683                             } else {
3684                                 /* Anything else Parse error. */
3685                                 errGarbageAfterLtSlash();
3686                                 /*
3687                                  * Switch to the bogus comment state.
3688                                  */
3689                                 clearStrBufBeforeUse();
3690                                 appendStrBuf(c);
3691                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3692                                 continue stateloop;
3693                             }
3694                     }
3695                 case RCDATA:
3696                     rcdataloop: for (;;) {
3697                         if (reconsume) {
3698                             reconsume = false;
3699                         } else {
3700                             if (++pos == endPos) {
3701                                 break stateloop;
3702                             }
3703                             c = checkChar(buf, pos);
3704                         }
3705                         switch (c) {
3706                             case '&':
3707                                 /*
3708                                  * U+0026 AMPERSAND (&) Switch to the character
3709                                  * reference in RCDATA state.
3710                                  */
3711                                 flushChars(buf, pos);
3712                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
3713                                 appendCharRefBuf(c);
3714                                 setAdditionalAndRememberAmpersandLocation('\u0000');
3715                                 returnState = state;
3716                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
3717                                 continue stateloop;
3718                             case '<':
3719                                 /*
3720                                  * U+003C LESS-THAN SIGN (<) Switch to the
3721                                  * RCDATA less-than sign state.
3722                                  */
3723                                 flushChars(buf, pos);
3724 
3725                                 returnState = state;
3726                                 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3727                                 continue stateloop;
3728                             case '\u0000':
3729                                 emitReplacementCharacter(buf, pos);
3730                                 continue;
3731                             case '\r':
3732                                 emitCarriageReturn(buf, pos);
3733                                 break stateloop;
3734                             case '\n':
3735                                 silentLineFeed();
3736                                 // CPPONLY: MOZ_FALLTHROUGH;
3737                             default:
3738                                 /*
3739                                  * Emit the current input character as a
3740                                  * character token. Stay in the RCDATA state.
3741                                  */
3742                                 continue;
3743                         }
3744                     }
3745                 case RAWTEXT:
3746                     rawtextloop: for (;;) {
3747                         if (reconsume) {
3748                             reconsume = false;
3749                         } else {
3750                             if (++pos == endPos) {
3751                                 break stateloop;
3752                             }
3753                             c = checkChar(buf, pos);
3754                         }
3755                         switch (c) {
3756                             case '<':
3757                                 /*
3758                                  * U+003C LESS-THAN SIGN (<) Switch to the
3759                                  * RAWTEXT less-than sign state.
3760                                  */
3761                                 flushChars(buf, pos);
3762 
3763                                 returnState = state;
3764                                 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3765                                 break rawtextloop;
3766                             // FALL THRU continue stateloop;
3767                             case '\u0000':
3768                                 emitReplacementCharacter(buf, pos);
3769                                 continue;
3770                             case '\r':
3771                                 emitCarriageReturn(buf, pos);
3772                                 break stateloop;
3773                             case '\n':
3774                                 silentLineFeed();
3775                                 // CPPONLY: MOZ_FALLTHROUGH;
3776                             default:
3777                                 /*
3778                                  * Emit the current input character as a
3779                                  * character token. Stay in the RAWTEXT state.
3780                                  */
3781                                 continue;
3782                         }
3783                     }
3784                     // CPPONLY: MOZ_FALLTHROUGH;
3785                 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
3786                     rawtextrcdatalessthansignloop: for (;;) {
3787                         if (++pos == endPos) {
3788                             break stateloop;
3789                         }
3790                         c = checkChar(buf, pos);
3791                         switch (c) {
3792                             case '/':
3793                                 /*
3794                                  * U+002F SOLIDUS (/) Set the temporary buffer
3795                                  * to the empty string. Switch to the script
3796                                  * data end tag open state.
3797                                  */
3798                                 index = 0;
3799                                 clearStrBufBeforeUse();
3800                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
3801                                 break rawtextrcdatalessthansignloop;
3802                             // FALL THRU continue stateloop;
3803                             default:
3804                                 /*
3805                                  * Otherwise, emit a U+003C LESS-THAN SIGN
3806                                  * character token
3807                                  */
3808                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3809                                 /*
3810                                  * and reconsume the current input character in
3811                                  * the data state.
3812                                  */
3813                                 cstart = pos;
3814                                 reconsume = true;
3815                                 state = transition(state, returnState, reconsume, pos);
3816                                 continue stateloop;
3817                         }
3818                     }
3819                     // CPPONLY: MOZ_FALLTHROUGH;
3820                 case NON_DATA_END_TAG_NAME:
3821                     for (;;) {
3822                         if (++pos == endPos) {
3823                             break stateloop;
3824                         }
3825                         c = checkChar(buf, pos);
3826                         /*
3827                          * ASSERT! when entering this state, set index to 0 and
3828                          * call clearStrBufBeforeUse(); Let's implement the above
3829                          * without lookahead. strBuf is the 'temporary buffer'.
3830                          */
3831                         if (endTagExpectationAsArray == null) {
3832                             tokenHandler.characters(Tokenizer.LT_SOLIDUS,
3833                                     0, 2);
3834                             cstart = pos;
3835                             reconsume = true;
3836                             state = transition(state, returnState, reconsume, pos);
3837                             continue stateloop;
3838                         } else if (index < endTagExpectationAsArray.length) {
3839                             char e = endTagExpectationAsArray[index];
3840                             char folded = c;
3841                             if (c >= 'A' && c <= 'Z') {
3842                                 folded += 0x20;
3843                             }
3844                             if (folded != e) {
3845                                 // [NOCPP[
3846                                 errHtml4LtSlashInRcdata(folded);
3847                                 // ]NOCPP]
3848                                 tokenHandler.characters(Tokenizer.LT_SOLIDUS,
3849                                         0, 2);
3850                                 emitStrBuf();
3851                                 cstart = pos;
3852                                 reconsume = true;
3853                                 state = transition(state, returnState, reconsume, pos);
3854                                 continue stateloop;
3855                             }
3856                             appendStrBuf(c);
3857                             index++;
3858                             continue;
3859                         } else {
3860                             endTag = true;
3861                             // XXX replace contentModelElement with different
3862                             // type
3863                             tagName = endTagExpectation;
3864                             switch (c) {
3865                                 case '\r':
3866                                     silentCarriageReturn();
3867                                     clearStrBufAfterUse(); // strBuf not used
3868                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3869                                     break stateloop;
3870                                 case '\n':
3871                                     silentLineFeed();
3872                                     // CPPONLY: MOZ_FALLTHROUGH;
3873                                 case ' ':
3874                                 case '\t':
3875                                 case '\u000C':
3876                                     /*
3877                                      * U+0009 CHARACTER TABULATION U+000A LINE
3878                                      * FEED (LF) U+000C FORM FEED (FF) U+0020
3879                                      * SPACE If the current end tag token is an
3880                                      * appropriate end tag token, then switch to
3881                                      * the before attribute name state.
3882                                      */
3883                                     clearStrBufAfterUse(); // strBuf not used
3884                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3885                                     continue stateloop;
3886                                 case '/':
3887                                     /*
3888                                      * U+002F SOLIDUS (/) If the current end tag
3889                                      * token is an appropriate end tag token,
3890                                      * then switch to the self-closing start tag
3891                                      * state.
3892                                      */
3893                                     clearStrBufAfterUse(); // strBuf not used
3894                                     state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
3895                                     continue stateloop;
3896                                 case '>':
3897                                     /*
3898                                      * U+003E GREATER-THAN SIGN (>) If the
3899                                      * current end tag token is an appropriate
3900                                      * end tag token, then emit the current tag
3901                                      * token and switch to the data state.
3902                                      */
3903                                     clearStrBufAfterUse(); // strBuf not used
3904                                     state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
3905                                     if (shouldSuspend) {
3906                                         break stateloop;
3907                                     }
3908                                     continue stateloop;
3909                                 default:
3910                                     /*
3911                                      * Emit a U+003C LESS-THAN SIGN character
3912                                      * token, a U+002F SOLIDUS character token,
3913                                      * a character token for each of the
3914                                      * characters in the temporary buffer (in
3915                                      * the order they were added to the buffer),
3916                                      * and reconsume the current input character
3917                                      * in the RAWTEXT state.
3918                                      */
3919                                     // [NOCPP[
3920                                     errWarnLtSlashInRcdata();
3921                                     // ]NOCPP]
3922                                     tokenHandler.characters(
3923                                             Tokenizer.LT_SOLIDUS, 0, 2);
3924                                     emitStrBuf();
3925                                     cstart = pos; // don't drop the
3926                                                   // character
3927                                     reconsume = true;
3928                                     state = transition(state, returnState, reconsume, pos);
3929                                     continue stateloop;
3930                             }
3931                         }
3932                     }
3933                     // BEGIN HOTSPOT WORKAROUND
3934                 case BOGUS_COMMENT:
3935                     boguscommentloop: for (;;) {
3936                         if (reconsume) {
3937                             reconsume = false;
3938                         } else {
3939                             if (++pos == endPos) {
3940                                 break stateloop;
3941                             }
3942                             c = checkChar(buf, pos);
3943                         }
3944                         /*
3945                          * Consume every character up to and including the first
3946                          * U+003E GREATER-THAN SIGN character (>) or the end of
3947                          * the file (EOF), whichever comes first. Emit a comment
3948                          * token whose data is the concatenation of all the
3949                          * characters starting from and including the character
3950                          * that caused the state machine to switch into the
3951                          * bogus comment state, up to and including the
3952                          * character immediately before the last consumed
3953                          * character (i.e. up to the character just before the
3954                          * U+003E or EOF character). (If the comment was started
3955                          * by the end of the file (EOF), the token is empty.)
3956                          *
3957                          * Switch to the data state.
3958                          *
3959                          * If the end of the file was reached, reconsume the EOF
3960                          * character.
3961                          */
3962                         switch (c) {
3963                             case '>':
3964                                 emitComment(0, pos);
3965                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
3966                                 continue stateloop;
3967                             case '-':
3968                                 appendStrBuf(c);
3969                                 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
3970                                 break boguscommentloop;
3971                             case '\r':
3972                                 appendStrBufCarriageReturn();
3973                                 break stateloop;
3974                             case '\n':
3975                                 appendStrBufLineFeed();
3976                                 continue;
3977                             case '\u0000':
3978                                 c = '\uFFFD';
3979                                 // CPPONLY: MOZ_FALLTHROUGH;
3980                             default:
3981                                 appendStrBuf(c);
3982                                 continue;
3983                         }
3984                     }
3985                     // CPPONLY: MOZ_FALLTHROUGH;
3986                 case BOGUS_COMMENT_HYPHEN:
3987                     boguscommenthyphenloop: for (;;) {
3988                         if (++pos == endPos) {
3989                             break stateloop;
3990                         }
3991                         c = checkChar(buf, pos);
3992                         switch (c) {
3993                             case '>':
3994                                 // [NOCPP[
3995                                 maybeAppendSpaceToBogusComment();
3996                                 // ]NOCPP]
3997                                 emitComment(0, pos);
3998                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
3999                                 continue stateloop;
4000                             case '-':
4001                                 appendSecondHyphenToBogusComment();
4002                                 continue boguscommenthyphenloop;
4003                             case '\r':
4004                                 appendStrBufCarriageReturn();
4005                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4006                                 break stateloop;
4007                             case '\n':
4008                                 appendStrBufLineFeed();
4009                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4010                                 continue stateloop;
4011                             case '\u0000':
4012                                 c = '\uFFFD';
4013                                 // CPPONLY: MOZ_FALLTHROUGH;
4014                             default:
4015                                 appendStrBuf(c);
4016                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4017                                 continue stateloop;
4018                         }
4019                     }
4020                 case SCRIPT_DATA:
4021                     scriptdataloop: for (;;) {
4022                         if (reconsume) {
4023                             reconsume = false;
4024                         } else {
4025                             if (++pos == endPos) {
4026                                 break stateloop;
4027                             }
4028                             c = checkChar(buf, pos);
4029                         }
4030                         switch (c) {
4031                             case '<':
4032                                 /*
4033                                  * U+003C LESS-THAN SIGN (<) Switch to the
4034                                  * script data less-than sign state.
4035                                  */
4036                                 flushChars(buf, pos);
4037                                 returnState = state;
4038                                 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
4039                                 break scriptdataloop; // FALL THRU continue
4040                             // stateloop;
4041                             case '\u0000':
4042                                 emitReplacementCharacter(buf, pos);
4043                                 continue;
4044                             case '\r':
4045                                 emitCarriageReturn(buf, pos);
4046                                 break stateloop;
4047                             case '\n':
4048                                 silentLineFeed();
4049                                 // CPPONLY: MOZ_FALLTHROUGH;
4050                             default:
4051                                 /*
4052                                  * Anything else Emit the current input
4053                                  * character as a character token. Stay in the
4054                                  * script data state.
4055                                  */
4056                                 continue;
4057                         }
4058                     }
4059                     // CPPONLY: MOZ_FALLTHROUGH;
4060                 case SCRIPT_DATA_LESS_THAN_SIGN:
4061                     scriptdatalessthansignloop: for (;;) {
4062                         if (++pos == endPos) {
4063                             break stateloop;
4064                         }
4065                         c = checkChar(buf, pos);
4066                         switch (c) {
4067                             case '/':
4068                                 /*
4069                                  * U+002F SOLIDUS (/) Set the temporary buffer
4070                                  * to the empty string. Switch to the script
4071                                  * data end tag open state.
4072                                  */
4073                                 index = 0;
4074                                 clearStrBufBeforeUse();
4075                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
4076                                 continue stateloop;
4077                             case '!':
4078                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4079                                 cstart = pos;
4080                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
4081                                 break scriptdatalessthansignloop; // FALL THRU
4082                             // continue
4083                             // stateloop;
4084                             default:
4085                                 /*
4086                                  * Otherwise, emit a U+003C LESS-THAN SIGN
4087                                  * character token
4088                                  */
4089                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4090                                 /*
4091                                  * and reconsume the current input character in
4092                                  * the data state.
4093                                  */
4094                                 cstart = pos;
4095                                 reconsume = true;
4096                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4097                                 continue stateloop;
4098                         }
4099                     }
4100                     // CPPONLY: MOZ_FALLTHROUGH;
4101                 case SCRIPT_DATA_ESCAPE_START:
4102                     scriptdataescapestartloop: for (;;) {
4103                         if (++pos == endPos) {
4104                             break stateloop;
4105                         }
4106                         c = checkChar(buf, pos);
4107                         /*
4108                          * Consume the next input character:
4109                          */
4110                         switch (c) {
4111                             case '-':
4112                                 /*
4113                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4114                                  * HYPHEN-MINUS character token. Switch to the
4115                                  * script data escape start dash state.
4116                                  */
4117                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
4118                                 break scriptdataescapestartloop; // FALL THRU
4119                             // continue
4120                             // stateloop;
4121                             default:
4122                                 /*
4123                                  * Anything else Reconsume the current input
4124                                  * character in the script data state.
4125                                  */
4126                                 reconsume = true;
4127                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4128                                 continue stateloop;
4129                         }
4130                     }
4131                     // CPPONLY: MOZ_FALLTHROUGH;
4132                 case SCRIPT_DATA_ESCAPE_START_DASH:
4133                     scriptdataescapestartdashloop: for (;;) {
4134                         if (++pos == endPos) {
4135                             break stateloop;
4136                         }
4137                         c = checkChar(buf, pos);
4138                         /*
4139                          * Consume the next input character:
4140                          */
4141                         switch (c) {
4142                             case '-':
4143                                 /*
4144                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4145                                  * HYPHEN-MINUS character token. Switch to the
4146                                  * script data escaped dash dash state.
4147                                  */
4148                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4149                                 break scriptdataescapestartdashloop;
4150                             // continue stateloop;
4151                             default:
4152                                 /*
4153                                  * Anything else Reconsume the current input
4154                                  * character in the script data state.
4155                                  */
4156                                 reconsume = true;
4157                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4158                                 continue stateloop;
4159                         }
4160                     }
4161                     // CPPONLY: MOZ_FALLTHROUGH;
4162                 case SCRIPT_DATA_ESCAPED_DASH_DASH:
4163                     scriptdataescapeddashdashloop: for (;;) {
4164                         if (++pos == endPos) {
4165                             break stateloop;
4166                         }
4167                         c = checkChar(buf, pos);
4168                         /*
4169                          * Consume the next input character:
4170                          */
4171                         switch (c) {
4172                             case '-':
4173                                 /*
4174                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4175                                  * HYPHEN-MINUS character token. Stay in the
4176                                  * script data escaped dash dash state.
4177                                  */
4178                                 continue;
4179                             case '<':
4180                                 /*
4181                                  * U+003C LESS-THAN SIGN (<) Switch to the
4182                                  * script data escaped less-than sign state.
4183                                  */
4184                                 flushChars(buf, pos);
4185                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4186                                 continue stateloop;
4187                             case '>':
4188                                 /*
4189                                  * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4190                                  * GREATER-THAN SIGN character token. Switch to
4191                                  * the script data state.
4192                                  */
4193                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4194                                 continue stateloop;
4195                             case '\u0000':
4196                                 emitReplacementCharacter(buf, pos);
4197                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4198                                 break scriptdataescapeddashdashloop;
4199                             case '\r':
4200                                 emitCarriageReturn(buf, pos);
4201                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4202                                 break stateloop;
4203                             case '\n':
4204                                 silentLineFeed();
4205                                 // CPPONLY: MOZ_FALLTHROUGH;
4206                             default:
4207                                 /*
4208                                  * Anything else Emit the current input
4209                                  * character as a character token. Switch to the
4210                                  * script data escaped state.
4211                                  */
4212                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4213                                 break scriptdataescapeddashdashloop;
4214                             // continue stateloop;
4215                         }
4216                     }
4217                     // CPPONLY: MOZ_FALLTHROUGH;
4218                 case SCRIPT_DATA_ESCAPED:
4219                     scriptdataescapedloop: for (;;) {
4220                         if (reconsume) {
4221                             reconsume = false;
4222                         } else {
4223                             if (++pos == endPos) {
4224                                 break stateloop;
4225                             }
4226                             c = checkChar(buf, pos);
4227                         }
4228                         /*
4229                          * Consume the next input character:
4230                          */
4231                         switch (c) {
4232                             case '-':
4233                                 /*
4234                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4235                                  * HYPHEN-MINUS character token. Switch to the
4236                                  * script data escaped dash state.
4237                                  */
4238                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
4239                                 break scriptdataescapedloop; // FALL THRU
4240                             // continue
4241                             // stateloop;
4242                             case '<':
4243                                 /*
4244                                  * U+003C LESS-THAN SIGN (<) Switch to the
4245                                  * script data escaped less-than sign state.
4246                                  */
4247                                 flushChars(buf, pos);
4248                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4249                                 continue stateloop;
4250                             case '\u0000':
4251                                 emitReplacementCharacter(buf, pos);
4252                                 continue;
4253                             case '\r':
4254                                 emitCarriageReturn(buf, pos);
4255                                 break stateloop;
4256                             case '\n':
4257                                 silentLineFeed();
4258                                 // CPPONLY: MOZ_FALLTHROUGH;
4259                             default:
4260                                 /*
4261                                  * Anything else Emit the current input
4262                                  * character as a character token. Stay in the
4263                                  * script data escaped state.
4264                                  */
4265                                 continue;
4266                         }
4267                     }
4268                     // CPPONLY: MOZ_FALLTHROUGH;
4269                 case SCRIPT_DATA_ESCAPED_DASH:
4270                     scriptdataescapeddashloop: for (;;) {
4271                         if (++pos == endPos) {
4272                             break stateloop;
4273                         }
4274                         c = checkChar(buf, pos);
4275                         /*
4276                          * Consume the next input character:
4277                          */
4278                         switch (c) {
4279                             case '-':
4280                                 /*
4281                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4282                                  * HYPHEN-MINUS character token. Switch to the
4283                                  * script data escaped dash dash state.
4284                                  */
4285                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4286                                 continue stateloop;
4287                             case '<':
4288                                 /*
4289                                  * U+003C LESS-THAN SIGN (<) Switch to the
4290                                  * script data escaped less-than sign state.
4291                                  */
4292                                 flushChars(buf, pos);
4293                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4294                                 break scriptdataescapeddashloop;
4295                             // continue stateloop;
4296                             case '\u0000':
4297                                 emitReplacementCharacter(buf, pos);
4298                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4299                                 continue stateloop;
4300                             case '\r':
4301                                 emitCarriageReturn(buf, pos);
4302                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4303                                 break stateloop;
4304                             case '\n':
4305                                 silentLineFeed();
4306                                 // CPPONLY: MOZ_FALLTHROUGH;
4307                             default:
4308                                 /*
4309                                  * Anything else Emit the current input
4310                                  * character as a character token. Switch to the
4311                                  * script data escaped state.
4312                                  */
4313                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4314                                 continue stateloop;
4315                         }
4316                     }
4317                     // CPPONLY: MOZ_FALLTHROUGH;
4318                 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
4319                     scriptdataescapedlessthanloop: for (;;) {
4320                         if (++pos == endPos) {
4321                             break stateloop;
4322                         }
4323                         c = checkChar(buf, pos);
4324                         /*
4325                          * Consume the next input character:
4326                          */
4327                         switch (c) {
4328                             case '/':
4329                                 /*
4330                                  * U+002F SOLIDUS (/) Set the temporary buffer
4331                                  * to the empty string. Switch to the script
4332                                  * data escaped end tag open state.
4333                                  */
4334                                 index = 0;
4335                                 clearStrBufBeforeUse();
4336                                 returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
4337                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
4338                                 continue stateloop;
4339                             case 'S':
4340                             case 's':
4341                                 /*
4342                                  * U+0041 LATIN CAPITAL LETTER A through to
4343                                  * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
4344                                  * LESS-THAN SIGN character token and the
4345                                  * current input character as a character token.
4346                                  */
4347                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4348                                 cstart = pos;
4349                                 index = 1;
4350                                 /*
4351                                  * Set the temporary buffer to the empty string.
4352                                  * Append the lowercase version of the current
4353                                  * input character (add 0x0020 to the
4354                                  * character's code point) to the temporary
4355                                  * buffer. Switch to the script data double
4356                                  * escape start state.
4357                                  */
4358                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
4359                                 break scriptdataescapedlessthanloop;
4360                             // continue stateloop;
4361                             default:
4362                                 /*
4363                                  * Anything else Emit a U+003C LESS-THAN SIGN
4364                                  * character token and reconsume the current
4365                                  * input character in the script data escaped
4366                                  * state.
4367                                  */
4368                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4369                                 cstart = pos;
4370                                 reconsume = true;
4371                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4372                                 continue stateloop;
4373                         }
4374                     }
4375                     // CPPONLY: MOZ_FALLTHROUGH;
4376                 case SCRIPT_DATA_DOUBLE_ESCAPE_START:
4377                     scriptdatadoubleescapestartloop: for (;;) {
4378                         if (++pos == endPos) {
4379                             break stateloop;
4380                         }
4381                         c = checkChar(buf, pos);
4382                         assert index > 0;
4383                         if (index < 6) { // SCRIPT_ARR.length
4384                             char folded = c;
4385                             if (c >= 'A' && c <= 'Z') {
4386                                 folded += 0x20;
4387                             }
4388                             if (folded != Tokenizer.SCRIPT_ARR[index]) {
4389                                 reconsume = true;
4390                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4391                                 continue stateloop;
4392                             }
4393                             index++;
4394                             continue;
4395                         }
4396                         switch (c) {
4397                             case '\r':
4398                                 emitCarriageReturn(buf, pos);
4399                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4400                                 break stateloop;
4401                             case '\n':
4402                                 silentLineFeed();
4403                                 // CPPONLY: MOZ_FALLTHROUGH;
4404                             case ' ':
4405                             case '\t':
4406                             case '\u000C':
4407                             case '/':
4408                             case '>':
4409                                 /*
4410                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4411                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4412                                  * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4413                                  * (>) Emit the current input character as a
4414                                  * character token. If the temporary buffer is
4415                                  * the string "script", then switch to the
4416                                  * script data double escaped state.
4417                                  */
4418                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4419                                 break scriptdatadoubleescapestartloop;
4420                             // continue stateloop;
4421                             default:
4422                                 /*
4423                                  * Anything else Reconsume the current input
4424                                  * character in the script data escaped state.
4425                                  */
4426                                 reconsume = true;
4427                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4428                                 continue stateloop;
4429                         }
4430                     }
4431                     // CPPONLY: MOZ_FALLTHROUGH;
4432                 case SCRIPT_DATA_DOUBLE_ESCAPED:
4433                     scriptdatadoubleescapedloop: for (;;) {
4434                         if (reconsume) {
4435                             reconsume = false;
4436                         } else {
4437                             if (++pos == endPos) {
4438                                 break stateloop;
4439                             }
4440                             c = checkChar(buf, pos);
4441                         }
4442                         /*
4443                          * Consume the next input character:
4444                          */
4445                         switch (c) {
4446                             case '-':
4447                                 /*
4448                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4449                                  * HYPHEN-MINUS character token. Switch to the
4450                                  * script data double escaped dash state.
4451                                  */
4452                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
4453                                 break scriptdatadoubleescapedloop; // FALL THRU
4454                             // continue
4455                             // stateloop;
4456                             case '<':
4457                                 /*
4458                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
4459                                  * LESS-THAN SIGN character token. Switch to the
4460                                  * script data double escaped less-than sign
4461                                  * state.
4462                                  */
4463                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4464                                 continue stateloop;
4465                             case '\u0000':
4466                                 emitReplacementCharacter(buf, pos);
4467                                 continue;
4468                             case '\r':
4469                                 emitCarriageReturn(buf, pos);
4470                                 break stateloop;
4471                             case '\n':
4472                                 silentLineFeed();
4473                                 // CPPONLY: MOZ_FALLTHROUGH;
4474                             default:
4475                                 /*
4476                                  * Anything else Emit the current input
4477                                  * character as a character token. Stay in the
4478                                  * script data double escaped state.
4479                                  */
4480                                 continue;
4481                         }
4482                     }
4483                     // CPPONLY: MOZ_FALLTHROUGH;
4484                 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
4485                     scriptdatadoubleescapeddashloop: for (;;) {
4486                         if (++pos == endPos) {
4487                             break stateloop;
4488                         }
4489                         c = checkChar(buf, pos);
4490                         /*
4491                          * Consume the next input character:
4492                          */
4493                         switch (c) {
4494                             case '-':
4495                                 /*
4496                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4497                                  * HYPHEN-MINUS character token. Switch to the
4498                                  * script data double escaped dash dash state.
4499                                  */
4500                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
4501                                 break scriptdatadoubleescapeddashloop;
4502                             // continue stateloop;
4503                             case '<':
4504                                 /*
4505                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
4506                                  * LESS-THAN SIGN character token. Switch to the
4507                                  * script data double escaped less-than sign
4508                                  * state.
4509                                  */
4510                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4511                                 continue stateloop;
4512                             case '\u0000':
4513                                 emitReplacementCharacter(buf, pos);
4514                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4515                                 continue stateloop;
4516                             case '\r':
4517                                 emitCarriageReturn(buf, pos);
4518                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4519                                 break stateloop;
4520                             case '\n':
4521                                 silentLineFeed();
4522                                 // CPPONLY: MOZ_FALLTHROUGH;
4523                             default:
4524                                 /*
4525                                  * Anything else Emit the current input
4526                                  * character as a character token. Switch to the
4527                                  * script data double escaped state.
4528                                  */
4529                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4530                                 continue stateloop;
4531                         }
4532                     }
4533                     // CPPONLY: MOZ_FALLTHROUGH;
4534                 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
4535                     scriptdatadoubleescapeddashdashloop: for (;;) {
4536                         if (++pos == endPos) {
4537                             break stateloop;
4538                         }
4539                         c = checkChar(buf, pos);
4540                         /*
4541                          * Consume the next input character:
4542                          */
4543                         switch (c) {
4544                             case '-':
4545                                 /*
4546                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4547                                  * HYPHEN-MINUS character token. Stay in the
4548                                  * script data double escaped dash dash state.
4549                                  */
4550                                 continue;
4551                             case '<':
4552                                 /*
4553                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
4554                                  * LESS-THAN SIGN character token. Switch to the
4555                                  * script data double escaped less-than sign
4556                                  * state.
4557                                  */
4558                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4559                                 break scriptdatadoubleescapeddashdashloop;
4560                             case '>':
4561                                 /*
4562                                  * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4563                                  * GREATER-THAN SIGN character token. Switch to
4564                                  * the script data state.
4565                                  */
4566                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4567                                 continue stateloop;
4568                             case '\u0000':
4569                                 emitReplacementCharacter(buf, pos);
4570                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4571                                 continue stateloop;
4572                             case '\r':
4573                                 emitCarriageReturn(buf, pos);
4574                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4575                                 break stateloop;
4576                             case '\n':
4577                                 silentLineFeed();
4578                                 // CPPONLY: MOZ_FALLTHROUGH;
4579                             default:
4580                                 /*
4581                                  * Anything else Emit the current input
4582                                  * character as a character token. Switch to the
4583                                  * script data double escaped state.
4584                                  */
4585                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4586                                 continue stateloop;
4587                         }
4588                     }
4589                     // CPPONLY: MOZ_FALLTHROUGH;
4590                 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
4591                     scriptdatadoubleescapedlessthanloop: for (;;) {
4592                         if (++pos == endPos) {
4593                             break stateloop;
4594                         }
4595                         c = checkChar(buf, pos);
4596                         /*
4597                          * Consume the next input character:
4598                          */
4599                         switch (c) {
4600                             case '/':
4601                                 /*
4602                                  * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
4603                                  * character token. Set the temporary buffer to
4604                                  * the empty string. Switch to the script data
4605                                  * double escape end state.
4606                                  */
4607                                 index = 0;
4608                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
4609                                 break scriptdatadoubleescapedlessthanloop;
4610                             default:
4611                                 /*
4612                                  * Anything else Reconsume the current input
4613                                  * character in the script data double escaped
4614                                  * state.
4615                                  */
4616                                 reconsume = true;
4617                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4618                                 continue stateloop;
4619                         }
4620                     }
4621                     // CPPONLY: MOZ_FALLTHROUGH;
4622                 case SCRIPT_DATA_DOUBLE_ESCAPE_END:
4623                     scriptdatadoubleescapeendloop: for (;;) {
4624                         if (++pos == endPos) {
4625                             break stateloop;
4626                         }
4627                         c = checkChar(buf, pos);
4628                         if (index < 6) { // SCRIPT_ARR.length
4629                             char folded = c;
4630                             if (c >= 'A' && c <= 'Z') {
4631                                 folded += 0x20;
4632                             }
4633                             if (folded != Tokenizer.SCRIPT_ARR[index]) {
4634                                 reconsume = true;
4635                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4636                                 continue stateloop;
4637                             }
4638                             index++;
4639                             continue;
4640                         }
4641                         switch (c) {
4642                             case '\r':
4643                                 emitCarriageReturn(buf, pos);
4644                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4645                                 break stateloop;
4646                             case '\n':
4647                                 silentLineFeed();
4648                                 // CPPONLY: MOZ_FALLTHROUGH;
4649                             case ' ':
4650                             case '\t':
4651                             case '\u000C':
4652                             case '/':
4653                             case '>':
4654                                 /*
4655                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4656                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4657                                  * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4658                                  * (>) Emit the current input character as a
4659                                  * character token. If the temporary buffer is
4660                                  * the string "script", then switch to the
4661                                  * script data escaped state.
4662                                  */
4663                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4664                                 continue stateloop;
4665                             default:
4666                                 /*
4667                                  * Reconsume the current input character in the
4668                                  * script data double escaped state.
4669                                  */
4670                                 reconsume = true;
4671                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4672                                 continue stateloop;
4673                         }
4674                     }
4675                 case MARKUP_DECLARATION_OCTYPE:
4676                     markupdeclarationdoctypeloop: for (;;) {
4677                         if (++pos == endPos) {
4678                             break stateloop;
4679                         }
4680                         c = checkChar(buf, pos);
4681                         if (index < 6) { // OCTYPE.length
4682                             char folded = c;
4683                             if (c >= 'A' && c <= 'Z') {
4684                                 folded += 0x20;
4685                             }
4686                             if (folded == Tokenizer.OCTYPE[index]) {
4687                                 appendStrBuf(c);
4688                             } else {
4689                                 errBogusComment();
4690                                 reconsume = true;
4691                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4692                                 continue stateloop;
4693                             }
4694                             index++;
4695                             continue;
4696                         } else {
4697                             reconsume = true;
4698                             state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
4699                             break markupdeclarationdoctypeloop;
4700                             // continue stateloop;
4701                         }
4702                     }
4703                     // CPPONLY: MOZ_FALLTHROUGH;
4704                 case DOCTYPE:
4705                     doctypeloop: for (;;) {
4706                         if (reconsume) {
4707                             reconsume = false;
4708                         } else {
4709                             if (++pos == endPos) {
4710                                 break stateloop;
4711                             }
4712                             c = checkChar(buf, pos);
4713                         }
4714                         initDoctypeFields();
4715                         /*
4716                          * Consume the next input character:
4717                          */
4718                         switch (c) {
4719                             case '\r':
4720                                 silentCarriageReturn();
4721                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4722                                 break stateloop;
4723                             case '\n':
4724                                 silentLineFeed();
4725                                 // CPPONLY: MOZ_FALLTHROUGH;
4726                             case ' ':
4727                             case '\t':
4728                             case '\u000C':
4729                                 /*
4730                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4731                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4732                                  * Switch to the before DOCTYPE name state.
4733                                  */
4734                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4735                                 break doctypeloop;
4736                             // continue stateloop;
4737                             default:
4738                                 /*
4739                                  * Anything else Parse error.
4740                                  */
4741                                 errMissingSpaceBeforeDoctypeName();
4742                                 /*
4743                                  * Reconsume the current character in the before
4744                                  * DOCTYPE name state.
4745                                  */
4746                                 reconsume = true;
4747                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4748                                 break doctypeloop;
4749                             // continue stateloop;
4750                         }
4751                     }
4752                     // CPPONLY: MOZ_FALLTHROUGH;
4753                 case BEFORE_DOCTYPE_NAME:
4754                     beforedoctypenameloop: for (;;) {
4755                         if (reconsume) {
4756                             reconsume = false;
4757                         } else {
4758                             if (++pos == endPos) {
4759                                 break stateloop;
4760                             }
4761                             c = checkChar(buf, pos);
4762                         }
4763                         /*
4764                          * Consume the next input character:
4765                          */
4766                         switch (c) {
4767                             case '\r':
4768                                 silentCarriageReturn();
4769                                 break stateloop;
4770                             case '\n':
4771                                 silentLineFeed();
4772                                 // CPPONLY: MOZ_FALLTHROUGH;
4773                             case ' ':
4774                             case '\t':
4775                             case '\u000C':
4776                                 /*
4777                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4778                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4779                                  * in the before DOCTYPE name state.
4780                                  */
4781                                 continue;
4782                             case '>':
4783                                 /*
4784                                  * U+003E GREATER-THAN SIGN (>) Parse error.
4785                                  */
4786                                 errNamelessDoctype();
4787                                 /*
4788                                  * Create a new DOCTYPE token. Set its
4789                                  * force-quirks flag to on.
4790                                  */
4791                                 forceQuirks = true;
4792                                 /*
4793                                  * Emit the token.
4794                                  */
4795                                 emitDoctypeToken(pos);
4796                                 /*
4797                                  * Switch to the data state.
4798                                  */
4799                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
4800                                 continue stateloop;
4801                             case '\u0000':
4802                                 c = '\uFFFD';
4803                                 // CPPONLY: MOZ_FALLTHROUGH;
4804                             default:
4805                                 if (c >= 'A' && c <= 'Z') {
4806                                     /*
4807                                      * U+0041 LATIN CAPITAL LETTER A through to
4808                                      * U+005A LATIN CAPITAL LETTER Z Create a
4809                                      * new DOCTYPE token. Set the token's name
4810                                      * to the lowercase version of the input
4811                                      * character (add 0x0020 to the character's
4812                                      * code point).
4813                                      */
4814                                     c += 0x20;
4815                                 }
4816                                 /* Anything else Create a new DOCTYPE token. */
4817                                 /*
4818                                  * Set the token's name name to the current
4819                                  * input character.
4820                                  */
4821                                 clearStrBufBeforeUse();
4822                                 appendStrBuf(c);
4823                                 /*
4824                                  * Switch to the DOCTYPE name state.
4825                                  */
4826                                 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
4827                                 break beforedoctypenameloop;
4828                             // continue stateloop;
4829                         }
4830                     }
4831                     // CPPONLY: MOZ_FALLTHROUGH;
4832                 case DOCTYPE_NAME:
4833                     doctypenameloop: for (;;) {
4834                         if (++pos == endPos) {
4835                             break stateloop;
4836                         }
4837                         c = checkChar(buf, pos);
4838                         /*
4839                          * Consume the next input character:
4840                          */
4841                         switch (c) {
4842                             case '\r':
4843                                 silentCarriageReturn();
4844                                 strBufToDoctypeName();
4845                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4846                                 break stateloop;
4847                             case '\n':
4848                                 silentLineFeed();
4849                                 // CPPONLY: MOZ_FALLTHROUGH;
4850                             case ' ':
4851                             case '\t':
4852                             case '\u000C':
4853                                 /*
4854                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4855                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4856                                  * Switch to the after DOCTYPE name state.
4857                                  */
4858                                 strBufToDoctypeName();
4859                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4860                                 break doctypenameloop;
4861                             // continue stateloop;
4862                             case '>':
4863                                 /*
4864                                  * U+003E GREATER-THAN SIGN (>) Emit the current
4865                                  * DOCTYPE token.
4866                                  */
4867                                 strBufToDoctypeName();
4868                                 emitDoctypeToken(pos);
4869                                 /*
4870                                  * Switch to the data state.
4871                                  */
4872                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
4873                                 continue stateloop;
4874                             case '\u0000':
4875                                 c = '\uFFFD';
4876                                 // CPPONLY: MOZ_FALLTHROUGH;
4877                             default:
4878                                 /*
4879                                  * U+0041 LATIN CAPITAL LETTER A through to
4880                                  * U+005A LATIN CAPITAL LETTER Z Append the
4881                                  * lowercase version of the input character (add
4882                                  * 0x0020 to the character's code point) to the
4883                                  * current DOCTYPE token's name.
4884                                  */
4885                                 if (c >= 'A' && c <= 'Z') {
4886                                     c += 0x0020;
4887                                 }
4888                                 /*
4889                                  * Anything else Append the current input
4890                                  * character to the current DOCTYPE token's
4891                                  * name.
4892                                  */
4893                                 appendStrBuf(c);
4894                                 /*
4895                                  * Stay in the DOCTYPE name state.
4896                                  */
4897                                 continue;
4898                         }
4899                     }
4900                     // CPPONLY: MOZ_FALLTHROUGH;
4901                 case AFTER_DOCTYPE_NAME:
4902                     afterdoctypenameloop: for (;;) {
4903                         if (++pos == endPos) {
4904                             break stateloop;
4905                         }
4906                         c = checkChar(buf, pos);
4907                         /*
4908                          * Consume the next input character:
4909                          */
4910                         switch (c) {
4911                             case '\r':
4912                                 silentCarriageReturn();
4913                                 break stateloop;
4914                             case '\n':
4915                                 silentLineFeed();
4916                                 // CPPONLY: MOZ_FALLTHROUGH;
4917                             case ' ':
4918                             case '\t':
4919                             case '\u000C':
4920                                 /*
4921                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4922                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4923                                  * in the after DOCTYPE name state.
4924                                  */
4925                                 continue;
4926                             case '>':
4927                                 /*
4928                                  * U+003E GREATER-THAN SIGN (>) Emit the current
4929                                  * DOCTYPE token.
4930                                  */
4931                                 emitDoctypeToken(pos);
4932                                 /*
4933                                  * Switch to the data state.
4934                                  */
4935                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
4936                                 continue stateloop;
4937                             case 'p':
4938                             case 'P':
4939                                 index = 0;
4940                                 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
4941                                 break afterdoctypenameloop;
4942                             // continue stateloop;
4943                             case 's':
4944                             case 'S':
4945                                 index = 0;
4946                                 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
4947                                 continue stateloop;
4948                             default:
4949                                 /*
4950                                  * Otherwise, this is the parse error.
4951                                  */
4952                                 bogusDoctype();
4953 
4954                                 /*
4955                                  * Set the DOCTYPE token's force-quirks flag to
4956                                  * on.
4957                                  */
4958                                 // done by bogusDoctype();
4959                                 /*
4960                                  * Switch to the bogus DOCTYPE state.
4961                                  */
4962                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4963                                 continue stateloop;
4964                         }
4965                     }
4966                     // CPPONLY: MOZ_FALLTHROUGH;
4967                 case DOCTYPE_UBLIC:
4968                     doctypeublicloop: for (;;) {
4969                         if (++pos == endPos) {
4970                             break stateloop;
4971                         }
4972                         c = checkChar(buf, pos);
4973                         /*
4974                          * If the six characters starting from the current input
4975                          * character are an ASCII case-insensitive match for the
4976                          * word "PUBLIC", then consume those characters and
4977                          * switch to the before DOCTYPE public identifier state.
4978                          */
4979                         if (index < 5) { // UBLIC.length
4980                             char folded = c;
4981                             if (c >= 'A' && c <= 'Z') {
4982                                 folded += 0x20;
4983                             }
4984                             if (folded != Tokenizer.UBLIC[index]) {
4985                                 bogusDoctype();
4986                                 // forceQuirks = true;
4987                                 reconsume = true;
4988                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4989                                 continue stateloop;
4990                             }
4991                             index++;
4992                             continue;
4993                         } else {
4994                             reconsume = true;
4995                             state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
4996                             break doctypeublicloop;
4997                             // continue stateloop;
4998                         }
4999                     }
5000                     // CPPONLY: MOZ_FALLTHROUGH;
5001                 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
5002                     afterdoctypepublickeywordloop: for (;;) {
5003                         if (reconsume) {
5004                             reconsume = false;
5005                         } else {
5006                             if (++pos == endPos) {
5007                                 break stateloop;
5008                             }
5009                             c = checkChar(buf, pos);
5010                         }
5011                         /*
5012                          * Consume the next input character:
5013                          */
5014                         switch (c) {
5015                             case '\r':
5016                                 silentCarriageReturn();
5017                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5018                                 break stateloop;
5019                             case '\n':
5020                                 silentLineFeed();
5021                                 // CPPONLY: MOZ_FALLTHROUGH;
5022                             case ' ':
5023                             case '\t':
5024                             case '\u000C':
5025                                 /*
5026                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5027                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5028                                  * Switch to the before DOCTYPE public
5029                                  * identifier state.
5030                                  */
5031                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5032                                 break afterdoctypepublickeywordloop;
5033                             // FALL THROUGH continue stateloop
5034                             case '"':
5035                                 /*
5036                                  * U+0022 QUOTATION MARK (") Parse Error.
5037                                  */
5038                                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
5039                                 /*
5040                                  * Set the DOCTYPE token's public identifier to
5041                                  * the empty string (not missing),
5042                                  */
5043                                 clearStrBufBeforeUse();
5044                                 /*
5045                                  * then switch to the DOCTYPE public identifier
5046                                  * (double-quoted) state.
5047                                  */
5048                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5049                                 continue stateloop;
5050                             case '\'':
5051                                 /*
5052                                  * U+0027 APOSTROPHE (') Parse Error.
5053                                  */
5054                                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
5055                                 /*
5056                                  * Set the DOCTYPE token's public identifier to
5057                                  * the empty string (not missing),
5058                                  */
5059                                 clearStrBufBeforeUse();
5060                                 /*
5061                                  * then switch to the DOCTYPE public identifier
5062                                  * (single-quoted) state.
5063                                  */
5064                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5065                                 continue stateloop;
5066                             case '>':
5067                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5068                                 errExpectedPublicId();
5069                                 /*
5070                                  * Set the DOCTYPE token's force-quirks flag to
5071                                  * on.
5072                                  */
5073                                 forceQuirks = true;
5074                                 /*
5075                                  * Emit that DOCTYPE token.
5076                                  */
5077                                 emitDoctypeToken(pos);
5078                                 /*
5079                                  * Switch to the data state.
5080                                  */
5081                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5082                                 continue stateloop;
5083                             default:
5084                                 bogusDoctype();
5085                                 /*
5086                                  * Set the DOCTYPE token's force-quirks flag to
5087                                  * on.
5088                                  */
5089                                 // done by bogusDoctype();
5090                                 /*
5091                                  * Switch to the bogus DOCTYPE state.
5092                                  */
5093                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5094                                 continue stateloop;
5095                         }
5096                     }
5097                     // CPPONLY: MOZ_FALLTHROUGH;
5098                 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
5099                     beforedoctypepublicidentifierloop: for (;;) {
5100                         if (++pos == endPos) {
5101                             break stateloop;
5102                         }
5103                         c = checkChar(buf, pos);
5104                         /*
5105                          * Consume the next input character:
5106                          */
5107                         switch (c) {
5108                             case '\r':
5109                                 silentCarriageReturn();
5110                                 break stateloop;
5111                             case '\n':
5112                                 silentLineFeed();
5113                                 // CPPONLY: MOZ_FALLTHROUGH;
5114                             case ' ':
5115                             case '\t':
5116                             case '\u000C':
5117                                 /*
5118                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5119                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5120                                  * in the before DOCTYPE public identifier
5121                                  * state.
5122                                  */
5123                                 continue;
5124                             case '"':
5125                                 /*
5126                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
5127                                  * token's public identifier to the empty string
5128                                  * (not missing),
5129                                  */
5130                                 clearStrBufBeforeUse();
5131                                 /*
5132                                  * then switch to the DOCTYPE public identifier
5133                                  * (double-quoted) state.
5134                                  */
5135                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5136                                 break beforedoctypepublicidentifierloop;
5137                             // continue stateloop;
5138                             case '\'':
5139                                 /*
5140                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5141                                  * public identifier to the empty string (not
5142                                  * missing),
5143                                  */
5144                                 clearStrBufBeforeUse();
5145                                 /*
5146                                  * then switch to the DOCTYPE public identifier
5147                                  * (single-quoted) state.
5148                                  */
5149                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5150                                 continue stateloop;
5151                             case '>':
5152                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5153                                 errExpectedPublicId();
5154                                 /*
5155                                  * Set the DOCTYPE token's force-quirks flag to
5156                                  * on.
5157                                  */
5158                                 forceQuirks = true;
5159                                 /*
5160                                  * Emit that DOCTYPE token.
5161                                  */
5162                                 emitDoctypeToken(pos);
5163                                 /*
5164                                  * Switch to the data state.
5165                                  */
5166                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5167                                 continue stateloop;
5168                             default:
5169                                 bogusDoctype();
5170                                 /*
5171                                  * Set the DOCTYPE token's force-quirks flag to
5172                                  * on.
5173                                  */
5174                                 // done by bogusDoctype();
5175                                 /*
5176                                  * Switch to the bogus DOCTYPE state.
5177                                  */
5178                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5179                                 continue stateloop;
5180                         }
5181                     }
5182                     // CPPONLY: MOZ_FALLTHROUGH;
5183                 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
5184                     doctypepublicidentifierdoublequotedloop: for (;;) {
5185                         if (++pos == endPos) {
5186                             break stateloop;
5187                         }
5188                         c = checkChar(buf, pos);
5189                         /*
5190                          * Consume the next input character:
5191                          */
5192                         switch (c) {
5193                             case '"':
5194                                 /*
5195                                  * U+0022 QUOTATION MARK (") Switch to the after
5196                                  * DOCTYPE public identifier state.
5197                                  */
5198                                 publicIdentifier = strBufToString();
5199                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5200                                 break doctypepublicidentifierdoublequotedloop;
5201                             // continue stateloop;
5202                             case '>':
5203                                 /*
5204                                  * U+003E GREATER-THAN SIGN (>) Parse error.
5205                                  */
5206                                 errGtInPublicId();
5207                                 /*
5208                                  * Set the DOCTYPE token's force-quirks flag to
5209                                  * on.
5210                                  */
5211                                 forceQuirks = true;
5212                                 /*
5213                                  * Emit that DOCTYPE token.
5214                                  */
5215                                 publicIdentifier = strBufToString();
5216                                 emitDoctypeToken(pos);
5217                                 /*
5218                                  * Switch to the data state.
5219                                  */
5220                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5221                                 continue stateloop;
5222                             case '\r':
5223                                 appendStrBufCarriageReturn();
5224                                 break stateloop;
5225                             case '\n':
5226                                 appendStrBufLineFeed();
5227                                 continue;
5228                             case '\u0000':
5229                                 c = '\uFFFD';
5230                                 // CPPONLY: MOZ_FALLTHROUGH;
5231                             default:
5232                                 /*
5233                                  * Anything else Append the current input
5234                                  * character to the current DOCTYPE token's
5235                                  * public identifier.
5236                                  */
5237                                 appendStrBuf(c);
5238                                 /*
5239                                  * Stay in the DOCTYPE public identifier
5240                                  * (double-quoted) state.
5241                                  */
5242                                 continue;
5243                         }
5244                     }
5245                     // CPPONLY: MOZ_FALLTHROUGH;
5246                 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
5247                     afterdoctypepublicidentifierloop: for (;;) {
5248                         if (++pos == endPos) {
5249                             break stateloop;
5250                         }
5251                         c = checkChar(buf, pos);
5252                         /*
5253                          * Consume the next input character:
5254                          */
5255                         switch (c) {
5256                             case '\r':
5257                                 silentCarriageReturn();
5258                                 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5259                                 break stateloop;
5260                             case '\n':
5261                                 silentLineFeed();
5262                                 // CPPONLY: MOZ_FALLTHROUGH;
5263                             case ' ':
5264                             case '\t':
5265                             case '\u000C':
5266                                 /*
5267                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5268                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5269                                  * Switch to the between DOCTYPE public and
5270                                  * system identifiers state.
5271                                  */
5272                                 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5273                                 break afterdoctypepublicidentifierloop;
5274                             // continue stateloop;
5275                             case '>':
5276                                 /*
5277                                  * U+003E GREATER-THAN SIGN (>) Emit the current
5278                                  * DOCTYPE token.
5279                                  */
5280                                 emitDoctypeToken(pos);
5281                                 /*
5282                                  * Switch to the data state.
5283                                  */
5284                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5285                                 continue stateloop;
5286                             case '"':
5287                                 /*
5288                                  * U+0022 QUOTATION MARK (") Parse error.
5289                                  */
5290                                 errNoSpaceBetweenPublicAndSystemIds();
5291                                 /*
5292                                  * Set the DOCTYPE token's system identifier to
5293                                  * the empty string (not missing),
5294                                  */
5295                                 clearStrBufBeforeUse();
5296                                 /*
5297                                  * then switch to the DOCTYPE system identifier
5298                                  * (double-quoted) state.
5299                                  */
5300                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5301                                 continue stateloop;
5302                             case '\'':
5303                                 /*
5304                                  * U+0027 APOSTROPHE (') Parse error.
5305                                  */
5306                                 errNoSpaceBetweenPublicAndSystemIds();
5307                                 /*
5308                                  * Set the DOCTYPE token's system identifier to
5309                                  * the empty string (not missing),
5310                                  */
5311                                 clearStrBufBeforeUse();
5312                                 /*
5313                                  * then switch to the DOCTYPE system identifier
5314                                  * (single-quoted) state.
5315                                  */
5316                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5317                                 continue stateloop;
5318                             default:
5319                                 bogusDoctype();
5320                                 /*
5321                                  * Set the DOCTYPE token's force-quirks flag to
5322                                  * on.
5323                                  */
5324                                 // done by bogusDoctype();
5325                                 /*
5326                                  * Switch to the bogus DOCTYPE state.
5327                                  */
5328                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5329                                 continue stateloop;
5330                         }
5331                     }
5332                     // CPPONLY: MOZ_FALLTHROUGH;
5333                 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
5334                     betweendoctypepublicandsystemidentifiersloop: for (;;) {
5335                         if (++pos == endPos) {
5336                             break stateloop;
5337                         }
5338                         c = checkChar(buf, pos);
5339                         /*
5340                          * Consume the next input character:
5341                          */
5342                         switch (c) {
5343                             case '\r':
5344                                 silentCarriageReturn();
5345                                 break stateloop;
5346                             case '\n':
5347                                 silentLineFeed();
5348                                 // CPPONLY: MOZ_FALLTHROUGH;
5349                             case ' ':
5350                             case '\t':
5351                             case '\u000C':
5352                                 /*
5353                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5354                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5355                                  * in the between DOCTYPE public and system
5356                                  * identifiers state.
5357                                  */
5358                                 continue;
5359                             case '>':
5360                                 /*
5361                                  * U+003E GREATER-THAN SIGN (>) Emit the current
5362                                  * DOCTYPE token.
5363                                  */
5364                                 emitDoctypeToken(pos);
5365                                 /*
5366                                  * Switch to the data state.
5367                                  */
5368                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5369                                 continue stateloop;
5370                             case '"':
5371                                 /*
5372                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
5373                                  * token's system identifier to the empty string
5374                                  * (not missing),
5375                                  */
5376                                 clearStrBufBeforeUse();
5377                                 /*
5378                                  * then switch to the DOCTYPE system identifier
5379                                  * (double-quoted) state.
5380                                  */
5381                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5382                                 break betweendoctypepublicandsystemidentifiersloop;
5383                             // continue stateloop;
5384                             case '\'':
5385                                 /*
5386                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5387                                  * system identifier to the empty string (not
5388                                  * missing),
5389                                  */
5390                                 clearStrBufBeforeUse();
5391                                 /*
5392                                  * then switch to the DOCTYPE system identifier
5393                                  * (single-quoted) state.
5394                                  */
5395                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5396                                 continue stateloop;
5397                             default:
5398                                 bogusDoctype();
5399                                 /*
5400                                  * Set the DOCTYPE token's force-quirks flag to
5401                                  * on.
5402                                  */
5403                                 // done by bogusDoctype();
5404                                 /*
5405                                  * Switch to the bogus DOCTYPE state.
5406                                  */
5407                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5408                                 continue stateloop;
5409                         }
5410                     }
5411                     // CPPONLY: MOZ_FALLTHROUGH;
5412                 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
5413                     doctypesystemidentifierdoublequotedloop: for (;;) {
5414                         if (++pos == endPos) {
5415                             break stateloop;
5416                         }
5417                         c = checkChar(buf, pos);
5418                         /*
5419                          * Consume the next input character:
5420                          */
5421                         switch (c) {
5422                             case '"':
5423                                 /*
5424                                  * U+0022 QUOTATION MARK (") Switch to the after
5425                                  * DOCTYPE system identifier state.
5426                                  */
5427                                 systemIdentifier = strBufToString();
5428                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5429                                 continue stateloop;
5430                             case '>':
5431                                 /*
5432                                  * U+003E GREATER-THAN SIGN (>) Parse error.
5433                                  */
5434                                 errGtInSystemId();
5435                                 /*
5436                                  * Set the DOCTYPE token's force-quirks flag to
5437                                  * on.
5438                                  */
5439                                 forceQuirks = true;
5440                                 /*
5441                                  * Emit that DOCTYPE token.
5442                                  */
5443                                 systemIdentifier = strBufToString();
5444                                 emitDoctypeToken(pos);
5445                                 /*
5446                                  * Switch to the data state.
5447                                  */
5448                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5449                                 continue stateloop;
5450                             case '\r':
5451                                 appendStrBufCarriageReturn();
5452                                 break stateloop;
5453                             case '\n':
5454                                 appendStrBufLineFeed();
5455                                 continue;
5456                             case '\u0000':
5457                                 c = '\uFFFD';
5458                                 // CPPONLY: MOZ_FALLTHROUGH;
5459                             default:
5460                                 /*
5461                                  * Anything else Append the current input
5462                                  * character to the current DOCTYPE token's
5463                                  * system identifier.
5464                                  */
5465                                 appendStrBuf(c);
5466                                 /*
5467                                  * Stay in the DOCTYPE system identifier
5468                                  * (double-quoted) state.
5469                                  */
5470                                 continue;
5471                         }
5472                     }
5473                 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
5474                     afterdoctypesystemidentifierloop: for (;;) {
5475                         if (++pos == endPos) {
5476                             break stateloop;
5477                         }
5478                         c = checkChar(buf, pos);
5479                         /*
5480                          * Consume the next input character:
5481                          */
5482                         switch (c) {
5483                             case '\r':
5484                                 silentCarriageReturn();
5485                                 break stateloop;
5486                             case '\n':
5487                                 silentLineFeed();
5488                                 // CPPONLY: MOZ_FALLTHROUGH;
5489                             case ' ':
5490                             case '\t':
5491                             case '\u000C':
5492                                 /*
5493                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5494                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5495                                  * in the after DOCTYPE system identifier state.
5496                                  */
5497                                 continue;
5498                             case '>':
5499                                 /*
5500                                  * U+003E GREATER-THAN SIGN (>) Emit the current
5501                                  * DOCTYPE token.
5502                                  */
5503                                 emitDoctypeToken(pos);
5504                                 /*
5505                                  * Switch to the data state.
5506                                  */
5507                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5508                                 continue stateloop;
5509                             default:
5510                                 /*
5511                                  * Switch to the bogus DOCTYPE state. (This does
5512                                  * not set the DOCTYPE token's force-quirks flag
5513                                  * to on.)
5514                                  */
5515                                 bogusDoctypeWithoutQuirks();
5516                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5517                                 break afterdoctypesystemidentifierloop;
5518                             // continue stateloop;
5519                         }
5520                     }
5521                     // CPPONLY: MOZ_FALLTHROUGH;
5522                 case BOGUS_DOCTYPE:
5523                     for (;;) {
5524                         if (reconsume) {
5525                             reconsume = false;
5526                         } else {
5527                             if (++pos == endPos) {
5528                                 break stateloop;
5529                             }
5530                             c = checkChar(buf, pos);
5531                         }
5532                         /*
5533                          * Consume the next input character:
5534                          */
5535                         switch (c) {
5536                             case '>':
5537                                 /*
5538                                  * U+003E GREATER-THAN SIGN (>) Emit that
5539                                  * DOCTYPE token.
5540                                  */
5541                                 emitDoctypeToken(pos);
5542                                 /*
5543                                  * Switch to the data state.
5544                                  */
5545                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5546                                 continue stateloop;
5547                             case '\r':
5548                                 silentCarriageReturn();
5549                                 break stateloop;
5550                             case '\n':
5551                                 silentLineFeed();
5552                                 // CPPONLY: MOZ_FALLTHROUGH;
5553                             default:
5554                                 /*
5555                                  * Anything else Stay in the bogus DOCTYPE
5556                                  * state.
5557                                  */
5558                                 continue;
5559                         }
5560                     }
5561                 case DOCTYPE_YSTEM:
5562                     doctypeystemloop: for (;;) {
5563                         if (++pos == endPos) {
5564                             break stateloop;
5565                         }
5566                         c = checkChar(buf, pos);
5567                         /*
5568                          * Otherwise, if the six characters starting from the
5569                          * current input character are an ASCII case-insensitive
5570                          * match for the word "SYSTEM", then consume those
5571                          * characters and switch to the before DOCTYPE system
5572                          * identifier state.
5573                          */
5574                         if (index < 5) { // YSTEM.length
5575                             char folded = c;
5576                             if (c >= 'A' && c <= 'Z') {
5577                                 folded += 0x20;
5578                             }
5579                             if (folded != Tokenizer.YSTEM[index]) {
5580                                 bogusDoctype();
5581                                 reconsume = true;
5582                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5583                                 continue stateloop;
5584                             }
5585                             index++;
5586                             continue stateloop;
5587                         } else {
5588                             reconsume = true;
5589                             state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
5590                             break doctypeystemloop;
5591                             // continue stateloop;
5592                         }
5593                     }
5594                     // CPPONLY: MOZ_FALLTHROUGH;
5595                 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
5596                     afterdoctypesystemkeywordloop: for (;;) {
5597                         if (reconsume) {
5598                             reconsume = false;
5599                         } else {
5600                             if (++pos == endPos) {
5601                                 break stateloop;
5602                             }
5603                             c = checkChar(buf, pos);
5604                         }
5605                         /*
5606                          * Consume the next input character:
5607                          */
5608                         switch (c) {
5609                             case '\r':
5610                                 silentCarriageReturn();
5611                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5612                                 break stateloop;
5613                             case '\n':
5614                                 silentLineFeed();
5615                                 // CPPONLY: MOZ_FALLTHROUGH;
5616                             case ' ':
5617                             case '\t':
5618                             case '\u000C':
5619                                 /*
5620                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5621                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5622                                  * Switch to the before DOCTYPE public
5623                                  * identifier state.
5624                                  */
5625                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5626                                 break afterdoctypesystemkeywordloop;
5627                             // FALL THROUGH continue stateloop
5628                             case '"':
5629                                 /*
5630                                  * U+0022 QUOTATION MARK (") Parse Error.
5631                                  */
5632                                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5633                                 /*
5634                                  * Set the DOCTYPE token's system identifier to
5635                                  * the empty string (not missing),
5636                                  */
5637                                 clearStrBufBeforeUse();
5638                                 /*
5639                                  * then switch to the DOCTYPE public identifier
5640                                  * (double-quoted) state.
5641                                  */
5642                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5643                                 continue stateloop;
5644                             case '\'':
5645                                 /*
5646                                  * U+0027 APOSTROPHE (') Parse Error.
5647                                  */
5648                                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5649                                 /*
5650                                  * Set the DOCTYPE token's public identifier to
5651                                  * the empty string (not missing),
5652                                  */
5653                                 clearStrBufBeforeUse();
5654                                 /*
5655                                  * then switch to the DOCTYPE public identifier
5656                                  * (single-quoted) state.
5657                                  */
5658                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5659                                 continue stateloop;
5660                             case '>':
5661                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5662                                 errExpectedPublicId();
5663                                 /*
5664                                  * Set the DOCTYPE token's force-quirks flag to
5665                                  * on.
5666                                  */
5667                                 forceQuirks = true;
5668                                 /*
5669                                  * Emit that DOCTYPE token.
5670                                  */
5671                                 emitDoctypeToken(pos);
5672                                 /*
5673                                  * Switch to the data state.
5674                                  */
5675                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5676                                 continue stateloop;
5677                             default:
5678                                 bogusDoctype();
5679                                 /*
5680                                  * Set the DOCTYPE token's force-quirks flag to
5681                                  * on.
5682                                  */
5683                                 // done by bogusDoctype();
5684                                 /*
5685                                  * Switch to the bogus DOCTYPE state.
5686                                  */
5687                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5688                                 continue stateloop;
5689                         }
5690                     }
5691                     // CPPONLY: MOZ_FALLTHROUGH;
5692                 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
5693                     beforedoctypesystemidentifierloop: for (;;) {
5694                         if (++pos == endPos) {
5695                             break stateloop;
5696                         }
5697                         c = checkChar(buf, pos);
5698                         /*
5699                          * Consume the next input character:
5700                          */
5701                         switch (c) {
5702                             case '\r':
5703                                 silentCarriageReturn();
5704                                 break stateloop;
5705                             case '\n':
5706                                 silentLineFeed();
5707                                 // CPPONLY: MOZ_FALLTHROUGH;
5708                             case ' ':
5709                             case '\t':
5710                             case '\u000C':
5711                                 /*
5712                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5713                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5714                                  * in the before DOCTYPE system identifier
5715                                  * state.
5716                                  */
5717                                 continue;
5718                             case '"':
5719                                 /*
5720                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
5721                                  * token's system identifier to the empty string
5722                                  * (not missing),
5723                                  */
5724                                 clearStrBufBeforeUse();
5725                                 /*
5726                                  * then switch to the DOCTYPE system identifier
5727                                  * (double-quoted) state.
5728                                  */
5729                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5730                                 continue stateloop;
5731                             case '\'':
5732                                 /*
5733                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5734                                  * system identifier to the empty string (not
5735                                  * missing),
5736                                  */
5737                                 clearStrBufBeforeUse();
5738                                 /*
5739                                  * then switch to the DOCTYPE system identifier
5740                                  * (single-quoted) state.
5741                                  */
5742                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5743                                 break beforedoctypesystemidentifierloop;
5744                             // continue stateloop;
5745                             case '>':
5746                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5747                                 errExpectedSystemId();
5748                                 /*
5749                                  * Set the DOCTYPE token's force-quirks flag to
5750                                  * on.
5751                                  */
5752                                 forceQuirks = true;
5753                                 /*
5754                                  * Emit that DOCTYPE token.
5755                                  */
5756                                 emitDoctypeToken(pos);
5757                                 /*
5758                                  * Switch to the data state.
5759                                  */
5760                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5761                                 continue stateloop;
5762                             default:
5763                                 bogusDoctype();
5764                                 /*
5765                                  * Set the DOCTYPE token's force-quirks flag to
5766                                  * on.
5767                                  */
5768                                 // done by bogusDoctype();
5769                                 /*
5770                                  * Switch to the bogus DOCTYPE state.
5771                                  */
5772                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5773                                 continue stateloop;
5774                         }
5775                     }
5776                     // CPPONLY: MOZ_FALLTHROUGH;
5777                 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
5778                     for (;;) {
5779                         if (++pos == endPos) {
5780                             break stateloop;
5781                         }
5782                         c = checkChar(buf, pos);
5783                         /*
5784                          * Consume the next input character:
5785                          */
5786                         switch (c) {
5787                             case '\'':
5788                                 /*
5789                                  * U+0027 APOSTROPHE (') Switch to the after
5790                                  * DOCTYPE system identifier state.
5791                                  */
5792                                 systemIdentifier = strBufToString();
5793                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5794                                 continue stateloop;
5795                             case '>':
5796                                 errGtInSystemId();
5797                                 /*
5798                                  * Set the DOCTYPE token's force-quirks flag to
5799                                  * on.
5800                                  */
5801                                 forceQuirks = true;
5802                                 /*
5803                                  * Emit that DOCTYPE token.
5804                                  */
5805                                 systemIdentifier = strBufToString();
5806                                 emitDoctypeToken(pos);
5807                                 /*
5808                                  * Switch to the data state.
5809                                  */
5810                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5811                                 continue stateloop;
5812                             case '\r':
5813                                 appendStrBufCarriageReturn();
5814                                 break stateloop;
5815                             case '\n':
5816                                 appendStrBufLineFeed();
5817                                 continue;
5818                             case '\u0000':
5819                                 c = '\uFFFD';
5820                                 // CPPONLY: MOZ_FALLTHROUGH;
5821                             default:
5822                                 /*
5823                                  * Anything else Append the current input
5824                                  * character to the current DOCTYPE token's
5825                                  * system identifier.
5826                                  */
5827                                 appendStrBuf(c);
5828                                 /*
5829                                  * Stay in the DOCTYPE system identifier
5830                                  * (double-quoted) state.
5831                                  */
5832                                 continue;
5833                         }
5834                     }
5835                 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
5836                     for (;;) {
5837                         if (++pos == endPos) {
5838                             break stateloop;
5839                         }
5840                         c = checkChar(buf, pos);
5841                         /*
5842                          * Consume the next input character:
5843                          */
5844                         switch (c) {
5845                             case '\'':
5846                                 /*
5847                                  * U+0027 APOSTROPHE (') Switch to the after
5848                                  * DOCTYPE public identifier state.
5849                                  */
5850                                 publicIdentifier = strBufToString();
5851                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5852                                 continue stateloop;
5853                             case '>':
5854                                 errGtInPublicId();
5855                                 /*
5856                                  * Set the DOCTYPE token's force-quirks flag to
5857                                  * on.
5858                                  */
5859                                 forceQuirks = true;
5860                                 /*
5861                                  * Emit that DOCTYPE token.
5862                                  */
5863                                 publicIdentifier = strBufToString();
5864                                 emitDoctypeToken(pos);
5865                                 /*
5866                                  * Switch to the data state.
5867                                  */
5868                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5869                                 continue stateloop;
5870                             case '\r':
5871                                 appendStrBufCarriageReturn();
5872                                 break stateloop;
5873                             case '\n':
5874                                 appendStrBufLineFeed();
5875                                 continue;
5876                             case '\u0000':
5877                                 c = '\uFFFD';
5878                                 // CPPONLY: MOZ_FALLTHROUGH;
5879                             default:
5880                                 /*
5881                                  * Anything else Append the current input
5882                                  * character to the current DOCTYPE token's
5883                                  * public identifier.
5884                                  */
5885                                 appendStrBuf(c);
5886                                 /*
5887                                  * Stay in the DOCTYPE public identifier
5888                                  * (single-quoted) state.
5889                                  */
5890                                 continue;
5891                         }
5892                     }
5893                 case PROCESSING_INSTRUCTION:
5894                     processinginstructionloop: for (;;) {
5895                         if (++pos == endPos) {
5896                             break stateloop;
5897                         }
5898                         c = checkChar(buf, pos);
5899                         switch (c) {
5900                             case '?':
5901                                 state = transition(
5902                                         state,
5903                                         Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
5904                                         reconsume, pos);
5905                                 break processinginstructionloop;
5906                             // continue stateloop;
5907                             default:
5908                                 continue;
5909                         }
5910                     }
5911                     // CPPONLY: MOZ_FALLTHROUGH;
5912                 case PROCESSING_INSTRUCTION_QUESTION_MARK:
5913                     if (++pos == endPos) {
5914                         break stateloop;
5915                     }
5916                     c = checkChar(buf, pos);
5917                     switch (c) {
5918                         case '>':
5919                             state = transition(state, Tokenizer.DATA,
5920                                     reconsume, pos);
5921                             continue stateloop;
5922                         default:
5923                             state = transition(state,
5924                                     Tokenizer.PROCESSING_INSTRUCTION,
5925                                     reconsume, pos);
5926                             continue stateloop;
5927                     }
5928                     // END HOTSPOT WORKAROUND
5929             }
5930         }
5931         flushChars(buf, pos);
5932         /*
5933          * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
5934          */
5935         // Save locals
5936         stateSave = state;
5937         returnStateSave = returnState;
5938         return pos;
5939     }
5940 
5941     // HOTSPOT WORKAROUND INSERTION POINT
5942 
5943     // [NOCPP[
5944 
transition(int from, int to, boolean reconsume, int pos)5945     protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
5946         return to;
5947     }
5948 
5949     // ]NOCPP]
5950 
initDoctypeFields()5951     private void initDoctypeFields() {
5952         // Discard the characters "DOCTYPE" accumulated as a potential bogus
5953         // comment into strBuf.
5954         clearStrBufAfterUse();
5955         doctypeName = "";
5956         if (systemIdentifier != null) {
5957             Portability.releaseString(systemIdentifier);
5958             systemIdentifier = null;
5959         }
5960         if (publicIdentifier != null) {
5961             Portability.releaseString(publicIdentifier);
5962             publicIdentifier = null;
5963         }
5964         forceQuirks = false;
5965     }
5966 
adjustDoubleHyphenAndAppendToStrBufCarriageReturn()5967     @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
5968             throws SAXException {
5969         silentCarriageReturn();
5970         adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
5971     }
5972 
adjustDoubleHyphenAndAppendToStrBufLineFeed()5973     @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed()
5974             throws SAXException {
5975         silentLineFeed();
5976         adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
5977     }
5978 
appendStrBufLineFeed()5979     @Inline private void appendStrBufLineFeed() {
5980         silentLineFeed();
5981         appendStrBuf('\n');
5982     }
5983 
appendStrBufCarriageReturn()5984     @Inline private void appendStrBufCarriageReturn() {
5985         silentCarriageReturn();
5986         appendStrBuf('\n');
5987     }
5988 
silentCarriageReturn()5989     @Inline protected void silentCarriageReturn() {
5990         ++line;
5991         lastCR = true;
5992     }
5993 
silentLineFeed()5994     @Inline protected void silentLineFeed() {
5995         ++line;
5996     }
5997 
emitCarriageReturn(@oLength char[] buf, int pos)5998     private void emitCarriageReturn(@NoLength char[] buf, int pos)
5999             throws SAXException {
6000         silentCarriageReturn();
6001         flushChars(buf, pos);
6002         tokenHandler.characters(Tokenizer.LF, 0, 1);
6003         cstart = Integer.MAX_VALUE;
6004     }
6005 
emitReplacementCharacter(@oLength char[] buf, int pos)6006     private void emitReplacementCharacter(@NoLength char[] buf, int pos)
6007             throws SAXException {
6008         flushChars(buf, pos);
6009         tokenHandler.zeroOriginatingReplacementCharacter();
6010         cstart = pos + 1;
6011     }
6012 
emitPlaintextReplacementCharacter(@oLength char[] buf, int pos)6013     private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
6014             throws SAXException {
6015         flushChars(buf, pos);
6016         tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
6017         cstart = pos + 1;
6018     }
6019 
setAdditionalAndRememberAmpersandLocation(char add)6020     private void setAdditionalAndRememberAmpersandLocation(char add) {
6021         additional = add;
6022         // [NOCPP[
6023         ampersandLocation = new LocatorImpl(this);
6024         // ]NOCPP]
6025     }
6026 
bogusDoctype()6027     private void bogusDoctype() throws SAXException {
6028         errBogusDoctype();
6029         forceQuirks = true;
6030     }
6031 
bogusDoctypeWithoutQuirks()6032     private void bogusDoctypeWithoutQuirks() throws SAXException {
6033         errBogusDoctype();
6034         forceQuirks = false;
6035     }
6036 
handleNcrValue(int returnState)6037     private void handleNcrValue(int returnState) throws SAXException {
6038         /*
6039          * If one or more characters match the range, then take them all and
6040          * interpret the string of characters as a number (either hexadecimal or
6041          * decimal as appropriate).
6042          */
6043         if (value <= 0xFFFF) {
6044             if (value >= 0x80 && value <= 0x9f) {
6045                 /*
6046                  * If that number is one of the numbers in the first column of
6047                  * the following table, then this is a parse error.
6048                  */
6049                 errNcrInC1Range();
6050                 /*
6051                  * Find the row with that number in the first column, and return
6052                  * a character token for the Unicode character given in the
6053                  * second column of that row.
6054                  */
6055                 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
6056                 emitOrAppendOne(val, returnState);
6057                 // [NOCPP[
6058             } else if (value == 0xC
6059                     && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
6060                 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
6061                     emitOrAppendOne(Tokenizer.SPACE, returnState);
6062                 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
6063                     fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
6064                 }
6065                 // ]NOCPP]
6066             } else if (value == 0x0) {
6067                 errNcrZero();
6068                 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
6069             } else if ((value & 0xF800) == 0xD800) {
6070                 errNcrSurrogate();
6071                 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
6072             } else {
6073                 /*
6074                  * Otherwise, return a character token for the Unicode character
6075                  * whose code point is that number.
6076                  */
6077                 char ch = (char) value;
6078                 // [NOCPP[
6079                 if (value == 0x0D) {
6080                     errNcrCr();
6081                 } else if ((value <= 0x0008) || (value == 0x000B)
6082                         || (value >= 0x000E && value <= 0x001F)) {
6083                     ch = errNcrControlChar(ch);
6084                 } else if (value >= 0xFDD0 && value <= 0xFDEF) {
6085                     errNcrUnassigned();
6086                 } else if ((value & 0xFFFE) == 0xFFFE) {
6087                     ch = errNcrNonCharacter(ch);
6088                 } else if (value >= 0x007F && value <= 0x009F) {
6089                     errNcrControlChar();
6090                 } else {
6091                     maybeWarnPrivateUse(ch);
6092                 }
6093                 // ]NOCPP]
6094                 bmpChar[0] = ch;
6095                 emitOrAppendOne(bmpChar, returnState);
6096             }
6097         } else if (value <= 0x10FFFF) {
6098             // [NOCPP[
6099             maybeWarnPrivateUseAstral();
6100             if ((value & 0xFFFE) == 0xFFFE) {
6101                 errAstralNonCharacter(value);
6102             }
6103             // ]NOCPP]
6104             astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
6105             astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
6106             emitOrAppendTwo(astralChar, returnState);
6107         } else {
6108             errNcrOutOfRange();
6109             emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
6110         }
6111     }
6112 
eof()6113     public void eof() throws SAXException {
6114         int state = stateSave;
6115         int returnState = returnStateSave;
6116 
6117         eofloop: for (;;) {
6118             switch (state) {
6119                 case SCRIPT_DATA_LESS_THAN_SIGN:
6120                 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
6121                     /*
6122                      * Otherwise, emit a U+003C LESS-THAN SIGN character token
6123                      */
6124                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6125                     /*
6126                      * and reconsume the current input character in the data
6127                      * state.
6128                      */
6129                     break eofloop;
6130                 case TAG_OPEN:
6131                     /*
6132                      * The behavior of this state depends on the content model
6133                      * flag.
6134                      */
6135                     /*
6136                      * Anything else Parse error.
6137                      */
6138                     errEofAfterLt();
6139                     /*
6140                      * Emit a U+003C LESS-THAN SIGN character token
6141                      */
6142                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6143                     /*
6144                      * and reconsume the current input character in the data
6145                      * state.
6146                      */
6147                     break eofloop;
6148                 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
6149                     /*
6150                      * Emit a U+003C LESS-THAN SIGN character token
6151                      */
6152                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6153                     /*
6154                      * and reconsume the current input character in the RCDATA
6155                      * state.
6156                      */
6157                     break eofloop;
6158                 case NON_DATA_END_TAG_NAME:
6159                     /*
6160                      * Emit a U+003C LESS-THAN SIGN character token, a U+002F
6161                      * SOLIDUS character token,
6162                      */
6163                     tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
6164                     /*
6165                      * a character token for each of the characters in the
6166                      * temporary buffer (in the order they were added to the
6167                      * buffer),
6168                      */
6169                     emitStrBuf();
6170                     /*
6171                      * and reconsume the current input character in the RCDATA
6172                      * state.
6173                      */
6174                     break eofloop;
6175                 case CLOSE_TAG_OPEN:
6176                     /* EOF Parse error. */
6177                     errEofAfterLt();
6178                     /*
6179                      * Emit a U+003C LESS-THAN SIGN character token and a U+002F
6180                      * SOLIDUS character token.
6181                      */
6182                     tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
6183                     /*
6184                      * Reconsume the EOF character in the data state.
6185                      */
6186                     break eofloop;
6187                 case TAG_NAME:
6188                     /*
6189                      * EOF Parse error.
6190                      */
6191                     errEofInTagName();
6192                     /*
6193                      * Reconsume the EOF character in the data state.
6194                      */
6195                     break eofloop;
6196                 case BEFORE_ATTRIBUTE_NAME:
6197                 case AFTER_ATTRIBUTE_VALUE_QUOTED:
6198                 case SELF_CLOSING_START_TAG:
6199                     /* EOF Parse error. */
6200                     errEofWithoutGt();
6201                     /*
6202                      * Reconsume the EOF character in the data state.
6203                      */
6204                     break eofloop;
6205                 case ATTRIBUTE_NAME:
6206                     /*
6207                      * EOF Parse error.
6208                      */
6209                     errEofInAttributeName();
6210                     /*
6211                      * Reconsume the EOF character in the data state.
6212                      */
6213                     break eofloop;
6214                 case AFTER_ATTRIBUTE_NAME:
6215                 case BEFORE_ATTRIBUTE_VALUE:
6216                     /* EOF Parse error. */
6217                     errEofWithoutGt();
6218                     /*
6219                      * Reconsume the EOF character in the data state.
6220                      */
6221                     break eofloop;
6222                 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
6223                 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
6224                 case ATTRIBUTE_VALUE_UNQUOTED:
6225                     /* EOF Parse error. */
6226                     errEofInAttributeValue();
6227                     /*
6228                      * Reconsume the EOF character in the data state.
6229                      */
6230                     break eofloop;
6231                 case BOGUS_COMMENT:
6232                     emitComment(0, 0);
6233                     break eofloop;
6234                 case BOGUS_COMMENT_HYPHEN:
6235                     // [NOCPP[
6236                     maybeAppendSpaceToBogusComment();
6237                     // ]NOCPP]
6238                     emitComment(0, 0);
6239                     break eofloop;
6240                 case MARKUP_DECLARATION_OPEN:
6241                     errBogusComment();
6242                     emitComment(0, 0);
6243                     break eofloop;
6244                 case MARKUP_DECLARATION_HYPHEN:
6245                     errBogusComment();
6246                     emitComment(0, 0);
6247                     break eofloop;
6248                 case MARKUP_DECLARATION_OCTYPE:
6249                     if (index < 6) {
6250                         errBogusComment();
6251                         emitComment(0, 0);
6252                     } else {
6253                         /* EOF Parse error. */
6254                         errEofInDoctype();
6255                         /*
6256                          * Create a new DOCTYPE token. Set its force-quirks flag
6257                          * to on.
6258                          */
6259                         doctypeName = "";
6260                         if (systemIdentifier != null) {
6261                             Portability.releaseString(systemIdentifier);
6262                             systemIdentifier = null;
6263                         }
6264                         if (publicIdentifier != null) {
6265                             Portability.releaseString(publicIdentifier);
6266                             publicIdentifier = null;
6267                         }
6268                         forceQuirks = true;
6269                         /*
6270                          * Emit the token.
6271                          */
6272                         emitDoctypeToken(0);
6273                         /*
6274                          * Reconsume the EOF character in the data state.
6275                          */
6276                         break eofloop;
6277                     }
6278                     break eofloop;
6279                 case COMMENT_START:
6280                 case COMMENT:
6281                     /*
6282                      * EOF Parse error.
6283                      */
6284                     errEofInComment();
6285                     /* Emit the comment token. */
6286                     emitComment(0, 0);
6287                     /*
6288                      * Reconsume the EOF character in the data state.
6289                      */
6290                     break eofloop;
6291                 case COMMENT_END:
6292                     errEofInComment();
6293                     /* Emit the comment token. */
6294                     emitComment(2, 0);
6295                     /*
6296                      * Reconsume the EOF character in the data state.
6297                      */
6298                     break eofloop;
6299                 case COMMENT_END_DASH:
6300                 case COMMENT_START_DASH:
6301                     errEofInComment();
6302                     /* Emit the comment token. */
6303                     emitComment(1, 0);
6304                     /*
6305                      * Reconsume the EOF character in the data state.
6306                      */
6307                     break eofloop;
6308                 case COMMENT_END_BANG:
6309                     errEofInComment();
6310                     /* Emit the comment token. */
6311                     emitComment(3, 0);
6312                     /*
6313                      * Reconsume the EOF character in the data state.
6314                      */
6315                     break eofloop;
6316                 case DOCTYPE:
6317                 case BEFORE_DOCTYPE_NAME:
6318                     errEofInDoctype();
6319                     /*
6320                      * Create a new DOCTYPE token. Set its force-quirks flag to
6321                      * on.
6322                      */
6323                     forceQuirks = true;
6324                     /*
6325                      * Emit the token.
6326                      */
6327                     emitDoctypeToken(0);
6328                     /*
6329                      * Reconsume the EOF character in the data state.
6330                      */
6331                     break eofloop;
6332                 case DOCTYPE_NAME:
6333                     errEofInDoctype();
6334                     strBufToDoctypeName();
6335                     /*
6336                      * Set the DOCTYPE token's force-quirks flag to on.
6337                      */
6338                     forceQuirks = true;
6339                     /*
6340                      * Emit that DOCTYPE token.
6341                      */
6342                     emitDoctypeToken(0);
6343                     /*
6344                      * Reconsume the EOF character in the data state.
6345                      */
6346                     break eofloop;
6347                 case DOCTYPE_UBLIC:
6348                 case DOCTYPE_YSTEM:
6349                 case AFTER_DOCTYPE_NAME:
6350                 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
6351                 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
6352                 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
6353                     errEofInDoctype();
6354                     /*
6355                      * Set the DOCTYPE token's force-quirks flag to on.
6356                      */
6357                     forceQuirks = true;
6358                     /*
6359                      * Emit that DOCTYPE token.
6360                      */
6361                     emitDoctypeToken(0);
6362                     /*
6363                      * Reconsume the EOF character in the data state.
6364                      */
6365                     break eofloop;
6366                 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
6367                 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
6368                     /* EOF Parse error. */
6369                     errEofInPublicId();
6370                     /*
6371                      * Set the DOCTYPE token's force-quirks flag to on.
6372                      */
6373                     forceQuirks = true;
6374                     /*
6375                      * Emit that DOCTYPE token.
6376                      */
6377                     publicIdentifier = strBufToString();
6378                     emitDoctypeToken(0);
6379                     /*
6380                      * Reconsume the EOF character in the data state.
6381                      */
6382                     break eofloop;
6383                 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
6384                 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
6385                 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
6386                     errEofInDoctype();
6387                     /*
6388                      * Set the DOCTYPE token's force-quirks flag to on.
6389                      */
6390                     forceQuirks = true;
6391                     /*
6392                      * Emit that DOCTYPE token.
6393                      */
6394                     emitDoctypeToken(0);
6395                     /*
6396                      * Reconsume the EOF character in the data state.
6397                      */
6398                     break eofloop;
6399                 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
6400                 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
6401                     /* EOF Parse error. */
6402                     errEofInSystemId();
6403                     /*
6404                      * Set the DOCTYPE token's force-quirks flag to on.
6405                      */
6406                     forceQuirks = true;
6407                     /*
6408                      * Emit that DOCTYPE token.
6409                      */
6410                     systemIdentifier = strBufToString();
6411                     emitDoctypeToken(0);
6412                     /*
6413                      * Reconsume the EOF character in the data state.
6414                      */
6415                     break eofloop;
6416                 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
6417                     errEofInDoctype();
6418                     /*
6419                      * Set the DOCTYPE token's force-quirks flag to on.
6420                      */
6421                     forceQuirks = true;
6422                     /*
6423                      * Emit that DOCTYPE token.
6424                      */
6425                     emitDoctypeToken(0);
6426                     /*
6427                      * Reconsume the EOF character in the data state.
6428                      */
6429                     break eofloop;
6430                 case BOGUS_DOCTYPE:
6431                     /*
6432                      * Emit that DOCTYPE token.
6433                      */
6434                     emitDoctypeToken(0);
6435                     /*
6436                      * Reconsume the EOF character in the data state.
6437                      */
6438                     break eofloop;
6439                 case CONSUME_CHARACTER_REFERENCE:
6440                     /*
6441                      * Unlike the definition is the spec, this state does not
6442                      * return a value and never requires the caller to
6443                      * backtrack. This state takes care of emitting characters
6444                      * or appending to the current attribute value. It also
6445                      * takes care of that in the case when consuming the entity
6446                      * fails.
6447                      */
6448                     /*
6449                      * This section defines how to consume an entity. This
6450                      * definition is used when parsing entities in text and in
6451                      * attributes.
6452                      *
6453                      * The behavior depends on the identity of the next
6454                      * character (the one immediately after the U+0026 AMPERSAND
6455                      * character):
6456                      */
6457 
6458                     emitOrAppendCharRefBuf(returnState);
6459                     state = returnState;
6460                     continue;
6461                 case CHARACTER_REFERENCE_HILO_LOOKUP:
6462                     errNoNamedCharacterMatch();
6463                     emitOrAppendCharRefBuf(returnState);
6464                     state = returnState;
6465                     continue;
6466                 case CHARACTER_REFERENCE_TAIL:
6467                     outer: for (;;) {
6468                         char c = '\u0000';
6469                         entCol++;
6470                         /*
6471                          * Consume the maximum number of characters possible,
6472                          * with the consumed characters matching one of the
6473                          * identifiers in the first column of the named
6474                          * character references table (in a case-sensitive
6475                          * manner).
6476                          */
6477                         hiloop: for (;;) {
6478                             if (hi == -1) {
6479                                 break hiloop;
6480                             }
6481                             if (entCol == NamedCharacters.NAMES[hi].length()) {
6482                                 break hiloop;
6483                             }
6484                             if (entCol > NamedCharacters.NAMES[hi].length()) {
6485                                 break outer;
6486                             } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
6487                                 hi--;
6488                             } else {
6489                                 break hiloop;
6490                             }
6491                         }
6492 
6493                         loloop: for (;;) {
6494                             if (hi < lo) {
6495                                 break outer;
6496                             }
6497                             if (entCol == NamedCharacters.NAMES[lo].length()) {
6498                                 candidate = lo;
6499                                 charRefBufMark = charRefBufLen;
6500                                 lo++;
6501                             } else if (entCol > NamedCharacters.NAMES[lo].length()) {
6502                                 break outer;
6503                             } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
6504                                 lo++;
6505                             } else {
6506                                 break loloop;
6507                             }
6508                         }
6509                         if (hi < lo) {
6510                             break outer;
6511                         }
6512                         continue;
6513                     }
6514 
6515                     if (candidate == -1) {
6516                         /*
6517                          * If no match can be made, then this is a parse error.
6518                          */
6519                         errNoNamedCharacterMatch();
6520                         emitOrAppendCharRefBuf(returnState);
6521                         state = returnState;
6522                         continue eofloop;
6523                     } else {
6524                         @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
6525                         if (candidateName.length() == 0
6526                                 || candidateName.charAt(candidateName.length() - 1) != ';') {
6527                             /*
6528                              * If the last character matched is not a U+003B
6529                              * SEMICOLON (;), there is a parse error.
6530                              */
6531                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6532                                 /*
6533                                  * If the entity is being consumed as part of an
6534                                  * attribute, and the last character matched is
6535                                  * not a U+003B SEMICOLON (;),
6536                                  */
6537                                 char ch;
6538                                 if (charRefBufMark == charRefBufLen) {
6539                                     ch = '\u0000';
6540                                 } else {
6541                                     ch = charRefBuf[charRefBufMark];
6542                                 }
6543                                 if ((ch >= '0' && ch <= '9')
6544                                         || (ch >= 'A' && ch <= 'Z')
6545                                         || (ch >= 'a' && ch <= 'z')) {
6546                                     /*
6547                                      * and the next character is in the range
6548                                      * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
6549                                      * U+0041 LATIN CAPITAL LETTER A to U+005A
6550                                      * LATIN CAPITAL LETTER Z, or U+0061 LATIN
6551                                      * SMALL LETTER A to U+007A LATIN SMALL
6552                                      * LETTER Z, then, for historical reasons,
6553                                      * all the characters that were matched
6554                                      * after the U+0026 AMPERSAND (&) must be
6555                                      * unconsumed, and nothing is returned.
6556                                      */
6557                                     errNoNamedCharacterMatch();
6558                                     appendCharRefBufToStrBuf();
6559                                     state = returnState;
6560                                     continue eofloop;
6561                                 }
6562                             }
6563                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6564                                 errUnescapedAmpersandInterpretedAsCharacterReference();
6565                             } else {
6566                                 errNotSemicolonTerminated();
6567                             }
6568                         }
6569 
6570                         /*
6571                          * Otherwise, return a character token for the character
6572                          * corresponding to the entity name (as given by the
6573                          * second column of the named character references
6574                          * table).
6575                          */
6576                         @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
6577                         if (
6578                         // [NOCPP[
6579                         val.length == 1
6580                         // ]NOCPP]
6581                         // CPPONLY: val[1] == 0
6582                         ) {
6583                             emitOrAppendOne(val, returnState);
6584                         } else {
6585                             emitOrAppendTwo(val, returnState);
6586                         }
6587                         // this is so complicated!
6588                         if (charRefBufMark < charRefBufLen) {
6589                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6590                                 appendStrBuf(charRefBuf, charRefBufMark,
6591                                         charRefBufLen - charRefBufMark);
6592                             } else {
6593                                 tokenHandler.characters(charRefBuf, charRefBufMark,
6594                                         charRefBufLen - charRefBufMark);
6595                             }
6596                         }
6597                         charRefBufLen = 0;
6598                         state = returnState;
6599                         continue eofloop;
6600                         /*
6601                          * If the markup contains I'm &notit; I tell you, the
6602                          * entity is parsed as "not", as in, I'm ¬it; I tell
6603                          * you. But if the markup was I'm &notin; I tell you,
6604                          * the entity would be parsed as "notin;", resulting in
6605                          * I'm ∉ I tell you.
6606                          */
6607                     }
6608                 case CONSUME_NCR:
6609                 case DECIMAL_NRC_LOOP:
6610                 case HEX_NCR_LOOP:
6611                     /*
6612                      * If no characters match the range, then don't consume any
6613                      * characters (and unconsume the U+0023 NUMBER SIGN
6614                      * character and, if appropriate, the X character). This is
6615                      * a parse error; nothing is returned.
6616                      *
6617                      * Otherwise, if the next character is a U+003B SEMICOLON,
6618                      * consume that too. If it isn't, there is a parse error.
6619                      */
6620                     if (!seenDigits) {
6621                         errNoDigitsInNCR();
6622                         emitOrAppendCharRefBuf(returnState);
6623                         state = returnState;
6624                         continue;
6625                     } else {
6626                         errCharRefLacksSemicolon();
6627                     }
6628                     // WARNING previous state sets reconsume
6629                     handleNcrValue(returnState);
6630                     state = returnState;
6631                     continue;
6632                 case CDATA_RSQB:
6633                     tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
6634                     break eofloop;
6635                 case CDATA_RSQB_RSQB:
6636                     tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
6637                     break eofloop;
6638                 case DATA:
6639                 default:
6640                     break eofloop;
6641             }
6642         }
6643         // case DATA:
6644         /*
6645          * EOF Emit an end-of-file token.
6646          */
6647         tokenHandler.eof();
6648         return;
6649     }
6650 
emitDoctypeToken(int pos)6651     private void emitDoctypeToken(int pos) throws SAXException {
6652         cstart = pos + 1;
6653         tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
6654                 forceQuirks);
6655         // It is OK and sufficient to release these here, since
6656         // there's no way out of the doctype states than through paths
6657         // that call this method.
6658         doctypeName = null;
6659         Portability.releaseString(publicIdentifier);
6660         publicIdentifier = null;
6661         Portability.releaseString(systemIdentifier);
6662         systemIdentifier = null;
6663     }
6664 
checkChar(@oLength char[] buf, int pos)6665     @Inline protected char checkChar(@NoLength char[] buf, int pos)
6666             throws SAXException {
6667         return buf[pos];
6668     }
6669 
internalEncodingDeclaration(String internalCharset)6670     public boolean internalEncodingDeclaration(String internalCharset)
6671             throws SAXException {
6672         if (encodingDeclarationHandler != null) {
6673             return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
6674         }
6675         return false;
6676     }
6677 
6678     /**
6679      * @param val
6680      * @throws SAXException
6681      */
emitOrAppendTwo(@onst @oLength char[] val, int returnState)6682     private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
6683             throws SAXException {
6684         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6685             appendStrBuf(val[0]);
6686             appendStrBuf(val[1]);
6687         } else {
6688             tokenHandler.characters(val, 0, 2);
6689         }
6690     }
6691 
emitOrAppendOne(@onst @oLength char[] val, int returnState)6692     private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
6693             throws SAXException {
6694         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6695             appendStrBuf(val[0]);
6696         } else {
6697             tokenHandler.characters(val, 0, 1);
6698         }
6699     }
6700 
end()6701     public void end() throws SAXException {
6702         strBuf = null;
6703         doctypeName = null;
6704         if (systemIdentifier != null) {
6705             Portability.releaseString(systemIdentifier);
6706             systemIdentifier = null;
6707         }
6708         if (publicIdentifier != null) {
6709             Portability.releaseString(publicIdentifier);
6710             publicIdentifier = null;
6711         }
6712         tagName = null;
6713         nonInternedTagName.setNameForNonInterned(null
6714                 // CPPONLY: , false
6715                 );
6716         attributeName = null;
6717         // CPPONLY: nonInternedAttributeName.setNameForNonInterned(null);
6718         tokenHandler.endTokenization();
6719         if (attributes != null) {
6720             // [NOCPP[
6721             attributes = null;
6722             // ]NOCPP]
6723             // CPPONLY: attributes.clear(mappingLangToXmlLang);
6724         }
6725     }
6726 
requestSuspension()6727     public void requestSuspension() {
6728         shouldSuspend = true;
6729     }
6730 
6731     // [NOCPP[
6732 
becomeConfident()6733     public void becomeConfident() {
6734         confident = true;
6735     }
6736 
6737     /**
6738      * Returns the nextCharOnNewLine.
6739      *
6740      * @return the nextCharOnNewLine
6741      */
isNextCharOnNewLine()6742     public boolean isNextCharOnNewLine() {
6743         return false;
6744     }
6745 
isPrevCR()6746     public boolean isPrevCR() {
6747         return lastCR;
6748     }
6749 
6750     /**
6751      * Returns the line.
6752      *
6753      * @return the line
6754      */
getLine()6755     public int getLine() {
6756         return -1;
6757     }
6758 
6759     /**
6760      * Returns the col.
6761      *
6762      * @return the col
6763      */
getCol()6764     public int getCol() {
6765         return -1;
6766     }
6767 
6768     // ]NOCPP]
6769 
isInDataState()6770     public boolean isInDataState() {
6771         return (stateSave == DATA);
6772     }
6773 
resetToDataState()6774     public void resetToDataState() {
6775         clearStrBufAfterUse();
6776         charRefBufLen = 0;
6777         stateSave = Tokenizer.DATA;
6778         // line = 1; XXX line numbers
6779         lastCR = false;
6780         index = 0;
6781         forceQuirks = false;
6782         additional = '\u0000';
6783         entCol = -1;
6784         firstCharKey = -1;
6785         lo = 0;
6786         hi = 0; // will always be overwritten before use anyway
6787         candidate = -1;
6788         charRefBufMark = 0;
6789         value = 0;
6790         seenDigits = false;
6791         endTag = false;
6792         shouldSuspend = false;
6793         initDoctypeFields();
6794         containsHyphen = false;
6795         tagName = null;
6796         attributeName = null;
6797         if (newAttributesEachTime) {
6798             if (attributes != null) {
6799                 Portability.delete(attributes);
6800                 attributes = null;
6801             }
6802         }
6803     }
6804 
loadState(Tokenizer other)6805     public void loadState(Tokenizer other) throws SAXException {
6806         strBufLen = other.strBufLen;
6807         if (strBufLen > strBuf.length) {
6808             strBuf = new char[strBufLen];
6809         }
6810         System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
6811 
6812         charRefBufLen = other.charRefBufLen;
6813         System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen);
6814 
6815         stateSave = other.stateSave;
6816         returnStateSave = other.returnStateSave;
6817         endTagExpectation = other.endTagExpectation;
6818         endTagExpectationAsArray = other.endTagExpectationAsArray;
6819         // line = 1; XXX line numbers
6820         lastCR = other.lastCR;
6821         index = other.index;
6822         forceQuirks = other.forceQuirks;
6823         additional = other.additional;
6824         entCol = other.entCol;
6825         firstCharKey = other.firstCharKey;
6826         lo = other.lo;
6827         hi = other.hi;
6828         candidate = other.candidate;
6829         charRefBufMark = other.charRefBufMark;
6830         value = other.value;
6831         seenDigits = other.seenDigits;
6832         endTag = other.endTag;
6833         shouldSuspend = false;
6834 
6835         if (other.doctypeName == null) {
6836             doctypeName = null;
6837         } else {
6838             doctypeName = Portability.newLocalFromLocal(other.doctypeName,
6839                     interner);
6840         }
6841 
6842         Portability.releaseString(systemIdentifier);
6843         if (other.systemIdentifier == null) {
6844             systemIdentifier = null;
6845         } else {
6846             systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
6847         }
6848 
6849         Portability.releaseString(publicIdentifier);
6850         if (other.publicIdentifier == null) {
6851             publicIdentifier = null;
6852         } else {
6853             publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
6854         }
6855 
6856         containsHyphen = other.containsHyphen;
6857         if (other.tagName == null) {
6858             tagName = null;
6859         } else if (other.tagName.isInterned()) {
6860             tagName = other.tagName;
6861         } else {
6862             // In the C++ case, the atoms in the other tokenizer are from a
6863             // different tokenizer-scoped atom table. Therefore, we have to
6864             // obtain the correspoding atom from our own atom table.
6865             nonInternedTagName.setNameForNonInterned(Portability.newLocalFromLocal(other.tagName.getName(), interner)
6866                     // CPPONLY: , other.tagName.isCustom()
6867                     );
6868             tagName = nonInternedTagName;
6869         }
6870 
6871         // [NOCPP[
6872         attributeName = other.attributeName;
6873         // ]NOCPP]
6874         // CPPONLY: if (other.attributeName == null) {
6875         // CPPONLY:     attributeName = null;
6876         // CPPONLY: } else if (other.attributeName.isInterned()) {
6877         // CPPONLY:     attributeName = other.attributeName;
6878         // CPPONLY: } else {
6879         // CPPONLY:     // In the C++ case, the atoms in the other tokenizer are from a
6880         // CPPONLY:     // different tokenizer-scoped atom table. Therefore, we have to
6881         // CPPONLY:     // obtain the correspoding atom from our own atom table.
6882         // CPPONLY:     nonInternedAttributeName.setNameForNonInterned(Portability.newLocalFromLocal(other.attributeName.getLocal(AttributeName.HTML), interner));
6883         // CPPONLY:     attributeName = nonInternedAttributeName;
6884         // CPPONLY: }
6885 
6886         Portability.delete(attributes);
6887         if (other.attributes == null) {
6888             attributes = null;
6889         } else {
6890             attributes = other.attributes.cloneAttributes(interner);
6891         }
6892     }
6893 
initializeWithoutStarting()6894     public void initializeWithoutStarting() throws SAXException {
6895         confident = false;
6896         strBuf = null;
6897         line = 1;
6898         // CPPONLY: attributeLine = 1;
6899         // [NOCPP[
6900         html4 = false;
6901         metaBoundaryPassed = false;
6902         wantsComments = tokenHandler.wantsComments();
6903         if (!newAttributesEachTime) {
6904             attributes = new HtmlAttributes(mappingLangToXmlLang);
6905         }
6906         // ]NOCPP]
6907         resetToDataState();
6908     }
6909 
errGarbageAfterLtSlash()6910     protected void errGarbageAfterLtSlash() throws SAXException {
6911     }
6912 
errLtSlashGt()6913     protected void errLtSlashGt() throws SAXException {
6914     }
6915 
errWarnLtSlashInRcdata()6916     protected void errWarnLtSlashInRcdata() throws SAXException {
6917     }
6918 
errHtml4LtSlashInRcdata(char folded)6919     protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
6920     }
6921 
errCharRefLacksSemicolon()6922     protected void errCharRefLacksSemicolon() throws SAXException {
6923     }
6924 
errNoDigitsInNCR()6925     protected void errNoDigitsInNCR() throws SAXException {
6926     }
6927 
errGtInSystemId()6928     protected void errGtInSystemId() throws SAXException {
6929     }
6930 
errGtInPublicId()6931     protected void errGtInPublicId() throws SAXException {
6932     }
6933 
errNamelessDoctype()6934     protected void errNamelessDoctype() throws SAXException {
6935     }
6936 
errConsecutiveHyphens()6937     protected void errConsecutiveHyphens() throws SAXException {
6938     }
6939 
errPrematureEndOfComment()6940     protected void errPrematureEndOfComment() throws SAXException {
6941     }
6942 
errBogusComment()6943     protected void errBogusComment() throws SAXException {
6944     }
6945 
errUnquotedAttributeValOrNull(char c)6946     protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
6947     }
6948 
errSlashNotFollowedByGt()6949     protected void errSlashNotFollowedByGt() throws SAXException {
6950     }
6951 
errHtml4XmlVoidSyntax()6952     protected void errHtml4XmlVoidSyntax() throws SAXException {
6953     }
6954 
errNoSpaceBetweenAttributes()6955     protected void errNoSpaceBetweenAttributes() throws SAXException {
6956     }
6957 
errHtml4NonNameInUnquotedAttribute(char c)6958     protected void errHtml4NonNameInUnquotedAttribute(char c)
6959             throws SAXException {
6960     }
6961 
errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)6962     protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
6963             throws SAXException {
6964     }
6965 
errAttributeValueMissing()6966     protected void errAttributeValueMissing() throws SAXException {
6967     }
6968 
errBadCharBeforeAttributeNameOrNull(char c)6969     protected void errBadCharBeforeAttributeNameOrNull(char c)
6970             throws SAXException {
6971     }
6972 
errEqualsSignBeforeAttributeName()6973     protected void errEqualsSignBeforeAttributeName() throws SAXException {
6974     }
6975 
errBadCharAfterLt(char c)6976     protected void errBadCharAfterLt(char c) throws SAXException {
6977     }
6978 
errLtGt()6979     protected void errLtGt() throws SAXException {
6980     }
6981 
errProcessingInstruction()6982     protected void errProcessingInstruction() throws SAXException {
6983     }
6984 
errUnescapedAmpersandInterpretedAsCharacterReference()6985     protected void errUnescapedAmpersandInterpretedAsCharacterReference()
6986             throws SAXException {
6987     }
6988 
errNotSemicolonTerminated()6989     protected void errNotSemicolonTerminated() throws SAXException {
6990     }
6991 
errNoNamedCharacterMatch()6992     protected void errNoNamedCharacterMatch() throws SAXException {
6993     }
6994 
errQuoteBeforeAttributeName(char c)6995     protected void errQuoteBeforeAttributeName(char c) throws SAXException {
6996     }
6997 
errQuoteOrLtInAttributeNameOrNull(char c)6998     protected void errQuoteOrLtInAttributeNameOrNull(char c)
6999             throws SAXException {
7000     }
7001 
errExpectedPublicId()7002     protected void errExpectedPublicId() throws SAXException {
7003     }
7004 
errBogusDoctype()7005     protected void errBogusDoctype() throws SAXException {
7006     }
7007 
maybeWarnPrivateUseAstral()7008     protected void maybeWarnPrivateUseAstral() throws SAXException {
7009     }
7010 
maybeWarnPrivateUse(char ch)7011     protected void maybeWarnPrivateUse(char ch) throws SAXException {
7012     }
7013 
maybeErrAttributesOnEndTag(HtmlAttributes attrs)7014     protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
7015             throws SAXException {
7016     }
7017 
maybeErrSlashInEndTag(boolean selfClosing)7018     protected void maybeErrSlashInEndTag(boolean selfClosing)
7019             throws SAXException {
7020     }
7021 
errNcrNonCharacter(char ch)7022     protected char errNcrNonCharacter(char ch) throws SAXException {
7023         return ch;
7024     }
7025 
errAstralNonCharacter(int ch)7026     protected void errAstralNonCharacter(int ch) throws SAXException {
7027     }
7028 
errNcrSurrogate()7029     protected void errNcrSurrogate() throws SAXException {
7030     }
7031 
errNcrControlChar(char ch)7032     protected char errNcrControlChar(char ch) throws SAXException {
7033         return ch;
7034     }
7035 
errNcrCr()7036     protected void errNcrCr() throws SAXException {
7037     }
7038 
errNcrInC1Range()7039     protected void errNcrInC1Range() throws SAXException {
7040     }
7041 
errEofInPublicId()7042     protected void errEofInPublicId() throws SAXException {
7043     }
7044 
errEofInComment()7045     protected void errEofInComment() throws SAXException {
7046     }
7047 
errEofInDoctype()7048     protected void errEofInDoctype() throws SAXException {
7049     }
7050 
errEofInAttributeValue()7051     protected void errEofInAttributeValue() throws SAXException {
7052     }
7053 
errEofInAttributeName()7054     protected void errEofInAttributeName() throws SAXException {
7055     }
7056 
errEofWithoutGt()7057     protected void errEofWithoutGt() throws SAXException {
7058     }
7059 
errEofInTagName()7060     protected void errEofInTagName() throws SAXException {
7061     }
7062 
errEofInEndTag()7063     protected void errEofInEndTag() throws SAXException {
7064     }
7065 
errEofAfterLt()7066     protected void errEofAfterLt() throws SAXException {
7067     }
7068 
errNcrOutOfRange()7069     protected void errNcrOutOfRange() throws SAXException {
7070     }
7071 
errNcrUnassigned()7072     protected void errNcrUnassigned() throws SAXException {
7073     }
7074 
errDuplicateAttribute()7075     protected void errDuplicateAttribute() throws SAXException {
7076     }
7077 
errEofInSystemId()7078     protected void errEofInSystemId() throws SAXException {
7079     }
7080 
errExpectedSystemId()7081     protected void errExpectedSystemId() throws SAXException {
7082     }
7083 
errMissingSpaceBeforeDoctypeName()7084     protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
7085     }
7086 
errHyphenHyphenBang()7087     protected void errHyphenHyphenBang() throws SAXException {
7088     }
7089 
errNcrControlChar()7090     protected void errNcrControlChar() throws SAXException {
7091     }
7092 
errNcrZero()7093     protected void errNcrZero() throws SAXException {
7094     }
7095 
errNoSpaceBetweenDoctypeSystemKeywordAndQuote()7096     protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
7097             throws SAXException {
7098     }
7099 
errNoSpaceBetweenPublicAndSystemIds()7100     protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
7101     }
7102 
errNoSpaceBetweenDoctypePublicKeywordAndQuote()7103     protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
7104             throws SAXException {
7105     }
7106 
noteAttributeWithoutValue()7107     protected void noteAttributeWithoutValue() throws SAXException {
7108     }
7109 
noteUnquotedAttributeValue()7110     protected void noteUnquotedAttributeValue() throws SAXException {
7111     }
7112 
7113     /**
7114      * Sets the encodingDeclarationHandler.
7115      *
7116      * @param encodingDeclarationHandler
7117      *            the encodingDeclarationHandler to set
7118      */
setEncodingDeclarationHandler( EncodingDeclarationHandler encodingDeclarationHandler)7119     public void setEncodingDeclarationHandler(
7120             EncodingDeclarationHandler encodingDeclarationHandler) {
7121         this.encodingDeclarationHandler = encodingDeclarationHandler;
7122     }
7123 
destructor()7124     void destructor() {
7125         Portability.delete(nonInternedTagName);
7126         // CPPONLY: Portability.delete(nonInternedAttributeName);
7127         nonInternedTagName = null;
7128         // The translator will write refcount tracing stuff here
7129         Portability.delete(attributes);
7130         attributes = null;
7131     }
7132 
7133     // [NOCPP[
7134 
7135     /**
7136      * Sets an offset to be added to the position reported to
7137      * <code>TransitionHandler</code>.
7138      *
7139      * @param offset the offset
7140      */
setTransitionBaseOffset(int offset)7141     public void setTransitionBaseOffset(int offset) {
7142 
7143     }
7144 
7145     // ]NOCPP]
7146 
7147 }
7148