1 /*
2  * Copyright (c) 2005-2007 Henri Sivonen
3  * Copyright (c) 2007-2017 Mozilla Foundation
4  * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
5  * Foundation, and Opera Software ASA.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23  * DEALINGS IN THE SOFTWARE.
24  */
25 
26 /*
27  * The comments following this one that use the same comment syntax as this
28  * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
29  * amended as of June 18 2008 and May 31 2010.
30  * That document came with this statement:
31  * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
32  * Opera Software ASA. You are granted a license to use, reproduce and
33  * create derivative works of this document."
34  */
35 
36 package nu.validator.htmlparser.impl;
37 
38 import org.xml.sax.ErrorHandler;
39 import org.xml.sax.Locator;
40 import org.xml.sax.ext.Locator2;
41 import org.xml.sax.SAXException;
42 import org.xml.sax.SAXParseException;
43 
44 import nu.validator.htmlparser.annotation.Auto;
45 import nu.validator.htmlparser.annotation.CharacterName;
46 import nu.validator.htmlparser.annotation.Const;
47 import nu.validator.htmlparser.annotation.Inline;
48 import nu.validator.htmlparser.annotation.Local;
49 import nu.validator.htmlparser.annotation.NoLength;
50 import nu.validator.htmlparser.common.EncodingDeclarationHandler;
51 import nu.validator.htmlparser.common.Interner;
52 import nu.validator.htmlparser.common.TokenHandler;
53 import nu.validator.htmlparser.common.XmlViolationPolicy;
54 
55 /**
56  * An implementation of
57  * https://html.spec.whatwg.org/multipage/syntax.html#tokenization
58  *
59  * This class implements the <code>Locator</code> interface. This is not an
60  * incidental implementation detail: Users of this class are encouraged to make
61  * use of the <code>Locator</code> nature.
62  *
63  * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
64  * can be configured to treat these conditions as fatal or to coerce the infoset
65  * to something that XML 1.0 allows.
66  *
67  * @version $Id$
68  * @author hsivonen
69  */
70 public class Tokenizer implements Locator, Locator2 {
71 
72     private static final int DATA_AND_RCDATA_MASK = ~1;
73 
74     public static final int DATA = 0;
75 
76     public static final int RCDATA = 1;
77 
78     public static final int SCRIPT_DATA = 2;
79 
80     public static final int RAWTEXT = 3;
81 
82     public static final int SCRIPT_DATA_ESCAPED = 4;
83 
84     public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
85 
86     public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
87 
88     public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
89 
90     public static final int PLAINTEXT = 8;
91 
92     public static final int TAG_OPEN = 9;
93 
94     public static final int CLOSE_TAG_OPEN = 10;
95 
96     public static final int TAG_NAME = 11;
97 
98     public static final int BEFORE_ATTRIBUTE_NAME = 12;
99 
100     public static final int ATTRIBUTE_NAME = 13;
101 
102     public static final int AFTER_ATTRIBUTE_NAME = 14;
103 
104     public static final int BEFORE_ATTRIBUTE_VALUE = 15;
105 
106     public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
107 
108     public static final int BOGUS_COMMENT = 17;
109 
110     public static final int MARKUP_DECLARATION_OPEN = 18;
111 
112     public static final int DOCTYPE = 19;
113 
114     public static final int BEFORE_DOCTYPE_NAME = 20;
115 
116     public static final int DOCTYPE_NAME = 21;
117 
118     public static final int AFTER_DOCTYPE_NAME = 22;
119 
120     public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
121 
122     public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
123 
124     public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
125 
126     public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
127 
128     public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
129 
130     public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
131 
132     public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
133 
134     public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
135 
136     public static final int BOGUS_DOCTYPE = 31;
137 
138     public static final int COMMENT_START = 32;
139 
140     public static final int COMMENT_START_DASH = 33;
141 
142     public static final int COMMENT = 34;
143 
144     public static final int COMMENT_END_DASH = 35;
145 
146     public static final int COMMENT_END = 36;
147 
148     public static final int COMMENT_END_BANG = 37;
149 
150     public static final int NON_DATA_END_TAG_NAME = 38;
151 
152     public static final int MARKUP_DECLARATION_HYPHEN = 39;
153 
154     public static final int MARKUP_DECLARATION_OCTYPE = 40;
155 
156     public static final int DOCTYPE_UBLIC = 41;
157 
158     public static final int DOCTYPE_YSTEM = 42;
159 
160     public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
161 
162     public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
163 
164     public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
165 
166     public static final int CONSUME_CHARACTER_REFERENCE = 46;
167 
168     public static final int CONSUME_NCR = 47;
169 
170     public static final int CHARACTER_REFERENCE_TAIL = 48;
171 
172     public static final int HEX_NCR_LOOP = 49;
173 
174     public static final int DECIMAL_NRC_LOOP = 50;
175 
176     public static final int HANDLE_NCR_VALUE = 51;
177 
178     public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
179 
180     public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
181 
182     public static final int SELF_CLOSING_START_TAG = 54;
183 
184     public static final int CDATA_START = 55;
185 
186     public static final int CDATA_SECTION = 56;
187 
188     public static final int CDATA_RSQB = 57;
189 
190     public static final int CDATA_RSQB_RSQB = 58;
191 
192     public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
193 
194     public static final int SCRIPT_DATA_ESCAPE_START = 60;
195 
196     public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
197 
198     public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
199 
200     public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
201 
202     public static final int BOGUS_COMMENT_HYPHEN = 64;
203 
204     public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
205 
206     public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
207 
208     public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
209 
210     public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
211 
212     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
213 
214     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
215 
216     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
217 
218     public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
219 
220     public static final int PROCESSING_INSTRUCTION = 73;
221 
222     public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
223 
224     /**
225      * Magic value for UTF-16 operations.
226      */
227     private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
228 
229     /**
230      * UTF-16 code unit array containing less than and greater than for emitting
231      * those characters on certain parse errors.
232      */
233     private static final @NoLength char[] LT_GT = { '<', '>' };
234 
235     /**
236      * UTF-16 code unit array containing less than and solidus for emitting
237      * those characters on certain parse errors.
238      */
239     private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
240 
241     /**
242      * UTF-16 code unit array containing ]] for emitting those characters on
243      * state transitions.
244      */
245     private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
246 
247     /**
248      * Array version of U+FFFD.
249      */
250     private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
251 
252     // [NOCPP[
253 
254     /**
255      * Array version of space.
256      */
257     private static final @NoLength char[] SPACE = { ' ' };
258 
259     // ]NOCPP]
260 
261     /**
262      * Array version of line feed.
263      */
264     private static final @NoLength char[] LF = { '\n' };
265 
266     /**
267      * "CDATA[" as <code>char[]</code>
268      */
269     private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
270             'A', '[' };
271 
272     /**
273      * "octype" as <code>char[]</code>
274      */
275     private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
276             'e' };
277 
278     /**
279      * "ublic" as <code>char[]</code>
280      */
281     private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
282 
283     /**
284      * "ystem" as <code>char[]</code>
285      */
286     private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
287 
288     private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
289 
290     private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
291 
292     private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
293 
294     private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
295             'e', 'x', 't' };
296 
297     private static final char[] XMP_ARR = { 'x', 'm', 'p' };
298 
299     private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
300             'e', 'a' };
301 
302     private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
303 
304     private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
305             'd' };
306 
307     private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
308             'p', 't' };
309 
310     private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
311             'e', 's' };
312 
313     /**
314      * The token handler.
315      */
316     protected final TokenHandler tokenHandler;
317 
318     protected EncodingDeclarationHandler encodingDeclarationHandler;
319 
320     // [NOCPP[
321 
322     /**
323      * The error handler.
324      */
325     protected ErrorHandler errorHandler;
326 
327     // ]NOCPP]
328 
329     /**
330      * Whether the previous char read was CR.
331      */
332     protected boolean lastCR;
333 
334     protected int stateSave;
335 
336     private int returnStateSave;
337 
338     protected int index;
339 
340     private boolean forceQuirks;
341 
342     private char additional;
343 
344     private int entCol;
345 
346     private int firstCharKey;
347 
348     private int lo;
349 
350     private int hi;
351 
352     private int candidate;
353 
354     private int charRefBufMark;
355 
356     protected int value;
357 
358     private boolean seenDigits;
359 
360     protected int cstart;
361 
362     /**
363      * The SAX public id for the resource being tokenized. (Only passed to back
364      * as part of locator data.)
365      */
366     private String publicId;
367 
368     /**
369      * The SAX system id for the resource being tokenized. (Only passed to back
370      * as part of locator data.)
371      */
372     private String systemId;
373 
374     /**
375      * Buffer for bufferable things other than those that fit the description
376      * of <code>charRefBuf</code>.
377      */
378     private @Auto char[] strBuf;
379 
380     /**
381      * Number of significant <code>char</code>s in <code>strBuf</code>.
382      */
383     private int strBufLen;
384 
385     /**
386      * Buffer for characters that might form a character reference but may
387      * end up not forming one.
388      */
389     private final @Auto char[] charRefBuf;
390 
391     /**
392      * Number of significant <code>char</code>s in <code>charRefBuf</code>.
393      */
394     private int charRefBufLen;
395 
396     /**
397      * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
398      */
399     private final @Auto char[] bmpChar;
400 
401     /**
402      * Buffer for expanding astral NCRs.
403      */
404     private final @Auto char[] astralChar;
405 
406     /**
407      * The element whose end tag closes the current CDATA or RCDATA element.
408      */
409     protected ElementName endTagExpectation = null;
410 
411     private char[] endTagExpectationAsArray; // not @Auto!
412 
413     /**
414      * <code>true</code> if tokenizing an end tag
415      */
416     protected boolean endTag;
417 
418     /**
419      * <code>true</code> iff the current element/attribute name contains
420      * a hyphen.
421      */
422     private boolean containsHyphen;
423 
424     /**
425      * The current tag token name. One of
426      * 1) null,
427      * 2) non-owning reference to nonInternedTagName
428      * 3) non-owning reference to a pre-interned ElementName
429      */
430     private ElementName tagName = null;
431 
432     /**
433      * The recycled ElementName instance for the non-pre-interned cases.
434      */
435     private ElementName nonInternedTagName = null;
436 
437     /**
438      * The current attribute name.
439      */
440     protected AttributeName attributeName = null;
441 
442     // CPPONLY: private AttributeName nonInternedAttributeName = null;
443 
444     // [NOCPP[
445 
446     /**
447      * Whether comment tokens are emitted.
448      */
449     private boolean wantsComments = false;
450 
451     /**
452      * Whether the stream is past the first 1024 bytes.
453      */
454     private boolean metaBoundaryPassed;
455 
456     // ]NOCPP]
457 
458     /**
459      * The name of the current doctype token.
460      */
461     private @Local String doctypeName;
462 
463     /**
464      * The public id of the current doctype token.
465      */
466     private String publicIdentifier;
467 
468     /**
469      * The system id of the current doctype token.
470      */
471     private String systemIdentifier;
472 
473     /**
474      * The attribute holder.
475      */
476     private HtmlAttributes attributes;
477 
478     // [NOCPP[
479 
480     /**
481      * The policy for vertical tab and form feed.
482      */
483     private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
484 
485     /**
486      * The policy for comments.
487      */
488     private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
489 
490     private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
491 
492     private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
493 
494     private int mappingLangToXmlLang;
495 
496     // ]NOCPP]
497 
498     private final boolean newAttributesEachTime;
499 
500     private boolean shouldSuspend;
501 
502     protected boolean confident;
503 
504     private int line;
505 
506     /*
507      * The line number of the current attribute. First set to the line of the
508      * attribute name and if there is a value, set to the line the value
509      * started on.
510      */
511     // CPPONLY: private int attributeLine;
512 
513     private Interner interner;
514 
515     // CPPONLY: private boolean viewingXmlSource;
516 
517     // [NOCPP[
518 
519     protected LocatorImpl ampersandLocation;
520 
Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime)521     public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
522         this.tokenHandler = tokenHandler;
523         this.encodingDeclarationHandler = null;
524         this.lastCR = false;
525         this.stateSave = 0;
526         this.returnStateSave = 0;
527         this.index = 0;
528         this.forceQuirks = false;
529         this.additional = '\u0000';
530         this.entCol = 0;
531         this.firstCharKey = 0;
532         this.lo = 0;
533         this.hi = 0;
534         this.candidate = 0;
535         this.charRefBufMark = 0;
536         this.value = 0;
537         this.seenDigits = false;
538         this.cstart = 0;
539         this.strBufLen = 0;
540         this.newAttributesEachTime = newAttributesEachTime;
541         // &CounterClockwiseContourIntegral; is the longest valid char ref and
542         // the semicolon never gets appended to the buffer.
543         this.charRefBuf = new char[32];
544         this.charRefBufLen = 0;
545         this.bmpChar = new char[1];
546         this.astralChar = new char[2];
547         this.endTagExpectation = null;
548         this.endTagExpectationAsArray = null;
549         this.endTag = false;
550         this.containsHyphen = false;
551         this.tagName = null;
552         this.nonInternedTagName = new ElementName();
553         this.attributeName = null;
554         // CPPONLY: this.nonInternedAttributeName = new AttributeName();
555         this.doctypeName = null;
556         this.publicIdentifier = null;
557         this.systemIdentifier = null;
558         this.attributes = null;
559         this.shouldSuspend = false;
560         this.confident = false;
561         this.line = 0;
562         // CPPONLY: this.attributeLine = 0;
563         this.interner = null;
564     }
565 
566     // ]NOCPP]
567 
568     /**
569      * The constructor.
570      *
571      * @param tokenHandler
572      *            the handler for receiving tokens
573      */
Tokenizer(TokenHandler tokenHandler )574     public Tokenizer(TokenHandler tokenHandler
575     // CPPONLY: , boolean viewingXmlSource
576     ) {
577         this.tokenHandler = tokenHandler;
578         this.encodingDeclarationHandler = null;
579         // [NOCPP[
580         this.newAttributesEachTime = false;
581         // ]NOCPP]
582         this.lastCR = false;
583         this.stateSave = 0;
584         this.returnStateSave = 0;
585         this.index = 0;
586         this.forceQuirks = false;
587         this.additional = '\u0000';
588         this.entCol = 0;
589         this.firstCharKey = 0;
590         this.lo = 0;
591         this.hi = 0;
592         this.candidate = 0;
593         this.charRefBufMark = 0;
594         this.value = 0;
595         this.seenDigits = false;
596         this.cstart = 0;
597         this.strBufLen = 0;
598         // &CounterClockwiseContourIntegral; is the longest valid char ref and
599         // the semicolon never gets appended to the buffer.
600         this.charRefBuf = new char[32];
601         this.charRefBufLen = 0;
602         this.bmpChar = new char[1];
603         this.astralChar = new char[2];
604         this.endTagExpectation = null;
605         this.endTagExpectationAsArray = null;
606         this.endTag = false;
607         this.containsHyphen = false;
608         this.tagName = null;
609         this.nonInternedTagName = new ElementName();
610         this.attributeName = null;
611         // CPPONLY: this.nonInternedAttributeName = new AttributeName();
612         this.doctypeName = null;
613         this.publicIdentifier = null;
614         this.systemIdentifier = null;
615         // [NOCPP[
616         this.attributes = null;
617         // ]NOCPP]
618         // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
619         // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
620         this.shouldSuspend = false;
621         this.confident = false;
622         this.line = 0;
623         // CPPONLY: this.attributeLine = 0;
624         this.interner = null;
625         // CPPONLY: this.viewingXmlSource = viewingXmlSource;
626     }
627 
setInterner(Interner interner)628     public void setInterner(Interner interner) {
629         this.interner = interner;
630     }
631 
initLocation(String newPublicId, String newSystemId)632     public void initLocation(String newPublicId, String newSystemId) {
633         this.systemId = newSystemId;
634         this.publicId = newPublicId;
635 
636     }
637 
638     // CPPONLY: boolean isViewingXmlSource() {
639     // CPPONLY: return viewingXmlSource;
640     // CPPONLY: }
641 
642     // [NOCPP[
643 
644     /**
645      * Returns the mappingLangToXmlLang.
646      *
647      * @return the mappingLangToXmlLang
648      */
isMappingLangToXmlLang()649     public boolean isMappingLangToXmlLang() {
650         return mappingLangToXmlLang == AttributeName.HTML_LANG;
651     }
652 
653     /**
654      * Sets the mappingLangToXmlLang.
655      *
656      * @param mappingLangToXmlLang
657      *            the mappingLangToXmlLang to set
658      */
setMappingLangToXmlLang(boolean mappingLangToXmlLang)659     public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
660         this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
661                 : AttributeName.HTML;
662     }
663 
664     /**
665      * Sets the error handler.
666      *
667      * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
668      */
setErrorHandler(ErrorHandler eh)669     public void setErrorHandler(ErrorHandler eh) {
670         this.errorHandler = eh;
671     }
672 
getErrorHandler()673     public ErrorHandler getErrorHandler() {
674         return this.errorHandler;
675     }
676 
677     /**
678      * Sets the commentPolicy.
679      *
680      * @param commentPolicy
681      *            the commentPolicy to set
682      */
setCommentPolicy(XmlViolationPolicy commentPolicy)683     public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
684         this.commentPolicy = commentPolicy;
685     }
686 
687     /**
688      * Sets the contentNonXmlCharPolicy.
689      *
690      * @param contentNonXmlCharPolicy
691      *            the contentNonXmlCharPolicy to set
692      */
setContentNonXmlCharPolicy( XmlViolationPolicy contentNonXmlCharPolicy)693     public void setContentNonXmlCharPolicy(
694             XmlViolationPolicy contentNonXmlCharPolicy) {
695         if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
696             throw new IllegalArgumentException(
697                     "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
698         }
699     }
700 
701     /**
702      * Sets the contentSpacePolicy.
703      *
704      * @param contentSpacePolicy
705      *            the contentSpacePolicy to set
706      */
setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy)707     public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
708         this.contentSpacePolicy = contentSpacePolicy;
709     }
710 
711     /**
712      * Sets the xmlnsPolicy.
713      *
714      * @param xmlnsPolicy
715      *            the xmlnsPolicy to set
716      */
setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy)717     public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
718         if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
719             throw new IllegalArgumentException("Can't use FATAL here.");
720         }
721         this.xmlnsPolicy = xmlnsPolicy;
722     }
723 
setNamePolicy(XmlViolationPolicy namePolicy)724     public void setNamePolicy(XmlViolationPolicy namePolicy) {
725         this.namePolicy = namePolicy;
726     }
727 
728     // ]NOCPP]
729 
730     // For the token handler to call
731 
732     /**
733      * Sets the tokenizer state and the associated element name. This should
734      * only ever used to put the tokenizer into one of the states that have
735      * a special end tag expectation.
736      *
737      * @param specialTokenizerState
738      *            the tokenizer state to set
739      */
setState(int specialTokenizerState)740     public void setState(int specialTokenizerState) {
741         this.stateSave = specialTokenizerState;
742         this.endTagExpectation = null;
743         this.endTagExpectationAsArray = null;
744     }
745 
746     // [NOCPP[
747 
748     /**
749      * Sets the tokenizer state and the associated element name. This should
750      * only ever used to put the tokenizer into one of the states that have
751      * a special end tag expectation. For use from the tokenizer test harness.
752      *
753      * @param specialTokenizerState
754      *            the tokenizer state to set
755      * @param endTagExpectation
756      *            the expected end tag for transitioning back to normal
757      */
setStateAndEndTagExpectation(int specialTokenizerState, @Local String endTagExpectation)758     public void setStateAndEndTagExpectation(int specialTokenizerState,
759             @Local String endTagExpectation) {
760         this.stateSave = specialTokenizerState;
761         if (specialTokenizerState == Tokenizer.DATA) {
762             return;
763         }
764         @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
765         this.endTagExpectation = ElementName.elementNameByBuffer(asArray,
766                 asArray.length, interner);
767         assert this.endTagExpectation != null;
768         endTagExpectationToArray();
769     }
770 
771     // ]NOCPP]
772 
773     /**
774      * Sets the tokenizer state and the associated element name. This should
775      * only ever used to put the tokenizer into one of the states that have
776      * a special end tag expectation.
777      *
778      * @param specialTokenizerState
779      *            the tokenizer state to set
780      * @param endTagExpectation
781      *            the expected end tag for transitioning back to normal
782      */
setStateAndEndTagExpectation(int specialTokenizerState, ElementName endTagExpectation)783     public void setStateAndEndTagExpectation(int specialTokenizerState,
784             ElementName endTagExpectation) {
785         this.stateSave = specialTokenizerState;
786         this.endTagExpectation = endTagExpectation;
787         endTagExpectationToArray();
788     }
789 
endTagExpectationToArray()790     private void endTagExpectationToArray() {
791         switch (endTagExpectation.getGroup()) {
792             case TreeBuilder.TITLE:
793                 endTagExpectationAsArray = TITLE_ARR;
794                 return;
795             case TreeBuilder.SCRIPT:
796                 endTagExpectationAsArray = SCRIPT_ARR;
797                 return;
798             case TreeBuilder.STYLE:
799                 endTagExpectationAsArray = STYLE_ARR;
800                 return;
801             case TreeBuilder.PLAINTEXT:
802                 endTagExpectationAsArray = PLAINTEXT_ARR;
803                 return;
804             case TreeBuilder.XMP:
805                 endTagExpectationAsArray = XMP_ARR;
806                 return;
807             case TreeBuilder.TEXTAREA:
808                 endTagExpectationAsArray = TEXTAREA_ARR;
809                 return;
810             case TreeBuilder.IFRAME:
811                 endTagExpectationAsArray = IFRAME_ARR;
812                 return;
813             case TreeBuilder.NOEMBED:
814                 endTagExpectationAsArray = NOEMBED_ARR;
815                 return;
816             case TreeBuilder.NOSCRIPT:
817                 endTagExpectationAsArray = NOSCRIPT_ARR;
818                 return;
819             case TreeBuilder.NOFRAMES:
820                 endTagExpectationAsArray = NOFRAMES_ARR;
821                 return;
822             default:
823                 assert false: "Bad end tag expectation.";
824                 return;
825         }
826     }
827 
828     /**
829      * For C++ use only.
830      */
setLineNumber(int line)831     public void setLineNumber(int line) {
832         // CPPONLY: this.attributeLine = line; // XXX is this needed?
833         this.line = line;
834     }
835 
836     // start Locator impl
837 
838     /**
839      * @see org.xml.sax.Locator#getLineNumber()
840      */
getLineNumber()841     @Inline public int getLineNumber() {
842         return line;
843     }
844 
845     // [NOCPP[
846 
847     /**
848      * @see org.xml.sax.Locator#getColumnNumber()
849      */
getColumnNumber()850     @Inline public int getColumnNumber() {
851         return -1;
852     }
853 
854     /**
855      * @see org.xml.sax.Locator#getPublicId()
856      */
getPublicId()857     public String getPublicId() {
858         return publicId;
859     }
860 
861     /**
862      * @see org.xml.sax.Locator#getSystemId()
863      */
getSystemId()864     public String getSystemId() {
865         return systemId;
866     }
867 
868     /**
869      * @see org.xml.sax.ext.Locator2#getXMLVersion()
870      */
getXMLVersion()871     public String getXMLVersion() {
872         return "1.0";
873     }
874 
875     /**
876      * @see org.xml.sax.ext.Locator2#getXMLVersion()
877      */
getEncoding()878     public String getEncoding() {
879         try {
880             return encodingDeclarationHandler == null ? null : encodingDeclarationHandler.getCharacterEncoding();
881         } catch (SAXException e) {
882             return null;
883         }
884     }
885 
886     // end Locator impl
887 
888     // end public API
889 
notifyAboutMetaBoundary()890     public void notifyAboutMetaBoundary() {
891         metaBoundaryPassed = true;
892     }
893 
894     // ]NOCPP]
895 
emptyAttributes()896     HtmlAttributes emptyAttributes() {
897         // [NOCPP[
898         if (newAttributesEachTime) {
899             return new HtmlAttributes(mappingLangToXmlLang);
900         } else {
901             // ]NOCPP]
902             return HtmlAttributes.EMPTY_ATTRIBUTES;
903             // [NOCPP[
904         }
905         // ]NOCPP]
906     }
907 
appendCharRefBuf(char c)908     @Inline private void appendCharRefBuf(char c) {
909         // CPPONLY: assert charRefBufLen < charRefBuf.length:
910         // CPPONLY:     "RELEASE: Attempted to overrun charRefBuf!";
911         charRefBuf[charRefBufLen++] = c;
912     }
913 
emitOrAppendCharRefBuf(int returnState)914     private void emitOrAppendCharRefBuf(int returnState) throws SAXException {
915         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
916             appendCharRefBufToStrBuf();
917         } else {
918             if (charRefBufLen > 0) {
919                 tokenHandler.characters(charRefBuf, 0, charRefBufLen);
920                 charRefBufLen = 0;
921             }
922         }
923     }
924 
clearStrBufAfterUse()925     @Inline private void clearStrBufAfterUse() {
926         strBufLen = 0;
927     }
928 
clearStrBufBeforeUse()929     @Inline private void clearStrBufBeforeUse() {
930         assert strBufLen == 0: "strBufLen not reset after previous use!";
931         strBufLen = 0; // no-op in the absence of bugs
932     }
933 
clearStrBufAfterOneHyphen()934     @Inline private void clearStrBufAfterOneHyphen() {
935         assert strBufLen == 1: "strBufLen length not one!";
936         assert strBuf[0] == '-': "strBuf does not start with a hyphen!";
937         strBufLen = 0;
938     }
939 
940     /**
941      * Appends to the buffer.
942      *
943      * @param c
944      *            the UTF-16 code unit to append
945      */
appendStrBuf(char c)946     @Inline private void appendStrBuf(char c) {
947         // CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient.";
948         // CPPONLY: if (strBufLen == strBuf.length) {
949         // CPPONLY:     if (!EnsureBufferSpace(1)) {
950         // CPPONLY:         assert false: "RELEASE: Unable to recover from buffer reallocation failure";
951         // CPPONLY:     } // TODO: Add telemetry when outer if fires but inner does not
952         // CPPONLY: }
953         strBuf[strBufLen++] = c;
954     }
955 
956     /**
957      * The buffer as a String. Currently only used for error reporting.
958      *
959      * <p>
960      * C++ memory note: The return value must be released.
961      *
962      * @return the buffer as a string
963      */
strBufToString()964     protected String strBufToString() {
965         String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen
966             // CPPONLY: , tokenHandler, !newAttributesEachTime && attributeName == AttributeName.CLASS
967         );
968         clearStrBufAfterUse();
969         return str;
970     }
971 
972     /**
973      * Returns the buffer as a local name. The return value is released in
974      * emitDoctypeToken().
975      *
976      * @return the buffer as local name
977      */
strBufToDoctypeName()978     private void strBufToDoctypeName() {
979         doctypeName = Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner);
980         clearStrBufAfterUse();
981     }
982 
983     /**
984      * Emits the buffer as character tokens.
985      *
986      * @throws SAXException
987      *             if the token handler threw
988      */
emitStrBuf()989     private void emitStrBuf() throws SAXException {
990         if (strBufLen > 0) {
991             tokenHandler.characters(strBuf, 0, strBufLen);
992             clearStrBufAfterUse();
993         }
994     }
995 
appendSecondHyphenToBogusComment()996     @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
997         // [NOCPP[
998         switch (commentPolicy) {
999             case ALTER_INFOSET:
1000                 appendStrBuf(' ');
1001                 // CPPONLY: MOZ_FALLTHROUGH;
1002             case ALLOW:
1003                 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1004                 // ]NOCPP]
1005                 appendStrBuf('-');
1006                 // [NOCPP[
1007                 break;
1008             case FATAL:
1009                 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1010                 break;
1011         }
1012         // ]NOCPP]
1013     }
1014 
1015     // [NOCPP[
maybeAppendSpaceToBogusComment()1016     private void maybeAppendSpaceToBogusComment() throws SAXException {
1017         switch (commentPolicy) {
1018             case ALTER_INFOSET:
1019                 appendStrBuf(' ');
1020                 // CPPONLY: MOZ_FALLTHROUGH;
1021             case ALLOW:
1022                 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
1023                 break;
1024             case FATAL:
1025                 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
1026                 break;
1027         }
1028     }
1029 
1030     // ]NOCPP]
1031 
adjustDoubleHyphenAndAppendToStrBufAndErr(char c)1032     @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c)
1033             throws SAXException {
1034         errConsecutiveHyphens();
1035         // [NOCPP[
1036         switch (commentPolicy) {
1037             case ALTER_INFOSET:
1038                 strBufLen--;
1039                 // WARNING!!! This expands the worst case of the buffer length
1040                 // given the length of input!
1041                 appendStrBuf(' ');
1042                 appendStrBuf('-');
1043                 // CPPONLY: MOZ_FALLTHROUGH;
1044             case ALLOW:
1045                 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1046                 // ]NOCPP]
1047                 appendStrBuf(c);
1048                 // [NOCPP[
1049                 break;
1050             case FATAL:
1051                 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1052                 break;
1053         }
1054         // ]NOCPP]
1055     }
1056 
appendStrBuf(@oLength char[] buffer, int offset, int length)1057     private void appendStrBuf(@NoLength char[] buffer, int offset, int length) throws SAXException {
1058         int newLen = Portability.checkedAdd(strBufLen, length);
1059         // CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient.";
1060         // CPPONLY: if (strBuf.length < newLen) {
1061         // CPPONLY:     if (!EnsureBufferSpace(length)) {
1062         // CPPONLY:         assert false: "RELEASE: Unable to recover from buffer reallocation failure";
1063         // CPPONLY:     } // TODO: Add telemetry when outer if fires but inner does not
1064         // CPPONLY: }
1065         System.arraycopy(buffer, offset, strBuf, strBufLen, length);
1066         strBufLen = newLen;
1067     }
1068 
1069     /**
1070      * Append the contents of the char reference buffer to the main one.
1071      */
appendCharRefBufToStrBuf()1072     @Inline private void appendCharRefBufToStrBuf() throws SAXException {
1073         appendStrBuf(charRefBuf, 0, charRefBufLen);
1074         charRefBufLen = 0;
1075     }
1076 
1077     /**
1078      * Emits the current comment token.
1079      *
1080      * @param pos
1081      *            TODO
1082      *
1083      * @throws SAXException
1084      */
emitComment(int provisionalHyphens, int pos)1085     private void emitComment(int provisionalHyphens, int pos)
1086             throws SAXException {
1087         // [NOCPP[
1088         if (wantsComments) {
1089             // ]NOCPP]
1090             tokenHandler.comment(strBuf, 0, strBufLen
1091                     - provisionalHyphens);
1092             // [NOCPP[
1093         }
1094         // ]NOCPP]
1095         clearStrBufAfterUse();
1096         cstart = pos + 1;
1097     }
1098 
1099     /**
1100      * Flushes coalesced character tokens.
1101      *
1102      * @param buf
1103      *            TODO
1104      * @param pos
1105      *            TODO
1106      *
1107      * @throws SAXException
1108      */
flushChars(@oLength char[] buf, int pos)1109     protected void flushChars(@NoLength char[] buf, int pos)
1110             throws SAXException {
1111         if (pos > cstart) {
1112             tokenHandler.characters(buf, cstart, pos - cstart);
1113         }
1114         cstart = Integer.MAX_VALUE;
1115     }
1116 
1117     /**
1118      * Reports an condition that would make the infoset incompatible with XML
1119      * 1.0 as fatal.
1120      *
1121      * @param message
1122      *            the message
1123      * @throws SAXException
1124      * @throws SAXParseException
1125      */
fatal(String message)1126     public void fatal(String message) throws SAXException {
1127         SAXParseException spe = new SAXParseException(message, this);
1128         if (errorHandler != null) {
1129             errorHandler.fatalError(spe);
1130         }
1131         throw spe;
1132     }
1133 
1134     /**
1135      * Reports a Parse Error.
1136      *
1137      * @param message
1138      *            the message
1139      * @throws SAXException
1140      */
err(String message)1141     public void err(String message) throws SAXException {
1142         if (errorHandler == null) {
1143             return;
1144         }
1145         SAXParseException spe = new SAXParseException(message, this);
1146         errorHandler.error(spe);
1147     }
1148 
errTreeBuilder(String message)1149     public void errTreeBuilder(String message) throws SAXException {
1150         ErrorHandler eh = null;
1151         if (tokenHandler instanceof TreeBuilder<?>) {
1152             TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
1153             eh = treeBuilder.getErrorHandler();
1154         }
1155         if (eh == null) {
1156             eh = errorHandler;
1157         }
1158         if (eh == null) {
1159             return;
1160         }
1161         SAXParseException spe = new SAXParseException(message, this);
1162         eh.error(spe);
1163     }
1164 
1165     /**
1166      * Reports a warning
1167      *
1168      * @param message
1169      *            the message
1170      * @throws SAXException
1171      */
warn(String message)1172     public void warn(String message) throws SAXException {
1173         if (errorHandler == null) {
1174             return;
1175         }
1176         SAXParseException spe = new SAXParseException(message, this);
1177         errorHandler.warning(spe);
1178     }
1179 
strBufToElementNameString()1180     private void strBufToElementNameString() {
1181         if (containsHyphen) {
1182             // We've got a custom element or annotation-xml.
1183             @Local String annotationName = ElementName.ANNOTATION_XML.getName();
1184             if (Portability.localEqualsBuffer(annotationName, strBuf, strBufLen)) {
1185                 tagName = ElementName.ANNOTATION_XML;
1186             } else {
1187                 nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen,
1188                         interner)
1189                         // CPPONLY: , true
1190                         );
1191                 tagName = nonInternedTagName;
1192             }
1193         } else {
1194             tagName = ElementName.elementNameByBuffer(strBuf, strBufLen, interner);
1195             if (tagName == null) {
1196                 nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen,
1197                     interner)
1198                         // CPPONLY: , false
1199                         );
1200                 tagName = nonInternedTagName;
1201             }
1202         }
1203         containsHyphen = false;
1204         clearStrBufAfterUse();
1205     }
1206 
emitCurrentTagToken(boolean selfClosing, int pos)1207     private int emitCurrentTagToken(boolean selfClosing, int pos)
1208             throws SAXException {
1209         cstart = pos + 1;
1210         maybeErrSlashInEndTag(selfClosing);
1211         stateSave = Tokenizer.DATA;
1212         HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
1213                 : attributes);
1214         if (endTag) {
1215             /*
1216              * When an end tag token is emitted, the content model flag must be
1217              * switched to the PCDATA state.
1218              */
1219             maybeErrAttributesOnEndTag(attrs);
1220             // CPPONLY: if (!viewingXmlSource) {
1221             tokenHandler.endTag(tagName);
1222             // CPPONLY: }
1223             // CPPONLY: if (newAttributesEachTime) {
1224             // CPPONLY:   Portability.delete(attributes);
1225             // CPPONLY:   attributes = null;
1226             // CPPONLY: }
1227         } else {
1228             // CPPONLY: if (viewingXmlSource) {
1229             // CPPONLY:   assert newAttributesEachTime;
1230             // CPPONLY:   Portability.delete(attributes);
1231             // CPPONLY:   attributes = null;
1232             // CPPONLY: } else {
1233             tokenHandler.startTag(tagName, attrs, selfClosing);
1234             // CPPONLY: }
1235         }
1236         tagName = null;
1237         if (newAttributesEachTime) {
1238             attributes = null;
1239         } else {
1240             attributes.clear(mappingLangToXmlLang);
1241         }
1242         /*
1243          * The token handler may have called setStateAndEndTagExpectation
1244          * and changed stateSave since the start of this method.
1245          */
1246         return stateSave;
1247     }
1248 
attributeNameComplete()1249     private void attributeNameComplete() throws SAXException {
1250         attributeName = AttributeName.nameByBuffer(strBuf, strBufLen, interner);
1251         if (attributeName == null) {
1252             // [NOCPP[
1253             attributeName = AttributeName.createAttributeName(
1254                     Portability.newLocalNameFromBuffer(strBuf, strBufLen,
1255                             interner),
1256                     namePolicy != XmlViolationPolicy.ALLOW);
1257             // ]NOCPP]
1258             // CPPONLY:     nonInternedAttributeName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner));
1259             // CPPONLY:     attributeName = nonInternedAttributeName;
1260         }
1261         clearStrBufAfterUse();
1262 
1263         if (attributes == null) {
1264             attributes = new HtmlAttributes(mappingLangToXmlLang);
1265         }
1266 
1267         /*
1268          * When the user agent leaves the attribute name state (and before
1269          * emitting the tag token, if appropriate), the complete attribute's
1270          * name must be compared to the other attributes on the same token; if
1271          * there is already an attribute on the token with the exact same name,
1272          * then this is a parse error and the new attribute must be dropped,
1273          * along with the value that gets associated with it (if any).
1274          */
1275         if (attributes.contains(attributeName)) {
1276             errDuplicateAttribute();
1277             attributeName = null;
1278         }
1279     }
1280 
addAttributeWithoutValue()1281     private void addAttributeWithoutValue() throws SAXException {
1282         noteAttributeWithoutValue();
1283 
1284         // [NOCPP[
1285         if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
1286                 && ElementName.META == tagName) {
1287             err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 1024 bytes.");
1288         }
1289         // ]NOCPP]
1290         if (attributeName != null) {
1291             // [NOCPP[
1292             if (AttributeName.SRC == attributeName
1293                     || AttributeName.HREF == attributeName) {
1294                 warn("Attribute \u201C"
1295                         + attributeName.getLocal(AttributeName.HTML)
1296                         + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
1297             }
1298             // ]NOCPP]
1299             attributes.addAttribute(attributeName,
1300                     Portability.newEmptyString()
1301                     // [NOCPP[
1302                     , xmlnsPolicy
1303             // ]NOCPP]
1304             // CPPONLY: , attributeLine
1305             );
1306             attributeName = null;
1307         } else {
1308             clearStrBufAfterUse();
1309         }
1310     }
1311 
addAttributeWithValue()1312     private void addAttributeWithValue() throws SAXException {
1313         // [NOCPP[
1314         if (metaBoundaryPassed && ElementName.META == tagName
1315                 && AttributeName.CHARSET == attributeName) {
1316             err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 1024 bytes.");
1317         }
1318         // ]NOCPP]
1319         if (attributeName != null) {
1320             String val = strBufToString(); // Ownership transferred to
1321             // HtmlAttributes
1322             // CPPONLY: if (mViewSource) {
1323             // CPPONLY:   mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
1324             // CPPONLY: }
1325             attributes.addAttribute(attributeName, val
1326             // [NOCPP[
1327                     , xmlnsPolicy
1328             // ]NOCPP]
1329             // CPPONLY: , attributeLine
1330             );
1331             attributeName = null;
1332         } else {
1333             // We have a duplicate attribute. Explicitly discard its value.
1334             clearStrBufAfterUse();
1335         }
1336     }
1337 
1338     // [NOCPP[
1339 
startErrorReporting()1340     protected void startErrorReporting() throws SAXException {
1341 
1342     }
1343 
1344     // ]NOCPP]
1345 
start()1346     public void start() throws SAXException {
1347         initializeWithoutStarting();
1348         tokenHandler.startTokenization(this);
1349         // [NOCPP[
1350         startErrorReporting();
1351         // ]NOCPP]
1352     }
1353 
tokenizeBuffer(UTF16Buffer buffer)1354     public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
1355         int state = stateSave;
1356         int returnState = returnStateSave;
1357         char c = '\u0000';
1358         shouldSuspend = false;
1359         lastCR = false;
1360 
1361         int start = buffer.getStart();
1362         int end = buffer.getEnd();
1363 
1364         // In C++, the caller of tokenizeBuffer needs to do this explicitly.
1365         // [NOCPP[
1366         ensureBufferSpace(end - start);
1367         // ]NOCPP]
1368 
1369         /**
1370          * The index of the last <code>char</code> read from <code>buf</code>.
1371          */
1372         int pos = start - 1;
1373 
1374         /**
1375          * The index of the first <code>char</code> in <code>buf</code> that is
1376          * part of a coalesced run of character tokens or
1377          * <code>Integer.MAX_VALUE</code> if there is not a current run being
1378          * coalesced.
1379          */
1380         switch (state) {
1381             case DATA:
1382             case RCDATA:
1383             case SCRIPT_DATA:
1384             case PLAINTEXT:
1385             case RAWTEXT:
1386             case CDATA_SECTION:
1387             case SCRIPT_DATA_ESCAPED:
1388             case SCRIPT_DATA_ESCAPE_START:
1389             case SCRIPT_DATA_ESCAPE_START_DASH:
1390             case SCRIPT_DATA_ESCAPED_DASH:
1391             case SCRIPT_DATA_ESCAPED_DASH_DASH:
1392             case SCRIPT_DATA_DOUBLE_ESCAPE_START:
1393             case SCRIPT_DATA_DOUBLE_ESCAPED:
1394             case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
1395             case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
1396             case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
1397             case SCRIPT_DATA_DOUBLE_ESCAPE_END:
1398                 cstart = start;
1399                 break;
1400             default:
1401                 cstart = Integer.MAX_VALUE;
1402                 break;
1403         }
1404 
1405         /**
1406          * The number of <code>char</code>s in <code>buf</code> that have
1407          * meaning. (The rest of the array is garbage and should not be
1408          * examined.)
1409          */
1410         // CPPONLY: if (mViewSource) {
1411         // CPPONLY:   mViewSource.SetBuffer(buffer);
1412         // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1413         // CPPONLY:   mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
1414         // CPPONLY: } else {
1415         // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1416         // CPPONLY: }
1417         // [NOCPP[
1418         pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
1419                 end);
1420         // ]NOCPP]
1421         if (pos == end) {
1422             // exiting due to end of buffer
1423             buffer.setStart(pos);
1424         } else {
1425             buffer.setStart(pos + 1);
1426         }
1427         return lastCR;
1428     }
1429 
1430     // [NOCPP[
ensureBufferSpace(int inputLength)1431     private void ensureBufferSpace(int inputLength) throws SAXException {
1432         // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB.
1433         // Adding to the general worst case instead of only the
1434         // TreeBuilder-exposed worst case to avoid re-introducing a bug when
1435         // unifying the tokenizer and tree builder buffers in the future.
1436         int worstCase = strBufLen + inputLength + charRefBufLen + 2;
1437         tokenHandler.ensureBufferSpace(worstCase);
1438         if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) {
1439             // When altering infoset, if the comment contents are consecutive
1440             // hyphens, each hyphen generates a space, too. These buffer
1441             // contents never get emitted as characters() to the tokenHandler,
1442             // which is why this calculation happens after the call to
1443             // ensureBufferSpace on tokenHandler.
1444             worstCase *= 2;
1445         }
1446         if (strBuf == null) {
1447             // Add an arbitrary small value to avoid immediate reallocation
1448             // once there are a few characters in the buffer.
1449             strBuf = new char[worstCase + 128];
1450         } else if (worstCase > strBuf.length) {
1451             // HotSpot reportedly allocates memory with 8-byte accuracy, so
1452             // there's no point in trying to do math here to avoid slop.
1453             // Maybe we should add some small constant to worstCase here
1454             // but not doing that without profiling. In C++ with jemalloc,
1455             // the corresponding method should do math to round up here
1456             // to avoid slop.
1457             char[] newBuf = new char[worstCase];
1458             System.arraycopy(strBuf, 0, newBuf, 0, strBufLen);
1459             strBuf = newBuf;
1460         }
1461     }
1462     // ]NOCPP]
1463 
stateLoop(int state, char c, int pos, @NoLength char[] buf, boolean reconsume, int returnState, int endPos)1464     @SuppressWarnings("unused") private int stateLoop(int state, char c,
1465             int pos, @NoLength char[] buf, boolean reconsume, int returnState,
1466             int endPos) throws SAXException {
1467         /*
1468          * Idioms used in this code:
1469          *
1470          *
1471          * Consuming the next input character
1472          *
1473          * To consume the next input character, the code does this: if (++pos ==
1474          * endPos) { break stateloop; } c = checkChar(buf, pos);
1475          *
1476          *
1477          * Staying in a state
1478          *
1479          * When there's a state that the tokenizer may stay in over multiple
1480          * input characters, the state has a wrapper |for(;;)| loop and staying
1481          * in the state continues the loop.
1482          *
1483          *
1484          * Switching to another state
1485          *
1486          * To switch to another state, the code sets the state variable to the
1487          * magic number of the new state. Then it either continues stateloop or
1488          * breaks out of the state's own wrapper loop if the target state is
1489          * right after the current state in source order. (This is a partial
1490          * workaround for Java's lack of goto.)
1491          *
1492          *
1493          * Reconsume support
1494          *
1495          * The spec sometimes says that an input character is reconsumed in
1496          * another state. If a state can ever be entered so that an input
1497          * character can be reconsumed in it, the state's code starts with an
1498          * |if (reconsume)| that sets reconsume to false and skips over the
1499          * normal code for consuming a new character.
1500          *
1501          * To reconsume the current character in another state, the code sets
1502          * |reconsume| to true and then switches to the other state.
1503          *
1504          *
1505          * Emitting character tokens
1506          *
1507          * This method emits character tokens lazily. Whenever a new range of
1508          * character tokens starts, the field cstart must be set to the start
1509          * index of the range. The flushChars() method must be called at the end
1510          * of a range to flush it.
1511          *
1512          *
1513          * U+0000 handling
1514          *
1515          * The various states have to handle the replacement of U+0000 with
1516          * U+FFFD. However, if U+0000 would be reconsumed in another state, the
1517          * replacement doesn't need to happen, because it's handled by the
1518          * reconsuming state.
1519          *
1520          *
1521          * LF handling
1522          *
1523          * Every state needs to increment the line number upon LF unless the LF
1524          * gets reconsumed by another state which increments the line number.
1525          *
1526          *
1527          * CR handling
1528          *
1529          * Every state needs to handle CR unless the CR gets reconsumed and is
1530          * handled by the reconsuming state. The CR needs to be handled as if it
1531          * were and LF, the lastCR field must be set to true and then this
1532          * method must return. The IO driver will then swallow the next
1533          * character if it is an LF to coalesce CRLF.
1534          */
1535         stateloop: for (;;) {
1536             switch (state) {
1537                 case DATA:
1538                     dataloop: for (;;) {
1539                         if (reconsume) {
1540                             reconsume = false;
1541                         } else {
1542                             if (++pos == endPos) {
1543                                 break stateloop;
1544                             }
1545                             c = checkChar(buf, pos);
1546                         }
1547                         switch (c) {
1548                             case '&':
1549                                 /*
1550                                  * U+0026 AMPERSAND (&) Switch to the character
1551                                  * reference in data state.
1552                                  */
1553                                 flushChars(buf, pos);
1554                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
1555                                 appendCharRefBuf(c);
1556                                 setAdditionalAndRememberAmpersandLocation('\u0000');
1557                                 returnState = state;
1558                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
1559                                 continue stateloop;
1560                             case '<':
1561                                 /*
1562                                  * U+003C LESS-THAN SIGN (<) Switch to the tag
1563                                  * open state.
1564                                  */
1565                                 flushChars(buf, pos);
1566 
1567                                 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
1568                                 break dataloop; // FALL THROUGH continue
1569                             // stateloop;
1570                             case '\u0000':
1571                                 emitReplacementCharacter(buf, pos);
1572                                 continue;
1573                             case '\r':
1574                                 emitCarriageReturn(buf, pos);
1575                                 break stateloop;
1576                             case '\n':
1577                                 silentLineFeed();
1578                                 // CPPONLY: MOZ_FALLTHROUGH;
1579                             default:
1580                                 /*
1581                                  * Anything else Emit the input character as a
1582                                  * character token.
1583                                  *
1584                                  * Stay in the data state.
1585                                  */
1586                                 continue;
1587                         }
1588                     }
1589                     // CPPONLY: MOZ_FALLTHROUGH;
1590                 case TAG_OPEN:
1591                     tagopenloop: for (;;) {
1592                         /*
1593                          * The behavior of this state depends on the content
1594                          * model flag.
1595                          */
1596                         if (++pos == endPos) {
1597                             break stateloop;
1598                         }
1599                         c = checkChar(buf, pos);
1600                         /*
1601                          * If the content model flag is set to the PCDATA state
1602                          * Consume the next input character:
1603                          */
1604                         if (c >= 'A' && c <= 'Z') {
1605                             /*
1606                              * U+0041 LATIN CAPITAL LETTER A through to U+005A
1607                              * LATIN CAPITAL LETTER Z Create a new start tag
1608                              * token,
1609                              */
1610                             endTag = false;
1611                             /*
1612                              * set its tag name to the lowercase version of the
1613                              * input character (add 0x0020 to the character's
1614                              * code point),
1615                              */
1616                             clearStrBufBeforeUse();
1617                             appendStrBuf((char) (c + 0x20));
1618                             containsHyphen = false;
1619                             /* then switch to the tag name state. */
1620                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1621                             /*
1622                              * (Don't emit the token yet; further details will
1623                              * be filled in before it is emitted.)
1624                              */
1625                             break tagopenloop;
1626                             // continue stateloop;
1627                         } else if (c >= 'a' && c <= 'z') {
1628                             /*
1629                              * U+0061 LATIN SMALL LETTER A through to U+007A
1630                              * LATIN SMALL LETTER Z Create a new start tag
1631                              * token,
1632                              */
1633                             endTag = false;
1634                             /*
1635                              * set its tag name to the input character,
1636                              */
1637                             clearStrBufBeforeUse();
1638                             appendStrBuf(c);
1639                             containsHyphen = false;
1640                             /* then switch to the tag name state. */
1641                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1642                             /*
1643                              * (Don't emit the token yet; further details will
1644                              * be filled in before it is emitted.)
1645                              */
1646                             break tagopenloop;
1647                             // continue stateloop;
1648                         }
1649                         switch (c) {
1650                             case '!':
1651                                 /*
1652                                  * U+0021 EXCLAMATION MARK (!) Switch to the
1653                                  * markup declaration open state.
1654                                  */
1655                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
1656                                 continue stateloop;
1657                             case '/':
1658                                 /*
1659                                  * U+002F SOLIDUS (/) Switch to the close tag
1660                                  * open state.
1661                                  */
1662                                 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
1663                                 continue stateloop;
1664                             case '?':
1665                                 // CPPONLY: if (viewingXmlSource) {
1666                                 // CPPONLY: state = transition(state,
1667                                 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
1668                                 // CPPONLY: reconsume,
1669                                 // CPPONLY: pos);
1670                                 // CPPONLY: continue stateloop;
1671                                 // CPPONLY: }
1672                                 /*
1673                                  * U+003F QUESTION MARK (?) Parse error.
1674                                  */
1675                                 errProcessingInstruction();
1676                                 /*
1677                                  * Switch to the bogus comment state.
1678                                  */
1679                                 clearStrBufBeforeUse();
1680                                 appendStrBuf(c);
1681                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
1682                                 continue stateloop;
1683                             case '>':
1684                                 /*
1685                                  * U+003E GREATER-THAN SIGN (>) Parse error.
1686                                  */
1687                                 errLtGt();
1688                                 /*
1689                                  * Emit a U+003C LESS-THAN SIGN character token
1690                                  * and a U+003E GREATER-THAN SIGN character
1691                                  * token.
1692                                  */
1693                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
1694                                 /* Switch to the data state. */
1695                                 cstart = pos + 1;
1696                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
1697                                 continue stateloop;
1698                             default:
1699                                 /*
1700                                  * Anything else Parse error.
1701                                  */
1702                                 errBadCharAfterLt(c);
1703                                 /*
1704                                  * Emit a U+003C LESS-THAN SIGN character token
1705                                  */
1706                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
1707                                 /*
1708                                  * and reconsume the current input character in
1709                                  * the data state.
1710                                  */
1711                                 cstart = pos;
1712                                 reconsume = true;
1713                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
1714                                 continue stateloop;
1715                         }
1716                     }
1717                     // CPPONLY: MOZ_FALLTHROUGH;
1718                 case TAG_NAME:
1719                     tagnameloop: for (;;) {
1720                         if (++pos == endPos) {
1721                             break stateloop;
1722                         }
1723                         c = checkChar(buf, pos);
1724                         /*
1725                          * Consume the next input character:
1726                          */
1727                         switch (c) {
1728                             case '\r':
1729                                 silentCarriageReturn();
1730                                 strBufToElementNameString();
1731                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1732                                 break stateloop;
1733                             case '\n':
1734                                 silentLineFeed();
1735                                 // CPPONLY: MOZ_FALLTHROUGH;
1736                             case ' ':
1737                             case '\t':
1738                             case '\u000C':
1739                                 /*
1740                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
1741                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1742                                  * Switch to the before attribute name state.
1743                                  */
1744                                 strBufToElementNameString();
1745                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1746                                 break tagnameloop;
1747                             // continue stateloop;
1748                             case '/':
1749                                 /*
1750                                  * U+002F SOLIDUS (/) Switch to the self-closing
1751                                  * start tag state.
1752                                  */
1753                                 strBufToElementNameString();
1754                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1755                                 continue stateloop;
1756                             case '>':
1757                                 /*
1758                                  * U+003E GREATER-THAN SIGN (>) Emit the current
1759                                  * tag token.
1760                                  */
1761                                 strBufToElementNameString();
1762                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1763                                 if (shouldSuspend) {
1764                                     break stateloop;
1765                                 }
1766                                 /*
1767                                  * Switch to the data state.
1768                                  */
1769                                 continue stateloop;
1770                             case '\u0000':
1771                                 c = '\uFFFD';
1772                                 // CPPONLY: MOZ_FALLTHROUGH;
1773                             default:
1774                                 if (c >= 'A' && c <= 'Z') {
1775                                     /*
1776                                      * U+0041 LATIN CAPITAL LETTER A through to
1777                                      * U+005A LATIN CAPITAL LETTER Z Append the
1778                                      * lowercase version of the current input
1779                                      * character (add 0x0020 to the character's
1780                                      * code point) to the current tag token's
1781                                      * tag name.
1782                                      */
1783                                     c += 0x20;
1784                                 } else if (c == '-') {
1785                                     containsHyphen = true;
1786                                 }
1787                                 /*
1788                                  * Anything else Append the current input
1789                                  * character to the current tag token's tag
1790                                  * name.
1791                                  */
1792                                 appendStrBuf(c);
1793                                 /*
1794                                  * Stay in the tag name state.
1795                                  */
1796                                 continue;
1797                         }
1798                     }
1799                     // CPPONLY: MOZ_FALLTHROUGH;
1800                 case BEFORE_ATTRIBUTE_NAME:
1801                     beforeattributenameloop: for (;;) {
1802                         if (reconsume) {
1803                             reconsume = false;
1804                         } else {
1805                             if (++pos == endPos) {
1806                                 break stateloop;
1807                             }
1808                             c = checkChar(buf, pos);
1809                         }
1810                         /*
1811                          * Consume the next input character:
1812                          */
1813                         switch (c) {
1814                             case '\r':
1815                                 silentCarriageReturn();
1816                                 break stateloop;
1817                             case '\n':
1818                                 silentLineFeed();
1819                                 // CPPONLY: MOZ_FALLTHROUGH;
1820                             case ' ':
1821                             case '\t':
1822                             case '\u000C':
1823                                 /*
1824                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
1825                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
1826                                  * in the before attribute name state.
1827                                  */
1828                                 continue;
1829                             case '/':
1830                                 /*
1831                                  * U+002F SOLIDUS (/) Switch to the self-closing
1832                                  * start tag state.
1833                                  */
1834                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1835                                 continue stateloop;
1836                             case '>':
1837                                 /*
1838                                  * U+003E GREATER-THAN SIGN (>) Emit the current
1839                                  * tag token.
1840                                  */
1841                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1842                                 if (shouldSuspend) {
1843                                     break stateloop;
1844                                 }
1845                                 /*
1846                                  * Switch to the data state.
1847                                  */
1848                                 continue stateloop;
1849                             case '\u0000':
1850                                 c = '\uFFFD';
1851                                 // CPPONLY: MOZ_FALLTHROUGH;
1852                             case '\"':
1853                             case '\'':
1854                             case '<':
1855                             case '=':
1856                                 /*
1857                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1858                                  * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
1859                                  * SIGN (=) Parse error.
1860                                  */
1861                                 errBadCharBeforeAttributeNameOrNull(c);
1862                                 /*
1863                                  * Treat it as per the "anything else" entry
1864                                  * below.
1865                                  */
1866                                 // CPPONLY: MOZ_FALLTHROUGH;
1867                             default:
1868                                 /*
1869                                  * Anything else Start a new attribute in the
1870                                  * current tag token.
1871                                  */
1872                                 if (c >= 'A' && c <= 'Z') {
1873                                     /*
1874                                      * U+0041 LATIN CAPITAL LETTER A through to
1875                                      * U+005A LATIN CAPITAL LETTER Z Set that
1876                                      * attribute's name to the lowercase version
1877                                      * of the current input character (add
1878                                      * 0x0020 to the character's code point)
1879                                      */
1880                                     c += 0x20;
1881                                 }
1882                                 // CPPONLY: attributeLine = line;
1883                                 /*
1884                                  * Set that attribute's name to the current
1885                                  * input character,
1886                                  */
1887                                 clearStrBufBeforeUse();
1888                                 appendStrBuf(c);
1889                                 /*
1890                                  * and its value to the empty string.
1891                                  */
1892                                 // Will do later.
1893                                 /*
1894                                  * Switch to the attribute name state.
1895                                  */
1896                                 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
1897                                 break beforeattributenameloop;
1898                             // continue stateloop;
1899                         }
1900                     }
1901                     // CPPONLY: MOZ_FALLTHROUGH;
1902                 case ATTRIBUTE_NAME:
1903                     attributenameloop: for (;;) {
1904                         if (++pos == endPos) {
1905                             break stateloop;
1906                         }
1907                         c = checkChar(buf, pos);
1908                         /*
1909                          * Consume the next input character:
1910                          */
1911                         switch (c) {
1912                             case '\r':
1913                                 silentCarriageReturn();
1914                                 attributeNameComplete();
1915                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1916                                 break stateloop;
1917                             case '\n':
1918                                 silentLineFeed();
1919                                 // CPPONLY: MOZ_FALLTHROUGH;
1920                             case ' ':
1921                             case '\t':
1922                             case '\u000C':
1923                                 /*
1924                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
1925                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1926                                  * Switch to the after attribute name state.
1927                                  */
1928                                 attributeNameComplete();
1929                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1930                                 continue stateloop;
1931                             case '/':
1932                                 /*
1933                                  * U+002F SOLIDUS (/) Switch to the self-closing
1934                                  * start tag state.
1935                                  */
1936                                 attributeNameComplete();
1937                                 addAttributeWithoutValue();
1938                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1939                                 continue stateloop;
1940                             case '=':
1941                                 /*
1942                                  * U+003D EQUALS SIGN (=) Switch to the before
1943                                  * attribute value state.
1944                                  */
1945                                 attributeNameComplete();
1946                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
1947                                 break attributenameloop;
1948                             // continue stateloop;
1949                             case '>':
1950                                 /*
1951                                  * U+003E GREATER-THAN SIGN (>) Emit the current
1952                                  * tag token.
1953                                  */
1954                                 attributeNameComplete();
1955                                 addAttributeWithoutValue();
1956                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1957                                 if (shouldSuspend) {
1958                                     break stateloop;
1959                                 }
1960                                 /*
1961                                  * Switch to the data state.
1962                                  */
1963                                 continue stateloop;
1964                             case '\u0000':
1965                                 c = '\uFFFD';
1966                                 // CPPONLY: MOZ_FALLTHROUGH;
1967                             case '\"':
1968                             case '\'':
1969                             case '<':
1970                                 /*
1971                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1972                                  * (') U+003C LESS-THAN SIGN (<) Parse error.
1973                                  */
1974                                 errQuoteOrLtInAttributeNameOrNull(c);
1975                                 /*
1976                                  * Treat it as per the "anything else" entry
1977                                  * below.
1978                                  */
1979                                 // CPPONLY: MOZ_FALLTHROUGH;
1980                             default:
1981                                 if (c >= 'A' && c <= 'Z') {
1982                                     /*
1983                                      * U+0041 LATIN CAPITAL LETTER A through to
1984                                      * U+005A LATIN CAPITAL LETTER Z Append the
1985                                      * lowercase version of the current input
1986                                      * character (add 0x0020 to the character's
1987                                      * code point) to the current attribute's
1988                                      * name.
1989                                      */
1990                                     c += 0x20;
1991                                 }
1992                                 /*
1993                                  * Anything else Append the current input
1994                                  * character to the current attribute's name.
1995                                  */
1996                                 appendStrBuf(c);
1997                                 /*
1998                                  * Stay in the attribute name state.
1999                                  */
2000                                 continue;
2001                         }
2002                     }
2003                     // CPPONLY: MOZ_FALLTHROUGH;
2004                 case BEFORE_ATTRIBUTE_VALUE:
2005                     beforeattributevalueloop: for (;;) {
2006                         if (++pos == endPos) {
2007                             break stateloop;
2008                         }
2009                         c = checkChar(buf, pos);
2010                         /*
2011                          * Consume the next input character:
2012                          */
2013                         switch (c) {
2014                             case '\r':
2015                                 silentCarriageReturn();
2016                                 break stateloop;
2017                             case '\n':
2018                                 silentLineFeed();
2019                                 // CPPONLY: MOZ_FALLTHROUGH;
2020                             case ' ':
2021                             case '\t':
2022                             case '\u000C':
2023                                 /*
2024                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2025                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
2026                                  * in the before attribute value state.
2027                                  */
2028                                 continue;
2029                             case '"':
2030                                 /*
2031                                  * U+0022 QUOTATION MARK (") Switch to the
2032                                  * attribute value (double-quoted) state.
2033                                  */
2034                                 // CPPONLY: attributeLine = line;
2035                                 clearStrBufBeforeUse();
2036                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
2037                                 break beforeattributevalueloop;
2038                             // continue stateloop;
2039                             case '&':
2040                                 /*
2041                                  * U+0026 AMPERSAND (&) Switch to the attribute
2042                                  * value (unquoted) state and reconsume this
2043                                  * input character.
2044                                  */
2045                                 // CPPONLY: attributeLine = line;
2046                                 clearStrBufBeforeUse();
2047                                 reconsume = true;
2048                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
2049                                 noteUnquotedAttributeValue();
2050                                 continue stateloop;
2051                             case '\'':
2052                                 /*
2053                                  * U+0027 APOSTROPHE (') Switch to the attribute
2054                                  * value (single-quoted) state.
2055                                  */
2056                                 // CPPONLY: attributeLine = line;
2057                                 clearStrBufBeforeUse();
2058                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
2059                                 continue stateloop;
2060                             case '>':
2061                                 /*
2062                                  * U+003E GREATER-THAN SIGN (>) Parse error.
2063                                  */
2064                                 errAttributeValueMissing();
2065                                 /*
2066                                  * Emit the current tag token.
2067                                  */
2068                                 addAttributeWithoutValue();
2069                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2070                                 if (shouldSuspend) {
2071                                     break stateloop;
2072                                 }
2073                                 /*
2074                                  * Switch to the data state.
2075                                  */
2076                                 continue stateloop;
2077                             case '\u0000':
2078                                 c = '\uFFFD';
2079                                 // CPPONLY: MOZ_FALLTHROUGH;
2080                             case '<':
2081                             case '=':
2082                             case '`':
2083                                 /*
2084                                  * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
2085                                  * (=) U+0060 GRAVE ACCENT (`)
2086                                  */
2087                                 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
2088                                 /*
2089                                  * Treat it as per the "anything else" entry
2090                                  * below.
2091                                  */
2092                                 // CPPONLY: MOZ_FALLTHROUGH;
2093                             default:
2094                                 /*
2095                                  * Anything else Append the current input
2096                                  * character to the current attribute's value.
2097                                  */
2098                                 // CPPONLY: attributeLine = line;
2099                                 clearStrBufBeforeUse();
2100                                 appendStrBuf(c);
2101                                 /*
2102                                  * Switch to the attribute value (unquoted)
2103                                  * state.
2104                                  */
2105 
2106                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
2107                                 noteUnquotedAttributeValue();
2108                                 continue stateloop;
2109                         }
2110                     }
2111                     // CPPONLY: MOZ_FALLTHROUGH;
2112                 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
2113                     attributevaluedoublequotedloop: for (;;) {
2114                         if (reconsume) {
2115                             reconsume = false;
2116                         } else {
2117                             if (++pos == endPos) {
2118                                 break stateloop;
2119                             }
2120                             c = checkChar(buf, pos);
2121                         }
2122                         /*
2123                          * Consume the next input character:
2124                          */
2125                         switch (c) {
2126                             case '"':
2127                                 /*
2128                                  * U+0022 QUOTATION MARK (") Switch to the after
2129                                  * attribute value (quoted) state.
2130                                  */
2131                                 addAttributeWithValue();
2132 
2133                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2134                                 break attributevaluedoublequotedloop;
2135                             // continue stateloop;
2136                             case '&':
2137                                 /*
2138                                  * U+0026 AMPERSAND (&) Switch to the character
2139                                  * reference in attribute value state, with the
2140                                  * additional allowed character being U+0022
2141                                  * QUOTATION MARK (").
2142                                  */
2143                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
2144                                 appendCharRefBuf(c);
2145                                 setAdditionalAndRememberAmpersandLocation('\"');
2146                                 returnState = state;
2147                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2148                                 continue stateloop;
2149                             case '\r':
2150                                 appendStrBufCarriageReturn();
2151                                 break stateloop;
2152                             case '\n':
2153                                 appendStrBufLineFeed();
2154                                 continue;
2155                             case '\u0000':
2156                                 c = '\uFFFD';
2157                                 // CPPONLY: MOZ_FALLTHROUGH;
2158                             default:
2159                                 /*
2160                                  * Anything else Append the current input
2161                                  * character to the current attribute's value.
2162                                  */
2163                                 appendStrBuf(c);
2164                                 /*
2165                                  * Stay in the attribute value (double-quoted)
2166                                  * state.
2167                                  */
2168                                 continue;
2169                         }
2170                     }
2171                     // CPPONLY: MOZ_FALLTHROUGH;
2172                 case AFTER_ATTRIBUTE_VALUE_QUOTED:
2173                     afterattributevaluequotedloop: for (;;) {
2174                         if (++pos == endPos) {
2175                             break stateloop;
2176                         }
2177                         c = checkChar(buf, pos);
2178                         /*
2179                          * Consume the next input character:
2180                          */
2181                         switch (c) {
2182                             case '\r':
2183                                 silentCarriageReturn();
2184                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2185                                 break stateloop;
2186                             case '\n':
2187                                 silentLineFeed();
2188                                 // CPPONLY: MOZ_FALLTHROUGH;
2189                             case ' ':
2190                             case '\t':
2191                             case '\u000C':
2192                                 /*
2193                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2194                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2195                                  * Switch to the before attribute name state.
2196                                  */
2197                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2198                                 continue stateloop;
2199                             case '/':
2200                                 /*
2201                                  * U+002F SOLIDUS (/) Switch to the self-closing
2202                                  * start tag state.
2203                                  */
2204                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2205                                 break afterattributevaluequotedloop;
2206                             // continue stateloop;
2207                             case '>':
2208                                 /*
2209                                  * U+003E GREATER-THAN SIGN (>) Emit the current
2210                                  * tag token.
2211                                  */
2212                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2213                                 if (shouldSuspend) {
2214                                     break stateloop;
2215                                 }
2216                                 /*
2217                                  * Switch to the data state.
2218                                  */
2219                                 continue stateloop;
2220                             default:
2221                                 /*
2222                                  * Anything else Parse error.
2223                                  */
2224                                 errNoSpaceBetweenAttributes();
2225                                 /*
2226                                  * Reconsume the character in the before
2227                                  * attribute name state.
2228                                  */
2229                                 reconsume = true;
2230                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2231                                 continue stateloop;
2232                         }
2233                     }
2234                     // CPPONLY: MOZ_FALLTHROUGH;
2235                 case SELF_CLOSING_START_TAG:
2236                     if (++pos == endPos) {
2237                         break stateloop;
2238                     }
2239                     c = checkChar(buf, pos);
2240                     /*
2241                      * Consume the next input character:
2242                      */
2243                     switch (c) {
2244                         case '>':
2245                             /*
2246                              * U+003E GREATER-THAN SIGN (>) Set the self-closing
2247                              * flag of the current tag token. Emit the current
2248                              * tag token.
2249                              */
2250                             state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
2251                             if (shouldSuspend) {
2252                                 break stateloop;
2253                             }
2254                             /*
2255                              * Switch to the data state.
2256                              */
2257                             continue stateloop;
2258                         default:
2259                             /* Anything else Parse error. */
2260                             errSlashNotFollowedByGt();
2261                             /*
2262                              * Reconsume the character in the before attribute
2263                              * name state.
2264                              */
2265                             reconsume = true;
2266                             state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2267                             continue stateloop;
2268                     }
2269                 case ATTRIBUTE_VALUE_UNQUOTED:
2270                     for (;;) {
2271                         if (reconsume) {
2272                             reconsume = false;
2273                         } else {
2274                             if (++pos == endPos) {
2275                                 break stateloop;
2276                             }
2277                             c = checkChar(buf, pos);
2278                         }
2279                         /*
2280                          * Consume the next input character:
2281                          */
2282                         switch (c) {
2283                             case '\r':
2284                                 silentCarriageReturn();
2285                                 addAttributeWithValue();
2286                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2287                                 break stateloop;
2288                             case '\n':
2289                                 silentLineFeed();
2290                                 // CPPONLY: MOZ_FALLTHROUGH;
2291                             case ' ':
2292                             case '\t':
2293                             case '\u000C':
2294                                 /*
2295                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2296                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2297                                  * Switch to the before attribute name state.
2298                                  */
2299                                 addAttributeWithValue();
2300                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2301                                 continue stateloop;
2302                             case '&':
2303                                 /*
2304                                  * U+0026 AMPERSAND (&) Switch to the character
2305                                  * reference in attribute value state, with the
2306                                  * additional allowed character being U+003E
2307                                  * GREATER-THAN SIGN (>)
2308                                  */
2309                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
2310                                 appendCharRefBuf(c);
2311                                 setAdditionalAndRememberAmpersandLocation('>');
2312                                 returnState = state;
2313                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2314                                 continue stateloop;
2315                             case '>':
2316                                 /*
2317                                  * U+003E GREATER-THAN SIGN (>) Emit the current
2318                                  * tag token.
2319                                  */
2320                                 addAttributeWithValue();
2321                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2322                                 if (shouldSuspend) {
2323                                     break stateloop;
2324                                 }
2325                                 /*
2326                                  * Switch to the data state.
2327                                  */
2328                                 continue stateloop;
2329                             case '\u0000':
2330                                 c = '\uFFFD';
2331                                 // CPPONLY: MOZ_FALLTHROUGH;
2332                             case '<':
2333                             case '\"':
2334                             case '\'':
2335                             case '=':
2336                             case '`':
2337                                 /*
2338                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
2339                                  * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
2340                                  * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
2341                                  */
2342                                 errUnquotedAttributeValOrNull(c);
2343                                 /*
2344                                  * Treat it as per the "anything else" entry
2345                                  * below.
2346                                  */
2347                                 // CPPONLY: MOZ_FALLTHROUGH;
2348                             default:
2349                                 /*
2350                                  * Anything else Append the current input
2351                                  * character to the current attribute's value.
2352                                  */
2353                                 appendStrBuf(c);
2354                                 /*
2355                                  * Stay in the attribute value (unquoted) state.
2356                                  */
2357                                 continue;
2358                         }
2359                     }
2360                 case AFTER_ATTRIBUTE_NAME:
2361                     for (;;) {
2362                         if (++pos == endPos) {
2363                             break stateloop;
2364                         }
2365                         c = checkChar(buf, pos);
2366                         /*
2367                          * Consume the next input character:
2368                          */
2369                         switch (c) {
2370                             case '\r':
2371                                 silentCarriageReturn();
2372                                 break stateloop;
2373                             case '\n':
2374                                 silentLineFeed();
2375                                 // CPPONLY: MOZ_FALLTHROUGH;
2376                             case ' ':
2377                             case '\t':
2378                             case '\u000C':
2379                                 /*
2380                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
2381                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
2382                                  * in the after attribute name state.
2383                                  */
2384                                 continue;
2385                             case '/':
2386                                 /*
2387                                  * U+002F SOLIDUS (/) Switch to the self-closing
2388                                  * start tag state.
2389                                  */
2390                                 addAttributeWithoutValue();
2391                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2392                                 continue stateloop;
2393                             case '=':
2394                                 /*
2395                                  * U+003D EQUALS SIGN (=) Switch to the before
2396                                  * attribute value state.
2397                                  */
2398                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
2399                                 continue stateloop;
2400                             case '>':
2401                                 /*
2402                                  * U+003E GREATER-THAN SIGN (>) Emit the current
2403                                  * tag token.
2404                                  */
2405                                 addAttributeWithoutValue();
2406                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2407                                 if (shouldSuspend) {
2408                                     break stateloop;
2409                                 }
2410                                 /*
2411                                  * Switch to the data state.
2412                                  */
2413                                 continue stateloop;
2414                             case '\u0000':
2415                                 c = '\uFFFD';
2416                                 // CPPONLY: MOZ_FALLTHROUGH;
2417                             case '\"':
2418                             case '\'':
2419                             case '<':
2420                                 errQuoteOrLtInAttributeNameOrNull(c);
2421                                 /*
2422                                  * Treat it as per the "anything else" entry
2423                                  * below.
2424                                  */
2425                                 // CPPONLY: MOZ_FALLTHROUGH;
2426                             default:
2427                                 addAttributeWithoutValue();
2428                                 /*
2429                                  * Anything else Start a new attribute in the
2430                                  * current tag token.
2431                                  */
2432                                 if (c >= 'A' && c <= 'Z') {
2433                                     /*
2434                                      * U+0041 LATIN CAPITAL LETTER A through to
2435                                      * U+005A LATIN CAPITAL LETTER Z Set that
2436                                      * attribute's name to the lowercase version
2437                                      * of the current input character (add
2438                                      * 0x0020 to the character's code point)
2439                                      */
2440                                     c += 0x20;
2441                                 }
2442                                 /*
2443                                  * Set that attribute's name to the current
2444                                  * input character,
2445                                  */
2446                                 clearStrBufBeforeUse();
2447                                 appendStrBuf(c);
2448                                 /*
2449                                  * and its value to the empty string.
2450                                  */
2451                                 // Will do later.
2452                                 /*
2453                                  * Switch to the attribute name state.
2454                                  */
2455                                 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
2456                                 continue stateloop;
2457                         }
2458                     }
2459                 case MARKUP_DECLARATION_OPEN:
2460                     markupdeclarationopenloop: for (;;) {
2461                         if (++pos == endPos) {
2462                             break stateloop;
2463                         }
2464                         c = checkChar(buf, pos);
2465                         /*
2466                          * If the next two characters are both U+002D
2467                          * HYPHEN-MINUS characters (-), consume those two
2468                          * characters, create a comment token whose data is the
2469                          * empty string, and switch to the comment start state.
2470                          *
2471                          * Otherwise, if the next seven characters are an ASCII
2472                          * case-insensitive match for the word "DOCTYPE", then
2473                          * consume those characters and switch to the DOCTYPE
2474                          * state.
2475                          *
2476                          * Otherwise, if the insertion mode is
2477                          * "in foreign content" and the current node is not an
2478                          * element in the HTML namespace and the next seven
2479                          * characters are an case-sensitive match for the string
2480                          * "[CDATA[" (the five uppercase letters "CDATA" with a
2481                          * U+005B LEFT SQUARE BRACKET character before and
2482                          * after), then consume those characters and switch to
2483                          * the CDATA section state.
2484                          *
2485                          * Otherwise, is is a parse error. Switch to the bogus
2486                          * comment state. The next character that is consumed,
2487                          * if any, is the first character that will be in the
2488                          * comment.
2489                          */
2490                         switch (c) {
2491                             case '-':
2492                                 clearStrBufBeforeUse();
2493                                 appendStrBuf(c);
2494                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
2495                                 break markupdeclarationopenloop;
2496                             // continue stateloop;
2497                             case 'd':
2498                             case 'D':
2499                                 clearStrBufBeforeUse();
2500                                 appendStrBuf(c);
2501                                 index = 0;
2502                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
2503                                 continue stateloop;
2504                             case '[':
2505                                 if (tokenHandler.cdataSectionAllowed()) {
2506                                     clearStrBufBeforeUse();
2507                                     appendStrBuf(c);
2508                                     index = 0;
2509                                     state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
2510                                     continue stateloop;
2511                                 }
2512                                 // CPPONLY: MOZ_FALLTHROUGH;
2513                             default:
2514                                 errBogusComment();
2515                                 clearStrBufBeforeUse();
2516                                 reconsume = true;
2517                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2518                                 continue stateloop;
2519                         }
2520                     }
2521                     // CPPONLY: MOZ_FALLTHROUGH;
2522                 case MARKUP_DECLARATION_HYPHEN:
2523                     markupdeclarationhyphenloop: for (;;) {
2524                         if (++pos == endPos) {
2525                             break stateloop;
2526                         }
2527                         c = checkChar(buf, pos);
2528                         switch (c) {
2529                             case '-':
2530                                 clearStrBufAfterOneHyphen();
2531                                 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
2532                                 break markupdeclarationhyphenloop;
2533                             // continue stateloop;
2534                             default:
2535                                 errBogusComment();
2536                                 reconsume = true;
2537                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2538                                 continue stateloop;
2539                         }
2540                     }
2541                     // CPPONLY: MOZ_FALLTHROUGH;
2542                 case COMMENT_START:
2543                     commentstartloop: for (;;) {
2544                         if (++pos == endPos) {
2545                             break stateloop;
2546                         }
2547                         c = checkChar(buf, pos);
2548                         /*
2549                          * Comment start state
2550                          *
2551                          *
2552                          * Consume the next input character:
2553                          */
2554                         switch (c) {
2555                             case '-':
2556                                 /*
2557                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
2558                                  * start dash state.
2559                                  */
2560                                 appendStrBuf(c);
2561                                 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
2562                                 continue stateloop;
2563                             case '>':
2564                                 /*
2565                                  * U+003E GREATER-THAN SIGN (>) Parse error.
2566                                  */
2567                                 errPrematureEndOfComment();
2568                                 /* Emit the comment token. */
2569                                 emitComment(0, pos);
2570                                 /*
2571                                  * Switch to the data state.
2572                                  */
2573                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2574                                 continue stateloop;
2575                             case '\r':
2576                                 appendStrBufCarriageReturn();
2577                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2578                                 break stateloop;
2579                             case '\n':
2580                                 appendStrBufLineFeed();
2581                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2582                                 break commentstartloop;
2583                             case '\u0000':
2584                                 c = '\uFFFD';
2585                                 // CPPONLY: MOZ_FALLTHROUGH;
2586                             default:
2587                                 /*
2588                                  * Anything else Append the input character to
2589                                  * the comment token's data.
2590                                  */
2591                                 appendStrBuf(c);
2592                                 /*
2593                                  * Switch to the comment state.
2594                                  */
2595                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2596                                 break commentstartloop;
2597                             // continue stateloop;
2598                         }
2599                     }
2600                     // CPPONLY: MOZ_FALLTHROUGH;
2601                 case COMMENT:
2602                     commentloop: for (;;) {
2603                         if (++pos == endPos) {
2604                             break stateloop;
2605                         }
2606                         c = checkChar(buf, pos);
2607                         /*
2608                          * Comment state Consume the next input character:
2609                          */
2610                         switch (c) {
2611                             case '-':
2612                                 /*
2613                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
2614                                  * end dash state
2615                                  */
2616                                 appendStrBuf(c);
2617                                 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2618                                 break commentloop;
2619                             // continue stateloop;
2620                             case '\r':
2621                                 appendStrBufCarriageReturn();
2622                                 break stateloop;
2623                             case '\n':
2624                                 appendStrBufLineFeed();
2625                                 continue;
2626                             case '\u0000':
2627                                 c = '\uFFFD';
2628                                 // CPPONLY: MOZ_FALLTHROUGH;
2629                             default:
2630                                 /*
2631                                  * Anything else Append the input character to
2632                                  * the comment token's data.
2633                                  */
2634                                 appendStrBuf(c);
2635                                 /*
2636                                  * Stay in the comment state.
2637                                  */
2638                                 continue;
2639                         }
2640                     }
2641                     // CPPONLY: MOZ_FALLTHROUGH;
2642                 case COMMENT_END_DASH:
2643                     commentenddashloop: for (;;) {
2644                         if (++pos == endPos) {
2645                             break stateloop;
2646                         }
2647                         c = checkChar(buf, pos);
2648                         /*
2649                          * Comment end dash state Consume the next input
2650                          * character:
2651                          */
2652                         switch (c) {
2653                             case '-':
2654                                 /*
2655                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
2656                                  * end state
2657                                  */
2658                                 appendStrBuf(c);
2659                                 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2660                                 break commentenddashloop;
2661                             // continue stateloop;
2662                             case '\r':
2663                                 appendStrBufCarriageReturn();
2664                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2665                                 break stateloop;
2666                             case '\n':
2667                                 appendStrBufLineFeed();
2668                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2669                                 continue stateloop;
2670                             case '\u0000':
2671                                 c = '\uFFFD';
2672                                 // CPPONLY: MOZ_FALLTHROUGH;
2673                             default:
2674                                 /*
2675                                  * Anything else Append a U+002D HYPHEN-MINUS
2676                                  * (-) character and the input character to the
2677                                  * comment token's data.
2678                                  */
2679                                 appendStrBuf(c);
2680                                 /*
2681                                  * Switch to the comment state.
2682                                  */
2683                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2684                                 continue stateloop;
2685                         }
2686                     }
2687                     // CPPONLY: MOZ_FALLTHROUGH;
2688                 case COMMENT_END:
2689                     commentendloop: for (;;) {
2690                         if (++pos == endPos) {
2691                             break stateloop;
2692                         }
2693                         c = checkChar(buf, pos);
2694                         /*
2695                          * Comment end dash state Consume the next input
2696                          * character:
2697                          */
2698                         switch (c) {
2699                             case '>':
2700                                 /*
2701                                  * U+003E GREATER-THAN SIGN (>) Emit the comment
2702                                  * token.
2703                                  */
2704                                 emitComment(2, pos);
2705                                 /*
2706                                  * Switch to the data state.
2707                                  */
2708                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2709                                 continue stateloop;
2710                             case '-':
2711                                 /* U+002D HYPHEN-MINUS (-) Parse error. */
2712                                 /*
2713                                  * Append a U+002D HYPHEN-MINUS (-) character to
2714                                  * the comment token's data.
2715                                  */
2716                                 adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2717                                 /*
2718                                  * Stay in the comment end state.
2719                                  */
2720                                 continue;
2721                             case '\r':
2722                                 adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
2723                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2724                                 break stateloop;
2725                             case '\n':
2726                                 adjustDoubleHyphenAndAppendToStrBufLineFeed();
2727                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2728                                 continue stateloop;
2729                             case '!':
2730                                 errHyphenHyphenBang();
2731                                 appendStrBuf(c);
2732                                 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
2733                                 continue stateloop;
2734                             case '\u0000':
2735                                 c = '\uFFFD';
2736                                 // CPPONLY: MOZ_FALLTHROUGH;
2737                             default:
2738                                 /*
2739                                  * Append two U+002D HYPHEN-MINUS (-) characters
2740                                  * and the input character to the comment
2741                                  * token's data.
2742                                  */
2743                                 adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2744                                 /*
2745                                  * Switch to the comment state.
2746                                  */
2747                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2748                                 continue stateloop;
2749                         }
2750                     }
2751                 case COMMENT_END_BANG:
2752                     for (;;) {
2753                         if (++pos == endPos) {
2754                             break stateloop;
2755                         }
2756                         c = checkChar(buf, pos);
2757                         /*
2758                          * Comment end bang state
2759                          *
2760                          * Consume the next input character:
2761                          */
2762                         switch (c) {
2763                             case '>':
2764                                 /*
2765                                  * U+003E GREATER-THAN SIGN (>) Emit the comment
2766                                  * token.
2767                                  */
2768                                 emitComment(3, pos);
2769                                 /*
2770                                  * Switch to the data state.
2771                                  */
2772                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2773                                 continue stateloop;
2774                             case '-':
2775                                 /*
2776                                  * Append two U+002D HYPHEN-MINUS (-) characters
2777                                  * and a U+0021 EXCLAMATION MARK (!) character
2778                                  * to the comment token's data.
2779                                  */
2780                                 appendStrBuf(c);
2781                                 /*
2782                                  * Switch to the comment end dash state.
2783                                  */
2784                                 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2785                                 continue stateloop;
2786                             case '\r':
2787                                 appendStrBufCarriageReturn();
2788                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2789                                 break stateloop;
2790                             case '\n':
2791                                 appendStrBufLineFeed();
2792                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2793                                 continue stateloop;
2794                             case '\u0000':
2795                                 c = '\uFFFD';
2796                                 // CPPONLY: MOZ_FALLTHROUGH;
2797                             default:
2798                                 /*
2799                                  * Anything else Append two U+002D HYPHEN-MINUS
2800                                  * (-) characters, a U+0021 EXCLAMATION MARK (!)
2801                                  * character, and the input character to the
2802                                  * comment token's data. Switch to the comment
2803                                  * state.
2804                                  */
2805                                 appendStrBuf(c);
2806                                 /*
2807                                  * Switch to the comment state.
2808                                  */
2809                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2810                                 continue stateloop;
2811                         }
2812                     }
2813                 case COMMENT_START_DASH:
2814                     if (++pos == endPos) {
2815                         break stateloop;
2816                     }
2817                     c = checkChar(buf, pos);
2818                     /*
2819                      * Comment start dash state
2820                      *
2821                      * Consume the next input character:
2822                      */
2823                     switch (c) {
2824                         case '-':
2825                             /*
2826                              * U+002D HYPHEN-MINUS (-) Switch to the comment end
2827                              * state
2828                              */
2829                             appendStrBuf(c);
2830                             state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2831                             continue stateloop;
2832                         case '>':
2833                             errPrematureEndOfComment();
2834                             /* Emit the comment token. */
2835                             emitComment(1, pos);
2836                             /*
2837                              * Switch to the data state.
2838                              */
2839                             state = transition(state, Tokenizer.DATA, reconsume, pos);
2840                             continue stateloop;
2841                         case '\r':
2842                             appendStrBufCarriageReturn();
2843                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2844                             break stateloop;
2845                         case '\n':
2846                             appendStrBufLineFeed();
2847                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2848                             continue stateloop;
2849                         case '\u0000':
2850                             c = '\uFFFD';
2851                             // CPPONLY: MOZ_FALLTHROUGH;
2852                         default:
2853                             /*
2854                              * Append a U+002D HYPHEN-MINUS character (-) and
2855                              * the current input character to the comment
2856                              * token's data.
2857                              */
2858                             appendStrBuf(c);
2859                             /*
2860                              * Switch to the comment state.
2861                              */
2862                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2863                             continue stateloop;
2864                     }
2865                 case CDATA_START:
2866                     for (;;) {
2867                         if (++pos == endPos) {
2868                             break stateloop;
2869                         }
2870                         c = checkChar(buf, pos);
2871                         if (index < 6) { // CDATA_LSQB.length
2872                             if (c == Tokenizer.CDATA_LSQB[index]) {
2873                                 appendStrBuf(c);
2874                             } else {
2875                                 errBogusComment();
2876                                 reconsume = true;
2877                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2878                                 continue stateloop;
2879                             }
2880                             index++;
2881                             continue;
2882                         } else {
2883                             clearStrBufAfterUse();
2884                             cstart = pos; // start coalescing
2885                             reconsume = true;
2886                             state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2887                             break; // FALL THROUGH continue stateloop;
2888                         }
2889                     }
2890                     // CPPONLY: MOZ_FALLTHROUGH;
2891                 case CDATA_SECTION:
2892                     cdatasectionloop: for (;;) {
2893                         if (reconsume) {
2894                             reconsume = false;
2895                         } else {
2896                             if (++pos == endPos) {
2897                                 break stateloop;
2898                             }
2899                             c = checkChar(buf, pos);
2900                         }
2901                         switch (c) {
2902                             case ']':
2903                                 flushChars(buf, pos);
2904                                 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
2905                                 break cdatasectionloop; // FALL THROUGH
2906                             case '\u0000':
2907                                 emitReplacementCharacter(buf, pos);
2908                                 continue;
2909                             case '\r':
2910                                 emitCarriageReturn(buf, pos);
2911                                 break stateloop;
2912                             case '\n':
2913                                 silentLineFeed();
2914                                 // CPPONLY: MOZ_FALLTHROUGH;
2915                             default:
2916                                 continue;
2917                         }
2918                     }
2919                     // CPPONLY: MOZ_FALLTHROUGH;
2920                 case CDATA_RSQB:
2921                     cdatarsqb: for (;;) {
2922                         if (++pos == endPos) {
2923                             break stateloop;
2924                         }
2925                         c = checkChar(buf, pos);
2926                         switch (c) {
2927                             case ']':
2928                                 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
2929                                 break cdatarsqb;
2930                             default:
2931                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
2932                                         1);
2933                                 cstart = pos;
2934                                 reconsume = true;
2935                                 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2936                                 continue stateloop;
2937                         }
2938                     }
2939                     // CPPONLY: MOZ_FALLTHROUGH;
2940                 case CDATA_RSQB_RSQB:
2941                     cdatarsqbrsqb: for (;;) {
2942                         if (++pos == endPos) {
2943                             break stateloop;
2944                         }
2945                         c = checkChar(buf, pos);
2946                         switch (c) {
2947                             case ']':
2948                                 // Saw a third ]. Emit one ] (logically the
2949                                 // first one) and stay in this state to
2950                                 // remember that the last two characters seen
2951                                 // have been ]].
2952                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
2953                                 continue;
2954                             case '>':
2955                                 cstart = pos + 1;
2956                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
2957                                 continue stateloop;
2958                             default:
2959                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
2960                                 cstart = pos;
2961                                 reconsume = true;
2962                                 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2963                                 continue stateloop;
2964                         }
2965                     }
2966                 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
2967                     attributevaluesinglequotedloop: for (;;) {
2968                         if (reconsume) {
2969                             reconsume = false;
2970                         } else {
2971                             if (++pos == endPos) {
2972                                 break stateloop;
2973                             }
2974                             c = checkChar(buf, pos);
2975                         }
2976                         /*
2977                          * Consume the next input character:
2978                          */
2979                         switch (c) {
2980                             case '\'':
2981                                 /*
2982                                  * U+0027 APOSTROPHE (') Switch to the after
2983                                  * attribute value (quoted) state.
2984                                  */
2985                                 addAttributeWithValue();
2986 
2987                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2988                                 continue stateloop;
2989                             case '&':
2990                                 /*
2991                                  * U+0026 AMPERSAND (&) Switch to the character
2992                                  * reference in attribute value state, with the
2993                                  * + additional allowed character being U+0027
2994                                  * APOSTROPHE (').
2995                                  */
2996                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
2997                                 appendCharRefBuf(c);
2998                                 setAdditionalAndRememberAmpersandLocation('\'');
2999                                 returnState = state;
3000                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
3001                                 break attributevaluesinglequotedloop;
3002                             // continue stateloop;
3003                             case '\r':
3004                                 appendStrBufCarriageReturn();
3005                                 break stateloop;
3006                             case '\n':
3007                                 appendStrBufLineFeed();
3008                                 continue;
3009                             case '\u0000':
3010                                 c = '\uFFFD';
3011                                 // CPPONLY: MOZ_FALLTHROUGH;
3012                             default:
3013                                 /*
3014                                  * Anything else Append the current input
3015                                  * character to the current attribute's value.
3016                                  */
3017                                 appendStrBuf(c);
3018                                 /*
3019                                  * Stay in the attribute value (double-quoted)
3020                                  * state.
3021                                  */
3022                                 continue;
3023                         }
3024                     }
3025                     // CPPONLY: MOZ_FALLTHROUGH;
3026                 case CONSUME_CHARACTER_REFERENCE:
3027                     if (++pos == endPos) {
3028                         break stateloop;
3029                     }
3030                     c = checkChar(buf, pos);
3031                     /*
3032                      * Unlike the definition is the spec, this state does not
3033                      * return a value and never requires the caller to
3034                      * backtrack. This state takes care of emitting characters
3035                      * or appending to the current attribute value. It also
3036                      * takes care of that in the case when consuming the
3037                      * character reference fails.
3038                      */
3039                     /*
3040                      * This section defines how to consume a character
3041                      * reference. This definition is used when parsing character
3042                      * references in text and in attributes.
3043                      *
3044                      * The behavior depends on the identity of the next
3045                      * character (the one immediately after the U+0026 AMPERSAND
3046                      * character):
3047                      */
3048                     switch (c) {
3049                         case ' ':
3050                         case '\t':
3051                         case '\n':
3052                         case '\r': // we'll reconsume!
3053                         case '\u000C':
3054                         case '<':
3055                         case '&':
3056                         case '\u0000':
3057                             emitOrAppendCharRefBuf(returnState);
3058                             if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3059                                 cstart = pos;
3060                             }
3061                             reconsume = true;
3062                             state = transition(state, returnState, reconsume, pos);
3063                             continue stateloop;
3064                         case '#':
3065                             /*
3066                              * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
3067                              * SIGN.
3068                              */
3069                             appendCharRefBuf('#');
3070                             state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
3071                             continue stateloop;
3072                         default:
3073                             if (c == additional) {
3074                                 emitOrAppendCharRefBuf(returnState);
3075                                 reconsume = true;
3076                                 state = transition(state, returnState, reconsume, pos);
3077                                 continue stateloop;
3078                             }
3079                             if (c >= 'a' && c <= 'z') {
3080                                 firstCharKey = c - 'a' + 26;
3081                             } else if (c >= 'A' && c <= 'Z') {
3082                                 firstCharKey = c - 'A';
3083                             } else {
3084                                 // No match
3085                                 /*
3086                                  * If no match can be made, then this is a parse
3087                                  * error.
3088                                  */
3089                                 errNoNamedCharacterMatch();
3090                                 emitOrAppendCharRefBuf(returnState);
3091                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3092                                     cstart = pos;
3093                                 }
3094                                 reconsume = true;
3095                                 state = transition(state, returnState, reconsume, pos);
3096                                 continue stateloop;
3097                             }
3098                             // Didn't fail yet
3099                             appendCharRefBuf(c);
3100                             state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
3101                             // FALL THROUGH continue stateloop;
3102                     }
3103                     // CPPONLY: MOZ_FALLTHROUGH;
3104                 case CHARACTER_REFERENCE_HILO_LOOKUP:
3105                     {
3106                         if (++pos == endPos) {
3107                             break stateloop;
3108                         }
3109                         c = checkChar(buf, pos);
3110                         /*
3111                          * The data structure is as follows:
3112                          *
3113                          * HILO_ACCEL is a two-dimensional int array whose major
3114                          * index corresponds to the second character of the
3115                          * character reference (code point as index) and the
3116                          * minor index corresponds to the first character of the
3117                          * character reference (packed so that A-Z runs from 0
3118                          * to 25 and a-z runs from 26 to 51). This layout makes
3119                          * it easier to use the sparseness of the data structure
3120                          * to omit parts of it: The second dimension of the
3121                          * table is null when no character reference starts with
3122                          * the character corresponding to that row.
3123                          *
3124                          * The int value HILO_ACCEL (by these indeces) is zero
3125                          * if there exists no character reference starting with
3126                          * that two-letter prefix. Otherwise, the value is an
3127                          * int that packs two shorts so that the higher short is
3128                          * the index of the highest character reference name
3129                          * with that prefix in NAMES and the lower short
3130                          * corresponds to the index of the lowest character
3131                          * reference name with that prefix. (It happens that the
3132                          * first two character reference names share their
3133                          * prefix so the packed int cannot be 0 by packing the
3134                          * two shorts.)
3135                          *
3136                          * NAMES is an array of byte arrays where each byte
3137                          * array encodes the name of a character references as
3138                          * ASCII. The names omit the first two letters of the
3139                          * name. (Since storing the first two letters would be
3140                          * redundant with the data contained in HILO_ACCEL.) The
3141                          * entries are lexically sorted.
3142                          *
3143                          * For a given index in NAMES, the same index in VALUES
3144                          * contains the corresponding expansion as an array of
3145                          * two UTF-16 code units (either the character and
3146                          * U+0000 or a suggogate pair).
3147                          */
3148                         int hilo = 0;
3149                         if (c <= 'z') {
3150                             @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
3151                             if (row != null) {
3152                                 hilo = row[firstCharKey];
3153                             }
3154                         }
3155                         if (hilo == 0) {
3156                             /*
3157                              * If no match can be made, then this is a parse
3158                              * error.
3159                              */
3160                             errNoNamedCharacterMatch();
3161                             emitOrAppendCharRefBuf(returnState);
3162                             if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3163                                 cstart = pos;
3164                             }
3165                             reconsume = true;
3166                             state = transition(state, returnState, reconsume, pos);
3167                             continue stateloop;
3168                         }
3169                         // Didn't fail yet
3170                         appendCharRefBuf(c);
3171                         lo = hilo & 0xFFFF;
3172                         hi = hilo >> 16;
3173                         entCol = -1;
3174                         candidate = -1;
3175                         charRefBufMark = 0;
3176                         state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
3177                         // FALL THROUGH continue stateloop;
3178                     }
3179                     // CPPONLY: MOZ_FALLTHROUGH;
3180                 case CHARACTER_REFERENCE_TAIL:
3181                     outer: for (;;) {
3182                         if (++pos == endPos) {
3183                             break stateloop;
3184                         }
3185                         c = checkChar(buf, pos);
3186                         entCol++;
3187                         /*
3188                          * Consume the maximum number of characters possible,
3189                          * with the consumed characters matching one of the
3190                          * identifiers in the first column of the named
3191                          * character references table (in a case-sensitive
3192                          * manner).
3193                          */
3194                         loloop: for (;;) {
3195                             if (hi < lo) {
3196                                 break outer;
3197                             }
3198                             if (entCol == NamedCharacters.NAMES[lo].length()) {
3199                                 candidate = lo;
3200                                 charRefBufMark = charRefBufLen;
3201                                 lo++;
3202                             } else if (entCol > NamedCharacters.NAMES[lo].length()) {
3203                                 break outer;
3204                             } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
3205                                 lo++;
3206                             } else {
3207                                 break loloop;
3208                             }
3209                         }
3210 
3211                         hiloop: for (;;) {
3212                             if (hi < lo) {
3213                                 break outer;
3214                             }
3215                             if (entCol == NamedCharacters.NAMES[hi].length()) {
3216                                 break hiloop;
3217                             }
3218                             if (entCol > NamedCharacters.NAMES[hi].length()) {
3219                                 break outer;
3220                             } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
3221                                 hi--;
3222                             } else {
3223                                 break hiloop;
3224                             }
3225                         }
3226 
3227                         if (c == ';') {
3228                             // If we see a semicolon, there cannot be a
3229                             // longer match. Break the loop. However, before
3230                             // breaking, take the longest match so far as the
3231                             // candidate, if we are just about to complete a
3232                             // match.
3233                             if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
3234                                 candidate = lo;
3235                                 charRefBufMark = charRefBufLen;
3236                             }
3237                             break outer;
3238                         }
3239 
3240                         if (hi < lo) {
3241                             break outer;
3242                         }
3243                         appendCharRefBuf(c);
3244                         continue;
3245                     }
3246 
3247                     if (candidate == -1) {
3248                         // reconsume deals with CR, LF or nul
3249                         /*
3250                          * If no match can be made, then this is a parse error.
3251                          */
3252                         errNoNamedCharacterMatch();
3253                         emitOrAppendCharRefBuf(returnState);
3254                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3255                             cstart = pos;
3256                         }
3257                         reconsume = true;
3258                         state = transition(state, returnState, reconsume, pos);
3259                         continue stateloop;
3260                     } else {
3261                         // c can't be CR, LF or nul if we got here
3262                         @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
3263                         if (candidateName.length() == 0
3264                                 || candidateName.charAt(candidateName.length() - 1) != ';') {
3265                             /*
3266                              * If the last character matched is not a U+003B
3267                              * SEMICOLON (;), there is a parse error.
3268                              */
3269                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3270                                 /*
3271                                  * If the entity is being consumed as part of an
3272                                  * attribute, and the last character matched is
3273                                  * not a U+003B SEMICOLON (;),
3274                                  */
3275                                 char ch;
3276                                 if (charRefBufMark == charRefBufLen) {
3277                                     ch = c;
3278                                 } else {
3279                                     ch = charRefBuf[charRefBufMark];
3280                                 }
3281                                 if (ch == '=' || (ch >= '0' && ch <= '9')
3282                                         || (ch >= 'A' && ch <= 'Z')
3283                                         || (ch >= 'a' && ch <= 'z')) {
3284                                     /*
3285                                      * and the next character is either a U+003D
3286                                      * EQUALS SIGN character (=) or in the range
3287                                      * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
3288                                      * U+0041 LATIN CAPITAL LETTER A to U+005A
3289                                      * LATIN CAPITAL LETTER Z, or U+0061 LATIN
3290                                      * SMALL LETTER A to U+007A LATIN SMALL
3291                                      * LETTER Z, then, for historical reasons,
3292                                      * all the characters that were matched
3293                                      * after the U+0026 AMPERSAND (&) must be
3294                                      * unconsumed, and nothing is returned.
3295                                      */
3296                                     errNoNamedCharacterMatch();
3297                                     appendCharRefBufToStrBuf();
3298                                     reconsume = true;
3299                                     state = transition(state, returnState, reconsume, pos);
3300                                     continue stateloop;
3301                                 }
3302                             }
3303                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3304                                 errUnescapedAmpersandInterpretedAsCharacterReference();
3305                             } else {
3306                                 errNotSemicolonTerminated();
3307                             }
3308                         }
3309 
3310                         /*
3311                          * Otherwise, return a character token for the character
3312                          * corresponding to the entity name (as given by the
3313                          * second column of the named character references
3314                          * table).
3315                          */
3316                         // CPPONLY: completedNamedCharacterReference();
3317                         @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
3318                         if (
3319                         // [NOCPP[
3320                         val.length == 1
3321                         // ]NOCPP]
3322                         // CPPONLY: val[1] == 0
3323                         ) {
3324                             emitOrAppendOne(val, returnState);
3325                         } else {
3326                             emitOrAppendTwo(val, returnState);
3327                         }
3328                         // this is so complicated!
3329                         if (charRefBufMark < charRefBufLen) {
3330                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3331                                 appendStrBuf(charRefBuf, charRefBufMark,
3332                                         charRefBufLen - charRefBufMark);
3333                             } else {
3334                                 tokenHandler.characters(charRefBuf, charRefBufMark,
3335                                         charRefBufLen - charRefBufMark);
3336                             }
3337                         }
3338                         // charRefBufLen will be zeroed below!
3339 
3340                         // Check if we broke out early with c being the last
3341                         // character that matched as opposed to being the
3342                         // first one that didn't match. In the case of an
3343                         // early break, the next run on text should start
3344                         // *after* the current character and the current
3345                         // character shouldn't be reconsumed.
3346                         boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen);
3347                         charRefBufLen = 0;
3348                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3349                             cstart = earlyBreak ? pos + 1 : pos;
3350                         }
3351                         reconsume = !earlyBreak;
3352                         state = transition(state, returnState, reconsume, pos);
3353                         continue stateloop;
3354                         /*
3355                          * If the markup contains I'm &notit; I tell you, the
3356                          * entity is parsed as "not", as in, I'm ¬it; I tell
3357                          * you. But if the markup was I'm &notin; I tell you,
3358                          * the entity would be parsed as "notin;", resulting in
3359                          * I'm ∉ I tell you.
3360                          */
3361                     }
3362                 case CONSUME_NCR:
3363                     if (++pos == endPos) {
3364                         break stateloop;
3365                     }
3366                     c = checkChar(buf, pos);
3367                     value = 0;
3368                     seenDigits = false;
3369                     /*
3370                      * The behavior further depends on the character after the
3371                      * U+0023 NUMBER SIGN:
3372                      */
3373                     switch (c) {
3374                         case 'x':
3375                         case 'X':
3376 
3377                             /*
3378                              * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
3379                              * LETTER X Consume the X.
3380                              *
3381                              * Follow the steps below, but using the range of
3382                              * characters U+0030 DIGIT ZERO through to U+0039
3383                              * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
3384                              * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
3385                              * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
3386                              * LETTER F (in other words, 0-9, A-F, a-f).
3387                              *
3388                              * When it comes to interpreting the number,
3389                              * interpret it as a hexadecimal number.
3390                              */
3391                             appendCharRefBuf(c);
3392                             state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
3393                             continue stateloop;
3394                         default:
3395                             /*
3396                              * Anything else Follow the steps below, but using
3397                              * the range of characters U+0030 DIGIT ZERO through
3398                              * to U+0039 DIGIT NINE (i.e. just 0-9).
3399                              *
3400                              * When it comes to interpreting the number,
3401                              * interpret it as a decimal number.
3402                              */
3403                             reconsume = true;
3404                             state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
3405                             // FALL THROUGH continue stateloop;
3406                     }
3407                     // CPPONLY: MOZ_FALLTHROUGH;
3408                 case DECIMAL_NRC_LOOP:
3409                     decimalloop: for (;;) {
3410                         if (reconsume) {
3411                             reconsume = false;
3412                         } else {
3413                             if (++pos == endPos) {
3414                                 break stateloop;
3415                             }
3416                             c = checkChar(buf, pos);
3417                         }
3418                         /*
3419                          * Consume as many characters as match the range of
3420                          * characters given above.
3421                          */
3422                         assert value >= 0: "value must not become negative.";
3423                         if (c >= '0' && c <= '9') {
3424                             seenDigits = true;
3425                             // Avoid overflow
3426                             if (value <= 0x10FFFF) {
3427                                 value *= 10;
3428                                 value += c - '0';
3429                             }
3430                             continue;
3431                         } else if (c == ';') {
3432                             if (seenDigits) {
3433                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3434                                     cstart = pos + 1;
3435                                 }
3436                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3437                                 // FALL THROUGH continue stateloop;
3438                                 break decimalloop;
3439                             } else {
3440                                 errNoDigitsInNCR();
3441                                 appendCharRefBuf(';');
3442                                 emitOrAppendCharRefBuf(returnState);
3443                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3444                                     cstart = pos + 1;
3445                                 }
3446                                 state = transition(state, returnState, reconsume, pos);
3447                                 continue stateloop;
3448                             }
3449                         } else {
3450                             /*
3451                              * If no characters match the range, then don't
3452                              * consume any characters (and unconsume the U+0023
3453                              * NUMBER SIGN character and, if appropriate, the X
3454                              * character). This is a parse error; nothing is
3455                              * returned.
3456                              *
3457                              * Otherwise, if the next character is a U+003B
3458                              * SEMICOLON, consume that too. If it isn't, there
3459                              * is a parse error.
3460                              */
3461                             if (!seenDigits) {
3462                                 errNoDigitsInNCR();
3463                                 emitOrAppendCharRefBuf(returnState);
3464                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3465                                     cstart = pos;
3466                                 }
3467                                 reconsume = true;
3468                                 state = transition(state, returnState, reconsume, pos);
3469                                 continue stateloop;
3470                             } else {
3471                                 errCharRefLacksSemicolon();
3472                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3473                                     cstart = pos;
3474                                 }
3475                                 reconsume = true;
3476                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3477                                 // FALL THROUGH continue stateloop;
3478                                 break decimalloop;
3479                             }
3480                         }
3481                     }
3482                     // CPPONLY: MOZ_FALLTHROUGH;
3483                 case HANDLE_NCR_VALUE:
3484                     // WARNING previous state sets reconsume
3485                     // We are not going to emit the contents of charRefBuf.
3486                     charRefBufLen = 0;
3487                     // XXX inline this case if the method size can take it
3488                     handleNcrValue(returnState);
3489                     state = transition(state, returnState, reconsume, pos);
3490                     continue stateloop;
3491                 case HEX_NCR_LOOP:
3492                     for (;;) {
3493                         if (++pos == endPos) {
3494                             break stateloop;
3495                         }
3496                         c = checkChar(buf, pos);
3497                         /*
3498                          * Consume as many characters as match the range of
3499                          * characters given above.
3500                          */
3501                         assert value >= 0: "value must not become negative.";
3502                         if (c >= '0' && c <= '9') {
3503                             seenDigits = true;
3504                             // Avoid overflow
3505                             if (value <= 0x10FFFF) {
3506                                 value *= 16;
3507                                 value += c - '0';
3508                             }
3509                             continue;
3510                         } else if (c >= 'A' && c <= 'F') {
3511                             seenDigits = true;
3512                             // Avoid overflow
3513                             if (value <= 0x10FFFF) {
3514                                 value *= 16;
3515                                 value += c - 'A' + 10;
3516                             }
3517                             continue;
3518                         } else if (c >= 'a' && c <= 'f') {
3519                             seenDigits = true;
3520                             // Avoid overflow
3521                             if (value <= 0x10FFFF) {
3522                                 value *= 16;
3523                                 value += c - 'a' + 10;
3524                             }
3525                             continue;
3526                         } else if (c == ';') {
3527                             if (seenDigits) {
3528                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3529                                     cstart = pos + 1;
3530                                 }
3531                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3532                                 continue stateloop;
3533                             } else {
3534                                 errNoDigitsInNCR();
3535                                 appendCharRefBuf(';');
3536                                 emitOrAppendCharRefBuf(returnState);
3537                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3538                                     cstart = pos + 1;
3539                                 }
3540                                 state = transition(state, returnState, reconsume, pos);
3541                                 continue stateloop;
3542                             }
3543                         } else {
3544                             /*
3545                              * If no characters match the range, then don't
3546                              * consume any characters (and unconsume the U+0023
3547                              * NUMBER SIGN character and, if appropriate, the X
3548                              * character). This is a parse error; nothing is
3549                              * returned.
3550                              *
3551                              * Otherwise, if the next character is a U+003B
3552                              * SEMICOLON, consume that too. If it isn't, there
3553                              * is a parse error.
3554                              */
3555                             if (!seenDigits) {
3556                                 errNoDigitsInNCR();
3557                                 emitOrAppendCharRefBuf(returnState);
3558                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3559                                     cstart = pos;
3560                                 }
3561                                 reconsume = true;
3562                                 state = transition(state, returnState, reconsume, pos);
3563                                 continue stateloop;
3564                             } else {
3565                                 errCharRefLacksSemicolon();
3566                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3567                                     cstart = pos;
3568                                 }
3569                                 reconsume = true;
3570                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3571                                 continue stateloop;
3572                             }
3573                         }
3574                     }
3575                 case PLAINTEXT:
3576                     plaintextloop: for (;;) {
3577                         if (reconsume) {
3578                             reconsume = false;
3579                         } else {
3580                             if (++pos == endPos) {
3581                                 break stateloop;
3582                             }
3583                             c = checkChar(buf, pos);
3584                         }
3585                         switch (c) {
3586                             case '\u0000':
3587                                 emitPlaintextReplacementCharacter(buf, pos);
3588                                 continue;
3589                             case '\r':
3590                                 emitCarriageReturn(buf, pos);
3591                                 break stateloop;
3592                             case '\n':
3593                                 silentLineFeed();
3594                                 // CPPONLY: MOZ_FALLTHROUGH;
3595                             default:
3596                                 /*
3597                                  * Anything else Emit the current input
3598                                  * character as a character token. Stay in the
3599                                  * RAWTEXT state.
3600                                  */
3601                                 continue;
3602                         }
3603                     }
3604                 case CLOSE_TAG_OPEN:
3605                     if (++pos == endPos) {
3606                         break stateloop;
3607                     }
3608                     c = checkChar(buf, pos);
3609                     /*
3610                      * Otherwise, if the content model flag is set to the PCDATA
3611                      * state, or if the next few characters do match that tag
3612                      * name, consume the next input character:
3613                      */
3614                     switch (c) {
3615                         case '>':
3616                             /* U+003E GREATER-THAN SIGN (>) Parse error. */
3617                             errLtSlashGt();
3618                             /*
3619                              * Switch to the data state.
3620                              */
3621                             cstart = pos + 1;
3622                             state = transition(state, Tokenizer.DATA, reconsume, pos);
3623                             continue stateloop;
3624                         case '\r':
3625                             silentCarriageReturn();
3626                             /* Anything else Parse error. */
3627                             errGarbageAfterLtSlash();
3628                             /*
3629                              * Switch to the bogus comment state.
3630                              */
3631                             clearStrBufBeforeUse();
3632                             appendStrBuf('\n');
3633                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3634                             break stateloop;
3635                         case '\n':
3636                             silentLineFeed();
3637                             /* Anything else Parse error. */
3638                             errGarbageAfterLtSlash();
3639                             /*
3640                              * Switch to the bogus comment state.
3641                              */
3642                             clearStrBufBeforeUse();
3643                             appendStrBuf(c);
3644                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3645                             continue stateloop;
3646                         case '\u0000':
3647                             c = '\uFFFD';
3648                             // CPPONLY: MOZ_FALLTHROUGH;
3649                         default:
3650                             if (c >= 'A' && c <= 'Z') {
3651                                 c += 0x20;
3652                             }
3653                             if (c >= 'a' && c <= 'z') {
3654                                 /*
3655                                  * U+0061 LATIN SMALL LETTER A through to U+007A
3656                                  * LATIN SMALL LETTER Z Create a new end tag
3657                                  * token,
3658                                  */
3659                                 endTag = true;
3660                                 /*
3661                                  * set its tag name to the input character,
3662                                  */
3663                                 clearStrBufBeforeUse();
3664                                 appendStrBuf(c);
3665                                 containsHyphen = false;
3666                                 /*
3667                                  * then switch to the tag name state. (Don't
3668                                  * emit the token yet; further details will be
3669                                  * filled in before it is emitted.)
3670                                  */
3671                                 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
3672                                 continue stateloop;
3673                             } else {
3674                                 /* Anything else Parse error. */
3675                                 errGarbageAfterLtSlash();
3676                                 /*
3677                                  * Switch to the bogus comment state.
3678                                  */
3679                                 clearStrBufBeforeUse();
3680                                 appendStrBuf(c);
3681                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3682                                 continue stateloop;
3683                             }
3684                     }
3685                 case RCDATA:
3686                     rcdataloop: for (;;) {
3687                         if (reconsume) {
3688                             reconsume = false;
3689                         } else {
3690                             if (++pos == endPos) {
3691                                 break stateloop;
3692                             }
3693                             c = checkChar(buf, pos);
3694                         }
3695                         switch (c) {
3696                             case '&':
3697                                 /*
3698                                  * U+0026 AMPERSAND (&) Switch to the character
3699                                  * reference in RCDATA state.
3700                                  */
3701                                 flushChars(buf, pos);
3702                                 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
3703                                 appendCharRefBuf(c);
3704                                 setAdditionalAndRememberAmpersandLocation('\u0000');
3705                                 returnState = state;
3706                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
3707                                 continue stateloop;
3708                             case '<':
3709                                 /*
3710                                  * U+003C LESS-THAN SIGN (<) Switch to the
3711                                  * RCDATA less-than sign state.
3712                                  */
3713                                 flushChars(buf, pos);
3714 
3715                                 returnState = state;
3716                                 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3717                                 continue stateloop;
3718                             case '\u0000':
3719                                 emitReplacementCharacter(buf, pos);
3720                                 continue;
3721                             case '\r':
3722                                 emitCarriageReturn(buf, pos);
3723                                 break stateloop;
3724                             case '\n':
3725                                 silentLineFeed();
3726                                 // CPPONLY: MOZ_FALLTHROUGH;
3727                             default:
3728                                 /*
3729                                  * Emit the current input character as a
3730                                  * character token. Stay in the RCDATA state.
3731                                  */
3732                                 continue;
3733                         }
3734                     }
3735                 case RAWTEXT:
3736                     rawtextloop: for (;;) {
3737                         if (reconsume) {
3738                             reconsume = false;
3739                         } else {
3740                             if (++pos == endPos) {
3741                                 break stateloop;
3742                             }
3743                             c = checkChar(buf, pos);
3744                         }
3745                         switch (c) {
3746                             case '<':
3747                                 /*
3748                                  * U+003C LESS-THAN SIGN (<) Switch to the
3749                                  * RAWTEXT less-than sign state.
3750                                  */
3751                                 flushChars(buf, pos);
3752 
3753                                 returnState = state;
3754                                 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3755                                 break rawtextloop;
3756                             // FALL THRU continue stateloop;
3757                             case '\u0000':
3758                                 emitReplacementCharacter(buf, pos);
3759                                 continue;
3760                             case '\r':
3761                                 emitCarriageReturn(buf, pos);
3762                                 break stateloop;
3763                             case '\n':
3764                                 silentLineFeed();
3765                                 // CPPONLY: MOZ_FALLTHROUGH;
3766                             default:
3767                                 /*
3768                                  * Emit the current input character as a
3769                                  * character token. Stay in the RAWTEXT state.
3770                                  */
3771                                 continue;
3772                         }
3773                     }
3774                     // CPPONLY: MOZ_FALLTHROUGH;
3775                 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
3776                     rawtextrcdatalessthansignloop: for (;;) {
3777                         if (++pos == endPos) {
3778                             break stateloop;
3779                         }
3780                         c = checkChar(buf, pos);
3781                         switch (c) {
3782                             case '/':
3783                                 /*
3784                                  * U+002F SOLIDUS (/) Set the temporary buffer
3785                                  * to the empty string. Switch to the script
3786                                  * data end tag open state.
3787                                  */
3788                                 index = 0;
3789                                 clearStrBufBeforeUse();
3790                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
3791                                 break rawtextrcdatalessthansignloop;
3792                             // FALL THRU continue stateloop;
3793                             default:
3794                                 /*
3795                                  * Otherwise, emit a U+003C LESS-THAN SIGN
3796                                  * character token
3797                                  */
3798                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3799                                 /*
3800                                  * and reconsume the current input character in
3801                                  * the data state.
3802                                  */
3803                                 cstart = pos;
3804                                 reconsume = true;
3805                                 state = transition(state, returnState, reconsume, pos);
3806                                 continue stateloop;
3807                         }
3808                     }
3809                     // CPPONLY: MOZ_FALLTHROUGH;
3810                 case NON_DATA_END_TAG_NAME:
3811                     for (;;) {
3812                         if (++pos == endPos) {
3813                             break stateloop;
3814                         }
3815                         c = checkChar(buf, pos);
3816                         /*
3817                          * ASSERT! when entering this state, set index to 0 and
3818                          * call clearStrBufBeforeUse(); Let's implement the above
3819                          * without lookahead. strBuf is the 'temporary buffer'.
3820                          */
3821                         if (endTagExpectationAsArray == null) {
3822                             tokenHandler.characters(Tokenizer.LT_SOLIDUS,
3823                                     0, 2);
3824                             cstart = pos;
3825                             reconsume = true;
3826                             state = transition(state, returnState, reconsume, pos);
3827                             continue stateloop;
3828                         } else if (index < endTagExpectationAsArray.length) {
3829                             char e = endTagExpectationAsArray[index];
3830                             char folded = c;
3831                             if (c >= 'A' && c <= 'Z') {
3832                                 folded += 0x20;
3833                             }
3834                             if (folded != e) {
3835                                 // [NOCPP[
3836                                 errHtml4LtSlashInRcdata(folded);
3837                                 // ]NOCPP]
3838                                 tokenHandler.characters(Tokenizer.LT_SOLIDUS,
3839                                         0, 2);
3840                                 emitStrBuf();
3841                                 cstart = pos;
3842                                 reconsume = true;
3843                                 state = transition(state, returnState, reconsume, pos);
3844                                 continue stateloop;
3845                             }
3846                             appendStrBuf(c);
3847                             index++;
3848                             continue;
3849                         } else {
3850                             endTag = true;
3851                             // XXX replace contentModelElement with different
3852                             // type
3853                             tagName = endTagExpectation;
3854                             switch (c) {
3855                                 case '\r':
3856                                     silentCarriageReturn();
3857                                     clearStrBufAfterUse(); // strBuf not used
3858                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3859                                     break stateloop;
3860                                 case '\n':
3861                                     silentLineFeed();
3862                                     // CPPONLY: MOZ_FALLTHROUGH;
3863                                 case ' ':
3864                                 case '\t':
3865                                 case '\u000C':
3866                                     /*
3867                                      * U+0009 CHARACTER TABULATION U+000A LINE
3868                                      * FEED (LF) U+000C FORM FEED (FF) U+0020
3869                                      * SPACE If the current end tag token is an
3870                                      * appropriate end tag token, then switch to
3871                                      * the before attribute name state.
3872                                      */
3873                                     clearStrBufAfterUse(); // strBuf not used
3874                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3875                                     continue stateloop;
3876                                 case '/':
3877                                     /*
3878                                      * U+002F SOLIDUS (/) If the current end tag
3879                                      * token is an appropriate end tag token,
3880                                      * then switch to the self-closing start tag
3881                                      * state.
3882                                      */
3883                                     clearStrBufAfterUse(); // strBuf not used
3884                                     state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
3885                                     continue stateloop;
3886                                 case '>':
3887                                     /*
3888                                      * U+003E GREATER-THAN SIGN (>) If the
3889                                      * current end tag token is an appropriate
3890                                      * end tag token, then emit the current tag
3891                                      * token and switch to the data state.
3892                                      */
3893                                     clearStrBufAfterUse(); // strBuf not used
3894                                     state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
3895                                     if (shouldSuspend) {
3896                                         break stateloop;
3897                                     }
3898                                     continue stateloop;
3899                                 default:
3900                                     /*
3901                                      * Emit a U+003C LESS-THAN SIGN character
3902                                      * token, a U+002F SOLIDUS character token,
3903                                      * a character token for each of the
3904                                      * characters in the temporary buffer (in
3905                                      * the order they were added to the buffer),
3906                                      * and reconsume the current input character
3907                                      * in the RAWTEXT state.
3908                                      */
3909                                     // [NOCPP[
3910                                     errWarnLtSlashInRcdata();
3911                                     // ]NOCPP]
3912                                     tokenHandler.characters(
3913                                             Tokenizer.LT_SOLIDUS, 0, 2);
3914                                     emitStrBuf();
3915                                     cstart = pos; // don't drop the
3916                                                   // character
3917                                     reconsume = true;
3918                                     state = transition(state, returnState, reconsume, pos);
3919                                     continue stateloop;
3920                             }
3921                         }
3922                     }
3923                     // BEGIN HOTSPOT WORKAROUND
3924                 case BOGUS_COMMENT:
3925                     boguscommentloop: for (;;) {
3926                         if (reconsume) {
3927                             reconsume = false;
3928                         } else {
3929                             if (++pos == endPos) {
3930                                 break stateloop;
3931                             }
3932                             c = checkChar(buf, pos);
3933                         }
3934                         /*
3935                          * Consume every character up to and including the first
3936                          * U+003E GREATER-THAN SIGN character (>) or the end of
3937                          * the file (EOF), whichever comes first. Emit a comment
3938                          * token whose data is the concatenation of all the
3939                          * characters starting from and including the character
3940                          * that caused the state machine to switch into the
3941                          * bogus comment state, up to and including the
3942                          * character immediately before the last consumed
3943                          * character (i.e. up to the character just before the
3944                          * U+003E or EOF character). (If the comment was started
3945                          * by the end of the file (EOF), the token is empty.)
3946                          *
3947                          * Switch to the data state.
3948                          *
3949                          * If the end of the file was reached, reconsume the EOF
3950                          * character.
3951                          */
3952                         switch (c) {
3953                             case '>':
3954                                 emitComment(0, pos);
3955                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
3956                                 continue stateloop;
3957                             case '-':
3958                                 appendStrBuf(c);
3959                                 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
3960                                 break boguscommentloop;
3961                             case '\r':
3962                                 appendStrBufCarriageReturn();
3963                                 break stateloop;
3964                             case '\n':
3965                                 appendStrBufLineFeed();
3966                                 continue;
3967                             case '\u0000':
3968                                 c = '\uFFFD';
3969                                 // CPPONLY: MOZ_FALLTHROUGH;
3970                             default:
3971                                 appendStrBuf(c);
3972                                 continue;
3973                         }
3974                     }
3975                     // CPPONLY: MOZ_FALLTHROUGH;
3976                 case BOGUS_COMMENT_HYPHEN:
3977                     boguscommenthyphenloop: for (;;) {
3978                         if (++pos == endPos) {
3979                             break stateloop;
3980                         }
3981                         c = checkChar(buf, pos);
3982                         switch (c) {
3983                             case '>':
3984                                 // [NOCPP[
3985                                 maybeAppendSpaceToBogusComment();
3986                                 // ]NOCPP]
3987                                 emitComment(0, pos);
3988                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
3989                                 continue stateloop;
3990                             case '-':
3991                                 appendSecondHyphenToBogusComment();
3992                                 continue boguscommenthyphenloop;
3993                             case '\r':
3994                                 appendStrBufCarriageReturn();
3995                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3996                                 break stateloop;
3997                             case '\n':
3998                                 appendStrBufLineFeed();
3999                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4000                                 continue stateloop;
4001                             case '\u0000':
4002                                 c = '\uFFFD';
4003                                 // CPPONLY: MOZ_FALLTHROUGH;
4004                             default:
4005                                 appendStrBuf(c);
4006                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4007                                 continue stateloop;
4008                         }
4009                     }
4010                 case SCRIPT_DATA:
4011                     scriptdataloop: for (;;) {
4012                         if (reconsume) {
4013                             reconsume = false;
4014                         } else {
4015                             if (++pos == endPos) {
4016                                 break stateloop;
4017                             }
4018                             c = checkChar(buf, pos);
4019                         }
4020                         switch (c) {
4021                             case '<':
4022                                 /*
4023                                  * U+003C LESS-THAN SIGN (<) Switch to the
4024                                  * script data less-than sign state.
4025                                  */
4026                                 flushChars(buf, pos);
4027                                 returnState = state;
4028                                 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
4029                                 break scriptdataloop; // FALL THRU continue
4030                             // stateloop;
4031                             case '\u0000':
4032                                 emitReplacementCharacter(buf, pos);
4033                                 continue;
4034                             case '\r':
4035                                 emitCarriageReturn(buf, pos);
4036                                 break stateloop;
4037                             case '\n':
4038                                 silentLineFeed();
4039                                 // CPPONLY: MOZ_FALLTHROUGH;
4040                             default:
4041                                 /*
4042                                  * Anything else Emit the current input
4043                                  * character as a character token. Stay in the
4044                                  * script data state.
4045                                  */
4046                                 continue;
4047                         }
4048                     }
4049                     // CPPONLY: MOZ_FALLTHROUGH;
4050                 case SCRIPT_DATA_LESS_THAN_SIGN:
4051                     scriptdatalessthansignloop: for (;;) {
4052                         if (++pos == endPos) {
4053                             break stateloop;
4054                         }
4055                         c = checkChar(buf, pos);
4056                         switch (c) {
4057                             case '/':
4058                                 /*
4059                                  * U+002F SOLIDUS (/) Set the temporary buffer
4060                                  * to the empty string. Switch to the script
4061                                  * data end tag open state.
4062                                  */
4063                                 index = 0;
4064                                 clearStrBufBeforeUse();
4065                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
4066                                 continue stateloop;
4067                             case '!':
4068                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4069                                 cstart = pos;
4070                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
4071                                 break scriptdatalessthansignloop; // FALL THRU
4072                             // continue
4073                             // stateloop;
4074                             default:
4075                                 /*
4076                                  * Otherwise, emit a U+003C LESS-THAN SIGN
4077                                  * character token
4078                                  */
4079                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4080                                 /*
4081                                  * and reconsume the current input character in
4082                                  * the data state.
4083                                  */
4084                                 cstart = pos;
4085                                 reconsume = true;
4086                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4087                                 continue stateloop;
4088                         }
4089                     }
4090                     // CPPONLY: MOZ_FALLTHROUGH;
4091                 case SCRIPT_DATA_ESCAPE_START:
4092                     scriptdataescapestartloop: for (;;) {
4093                         if (++pos == endPos) {
4094                             break stateloop;
4095                         }
4096                         c = checkChar(buf, pos);
4097                         /*
4098                          * Consume the next input character:
4099                          */
4100                         switch (c) {
4101                             case '-':
4102                                 /*
4103                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4104                                  * HYPHEN-MINUS character token. Switch to the
4105                                  * script data escape start dash state.
4106                                  */
4107                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
4108                                 break scriptdataescapestartloop; // FALL THRU
4109                             // continue
4110                             // stateloop;
4111                             default:
4112                                 /*
4113                                  * Anything else Reconsume the current input
4114                                  * character in the script data state.
4115                                  */
4116                                 reconsume = true;
4117                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4118                                 continue stateloop;
4119                         }
4120                     }
4121                     // CPPONLY: MOZ_FALLTHROUGH;
4122                 case SCRIPT_DATA_ESCAPE_START_DASH:
4123                     scriptdataescapestartdashloop: for (;;) {
4124                         if (++pos == endPos) {
4125                             break stateloop;
4126                         }
4127                         c = checkChar(buf, pos);
4128                         /*
4129                          * Consume the next input character:
4130                          */
4131                         switch (c) {
4132                             case '-':
4133                                 /*
4134                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4135                                  * HYPHEN-MINUS character token. Switch to the
4136                                  * script data escaped dash dash state.
4137                                  */
4138                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4139                                 break scriptdataescapestartdashloop;
4140                             // continue stateloop;
4141                             default:
4142                                 /*
4143                                  * Anything else Reconsume the current input
4144                                  * character in the script data state.
4145                                  */
4146                                 reconsume = true;
4147                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4148                                 continue stateloop;
4149                         }
4150                     }
4151                     // CPPONLY: MOZ_FALLTHROUGH;
4152                 case SCRIPT_DATA_ESCAPED_DASH_DASH:
4153                     scriptdataescapeddashdashloop: for (;;) {
4154                         if (++pos == endPos) {
4155                             break stateloop;
4156                         }
4157                         c = checkChar(buf, pos);
4158                         /*
4159                          * Consume the next input character:
4160                          */
4161                         switch (c) {
4162                             case '-':
4163                                 /*
4164                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4165                                  * HYPHEN-MINUS character token. Stay in the
4166                                  * script data escaped dash dash state.
4167                                  */
4168                                 continue;
4169                             case '<':
4170                                 /*
4171                                  * U+003C LESS-THAN SIGN (<) Switch to the
4172                                  * script data escaped less-than sign state.
4173                                  */
4174                                 flushChars(buf, pos);
4175                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4176                                 continue stateloop;
4177                             case '>':
4178                                 /*
4179                                  * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4180                                  * GREATER-THAN SIGN character token. Switch to
4181                                  * the script data state.
4182                                  */
4183                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4184                                 continue stateloop;
4185                             case '\u0000':
4186                                 emitReplacementCharacter(buf, pos);
4187                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4188                                 break scriptdataescapeddashdashloop;
4189                             case '\r':
4190                                 emitCarriageReturn(buf, pos);
4191                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4192                                 break stateloop;
4193                             case '\n':
4194                                 silentLineFeed();
4195                                 // CPPONLY: MOZ_FALLTHROUGH;
4196                             default:
4197                                 /*
4198                                  * Anything else Emit the current input
4199                                  * character as a character token. Switch to the
4200                                  * script data escaped state.
4201                                  */
4202                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4203                                 break scriptdataescapeddashdashloop;
4204                             // continue stateloop;
4205                         }
4206                     }
4207                     // CPPONLY: MOZ_FALLTHROUGH;
4208                 case SCRIPT_DATA_ESCAPED:
4209                     scriptdataescapedloop: for (;;) {
4210                         if (reconsume) {
4211                             reconsume = false;
4212                         } else {
4213                             if (++pos == endPos) {
4214                                 break stateloop;
4215                             }
4216                             c = checkChar(buf, pos);
4217                         }
4218                         /*
4219                          * Consume the next input character:
4220                          */
4221                         switch (c) {
4222                             case '-':
4223                                 /*
4224                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4225                                  * HYPHEN-MINUS character token. Switch to the
4226                                  * script data escaped dash state.
4227                                  */
4228                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
4229                                 break scriptdataescapedloop; // FALL THRU
4230                             // continue
4231                             // stateloop;
4232                             case '<':
4233                                 /*
4234                                  * U+003C LESS-THAN SIGN (<) Switch to the
4235                                  * script data escaped less-than sign state.
4236                                  */
4237                                 flushChars(buf, pos);
4238                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4239                                 continue stateloop;
4240                             case '\u0000':
4241                                 emitReplacementCharacter(buf, pos);
4242                                 continue;
4243                             case '\r':
4244                                 emitCarriageReturn(buf, pos);
4245                                 break stateloop;
4246                             case '\n':
4247                                 silentLineFeed();
4248                                 // CPPONLY: MOZ_FALLTHROUGH;
4249                             default:
4250                                 /*
4251                                  * Anything else Emit the current input
4252                                  * character as a character token. Stay in the
4253                                  * script data escaped state.
4254                                  */
4255                                 continue;
4256                         }
4257                     }
4258                     // CPPONLY: MOZ_FALLTHROUGH;
4259                 case SCRIPT_DATA_ESCAPED_DASH:
4260                     scriptdataescapeddashloop: for (;;) {
4261                         if (++pos == endPos) {
4262                             break stateloop;
4263                         }
4264                         c = checkChar(buf, pos);
4265                         /*
4266                          * Consume the next input character:
4267                          */
4268                         switch (c) {
4269                             case '-':
4270                                 /*
4271                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4272                                  * HYPHEN-MINUS character token. Switch to the
4273                                  * script data escaped dash dash state.
4274                                  */
4275                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4276                                 continue stateloop;
4277                             case '<':
4278                                 /*
4279                                  * U+003C LESS-THAN SIGN (<) Switch to the
4280                                  * script data escaped less-than sign state.
4281                                  */
4282                                 flushChars(buf, pos);
4283                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4284                                 break scriptdataescapeddashloop;
4285                             // continue stateloop;
4286                             case '\u0000':
4287                                 emitReplacementCharacter(buf, pos);
4288                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4289                                 continue stateloop;
4290                             case '\r':
4291                                 emitCarriageReturn(buf, pos);
4292                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4293                                 break stateloop;
4294                             case '\n':
4295                                 silentLineFeed();
4296                                 // CPPONLY: MOZ_FALLTHROUGH;
4297                             default:
4298                                 /*
4299                                  * Anything else Emit the current input
4300                                  * character as a character token. Switch to the
4301                                  * script data escaped state.
4302                                  */
4303                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4304                                 continue stateloop;
4305                         }
4306                     }
4307                     // CPPONLY: MOZ_FALLTHROUGH;
4308                 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
4309                     scriptdataescapedlessthanloop: for (;;) {
4310                         if (++pos == endPos) {
4311                             break stateloop;
4312                         }
4313                         c = checkChar(buf, pos);
4314                         /*
4315                          * Consume the next input character:
4316                          */
4317                         switch (c) {
4318                             case '/':
4319                                 /*
4320                                  * U+002F SOLIDUS (/) Set the temporary buffer
4321                                  * to the empty string. Switch to the script
4322                                  * data escaped end tag open state.
4323                                  */
4324                                 index = 0;
4325                                 clearStrBufBeforeUse();
4326                                 returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
4327                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
4328                                 continue stateloop;
4329                             case 'S':
4330                             case 's':
4331                                 /*
4332                                  * U+0041 LATIN CAPITAL LETTER A through to
4333                                  * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
4334                                  * LESS-THAN SIGN character token and the
4335                                  * current input character as a character token.
4336                                  */
4337                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4338                                 cstart = pos;
4339                                 index = 1;
4340                                 /*
4341                                  * Set the temporary buffer to the empty string.
4342                                  * Append the lowercase version of the current
4343                                  * input character (add 0x0020 to the
4344                                  * character's code point) to the temporary
4345                                  * buffer. Switch to the script data double
4346                                  * escape start state.
4347                                  */
4348                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
4349                                 break scriptdataescapedlessthanloop;
4350                             // continue stateloop;
4351                             default:
4352                                 /*
4353                                  * Anything else Emit a U+003C LESS-THAN SIGN
4354                                  * character token and reconsume the current
4355                                  * input character in the script data escaped
4356                                  * state.
4357                                  */
4358                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4359                                 cstart = pos;
4360                                 reconsume = true;
4361                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4362                                 continue stateloop;
4363                         }
4364                     }
4365                     // CPPONLY: MOZ_FALLTHROUGH;
4366                 case SCRIPT_DATA_DOUBLE_ESCAPE_START:
4367                     scriptdatadoubleescapestartloop: for (;;) {
4368                         if (++pos == endPos) {
4369                             break stateloop;
4370                         }
4371                         c = checkChar(buf, pos);
4372                         assert index > 0;
4373                         if (index < 6) { // SCRIPT_ARR.length
4374                             char folded = c;
4375                             if (c >= 'A' && c <= 'Z') {
4376                                 folded += 0x20;
4377                             }
4378                             if (folded != Tokenizer.SCRIPT_ARR[index]) {
4379                                 reconsume = true;
4380                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4381                                 continue stateloop;
4382                             }
4383                             index++;
4384                             continue;
4385                         }
4386                         switch (c) {
4387                             case '\r':
4388                                 emitCarriageReturn(buf, pos);
4389                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4390                                 break stateloop;
4391                             case '\n':
4392                                 silentLineFeed();
4393                                 // CPPONLY: MOZ_FALLTHROUGH;
4394                             case ' ':
4395                             case '\t':
4396                             case '\u000C':
4397                             case '/':
4398                             case '>':
4399                                 /*
4400                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4401                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4402                                  * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4403                                  * (>) Emit the current input character as a
4404                                  * character token. If the temporary buffer is
4405                                  * the string "script", then switch to the
4406                                  * script data double escaped state.
4407                                  */
4408                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4409                                 break scriptdatadoubleescapestartloop;
4410                             // continue stateloop;
4411                             default:
4412                                 /*
4413                                  * Anything else Reconsume the current input
4414                                  * character in the script data escaped state.
4415                                  */
4416                                 reconsume = true;
4417                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4418                                 continue stateloop;
4419                         }
4420                     }
4421                     // CPPONLY: MOZ_FALLTHROUGH;
4422                 case SCRIPT_DATA_DOUBLE_ESCAPED:
4423                     scriptdatadoubleescapedloop: for (;;) {
4424                         if (reconsume) {
4425                             reconsume = false;
4426                         } else {
4427                             if (++pos == endPos) {
4428                                 break stateloop;
4429                             }
4430                             c = checkChar(buf, pos);
4431                         }
4432                         /*
4433                          * Consume the next input character:
4434                          */
4435                         switch (c) {
4436                             case '-':
4437                                 /*
4438                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4439                                  * HYPHEN-MINUS character token. Switch to the
4440                                  * script data double escaped dash state.
4441                                  */
4442                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
4443                                 break scriptdatadoubleescapedloop; // FALL THRU
4444                             // continue
4445                             // stateloop;
4446                             case '<':
4447                                 /*
4448                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
4449                                  * LESS-THAN SIGN character token. Switch to the
4450                                  * script data double escaped less-than sign
4451                                  * state.
4452                                  */
4453                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4454                                 continue stateloop;
4455                             case '\u0000':
4456                                 emitReplacementCharacter(buf, pos);
4457                                 continue;
4458                             case '\r':
4459                                 emitCarriageReturn(buf, pos);
4460                                 break stateloop;
4461                             case '\n':
4462                                 silentLineFeed();
4463                                 // CPPONLY: MOZ_FALLTHROUGH;
4464                             default:
4465                                 /*
4466                                  * Anything else Emit the current input
4467                                  * character as a character token. Stay in the
4468                                  * script data double escaped state.
4469                                  */
4470                                 continue;
4471                         }
4472                     }
4473                     // CPPONLY: MOZ_FALLTHROUGH;
4474                 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
4475                     scriptdatadoubleescapeddashloop: for (;;) {
4476                         if (++pos == endPos) {
4477                             break stateloop;
4478                         }
4479                         c = checkChar(buf, pos);
4480                         /*
4481                          * Consume the next input character:
4482                          */
4483                         switch (c) {
4484                             case '-':
4485                                 /*
4486                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4487                                  * HYPHEN-MINUS character token. Switch to the
4488                                  * script data double escaped dash dash state.
4489                                  */
4490                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
4491                                 break scriptdatadoubleescapeddashloop;
4492                             // continue stateloop;
4493                             case '<':
4494                                 /*
4495                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
4496                                  * LESS-THAN SIGN character token. Switch to the
4497                                  * script data double escaped less-than sign
4498                                  * state.
4499                                  */
4500                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4501                                 continue stateloop;
4502                             case '\u0000':
4503                                 emitReplacementCharacter(buf, pos);
4504                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4505                                 continue stateloop;
4506                             case '\r':
4507                                 emitCarriageReturn(buf, pos);
4508                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4509                                 break stateloop;
4510                             case '\n':
4511                                 silentLineFeed();
4512                                 // CPPONLY: MOZ_FALLTHROUGH;
4513                             default:
4514                                 /*
4515                                  * Anything else Emit the current input
4516                                  * character as a character token. Switch to the
4517                                  * script data double escaped state.
4518                                  */
4519                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4520                                 continue stateloop;
4521                         }
4522                     }
4523                     // CPPONLY: MOZ_FALLTHROUGH;
4524                 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
4525                     scriptdatadoubleescapeddashdashloop: for (;;) {
4526                         if (++pos == endPos) {
4527                             break stateloop;
4528                         }
4529                         c = checkChar(buf, pos);
4530                         /*
4531                          * Consume the next input character:
4532                          */
4533                         switch (c) {
4534                             case '-':
4535                                 /*
4536                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
4537                                  * HYPHEN-MINUS character token. Stay in the
4538                                  * script data double escaped dash dash state.
4539                                  */
4540                                 continue;
4541                             case '<':
4542                                 /*
4543                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
4544                                  * LESS-THAN SIGN character token. Switch to the
4545                                  * script data double escaped less-than sign
4546                                  * state.
4547                                  */
4548                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4549                                 break scriptdatadoubleescapeddashdashloop;
4550                             case '>':
4551                                 /*
4552                                  * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4553                                  * GREATER-THAN SIGN character token. Switch to
4554                                  * the script data state.
4555                                  */
4556                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4557                                 continue stateloop;
4558                             case '\u0000':
4559                                 emitReplacementCharacter(buf, pos);
4560                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4561                                 continue stateloop;
4562                             case '\r':
4563                                 emitCarriageReturn(buf, pos);
4564                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4565                                 break stateloop;
4566                             case '\n':
4567                                 silentLineFeed();
4568                                 // CPPONLY: MOZ_FALLTHROUGH;
4569                             default:
4570                                 /*
4571                                  * Anything else Emit the current input
4572                                  * character as a character token. Switch to the
4573                                  * script data double escaped state.
4574                                  */
4575                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4576                                 continue stateloop;
4577                         }
4578                     }
4579                     // CPPONLY: MOZ_FALLTHROUGH;
4580                 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
4581                     scriptdatadoubleescapedlessthanloop: for (;;) {
4582                         if (++pos == endPos) {
4583                             break stateloop;
4584                         }
4585                         c = checkChar(buf, pos);
4586                         /*
4587                          * Consume the next input character:
4588                          */
4589                         switch (c) {
4590                             case '/':
4591                                 /*
4592                                  * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
4593                                  * character token. Set the temporary buffer to
4594                                  * the empty string. Switch to the script data
4595                                  * double escape end state.
4596                                  */
4597                                 index = 0;
4598                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
4599                                 break scriptdatadoubleescapedlessthanloop;
4600                             default:
4601                                 /*
4602                                  * Anything else Reconsume the current input
4603                                  * character in the script data double escaped
4604                                  * state.
4605                                  */
4606                                 reconsume = true;
4607                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4608                                 continue stateloop;
4609                         }
4610                     }
4611                     // CPPONLY: MOZ_FALLTHROUGH;
4612                 case SCRIPT_DATA_DOUBLE_ESCAPE_END:
4613                     scriptdatadoubleescapeendloop: for (;;) {
4614                         if (++pos == endPos) {
4615                             break stateloop;
4616                         }
4617                         c = checkChar(buf, pos);
4618                         if (index < 6) { // SCRIPT_ARR.length
4619                             char folded = c;
4620                             if (c >= 'A' && c <= 'Z') {
4621                                 folded += 0x20;
4622                             }
4623                             if (folded != Tokenizer.SCRIPT_ARR[index]) {
4624                                 reconsume = true;
4625                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4626                                 continue stateloop;
4627                             }
4628                             index++;
4629                             continue;
4630                         }
4631                         switch (c) {
4632                             case '\r':
4633                                 emitCarriageReturn(buf, pos);
4634                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4635                                 break stateloop;
4636                             case '\n':
4637                                 silentLineFeed();
4638                                 // CPPONLY: MOZ_FALLTHROUGH;
4639                             case ' ':
4640                             case '\t':
4641                             case '\u000C':
4642                             case '/':
4643                             case '>':
4644                                 /*
4645                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4646                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4647                                  * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4648                                  * (>) Emit the current input character as a
4649                                  * character token. If the temporary buffer is
4650                                  * the string "script", then switch to the
4651                                  * script data escaped state.
4652                                  */
4653                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4654                                 continue stateloop;
4655                             default:
4656                                 /*
4657                                  * Reconsume the current input character in the
4658                                  * script data double escaped state.
4659                                  */
4660                                 reconsume = true;
4661                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4662                                 continue stateloop;
4663                         }
4664                     }
4665                 case MARKUP_DECLARATION_OCTYPE:
4666                     markupdeclarationdoctypeloop: for (;;) {
4667                         if (++pos == endPos) {
4668                             break stateloop;
4669                         }
4670                         c = checkChar(buf, pos);
4671                         if (index < 6) { // OCTYPE.length
4672                             char folded = c;
4673                             if (c >= 'A' && c <= 'Z') {
4674                                 folded += 0x20;
4675                             }
4676                             if (folded == Tokenizer.OCTYPE[index]) {
4677                                 appendStrBuf(c);
4678                             } else {
4679                                 errBogusComment();
4680                                 reconsume = true;
4681                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4682                                 continue stateloop;
4683                             }
4684                             index++;
4685                             continue;
4686                         } else {
4687                             reconsume = true;
4688                             state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
4689                             break markupdeclarationdoctypeloop;
4690                             // continue stateloop;
4691                         }
4692                     }
4693                     // CPPONLY: MOZ_FALLTHROUGH;
4694                 case DOCTYPE:
4695                     doctypeloop: for (;;) {
4696                         if (reconsume) {
4697                             reconsume = false;
4698                         } else {
4699                             if (++pos == endPos) {
4700                                 break stateloop;
4701                             }
4702                             c = checkChar(buf, pos);
4703                         }
4704                         initDoctypeFields();
4705                         /*
4706                          * Consume the next input character:
4707                          */
4708                         switch (c) {
4709                             case '\r':
4710                                 silentCarriageReturn();
4711                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4712                                 break stateloop;
4713                             case '\n':
4714                                 silentLineFeed();
4715                                 // CPPONLY: MOZ_FALLTHROUGH;
4716                             case ' ':
4717                             case '\t':
4718                             case '\u000C':
4719                                 /*
4720                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4721                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4722                                  * Switch to the before DOCTYPE name state.
4723                                  */
4724                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4725                                 break doctypeloop;
4726                             // continue stateloop;
4727                             default:
4728                                 /*
4729                                  * Anything else Parse error.
4730                                  */
4731                                 errMissingSpaceBeforeDoctypeName();
4732                                 /*
4733                                  * Reconsume the current character in the before
4734                                  * DOCTYPE name state.
4735                                  */
4736                                 reconsume = true;
4737                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4738                                 break doctypeloop;
4739                             // continue stateloop;
4740                         }
4741                     }
4742                     // CPPONLY: MOZ_FALLTHROUGH;
4743                 case BEFORE_DOCTYPE_NAME:
4744                     beforedoctypenameloop: for (;;) {
4745                         if (reconsume) {
4746                             reconsume = false;
4747                         } else {
4748                             if (++pos == endPos) {
4749                                 break stateloop;
4750                             }
4751                             c = checkChar(buf, pos);
4752                         }
4753                         /*
4754                          * Consume the next input character:
4755                          */
4756                         switch (c) {
4757                             case '\r':
4758                                 silentCarriageReturn();
4759                                 break stateloop;
4760                             case '\n':
4761                                 silentLineFeed();
4762                                 // CPPONLY: MOZ_FALLTHROUGH;
4763                             case ' ':
4764                             case '\t':
4765                             case '\u000C':
4766                                 /*
4767                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4768                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4769                                  * in the before DOCTYPE name state.
4770                                  */
4771                                 continue;
4772                             case '>':
4773                                 /*
4774                                  * U+003E GREATER-THAN SIGN (>) Parse error.
4775                                  */
4776                                 errNamelessDoctype();
4777                                 /*
4778                                  * Create a new DOCTYPE token. Set its
4779                                  * force-quirks flag to on.
4780                                  */
4781                                 forceQuirks = true;
4782                                 /*
4783                                  * Emit the token.
4784                                  */
4785                                 emitDoctypeToken(pos);
4786                                 /*
4787                                  * Switch to the data state.
4788                                  */
4789                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
4790                                 continue stateloop;
4791                             case '\u0000':
4792                                 c = '\uFFFD';
4793                                 // CPPONLY: MOZ_FALLTHROUGH;
4794                             default:
4795                                 if (c >= 'A' && c <= 'Z') {
4796                                     /*
4797                                      * U+0041 LATIN CAPITAL LETTER A through to
4798                                      * U+005A LATIN CAPITAL LETTER Z Create a
4799                                      * new DOCTYPE token. Set the token's name
4800                                      * to the lowercase version of the input
4801                                      * character (add 0x0020 to the character's
4802                                      * code point).
4803                                      */
4804                                     c += 0x20;
4805                                 }
4806                                 /* Anything else Create a new DOCTYPE token. */
4807                                 /*
4808                                  * Set the token's name name to the current
4809                                  * input character.
4810                                  */
4811                                 clearStrBufBeforeUse();
4812                                 appendStrBuf(c);
4813                                 /*
4814                                  * Switch to the DOCTYPE name state.
4815                                  */
4816                                 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
4817                                 break beforedoctypenameloop;
4818                             // continue stateloop;
4819                         }
4820                     }
4821                     // CPPONLY: MOZ_FALLTHROUGH;
4822                 case DOCTYPE_NAME:
4823                     doctypenameloop: for (;;) {
4824                         if (++pos == endPos) {
4825                             break stateloop;
4826                         }
4827                         c = checkChar(buf, pos);
4828                         /*
4829                          * Consume the next input character:
4830                          */
4831                         switch (c) {
4832                             case '\r':
4833                                 silentCarriageReturn();
4834                                 strBufToDoctypeName();
4835                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4836                                 break stateloop;
4837                             case '\n':
4838                                 silentLineFeed();
4839                                 // CPPONLY: MOZ_FALLTHROUGH;
4840                             case ' ':
4841                             case '\t':
4842                             case '\u000C':
4843                                 /*
4844                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4845                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4846                                  * Switch to the after DOCTYPE name state.
4847                                  */
4848                                 strBufToDoctypeName();
4849                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4850                                 break doctypenameloop;
4851                             // continue stateloop;
4852                             case '>':
4853                                 /*
4854                                  * U+003E GREATER-THAN SIGN (>) Emit the current
4855                                  * DOCTYPE token.
4856                                  */
4857                                 strBufToDoctypeName();
4858                                 emitDoctypeToken(pos);
4859                                 /*
4860                                  * Switch to the data state.
4861                                  */
4862                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
4863                                 continue stateloop;
4864                             case '\u0000':
4865                                 c = '\uFFFD';
4866                                 // CPPONLY: MOZ_FALLTHROUGH;
4867                             default:
4868                                 /*
4869                                  * U+0041 LATIN CAPITAL LETTER A through to
4870                                  * U+005A LATIN CAPITAL LETTER Z Append the
4871                                  * lowercase version of the input character (add
4872                                  * 0x0020 to the character's code point) to the
4873                                  * current DOCTYPE token's name.
4874                                  */
4875                                 if (c >= 'A' && c <= 'Z') {
4876                                     c += 0x0020;
4877                                 }
4878                                 /*
4879                                  * Anything else Append the current input
4880                                  * character to the current DOCTYPE token's
4881                                  * name.
4882                                  */
4883                                 appendStrBuf(c);
4884                                 /*
4885                                  * Stay in the DOCTYPE name state.
4886                                  */
4887                                 continue;
4888                         }
4889                     }
4890                     // CPPONLY: MOZ_FALLTHROUGH;
4891                 case AFTER_DOCTYPE_NAME:
4892                     afterdoctypenameloop: for (;;) {
4893                         if (++pos == endPos) {
4894                             break stateloop;
4895                         }
4896                         c = checkChar(buf, pos);
4897                         /*
4898                          * Consume the next input character:
4899                          */
4900                         switch (c) {
4901                             case '\r':
4902                                 silentCarriageReturn();
4903                                 break stateloop;
4904                             case '\n':
4905                                 silentLineFeed();
4906                                 // CPPONLY: MOZ_FALLTHROUGH;
4907                             case ' ':
4908                             case '\t':
4909                             case '\u000C':
4910                                 /*
4911                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
4912                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4913                                  * in the after DOCTYPE name state.
4914                                  */
4915                                 continue;
4916                             case '>':
4917                                 /*
4918                                  * U+003E GREATER-THAN SIGN (>) Emit the current
4919                                  * DOCTYPE token.
4920                                  */
4921                                 emitDoctypeToken(pos);
4922                                 /*
4923                                  * Switch to the data state.
4924                                  */
4925                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
4926                                 continue stateloop;
4927                             case 'p':
4928                             case 'P':
4929                                 index = 0;
4930                                 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
4931                                 break afterdoctypenameloop;
4932                             // continue stateloop;
4933                             case 's':
4934                             case 'S':
4935                                 index = 0;
4936                                 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
4937                                 continue stateloop;
4938                             default:
4939                                 /*
4940                                  * Otherwise, this is the parse error.
4941                                  */
4942                                 bogusDoctype();
4943 
4944                                 /*
4945                                  * Set the DOCTYPE token's force-quirks flag to
4946                                  * on.
4947                                  */
4948                                 // done by bogusDoctype();
4949                                 /*
4950                                  * Switch to the bogus DOCTYPE state.
4951                                  */
4952                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4953                                 continue stateloop;
4954                         }
4955                     }
4956                     // CPPONLY: MOZ_FALLTHROUGH;
4957                 case DOCTYPE_UBLIC:
4958                     doctypeublicloop: for (;;) {
4959                         if (++pos == endPos) {
4960                             break stateloop;
4961                         }
4962                         c = checkChar(buf, pos);
4963                         /*
4964                          * If the six characters starting from the current input
4965                          * character are an ASCII case-insensitive match for the
4966                          * word "PUBLIC", then consume those characters and
4967                          * switch to the before DOCTYPE public identifier state.
4968                          */
4969                         if (index < 5) { // UBLIC.length
4970                             char folded = c;
4971                             if (c >= 'A' && c <= 'Z') {
4972                                 folded += 0x20;
4973                             }
4974                             if (folded != Tokenizer.UBLIC[index]) {
4975                                 bogusDoctype();
4976                                 // forceQuirks = true;
4977                                 reconsume = true;
4978                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4979                                 continue stateloop;
4980                             }
4981                             index++;
4982                             continue;
4983                         } else {
4984                             reconsume = true;
4985                             state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
4986                             break doctypeublicloop;
4987                             // continue stateloop;
4988                         }
4989                     }
4990                     // CPPONLY: MOZ_FALLTHROUGH;
4991                 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
4992                     afterdoctypepublickeywordloop: for (;;) {
4993                         if (reconsume) {
4994                             reconsume = false;
4995                         } else {
4996                             if (++pos == endPos) {
4997                                 break stateloop;
4998                             }
4999                             c = checkChar(buf, pos);
5000                         }
5001                         /*
5002                          * Consume the next input character:
5003                          */
5004                         switch (c) {
5005                             case '\r':
5006                                 silentCarriageReturn();
5007                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5008                                 break stateloop;
5009                             case '\n':
5010                                 silentLineFeed();
5011                                 // CPPONLY: MOZ_FALLTHROUGH;
5012                             case ' ':
5013                             case '\t':
5014                             case '\u000C':
5015                                 /*
5016                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5017                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5018                                  * Switch to the before DOCTYPE public
5019                                  * identifier state.
5020                                  */
5021                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5022                                 break afterdoctypepublickeywordloop;
5023                             // FALL THROUGH continue stateloop
5024                             case '"':
5025                                 /*
5026                                  * U+0022 QUOTATION MARK (") Parse Error.
5027                                  */
5028                                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
5029                                 /*
5030                                  * Set the DOCTYPE token's public identifier to
5031                                  * the empty string (not missing),
5032                                  */
5033                                 clearStrBufBeforeUse();
5034                                 /*
5035                                  * then switch to the DOCTYPE public identifier
5036                                  * (double-quoted) state.
5037                                  */
5038                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5039                                 continue stateloop;
5040                             case '\'':
5041                                 /*
5042                                  * U+0027 APOSTROPHE (') Parse Error.
5043                                  */
5044                                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
5045                                 /*
5046                                  * Set the DOCTYPE token's public identifier to
5047                                  * the empty string (not missing),
5048                                  */
5049                                 clearStrBufBeforeUse();
5050                                 /*
5051                                  * then switch to the DOCTYPE public identifier
5052                                  * (single-quoted) state.
5053                                  */
5054                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5055                                 continue stateloop;
5056                             case '>':
5057                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5058                                 errExpectedPublicId();
5059                                 /*
5060                                  * Set the DOCTYPE token's force-quirks flag to
5061                                  * on.
5062                                  */
5063                                 forceQuirks = true;
5064                                 /*
5065                                  * Emit that DOCTYPE token.
5066                                  */
5067                                 emitDoctypeToken(pos);
5068                                 /*
5069                                  * Switch to the data state.
5070                                  */
5071                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5072                                 continue stateloop;
5073                             default:
5074                                 bogusDoctype();
5075                                 /*
5076                                  * Set the DOCTYPE token's force-quirks flag to
5077                                  * on.
5078                                  */
5079                                 // done by bogusDoctype();
5080                                 /*
5081                                  * Switch to the bogus DOCTYPE state.
5082                                  */
5083                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5084                                 continue stateloop;
5085                         }
5086                     }
5087                     // CPPONLY: MOZ_FALLTHROUGH;
5088                 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
5089                     beforedoctypepublicidentifierloop: for (;;) {
5090                         if (++pos == endPos) {
5091                             break stateloop;
5092                         }
5093                         c = checkChar(buf, pos);
5094                         /*
5095                          * Consume the next input character:
5096                          */
5097                         switch (c) {
5098                             case '\r':
5099                                 silentCarriageReturn();
5100                                 break stateloop;
5101                             case '\n':
5102                                 silentLineFeed();
5103                                 // CPPONLY: MOZ_FALLTHROUGH;
5104                             case ' ':
5105                             case '\t':
5106                             case '\u000C':
5107                                 /*
5108                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5109                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5110                                  * in the before DOCTYPE public identifier
5111                                  * state.
5112                                  */
5113                                 continue;
5114                             case '"':
5115                                 /*
5116                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
5117                                  * token's public identifier to the empty string
5118                                  * (not missing),
5119                                  */
5120                                 clearStrBufBeforeUse();
5121                                 /*
5122                                  * then switch to the DOCTYPE public identifier
5123                                  * (double-quoted) state.
5124                                  */
5125                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5126                                 break beforedoctypepublicidentifierloop;
5127                             // continue stateloop;
5128                             case '\'':
5129                                 /*
5130                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5131                                  * public identifier to the empty string (not
5132                                  * missing),
5133                                  */
5134                                 clearStrBufBeforeUse();
5135                                 /*
5136                                  * then switch to the DOCTYPE public identifier
5137                                  * (single-quoted) state.
5138                                  */
5139                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5140                                 continue stateloop;
5141                             case '>':
5142                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5143                                 errExpectedPublicId();
5144                                 /*
5145                                  * Set the DOCTYPE token's force-quirks flag to
5146                                  * on.
5147                                  */
5148                                 forceQuirks = true;
5149                                 /*
5150                                  * Emit that DOCTYPE token.
5151                                  */
5152                                 emitDoctypeToken(pos);
5153                                 /*
5154                                  * Switch to the data state.
5155                                  */
5156                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5157                                 continue stateloop;
5158                             default:
5159                                 bogusDoctype();
5160                                 /*
5161                                  * Set the DOCTYPE token's force-quirks flag to
5162                                  * on.
5163                                  */
5164                                 // done by bogusDoctype();
5165                                 /*
5166                                  * Switch to the bogus DOCTYPE state.
5167                                  */
5168                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5169                                 continue stateloop;
5170                         }
5171                     }
5172                     // CPPONLY: MOZ_FALLTHROUGH;
5173                 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
5174                     doctypepublicidentifierdoublequotedloop: for (;;) {
5175                         if (++pos == endPos) {
5176                             break stateloop;
5177                         }
5178                         c = checkChar(buf, pos);
5179                         /*
5180                          * Consume the next input character:
5181                          */
5182                         switch (c) {
5183                             case '"':
5184                                 /*
5185                                  * U+0022 QUOTATION MARK (") Switch to the after
5186                                  * DOCTYPE public identifier state.
5187                                  */
5188                                 publicIdentifier = strBufToString();
5189                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5190                                 break doctypepublicidentifierdoublequotedloop;
5191                             // continue stateloop;
5192                             case '>':
5193                                 /*
5194                                  * U+003E GREATER-THAN SIGN (>) Parse error.
5195                                  */
5196                                 errGtInPublicId();
5197                                 /*
5198                                  * Set the DOCTYPE token's force-quirks flag to
5199                                  * on.
5200                                  */
5201                                 forceQuirks = true;
5202                                 /*
5203                                  * Emit that DOCTYPE token.
5204                                  */
5205                                 publicIdentifier = strBufToString();
5206                                 emitDoctypeToken(pos);
5207                                 /*
5208                                  * Switch to the data state.
5209                                  */
5210                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5211                                 continue stateloop;
5212                             case '\r':
5213                                 appendStrBufCarriageReturn();
5214                                 break stateloop;
5215                             case '\n':
5216                                 appendStrBufLineFeed();
5217                                 continue;
5218                             case '\u0000':
5219                                 c = '\uFFFD';
5220                                 // CPPONLY: MOZ_FALLTHROUGH;
5221                             default:
5222                                 /*
5223                                  * Anything else Append the current input
5224                                  * character to the current DOCTYPE token's
5225                                  * public identifier.
5226                                  */
5227                                 appendStrBuf(c);
5228                                 /*
5229                                  * Stay in the DOCTYPE public identifier
5230                                  * (double-quoted) state.
5231                                  */
5232                                 continue;
5233                         }
5234                     }
5235                     // CPPONLY: MOZ_FALLTHROUGH;
5236                 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
5237                     afterdoctypepublicidentifierloop: for (;;) {
5238                         if (++pos == endPos) {
5239                             break stateloop;
5240                         }
5241                         c = checkChar(buf, pos);
5242                         /*
5243                          * Consume the next input character:
5244                          */
5245                         switch (c) {
5246                             case '\r':
5247                                 silentCarriageReturn();
5248                                 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5249                                 break stateloop;
5250                             case '\n':
5251                                 silentLineFeed();
5252                                 // CPPONLY: MOZ_FALLTHROUGH;
5253                             case ' ':
5254                             case '\t':
5255                             case '\u000C':
5256                                 /*
5257                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5258                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5259                                  * Switch to the between DOCTYPE public and
5260                                  * system identifiers state.
5261                                  */
5262                                 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5263                                 break afterdoctypepublicidentifierloop;
5264                             // continue stateloop;
5265                             case '>':
5266                                 /*
5267                                  * U+003E GREATER-THAN SIGN (>) Emit the current
5268                                  * DOCTYPE token.
5269                                  */
5270                                 emitDoctypeToken(pos);
5271                                 /*
5272                                  * Switch to the data state.
5273                                  */
5274                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5275                                 continue stateloop;
5276                             case '"':
5277                                 /*
5278                                  * U+0022 QUOTATION MARK (") Parse error.
5279                                  */
5280                                 errNoSpaceBetweenPublicAndSystemIds();
5281                                 /*
5282                                  * Set the DOCTYPE token's system identifier to
5283                                  * the empty string (not missing),
5284                                  */
5285                                 clearStrBufBeforeUse();
5286                                 /*
5287                                  * then switch to the DOCTYPE system identifier
5288                                  * (double-quoted) state.
5289                                  */
5290                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5291                                 continue stateloop;
5292                             case '\'':
5293                                 /*
5294                                  * U+0027 APOSTROPHE (') Parse error.
5295                                  */
5296                                 errNoSpaceBetweenPublicAndSystemIds();
5297                                 /*
5298                                  * Set the DOCTYPE token's system identifier to
5299                                  * the empty string (not missing),
5300                                  */
5301                                 clearStrBufBeforeUse();
5302                                 /*
5303                                  * then switch to the DOCTYPE system identifier
5304                                  * (single-quoted) state.
5305                                  */
5306                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5307                                 continue stateloop;
5308                             default:
5309                                 bogusDoctype();
5310                                 /*
5311                                  * Set the DOCTYPE token's force-quirks flag to
5312                                  * on.
5313                                  */
5314                                 // done by bogusDoctype();
5315                                 /*
5316                                  * Switch to the bogus DOCTYPE state.
5317                                  */
5318                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5319                                 continue stateloop;
5320                         }
5321                     }
5322                     // CPPONLY: MOZ_FALLTHROUGH;
5323                 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
5324                     betweendoctypepublicandsystemidentifiersloop: for (;;) {
5325                         if (++pos == endPos) {
5326                             break stateloop;
5327                         }
5328                         c = checkChar(buf, pos);
5329                         /*
5330                          * Consume the next input character:
5331                          */
5332                         switch (c) {
5333                             case '\r':
5334                                 silentCarriageReturn();
5335                                 break stateloop;
5336                             case '\n':
5337                                 silentLineFeed();
5338                                 // CPPONLY: MOZ_FALLTHROUGH;
5339                             case ' ':
5340                             case '\t':
5341                             case '\u000C':
5342                                 /*
5343                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5344                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5345                                  * in the between DOCTYPE public and system
5346                                  * identifiers state.
5347                                  */
5348                                 continue;
5349                             case '>':
5350                                 /*
5351                                  * U+003E GREATER-THAN SIGN (>) Emit the current
5352                                  * DOCTYPE token.
5353                                  */
5354                                 emitDoctypeToken(pos);
5355                                 /*
5356                                  * Switch to the data state.
5357                                  */
5358                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5359                                 continue stateloop;
5360                             case '"':
5361                                 /*
5362                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
5363                                  * token's system identifier to the empty string
5364                                  * (not missing),
5365                                  */
5366                                 clearStrBufBeforeUse();
5367                                 /*
5368                                  * then switch to the DOCTYPE system identifier
5369                                  * (double-quoted) state.
5370                                  */
5371                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5372                                 break betweendoctypepublicandsystemidentifiersloop;
5373                             // continue stateloop;
5374                             case '\'':
5375                                 /*
5376                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5377                                  * system identifier to the empty string (not
5378                                  * missing),
5379                                  */
5380                                 clearStrBufBeforeUse();
5381                                 /*
5382                                  * then switch to the DOCTYPE system identifier
5383                                  * (single-quoted) state.
5384                                  */
5385                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5386                                 continue stateloop;
5387                             default:
5388                                 bogusDoctype();
5389                                 /*
5390                                  * Set the DOCTYPE token's force-quirks flag to
5391                                  * on.
5392                                  */
5393                                 // done by bogusDoctype();
5394                                 /*
5395                                  * Switch to the bogus DOCTYPE state.
5396                                  */
5397                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5398                                 continue stateloop;
5399                         }
5400                     }
5401                     // CPPONLY: MOZ_FALLTHROUGH;
5402                 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
5403                     doctypesystemidentifierdoublequotedloop: for (;;) {
5404                         if (++pos == endPos) {
5405                             break stateloop;
5406                         }
5407                         c = checkChar(buf, pos);
5408                         /*
5409                          * Consume the next input character:
5410                          */
5411                         switch (c) {
5412                             case '"':
5413                                 /*
5414                                  * U+0022 QUOTATION MARK (") Switch to the after
5415                                  * DOCTYPE system identifier state.
5416                                  */
5417                                 systemIdentifier = strBufToString();
5418                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5419                                 continue stateloop;
5420                             case '>':
5421                                 /*
5422                                  * U+003E GREATER-THAN SIGN (>) Parse error.
5423                                  */
5424                                 errGtInSystemId();
5425                                 /*
5426                                  * Set the DOCTYPE token's force-quirks flag to
5427                                  * on.
5428                                  */
5429                                 forceQuirks = true;
5430                                 /*
5431                                  * Emit that DOCTYPE token.
5432                                  */
5433                                 systemIdentifier = strBufToString();
5434                                 emitDoctypeToken(pos);
5435                                 /*
5436                                  * Switch to the data state.
5437                                  */
5438                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5439                                 continue stateloop;
5440                             case '\r':
5441                                 appendStrBufCarriageReturn();
5442                                 break stateloop;
5443                             case '\n':
5444                                 appendStrBufLineFeed();
5445                                 continue;
5446                             case '\u0000':
5447                                 c = '\uFFFD';
5448                                 // CPPONLY: MOZ_FALLTHROUGH;
5449                             default:
5450                                 /*
5451                                  * Anything else Append the current input
5452                                  * character to the current DOCTYPE token's
5453                                  * system identifier.
5454                                  */
5455                                 appendStrBuf(c);
5456                                 /*
5457                                  * Stay in the DOCTYPE system identifier
5458                                  * (double-quoted) state.
5459                                  */
5460                                 continue;
5461                         }
5462                     }
5463                 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
5464                     afterdoctypesystemidentifierloop: for (;;) {
5465                         if (++pos == endPos) {
5466                             break stateloop;
5467                         }
5468                         c = checkChar(buf, pos);
5469                         /*
5470                          * Consume the next input character:
5471                          */
5472                         switch (c) {
5473                             case '\r':
5474                                 silentCarriageReturn();
5475                                 break stateloop;
5476                             case '\n':
5477                                 silentLineFeed();
5478                                 // CPPONLY: MOZ_FALLTHROUGH;
5479                             case ' ':
5480                             case '\t':
5481                             case '\u000C':
5482                                 /*
5483                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5484                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5485                                  * in the after DOCTYPE system identifier state.
5486                                  */
5487                                 continue;
5488                             case '>':
5489                                 /*
5490                                  * U+003E GREATER-THAN SIGN (>) Emit the current
5491                                  * DOCTYPE token.
5492                                  */
5493                                 emitDoctypeToken(pos);
5494                                 /*
5495                                  * Switch to the data state.
5496                                  */
5497                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5498                                 continue stateloop;
5499                             default:
5500                                 /*
5501                                  * Switch to the bogus DOCTYPE state. (This does
5502                                  * not set the DOCTYPE token's force-quirks flag
5503                                  * to on.)
5504                                  */
5505                                 bogusDoctypeWithoutQuirks();
5506                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5507                                 break afterdoctypesystemidentifierloop;
5508                             // continue stateloop;
5509                         }
5510                     }
5511                     // CPPONLY: MOZ_FALLTHROUGH;
5512                 case BOGUS_DOCTYPE:
5513                     for (;;) {
5514                         if (reconsume) {
5515                             reconsume = false;
5516                         } else {
5517                             if (++pos == endPos) {
5518                                 break stateloop;
5519                             }
5520                             c = checkChar(buf, pos);
5521                         }
5522                         /*
5523                          * Consume the next input character:
5524                          */
5525                         switch (c) {
5526                             case '>':
5527                                 /*
5528                                  * U+003E GREATER-THAN SIGN (>) Emit that
5529                                  * DOCTYPE token.
5530                                  */
5531                                 emitDoctypeToken(pos);
5532                                 /*
5533                                  * Switch to the data state.
5534                                  */
5535                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5536                                 continue stateloop;
5537                             case '\r':
5538                                 silentCarriageReturn();
5539                                 break stateloop;
5540                             case '\n':
5541                                 silentLineFeed();
5542                                 // CPPONLY: MOZ_FALLTHROUGH;
5543                             default:
5544                                 /*
5545                                  * Anything else Stay in the bogus DOCTYPE
5546                                  * state.
5547                                  */
5548                                 continue;
5549                         }
5550                     }
5551                 case DOCTYPE_YSTEM:
5552                     doctypeystemloop: for (;;) {
5553                         if (++pos == endPos) {
5554                             break stateloop;
5555                         }
5556                         c = checkChar(buf, pos);
5557                         /*
5558                          * Otherwise, if the six characters starting from the
5559                          * current input character are an ASCII case-insensitive
5560                          * match for the word "SYSTEM", then consume those
5561                          * characters and switch to the before DOCTYPE system
5562                          * identifier state.
5563                          */
5564                         if (index < 5) { // YSTEM.length
5565                             char folded = c;
5566                             if (c >= 'A' && c <= 'Z') {
5567                                 folded += 0x20;
5568                             }
5569                             if (folded != Tokenizer.YSTEM[index]) {
5570                                 bogusDoctype();
5571                                 reconsume = true;
5572                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5573                                 continue stateloop;
5574                             }
5575                             index++;
5576                             continue stateloop;
5577                         } else {
5578                             reconsume = true;
5579                             state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
5580                             break doctypeystemloop;
5581                             // continue stateloop;
5582                         }
5583                     }
5584                     // CPPONLY: MOZ_FALLTHROUGH;
5585                 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
5586                     afterdoctypesystemkeywordloop: for (;;) {
5587                         if (reconsume) {
5588                             reconsume = false;
5589                         } else {
5590                             if (++pos == endPos) {
5591                                 break stateloop;
5592                             }
5593                             c = checkChar(buf, pos);
5594                         }
5595                         /*
5596                          * Consume the next input character:
5597                          */
5598                         switch (c) {
5599                             case '\r':
5600                                 silentCarriageReturn();
5601                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5602                                 break stateloop;
5603                             case '\n':
5604                                 silentLineFeed();
5605                                 // CPPONLY: MOZ_FALLTHROUGH;
5606                             case ' ':
5607                             case '\t':
5608                             case '\u000C':
5609                                 /*
5610                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5611                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5612                                  * Switch to the before DOCTYPE public
5613                                  * identifier state.
5614                                  */
5615                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5616                                 break afterdoctypesystemkeywordloop;
5617                             // FALL THROUGH continue stateloop
5618                             case '"':
5619                                 /*
5620                                  * U+0022 QUOTATION MARK (") Parse Error.
5621                                  */
5622                                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5623                                 /*
5624                                  * Set the DOCTYPE token's system identifier to
5625                                  * the empty string (not missing),
5626                                  */
5627                                 clearStrBufBeforeUse();
5628                                 /*
5629                                  * then switch to the DOCTYPE public identifier
5630                                  * (double-quoted) state.
5631                                  */
5632                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5633                                 continue stateloop;
5634                             case '\'':
5635                                 /*
5636                                  * U+0027 APOSTROPHE (') Parse Error.
5637                                  */
5638                                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5639                                 /*
5640                                  * Set the DOCTYPE token's public identifier to
5641                                  * the empty string (not missing),
5642                                  */
5643                                 clearStrBufBeforeUse();
5644                                 /*
5645                                  * then switch to the DOCTYPE public identifier
5646                                  * (single-quoted) state.
5647                                  */
5648                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5649                                 continue stateloop;
5650                             case '>':
5651                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5652                                 errExpectedPublicId();
5653                                 /*
5654                                  * Set the DOCTYPE token's force-quirks flag to
5655                                  * on.
5656                                  */
5657                                 forceQuirks = true;
5658                                 /*
5659                                  * Emit that DOCTYPE token.
5660                                  */
5661                                 emitDoctypeToken(pos);
5662                                 /*
5663                                  * Switch to the data state.
5664                                  */
5665                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5666                                 continue stateloop;
5667                             default:
5668                                 bogusDoctype();
5669                                 /*
5670                                  * Set the DOCTYPE token's force-quirks flag to
5671                                  * on.
5672                                  */
5673                                 // done by bogusDoctype();
5674                                 /*
5675                                  * Switch to the bogus DOCTYPE state.
5676                                  */
5677                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5678                                 continue stateloop;
5679                         }
5680                     }
5681                     // CPPONLY: MOZ_FALLTHROUGH;
5682                 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
5683                     beforedoctypesystemidentifierloop: for (;;) {
5684                         if (++pos == endPos) {
5685                             break stateloop;
5686                         }
5687                         c = checkChar(buf, pos);
5688                         /*
5689                          * Consume the next input character:
5690                          */
5691                         switch (c) {
5692                             case '\r':
5693                                 silentCarriageReturn();
5694                                 break stateloop;
5695                             case '\n':
5696                                 silentLineFeed();
5697                                 // CPPONLY: MOZ_FALLTHROUGH;
5698                             case ' ':
5699                             case '\t':
5700                             case '\u000C':
5701                                 /*
5702                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
5703                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5704                                  * in the before DOCTYPE system identifier
5705                                  * state.
5706                                  */
5707                                 continue;
5708                             case '"':
5709                                 /*
5710                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
5711                                  * token's system identifier to the empty string
5712                                  * (not missing),
5713                                  */
5714                                 clearStrBufBeforeUse();
5715                                 /*
5716                                  * then switch to the DOCTYPE system identifier
5717                                  * (double-quoted) state.
5718                                  */
5719                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5720                                 continue stateloop;
5721                             case '\'':
5722                                 /*
5723                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5724                                  * system identifier to the empty string (not
5725                                  * missing),
5726                                  */
5727                                 clearStrBufBeforeUse();
5728                                 /*
5729                                  * then switch to the DOCTYPE system identifier
5730                                  * (single-quoted) state.
5731                                  */
5732                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5733                                 break beforedoctypesystemidentifierloop;
5734                             // continue stateloop;
5735                             case '>':
5736                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5737                                 errExpectedSystemId();
5738                                 /*
5739                                  * Set the DOCTYPE token's force-quirks flag to
5740                                  * on.
5741                                  */
5742                                 forceQuirks = true;
5743                                 /*
5744                                  * Emit that DOCTYPE token.
5745                                  */
5746                                 emitDoctypeToken(pos);
5747                                 /*
5748                                  * Switch to the data state.
5749                                  */
5750                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5751                                 continue stateloop;
5752                             default:
5753                                 bogusDoctype();
5754                                 /*
5755                                  * Set the DOCTYPE token's force-quirks flag to
5756                                  * on.
5757                                  */
5758                                 // done by bogusDoctype();
5759                                 /*
5760                                  * Switch to the bogus DOCTYPE state.
5761                                  */
5762                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5763                                 continue stateloop;
5764                         }
5765                     }
5766                     // CPPONLY: MOZ_FALLTHROUGH;
5767                 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
5768                     for (;;) {
5769                         if (++pos == endPos) {
5770                             break stateloop;
5771                         }
5772                         c = checkChar(buf, pos);
5773                         /*
5774                          * Consume the next input character:
5775                          */
5776                         switch (c) {
5777                             case '\'':
5778                                 /*
5779                                  * U+0027 APOSTROPHE (') Switch to the after
5780                                  * DOCTYPE system identifier state.
5781                                  */
5782                                 systemIdentifier = strBufToString();
5783                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5784                                 continue stateloop;
5785                             case '>':
5786                                 errGtInSystemId();
5787                                 /*
5788                                  * Set the DOCTYPE token's force-quirks flag to
5789                                  * on.
5790                                  */
5791                                 forceQuirks = true;
5792                                 /*
5793                                  * Emit that DOCTYPE token.
5794                                  */
5795                                 systemIdentifier = strBufToString();
5796                                 emitDoctypeToken(pos);
5797                                 /*
5798                                  * Switch to the data state.
5799                                  */
5800                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5801                                 continue stateloop;
5802                             case '\r':
5803                                 appendStrBufCarriageReturn();
5804                                 break stateloop;
5805                             case '\n':
5806                                 appendStrBufLineFeed();
5807                                 continue;
5808                             case '\u0000':
5809                                 c = '\uFFFD';
5810                                 // CPPONLY: MOZ_FALLTHROUGH;
5811                             default:
5812                                 /*
5813                                  * Anything else Append the current input
5814                                  * character to the current DOCTYPE token's
5815                                  * system identifier.
5816                                  */
5817                                 appendStrBuf(c);
5818                                 /*
5819                                  * Stay in the DOCTYPE system identifier
5820                                  * (double-quoted) state.
5821                                  */
5822                                 continue;
5823                         }
5824                     }
5825                 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
5826                     for (;;) {
5827                         if (++pos == endPos) {
5828                             break stateloop;
5829                         }
5830                         c = checkChar(buf, pos);
5831                         /*
5832                          * Consume the next input character:
5833                          */
5834                         switch (c) {
5835                             case '\'':
5836                                 /*
5837                                  * U+0027 APOSTROPHE (') Switch to the after
5838                                  * DOCTYPE public identifier state.
5839                                  */
5840                                 publicIdentifier = strBufToString();
5841                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5842                                 continue stateloop;
5843                             case '>':
5844                                 errGtInPublicId();
5845                                 /*
5846                                  * Set the DOCTYPE token's force-quirks flag to
5847                                  * on.
5848                                  */
5849                                 forceQuirks = true;
5850                                 /*
5851                                  * Emit that DOCTYPE token.
5852                                  */
5853                                 publicIdentifier = strBufToString();
5854                                 emitDoctypeToken(pos);
5855                                 /*
5856                                  * Switch to the data state.
5857                                  */
5858                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
5859                                 continue stateloop;
5860                             case '\r':
5861                                 appendStrBufCarriageReturn();
5862                                 break stateloop;
5863                             case '\n':
5864                                 appendStrBufLineFeed();
5865                                 continue;
5866                             case '\u0000':
5867                                 c = '\uFFFD';
5868                                 // CPPONLY: MOZ_FALLTHROUGH;
5869                             default:
5870                                 /*
5871                                  * Anything else Append the current input
5872                                  * character to the current DOCTYPE token's
5873                                  * public identifier.
5874                                  */
5875                                 appendStrBuf(c);
5876                                 /*
5877                                  * Stay in the DOCTYPE public identifier
5878                                  * (single-quoted) state.
5879                                  */
5880                                 continue;
5881                         }
5882                     }
5883                 case PROCESSING_INSTRUCTION:
5884                     processinginstructionloop: for (;;) {
5885                         if (++pos == endPos) {
5886                             break stateloop;
5887                         }
5888                         c = checkChar(buf, pos);
5889                         switch (c) {
5890                             case '?':
5891                                 state = transition(
5892                                         state,
5893                                         Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
5894                                         reconsume, pos);
5895                                 break processinginstructionloop;
5896                             // continue stateloop;
5897                             default:
5898                                 continue;
5899                         }
5900                     }
5901                     // CPPONLY: MOZ_FALLTHROUGH;
5902                 case PROCESSING_INSTRUCTION_QUESTION_MARK:
5903                     if (++pos == endPos) {
5904                         break stateloop;
5905                     }
5906                     c = checkChar(buf, pos);
5907                     switch (c) {
5908                         case '>':
5909                             state = transition(state, Tokenizer.DATA,
5910                                     reconsume, pos);
5911                             continue stateloop;
5912                         default:
5913                             state = transition(state,
5914                                     Tokenizer.PROCESSING_INSTRUCTION,
5915                                     reconsume, pos);
5916                             continue stateloop;
5917                     }
5918                     // END HOTSPOT WORKAROUND
5919             }
5920         }
5921         flushChars(buf, pos);
5922         /*
5923          * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
5924          */
5925         // Save locals
5926         stateSave = state;
5927         returnStateSave = returnState;
5928         return pos;
5929     }
5930 
5931     // HOTSPOT WORKAROUND INSERTION POINT
5932 
5933     // [NOCPP[
5934 
transition(int from, int to, boolean reconsume, int pos)5935     protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
5936         return to;
5937     }
5938 
5939     // ]NOCPP]
5940 
initDoctypeFields()5941     private void initDoctypeFields() {
5942         // Discard the characters "DOCTYPE" accumulated as a potential bogus
5943         // comment into strBuf.
5944         clearStrBufAfterUse();
5945         doctypeName = "";
5946         if (systemIdentifier != null) {
5947             Portability.releaseString(systemIdentifier);
5948             systemIdentifier = null;
5949         }
5950         if (publicIdentifier != null) {
5951             Portability.releaseString(publicIdentifier);
5952             publicIdentifier = null;
5953         }
5954         forceQuirks = false;
5955     }
5956 
adjustDoubleHyphenAndAppendToStrBufCarriageReturn()5957     @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
5958             throws SAXException {
5959         silentCarriageReturn();
5960         adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
5961     }
5962 
adjustDoubleHyphenAndAppendToStrBufLineFeed()5963     @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed()
5964             throws SAXException {
5965         silentLineFeed();
5966         adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
5967     }
5968 
appendStrBufLineFeed()5969     @Inline private void appendStrBufLineFeed() {
5970         silentLineFeed();
5971         appendStrBuf('\n');
5972     }
5973 
appendStrBufCarriageReturn()5974     @Inline private void appendStrBufCarriageReturn() {
5975         silentCarriageReturn();
5976         appendStrBuf('\n');
5977     }
5978 
silentCarriageReturn()5979     @Inline protected void silentCarriageReturn() {
5980         ++line;
5981         lastCR = true;
5982     }
5983 
silentLineFeed()5984     @Inline protected void silentLineFeed() {
5985         ++line;
5986     }
5987 
emitCarriageReturn(@oLength char[] buf, int pos)5988     private void emitCarriageReturn(@NoLength char[] buf, int pos)
5989             throws SAXException {
5990         silentCarriageReturn();
5991         flushChars(buf, pos);
5992         tokenHandler.characters(Tokenizer.LF, 0, 1);
5993         cstart = Integer.MAX_VALUE;
5994     }
5995 
emitReplacementCharacter(@oLength char[] buf, int pos)5996     private void emitReplacementCharacter(@NoLength char[] buf, int pos)
5997             throws SAXException {
5998         flushChars(buf, pos);
5999         tokenHandler.zeroOriginatingReplacementCharacter();
6000         cstart = pos + 1;
6001     }
6002 
emitPlaintextReplacementCharacter(@oLength char[] buf, int pos)6003     private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
6004             throws SAXException {
6005         flushChars(buf, pos);
6006         tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
6007         cstart = pos + 1;
6008     }
6009 
setAdditionalAndRememberAmpersandLocation(char add)6010     private void setAdditionalAndRememberAmpersandLocation(char add) {
6011         additional = add;
6012         // [NOCPP[
6013         ampersandLocation = new LocatorImpl(this);
6014         // ]NOCPP]
6015     }
6016 
bogusDoctype()6017     private void bogusDoctype() throws SAXException {
6018         errBogusDoctype();
6019         forceQuirks = true;
6020     }
6021 
bogusDoctypeWithoutQuirks()6022     private void bogusDoctypeWithoutQuirks() throws SAXException {
6023         errBogusDoctype();
6024         forceQuirks = false;
6025     }
6026 
handleNcrValue(int returnState)6027     private void handleNcrValue(int returnState) throws SAXException {
6028         /*
6029          * If one or more characters match the range, then take them all and
6030          * interpret the string of characters as a number (either hexadecimal or
6031          * decimal as appropriate).
6032          */
6033         if (value <= 0xFFFF) {
6034             if (value >= 0x80 && value <= 0x9f) {
6035                 /*
6036                  * If that number is one of the numbers in the first column of
6037                  * the following table, then this is a parse error.
6038                  */
6039                 errNcrInC1Range();
6040                 /*
6041                  * Find the row with that number in the first column, and return
6042                  * a character token for the Unicode character given in the
6043                  * second column of that row.
6044                  */
6045                 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
6046                 emitOrAppendOne(val, returnState);
6047                 // [NOCPP[
6048             } else if (value == 0xC
6049                     && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
6050                 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
6051                     emitOrAppendOne(Tokenizer.SPACE, returnState);
6052                 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
6053                     fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
6054                 }
6055                 // ]NOCPP]
6056             } else if (value == 0x0) {
6057                 errNcrZero();
6058                 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
6059             } else if ((value & 0xF800) == 0xD800) {
6060                 errNcrSurrogate();
6061                 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
6062             } else {
6063                 /*
6064                  * Otherwise, return a character token for the Unicode character
6065                  * whose code point is that number.
6066                  */
6067                 char ch = (char) value;
6068                 // [NOCPP[
6069                 if (value == 0x0D) {
6070                     errNcrCr();
6071                 } else if ((value <= 0x0008) || (value == 0x000B)
6072                         || (value >= 0x000E && value <= 0x001F)) {
6073                     ch = errNcrControlChar(ch);
6074                 } else if (value >= 0xFDD0 && value <= 0xFDEF) {
6075                     errNcrUnassigned();
6076                 } else if ((value & 0xFFFE) == 0xFFFE) {
6077                     ch = errNcrNonCharacter(ch);
6078                 } else if (value >= 0x007F && value <= 0x009F) {
6079                     errNcrControlChar();
6080                 } else {
6081                     maybeWarnPrivateUse(ch);
6082                 }
6083                 // ]NOCPP]
6084                 bmpChar[0] = ch;
6085                 emitOrAppendOne(bmpChar, returnState);
6086             }
6087         } else if (value <= 0x10FFFF) {
6088             // [NOCPP[
6089             maybeWarnPrivateUseAstral();
6090             if ((value & 0xFFFE) == 0xFFFE) {
6091                 errAstralNonCharacter(value);
6092             }
6093             // ]NOCPP]
6094             astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
6095             astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
6096             emitOrAppendTwo(astralChar, returnState);
6097         } else {
6098             errNcrOutOfRange();
6099             emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
6100         }
6101     }
6102 
eof()6103     public void eof() throws SAXException {
6104         int state = stateSave;
6105         int returnState = returnStateSave;
6106 
6107         eofloop: for (;;) {
6108             switch (state) {
6109                 case SCRIPT_DATA_LESS_THAN_SIGN:
6110                 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
6111                     /*
6112                      * Otherwise, emit a U+003C LESS-THAN SIGN character token
6113                      */
6114                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6115                     /*
6116                      * and reconsume the current input character in the data
6117                      * state.
6118                      */
6119                     break eofloop;
6120                 case TAG_OPEN:
6121                     /*
6122                      * The behavior of this state depends on the content model
6123                      * flag.
6124                      */
6125                     /*
6126                      * Anything else Parse error.
6127                      */
6128                     errEofAfterLt();
6129                     /*
6130                      * Emit a U+003C LESS-THAN SIGN character token
6131                      */
6132                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6133                     /*
6134                      * and reconsume the current input character in the data
6135                      * state.
6136                      */
6137                     break eofloop;
6138                 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
6139                     /*
6140                      * Emit a U+003C LESS-THAN SIGN character token
6141                      */
6142                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6143                     /*
6144                      * and reconsume the current input character in the RCDATA
6145                      * state.
6146                      */
6147                     break eofloop;
6148                 case NON_DATA_END_TAG_NAME:
6149                     /*
6150                      * Emit a U+003C LESS-THAN SIGN character token, a U+002F
6151                      * SOLIDUS character token,
6152                      */
6153                     tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
6154                     /*
6155                      * a character token for each of the characters in the
6156                      * temporary buffer (in the order they were added to the
6157                      * buffer),
6158                      */
6159                     emitStrBuf();
6160                     /*
6161                      * and reconsume the current input character in the RCDATA
6162                      * state.
6163                      */
6164                     break eofloop;
6165                 case CLOSE_TAG_OPEN:
6166                     /* EOF Parse error. */
6167                     errEofAfterLt();
6168                     /*
6169                      * Emit a U+003C LESS-THAN SIGN character token and a U+002F
6170                      * SOLIDUS character token.
6171                      */
6172                     tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
6173                     /*
6174                      * Reconsume the EOF character in the data state.
6175                      */
6176                     break eofloop;
6177                 case TAG_NAME:
6178                     /*
6179                      * EOF Parse error.
6180                      */
6181                     errEofInTagName();
6182                     /*
6183                      * Reconsume the EOF character in the data state.
6184                      */
6185                     break eofloop;
6186                 case BEFORE_ATTRIBUTE_NAME:
6187                 case AFTER_ATTRIBUTE_VALUE_QUOTED:
6188                 case SELF_CLOSING_START_TAG:
6189                     /* EOF Parse error. */
6190                     errEofWithoutGt();
6191                     /*
6192                      * Reconsume the EOF character in the data state.
6193                      */
6194                     break eofloop;
6195                 case ATTRIBUTE_NAME:
6196                     /*
6197                      * EOF Parse error.
6198                      */
6199                     errEofInAttributeName();
6200                     /*
6201                      * Reconsume the EOF character in the data state.
6202                      */
6203                     break eofloop;
6204                 case AFTER_ATTRIBUTE_NAME:
6205                 case BEFORE_ATTRIBUTE_VALUE:
6206                     /* EOF Parse error. */
6207                     errEofWithoutGt();
6208                     /*
6209                      * Reconsume the EOF character in the data state.
6210                      */
6211                     break eofloop;
6212                 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
6213                 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
6214                 case ATTRIBUTE_VALUE_UNQUOTED:
6215                     /* EOF Parse error. */
6216                     errEofInAttributeValue();
6217                     /*
6218                      * Reconsume the EOF character in the data state.
6219                      */
6220                     break eofloop;
6221                 case BOGUS_COMMENT:
6222                     emitComment(0, 0);
6223                     break eofloop;
6224                 case BOGUS_COMMENT_HYPHEN:
6225                     // [NOCPP[
6226                     maybeAppendSpaceToBogusComment();
6227                     // ]NOCPP]
6228                     emitComment(0, 0);
6229                     break eofloop;
6230                 case MARKUP_DECLARATION_OPEN:
6231                     errBogusComment();
6232                     emitComment(0, 0);
6233                     break eofloop;
6234                 case MARKUP_DECLARATION_HYPHEN:
6235                     errBogusComment();
6236                     emitComment(0, 0);
6237                     break eofloop;
6238                 case MARKUP_DECLARATION_OCTYPE:
6239                     if (index < 6) {
6240                         errBogusComment();
6241                         emitComment(0, 0);
6242                     } else {
6243                         /* EOF Parse error. */
6244                         errEofInDoctype();
6245                         /*
6246                          * Create a new DOCTYPE token. Set its force-quirks flag
6247                          * to on.
6248                          */
6249                         doctypeName = "";
6250                         if (systemIdentifier != null) {
6251                             Portability.releaseString(systemIdentifier);
6252                             systemIdentifier = null;
6253                         }
6254                         if (publicIdentifier != null) {
6255                             Portability.releaseString(publicIdentifier);
6256                             publicIdentifier = null;
6257                         }
6258                         forceQuirks = true;
6259                         /*
6260                          * Emit the token.
6261                          */
6262                         emitDoctypeToken(0);
6263                         /*
6264                          * Reconsume the EOF character in the data state.
6265                          */
6266                         break eofloop;
6267                     }
6268                     break eofloop;
6269                 case COMMENT_START:
6270                 case COMMENT:
6271                     /*
6272                      * EOF Parse error.
6273                      */
6274                     errEofInComment();
6275                     /* Emit the comment token. */
6276                     emitComment(0, 0);
6277                     /*
6278                      * Reconsume the EOF character in the data state.
6279                      */
6280                     break eofloop;
6281                 case COMMENT_END:
6282                     errEofInComment();
6283                     /* Emit the comment token. */
6284                     emitComment(2, 0);
6285                     /*
6286                      * Reconsume the EOF character in the data state.
6287                      */
6288                     break eofloop;
6289                 case COMMENT_END_DASH:
6290                 case COMMENT_START_DASH:
6291                     errEofInComment();
6292                     /* Emit the comment token. */
6293                     emitComment(1, 0);
6294                     /*
6295                      * Reconsume the EOF character in the data state.
6296                      */
6297                     break eofloop;
6298                 case COMMENT_END_BANG:
6299                     errEofInComment();
6300                     /* Emit the comment token. */
6301                     emitComment(3, 0);
6302                     /*
6303                      * Reconsume the EOF character in the data state.
6304                      */
6305                     break eofloop;
6306                 case DOCTYPE:
6307                 case BEFORE_DOCTYPE_NAME:
6308                     errEofInDoctype();
6309                     /*
6310                      * Create a new DOCTYPE token. Set its force-quirks flag to
6311                      * on.
6312                      */
6313                     forceQuirks = true;
6314                     /*
6315                      * Emit the token.
6316                      */
6317                     emitDoctypeToken(0);
6318                     /*
6319                      * Reconsume the EOF character in the data state.
6320                      */
6321                     break eofloop;
6322                 case DOCTYPE_NAME:
6323                     errEofInDoctype();
6324                     strBufToDoctypeName();
6325                     /*
6326                      * Set the DOCTYPE token's force-quirks flag to on.
6327                      */
6328                     forceQuirks = true;
6329                     /*
6330                      * Emit that DOCTYPE token.
6331                      */
6332                     emitDoctypeToken(0);
6333                     /*
6334                      * Reconsume the EOF character in the data state.
6335                      */
6336                     break eofloop;
6337                 case DOCTYPE_UBLIC:
6338                 case DOCTYPE_YSTEM:
6339                 case AFTER_DOCTYPE_NAME:
6340                 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
6341                 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
6342                 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
6343                     errEofInDoctype();
6344                     /*
6345                      * Set the DOCTYPE token's force-quirks flag to on.
6346                      */
6347                     forceQuirks = true;
6348                     /*
6349                      * Emit that DOCTYPE token.
6350                      */
6351                     emitDoctypeToken(0);
6352                     /*
6353                      * Reconsume the EOF character in the data state.
6354                      */
6355                     break eofloop;
6356                 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
6357                 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
6358                     /* EOF Parse error. */
6359                     errEofInPublicId();
6360                     /*
6361                      * Set the DOCTYPE token's force-quirks flag to on.
6362                      */
6363                     forceQuirks = true;
6364                     /*
6365                      * Emit that DOCTYPE token.
6366                      */
6367                     publicIdentifier = strBufToString();
6368                     emitDoctypeToken(0);
6369                     /*
6370                      * Reconsume the EOF character in the data state.
6371                      */
6372                     break eofloop;
6373                 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
6374                 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
6375                 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
6376                     errEofInDoctype();
6377                     /*
6378                      * Set the DOCTYPE token's force-quirks flag to on.
6379                      */
6380                     forceQuirks = true;
6381                     /*
6382                      * Emit that DOCTYPE token.
6383                      */
6384                     emitDoctypeToken(0);
6385                     /*
6386                      * Reconsume the EOF character in the data state.
6387                      */
6388                     break eofloop;
6389                 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
6390                 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
6391                     /* EOF Parse error. */
6392                     errEofInSystemId();
6393                     /*
6394                      * Set the DOCTYPE token's force-quirks flag to on.
6395                      */
6396                     forceQuirks = true;
6397                     /*
6398                      * Emit that DOCTYPE token.
6399                      */
6400                     systemIdentifier = strBufToString();
6401                     emitDoctypeToken(0);
6402                     /*
6403                      * Reconsume the EOF character in the data state.
6404                      */
6405                     break eofloop;
6406                 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
6407                     errEofInDoctype();
6408                     /*
6409                      * Set the DOCTYPE token's force-quirks flag to on.
6410                      */
6411                     forceQuirks = true;
6412                     /*
6413                      * Emit that DOCTYPE token.
6414                      */
6415                     emitDoctypeToken(0);
6416                     /*
6417                      * Reconsume the EOF character in the data state.
6418                      */
6419                     break eofloop;
6420                 case BOGUS_DOCTYPE:
6421                     /*
6422                      * Emit that DOCTYPE token.
6423                      */
6424                     emitDoctypeToken(0);
6425                     /*
6426                      * Reconsume the EOF character in the data state.
6427                      */
6428                     break eofloop;
6429                 case CONSUME_CHARACTER_REFERENCE:
6430                     /*
6431                      * Unlike the definition is the spec, this state does not
6432                      * return a value and never requires the caller to
6433                      * backtrack. This state takes care of emitting characters
6434                      * or appending to the current attribute value. It also
6435                      * takes care of that in the case when consuming the entity
6436                      * fails.
6437                      */
6438                     /*
6439                      * This section defines how to consume an entity. This
6440                      * definition is used when parsing entities in text and in
6441                      * attributes.
6442                      *
6443                      * The behavior depends on the identity of the next
6444                      * character (the one immediately after the U+0026 AMPERSAND
6445                      * character):
6446                      */
6447 
6448                     emitOrAppendCharRefBuf(returnState);
6449                     state = returnState;
6450                     continue;
6451                 case CHARACTER_REFERENCE_HILO_LOOKUP:
6452                     errNoNamedCharacterMatch();
6453                     emitOrAppendCharRefBuf(returnState);
6454                     state = returnState;
6455                     continue;
6456                 case CHARACTER_REFERENCE_TAIL:
6457                     outer: for (;;) {
6458                         char c = '\u0000';
6459                         entCol++;
6460                         /*
6461                          * Consume the maximum number of characters possible,
6462                          * with the consumed characters matching one of the
6463                          * identifiers in the first column of the named
6464                          * character references table (in a case-sensitive
6465                          * manner).
6466                          */
6467                         hiloop: for (;;) {
6468                             if (hi == -1) {
6469                                 break hiloop;
6470                             }
6471                             if (entCol == NamedCharacters.NAMES[hi].length()) {
6472                                 break hiloop;
6473                             }
6474                             if (entCol > NamedCharacters.NAMES[hi].length()) {
6475                                 break outer;
6476                             } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
6477                                 hi--;
6478                             } else {
6479                                 break hiloop;
6480                             }
6481                         }
6482 
6483                         loloop: for (;;) {
6484                             if (hi < lo) {
6485                                 break outer;
6486                             }
6487                             if (entCol == NamedCharacters.NAMES[lo].length()) {
6488                                 candidate = lo;
6489                                 charRefBufMark = charRefBufLen;
6490                                 lo++;
6491                             } else if (entCol > NamedCharacters.NAMES[lo].length()) {
6492                                 break outer;
6493                             } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
6494                                 lo++;
6495                             } else {
6496                                 break loloop;
6497                             }
6498                         }
6499                         if (hi < lo) {
6500                             break outer;
6501                         }
6502                         continue;
6503                     }
6504 
6505                     if (candidate == -1) {
6506                         /*
6507                          * If no match can be made, then this is a parse error.
6508                          */
6509                         errNoNamedCharacterMatch();
6510                         emitOrAppendCharRefBuf(returnState);
6511                         state = returnState;
6512                         continue eofloop;
6513                     } else {
6514                         @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
6515                         if (candidateName.length() == 0
6516                                 || candidateName.charAt(candidateName.length() - 1) != ';') {
6517                             /*
6518                              * If the last character matched is not a U+003B
6519                              * SEMICOLON (;), there is a parse error.
6520                              */
6521                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6522                                 /*
6523                                  * If the entity is being consumed as part of an
6524                                  * attribute, and the last character matched is
6525                                  * not a U+003B SEMICOLON (;),
6526                                  */
6527                                 char ch;
6528                                 if (charRefBufMark == charRefBufLen) {
6529                                     ch = '\u0000';
6530                                 } else {
6531                                     ch = charRefBuf[charRefBufMark];
6532                                 }
6533                                 if ((ch >= '0' && ch <= '9')
6534                                         || (ch >= 'A' && ch <= 'Z')
6535                                         || (ch >= 'a' && ch <= 'z')) {
6536                                     /*
6537                                      * and the next character is in the range
6538                                      * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
6539                                      * U+0041 LATIN CAPITAL LETTER A to U+005A
6540                                      * LATIN CAPITAL LETTER Z, or U+0061 LATIN
6541                                      * SMALL LETTER A to U+007A LATIN SMALL
6542                                      * LETTER Z, then, for historical reasons,
6543                                      * all the characters that were matched
6544                                      * after the U+0026 AMPERSAND (&) must be
6545                                      * unconsumed, and nothing is returned.
6546                                      */
6547                                     errNoNamedCharacterMatch();
6548                                     appendCharRefBufToStrBuf();
6549                                     state = returnState;
6550                                     continue eofloop;
6551                                 }
6552                             }
6553                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6554                                 errUnescapedAmpersandInterpretedAsCharacterReference();
6555                             } else {
6556                                 errNotSemicolonTerminated();
6557                             }
6558                         }
6559 
6560                         /*
6561                          * Otherwise, return a character token for the character
6562                          * corresponding to the entity name (as given by the
6563                          * second column of the named character references
6564                          * table).
6565                          */
6566                         @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
6567                         if (
6568                         // [NOCPP[
6569                         val.length == 1
6570                         // ]NOCPP]
6571                         // CPPONLY: val[1] == 0
6572                         ) {
6573                             emitOrAppendOne(val, returnState);
6574                         } else {
6575                             emitOrAppendTwo(val, returnState);
6576                         }
6577                         // this is so complicated!
6578                         if (charRefBufMark < charRefBufLen) {
6579                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6580                                 appendStrBuf(charRefBuf, charRefBufMark,
6581                                         charRefBufLen - charRefBufMark);
6582                             } else {
6583                                 tokenHandler.characters(charRefBuf, charRefBufMark,
6584                                         charRefBufLen - charRefBufMark);
6585                             }
6586                         }
6587                         charRefBufLen = 0;
6588                         state = returnState;
6589                         continue eofloop;
6590                         /*
6591                          * If the markup contains I'm &notit; I tell you, the
6592                          * entity is parsed as "not", as in, I'm ¬it; I tell
6593                          * you. But if the markup was I'm &notin; I tell you,
6594                          * the entity would be parsed as "notin;", resulting in
6595                          * I'm ∉ I tell you.
6596                          */
6597                     }
6598                 case CONSUME_NCR:
6599                 case DECIMAL_NRC_LOOP:
6600                 case HEX_NCR_LOOP:
6601                     /*
6602                      * If no characters match the range, then don't consume any
6603                      * characters (and unconsume the U+0023 NUMBER SIGN
6604                      * character and, if appropriate, the X character). This is
6605                      * a parse error; nothing is returned.
6606                      *
6607                      * Otherwise, if the next character is a U+003B SEMICOLON,
6608                      * consume that too. If it isn't, there is a parse error.
6609                      */
6610                     if (!seenDigits) {
6611                         errNoDigitsInNCR();
6612                         emitOrAppendCharRefBuf(returnState);
6613                         state = returnState;
6614                         continue;
6615                     } else {
6616                         errCharRefLacksSemicolon();
6617                     }
6618                     // WARNING previous state sets reconsume
6619                     handleNcrValue(returnState);
6620                     state = returnState;
6621                     continue;
6622                 case CDATA_RSQB:
6623                     tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
6624                     break eofloop;
6625                 case CDATA_RSQB_RSQB:
6626                     tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
6627                     break eofloop;
6628                 case DATA:
6629                 default:
6630                     break eofloop;
6631             }
6632         }
6633         // case DATA:
6634         /*
6635          * EOF Emit an end-of-file token.
6636          */
6637         tokenHandler.eof();
6638         return;
6639     }
6640 
emitDoctypeToken(int pos)6641     private void emitDoctypeToken(int pos) throws SAXException {
6642         cstart = pos + 1;
6643         tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
6644                 forceQuirks);
6645         // It is OK and sufficient to release these here, since
6646         // there's no way out of the doctype states than through paths
6647         // that call this method.
6648         doctypeName = null;
6649         Portability.releaseString(publicIdentifier);
6650         publicIdentifier = null;
6651         Portability.releaseString(systemIdentifier);
6652         systemIdentifier = null;
6653     }
6654 
checkChar(@oLength char[] buf, int pos)6655     @Inline protected char checkChar(@NoLength char[] buf, int pos)
6656             throws SAXException {
6657         return buf[pos];
6658     }
6659 
internalEncodingDeclaration(String internalCharset)6660     public boolean internalEncodingDeclaration(String internalCharset)
6661             throws SAXException {
6662         if (encodingDeclarationHandler != null) {
6663             return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
6664         }
6665         return false;
6666     }
6667 
6668     /**
6669      * @param val
6670      * @throws SAXException
6671      */
emitOrAppendTwo(@onst @oLength char[] val, int returnState)6672     private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
6673             throws SAXException {
6674         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6675             appendStrBuf(val[0]);
6676             appendStrBuf(val[1]);
6677         } else {
6678             tokenHandler.characters(val, 0, 2);
6679         }
6680     }
6681 
emitOrAppendOne(@onst @oLength char[] val, int returnState)6682     private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
6683             throws SAXException {
6684         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6685             appendStrBuf(val[0]);
6686         } else {
6687             tokenHandler.characters(val, 0, 1);
6688         }
6689     }
6690 
end()6691     public void end() throws SAXException {
6692         strBuf = null;
6693         doctypeName = null;
6694         if (systemIdentifier != null) {
6695             Portability.releaseString(systemIdentifier);
6696             systemIdentifier = null;
6697         }
6698         if (publicIdentifier != null) {
6699             Portability.releaseString(publicIdentifier);
6700             publicIdentifier = null;
6701         }
6702         tagName = null;
6703         nonInternedTagName.setNameForNonInterned(null
6704                 // CPPONLY: , false
6705                 );
6706         attributeName = null;
6707         // CPPONLY: nonInternedAttributeName.setNameForNonInterned(null);
6708         tokenHandler.endTokenization();
6709         if (attributes != null) {
6710             // [NOCPP[
6711             attributes = null;
6712             // ]NOCPP]
6713             // CPPONLY: attributes.clear(mappingLangToXmlLang);
6714         }
6715     }
6716 
requestSuspension()6717     public void requestSuspension() {
6718         shouldSuspend = true;
6719     }
6720 
6721     // [NOCPP[
6722 
becomeConfident()6723     public void becomeConfident() {
6724         confident = true;
6725     }
6726 
6727     /**
6728      * Returns the nextCharOnNewLine.
6729      *
6730      * @return the nextCharOnNewLine
6731      */
isNextCharOnNewLine()6732     public boolean isNextCharOnNewLine() {
6733         return false;
6734     }
6735 
isPrevCR()6736     public boolean isPrevCR() {
6737         return lastCR;
6738     }
6739 
6740     /**
6741      * Returns the line.
6742      *
6743      * @return the line
6744      */
getLine()6745     public int getLine() {
6746         return -1;
6747     }
6748 
6749     /**
6750      * Returns the col.
6751      *
6752      * @return the col
6753      */
getCol()6754     public int getCol() {
6755         return -1;
6756     }
6757 
6758     // ]NOCPP]
6759 
isInDataState()6760     public boolean isInDataState() {
6761         return (stateSave == DATA);
6762     }
6763 
resetToDataState()6764     public void resetToDataState() {
6765         clearStrBufAfterUse();
6766         charRefBufLen = 0;
6767         stateSave = Tokenizer.DATA;
6768         // line = 1; XXX line numbers
6769         lastCR = false;
6770         index = 0;
6771         forceQuirks = false;
6772         additional = '\u0000';
6773         entCol = -1;
6774         firstCharKey = -1;
6775         lo = 0;
6776         hi = 0; // will always be overwritten before use anyway
6777         candidate = -1;
6778         charRefBufMark = 0;
6779         value = 0;
6780         seenDigits = false;
6781         endTag = false;
6782         shouldSuspend = false;
6783         initDoctypeFields();
6784         containsHyphen = false;
6785         tagName = null;
6786         attributeName = null;
6787         if (newAttributesEachTime) {
6788             if (attributes != null) {
6789                 Portability.delete(attributes);
6790                 attributes = null;
6791             }
6792         }
6793     }
6794 
loadState(Tokenizer other)6795     public void loadState(Tokenizer other) throws SAXException {
6796         strBufLen = other.strBufLen;
6797         if (strBufLen > strBuf.length) {
6798             strBuf = new char[strBufLen];
6799         }
6800         System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
6801 
6802         charRefBufLen = other.charRefBufLen;
6803         System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen);
6804 
6805         stateSave = other.stateSave;
6806         returnStateSave = other.returnStateSave;
6807         endTagExpectation = other.endTagExpectation;
6808         endTagExpectationAsArray = other.endTagExpectationAsArray;
6809         // line = 1; XXX line numbers
6810         lastCR = other.lastCR;
6811         index = other.index;
6812         forceQuirks = other.forceQuirks;
6813         additional = other.additional;
6814         entCol = other.entCol;
6815         firstCharKey = other.firstCharKey;
6816         lo = other.lo;
6817         hi = other.hi;
6818         candidate = other.candidate;
6819         charRefBufMark = other.charRefBufMark;
6820         value = other.value;
6821         seenDigits = other.seenDigits;
6822         endTag = other.endTag;
6823         shouldSuspend = false;
6824         doctypeName = other.doctypeName;
6825 
6826         Portability.releaseString(systemIdentifier);
6827         if (other.systemIdentifier == null) {
6828             systemIdentifier = null;
6829         } else {
6830             systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
6831         }
6832 
6833         Portability.releaseString(publicIdentifier);
6834         if (other.publicIdentifier == null) {
6835             publicIdentifier = null;
6836         } else {
6837             publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
6838         }
6839 
6840         containsHyphen = other.containsHyphen;
6841         if (other.tagName == null) {
6842             tagName = null;
6843         } else if (other.tagName.isInterned()) {
6844             tagName = other.tagName;
6845         } else {
6846             // In the C++ case, the atoms in the other tokenizer are from a
6847             // different tokenizer-scoped atom table. Therefore, we have to
6848             // obtain the correspoding atom from our own atom table.
6849             nonInternedTagName.setNameForNonInterned(other.tagName.getName()
6850                     // CPPONLY: , other.tagName.isCustom()
6851                     );
6852             tagName = nonInternedTagName;
6853         }
6854 
6855         // [NOCPP[
6856         attributeName = other.attributeName;
6857         // ]NOCPP]
6858         // CPPONLY: if (other.attributeName == null) {
6859         // CPPONLY:     attributeName = null;
6860         // CPPONLY: } else if (other.attributeName.isInterned()) {
6861         // CPPONLY:     attributeName = other.attributeName;
6862         // CPPONLY: } else {
6863         // CPPONLY:     // In the C++ case, the atoms in the other tokenizer are from a
6864         // CPPONLY:     // different tokenizer-scoped atom table. Therefore, we have to
6865         // CPPONLY:     // obtain the correspoding atom from our own atom table.
6866         // CPPONLY:     nonInternedAttributeName.setNameForNonInterned(other.attributeName.getLocal(AttributeName.HTML));
6867         // CPPONLY:     attributeName = nonInternedAttributeName;
6868         // CPPONLY: }
6869 
6870         Portability.delete(attributes);
6871         if (other.attributes == null) {
6872             attributes = null;
6873         } else {
6874             attributes = other.attributes.cloneAttributes();
6875         }
6876     }
6877 
initializeWithoutStarting()6878     public void initializeWithoutStarting() throws SAXException {
6879         confident = false;
6880         strBuf = null;
6881         line = 1;
6882         // CPPONLY: attributeLine = 1;
6883         // [NOCPP[
6884         metaBoundaryPassed = false;
6885         wantsComments = tokenHandler.wantsComments();
6886         if (!newAttributesEachTime) {
6887             attributes = new HtmlAttributes(mappingLangToXmlLang);
6888         }
6889         // ]NOCPP]
6890         resetToDataState();
6891     }
6892 
errGarbageAfterLtSlash()6893     protected void errGarbageAfterLtSlash() throws SAXException {
6894     }
6895 
errLtSlashGt()6896     protected void errLtSlashGt() throws SAXException {
6897     }
6898 
errWarnLtSlashInRcdata()6899     protected void errWarnLtSlashInRcdata() throws SAXException {
6900     }
6901 
errHtml4LtSlashInRcdata(char folded)6902     protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
6903     }
6904 
errCharRefLacksSemicolon()6905     protected void errCharRefLacksSemicolon() throws SAXException {
6906     }
6907 
errNoDigitsInNCR()6908     protected void errNoDigitsInNCR() throws SAXException {
6909     }
6910 
errGtInSystemId()6911     protected void errGtInSystemId() throws SAXException {
6912     }
6913 
errGtInPublicId()6914     protected void errGtInPublicId() throws SAXException {
6915     }
6916 
errNamelessDoctype()6917     protected void errNamelessDoctype() throws SAXException {
6918     }
6919 
errConsecutiveHyphens()6920     protected void errConsecutiveHyphens() throws SAXException {
6921     }
6922 
errPrematureEndOfComment()6923     protected void errPrematureEndOfComment() throws SAXException {
6924     }
6925 
errBogusComment()6926     protected void errBogusComment() throws SAXException {
6927     }
6928 
errUnquotedAttributeValOrNull(char c)6929     protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
6930     }
6931 
errSlashNotFollowedByGt()6932     protected void errSlashNotFollowedByGt() throws SAXException {
6933     }
6934 
errNoSpaceBetweenAttributes()6935     protected void errNoSpaceBetweenAttributes() throws SAXException {
6936     }
6937 
errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)6938     protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
6939             throws SAXException {
6940     }
6941 
errAttributeValueMissing()6942     protected void errAttributeValueMissing() throws SAXException {
6943     }
6944 
errBadCharBeforeAttributeNameOrNull(char c)6945     protected void errBadCharBeforeAttributeNameOrNull(char c)
6946             throws SAXException {
6947     }
6948 
errEqualsSignBeforeAttributeName()6949     protected void errEqualsSignBeforeAttributeName() throws SAXException {
6950     }
6951 
errBadCharAfterLt(char c)6952     protected void errBadCharAfterLt(char c) throws SAXException {
6953     }
6954 
errLtGt()6955     protected void errLtGt() throws SAXException {
6956     }
6957 
errProcessingInstruction()6958     protected void errProcessingInstruction() throws SAXException {
6959     }
6960 
errUnescapedAmpersandInterpretedAsCharacterReference()6961     protected void errUnescapedAmpersandInterpretedAsCharacterReference()
6962             throws SAXException {
6963     }
6964 
errNotSemicolonTerminated()6965     protected void errNotSemicolonTerminated() throws SAXException {
6966     }
6967 
errNoNamedCharacterMatch()6968     protected void errNoNamedCharacterMatch() throws SAXException {
6969     }
6970 
errQuoteBeforeAttributeName(char c)6971     protected void errQuoteBeforeAttributeName(char c) throws SAXException {
6972     }
6973 
errQuoteOrLtInAttributeNameOrNull(char c)6974     protected void errQuoteOrLtInAttributeNameOrNull(char c)
6975             throws SAXException {
6976     }
6977 
errExpectedPublicId()6978     protected void errExpectedPublicId() throws SAXException {
6979     }
6980 
errBogusDoctype()6981     protected void errBogusDoctype() throws SAXException {
6982     }
6983 
maybeWarnPrivateUseAstral()6984     protected void maybeWarnPrivateUseAstral() throws SAXException {
6985     }
6986 
maybeWarnPrivateUse(char ch)6987     protected void maybeWarnPrivateUse(char ch) throws SAXException {
6988     }
6989 
maybeErrAttributesOnEndTag(HtmlAttributes attrs)6990     protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
6991             throws SAXException {
6992     }
6993 
maybeErrSlashInEndTag(boolean selfClosing)6994     protected void maybeErrSlashInEndTag(boolean selfClosing)
6995             throws SAXException {
6996     }
6997 
errNcrNonCharacter(char ch)6998     protected char errNcrNonCharacter(char ch) throws SAXException {
6999         return ch;
7000     }
7001 
errAstralNonCharacter(int ch)7002     protected void errAstralNonCharacter(int ch) throws SAXException {
7003     }
7004 
errNcrSurrogate()7005     protected void errNcrSurrogate() throws SAXException {
7006     }
7007 
errNcrControlChar(char ch)7008     protected char errNcrControlChar(char ch) throws SAXException {
7009         return ch;
7010     }
7011 
errNcrCr()7012     protected void errNcrCr() throws SAXException {
7013     }
7014 
errNcrInC1Range()7015     protected void errNcrInC1Range() throws SAXException {
7016     }
7017 
errEofInPublicId()7018     protected void errEofInPublicId() throws SAXException {
7019     }
7020 
errEofInComment()7021     protected void errEofInComment() throws SAXException {
7022     }
7023 
errEofInDoctype()7024     protected void errEofInDoctype() throws SAXException {
7025     }
7026 
errEofInAttributeValue()7027     protected void errEofInAttributeValue() throws SAXException {
7028     }
7029 
errEofInAttributeName()7030     protected void errEofInAttributeName() throws SAXException {
7031     }
7032 
errEofWithoutGt()7033     protected void errEofWithoutGt() throws SAXException {
7034     }
7035 
errEofInTagName()7036     protected void errEofInTagName() throws SAXException {
7037     }
7038 
errEofInEndTag()7039     protected void errEofInEndTag() throws SAXException {
7040     }
7041 
errEofAfterLt()7042     protected void errEofAfterLt() throws SAXException {
7043     }
7044 
errNcrOutOfRange()7045     protected void errNcrOutOfRange() throws SAXException {
7046     }
7047 
errNcrUnassigned()7048     protected void errNcrUnassigned() throws SAXException {
7049     }
7050 
errDuplicateAttribute()7051     protected void errDuplicateAttribute() throws SAXException {
7052     }
7053 
errEofInSystemId()7054     protected void errEofInSystemId() throws SAXException {
7055     }
7056 
errExpectedSystemId()7057     protected void errExpectedSystemId() throws SAXException {
7058     }
7059 
errMissingSpaceBeforeDoctypeName()7060     protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
7061     }
7062 
errHyphenHyphenBang()7063     protected void errHyphenHyphenBang() throws SAXException {
7064     }
7065 
errNcrControlChar()7066     protected void errNcrControlChar() throws SAXException {
7067     }
7068 
errNcrZero()7069     protected void errNcrZero() throws SAXException {
7070     }
7071 
errNoSpaceBetweenDoctypeSystemKeywordAndQuote()7072     protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
7073             throws SAXException {
7074     }
7075 
errNoSpaceBetweenPublicAndSystemIds()7076     protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
7077     }
7078 
errNoSpaceBetweenDoctypePublicKeywordAndQuote()7079     protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
7080             throws SAXException {
7081     }
7082 
noteAttributeWithoutValue()7083     protected void noteAttributeWithoutValue() throws SAXException {
7084     }
7085 
noteUnquotedAttributeValue()7086     protected void noteUnquotedAttributeValue() throws SAXException {
7087     }
7088 
7089     /**
7090      * Sets the encodingDeclarationHandler.
7091      *
7092      * @param encodingDeclarationHandler
7093      *            the encodingDeclarationHandler to set
7094      */
setEncodingDeclarationHandler( EncodingDeclarationHandler encodingDeclarationHandler)7095     public void setEncodingDeclarationHandler(
7096             EncodingDeclarationHandler encodingDeclarationHandler) {
7097         this.encodingDeclarationHandler = encodingDeclarationHandler;
7098     }
7099 
destructor()7100     void destructor() {
7101         Portability.delete(nonInternedTagName);
7102         nonInternedTagName = null;
7103         // CPPONLY: Portability.delete(nonInternedAttributeName);
7104         // CPPONLY: nonInternedAttributeName = null;
7105         // The translator will write refcount tracing stuff here
7106         Portability.delete(attributes);
7107         attributes = null;
7108     }
7109 
7110     // [NOCPP[
7111 
7112     /**
7113      * Sets an offset to be added to the position reported to
7114      * <code>TransitionHandler</code>.
7115      *
7116      * @param offset the offset
7117      */
setTransitionBaseOffset(int offset)7118     public void setTransitionBaseOffset(int offset) {
7119 
7120     }
7121 
7122     // ]NOCPP]
7123 
7124 }
7125