1 /*
2  * Copyright (c) 2005-2007 Henri Sivonen
3  * Copyright (c) 2007-2017 Mozilla Foundation
4  * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
5  * Foundation, and Opera Software ASA.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23  * DEALINGS IN THE SOFTWARE.
24  */
25 
26 /*
27  * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
28  * Please edit Tokenizer.java instead and regenerate.
29  */
30 
31 #ifndef nsHtml5Tokenizer_h
32 #define nsHtml5Tokenizer_h
33 
34 #include "jArray.h"
35 #include "nsAHtml5TreeBuilderState.h"
36 #include "nsAtom.h"
37 #include "nsGkAtoms.h"
38 #include "nsHtml5ArrayCopy.h"
39 #include "nsHtml5AtomTable.h"
40 #include "nsHtml5DocumentMode.h"
41 #include "nsHtml5Highlighter.h"
42 #include "nsHtml5Macros.h"
43 #include "nsHtml5NamedCharacters.h"
44 #include "nsHtml5NamedCharactersAccel.h"
45 #include "nsHtml5String.h"
46 #include "nsHtml5TokenizerLoopPolicies.h"
47 #include "nsIContent.h"
48 #include "nsTraceRefcnt.h"
49 
50 class nsHtml5StreamParser;
51 
52 class nsHtml5AttributeName;
53 class nsHtml5ElementName;
54 class nsHtml5TreeBuilder;
55 class nsHtml5UTF16Buffer;
56 class nsHtml5StateSnapshot;
57 class nsHtml5Portability;
58 
59 class nsHtml5Tokenizer {
60  private:
61   static const int32_t DATA_AND_RCDATA_MASK = ~1;
62 
63  public:
64   static const int32_t DATA = 0;
65 
66   static const int32_t RCDATA = 1;
67 
68   static const int32_t SCRIPT_DATA = 2;
69 
70   static const int32_t RAWTEXT = 3;
71 
72   static const int32_t SCRIPT_DATA_ESCAPED = 4;
73 
74   static const int32_t ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
75 
76   static const int32_t ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
77 
78   static const int32_t ATTRIBUTE_VALUE_UNQUOTED = 7;
79 
80   static const int32_t PLAINTEXT = 8;
81 
82   static const int32_t TAG_OPEN = 9;
83 
84   static const int32_t CLOSE_TAG_OPEN = 10;
85 
86   static const int32_t TAG_NAME = 11;
87 
88   static const int32_t BEFORE_ATTRIBUTE_NAME = 12;
89 
90   static const int32_t ATTRIBUTE_NAME = 13;
91 
92   static const int32_t AFTER_ATTRIBUTE_NAME = 14;
93 
94   static const int32_t BEFORE_ATTRIBUTE_VALUE = 15;
95 
96   static const int32_t AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
97 
98   static const int32_t BOGUS_COMMENT = 17;
99 
100   static const int32_t MARKUP_DECLARATION_OPEN = 18;
101 
102   static const int32_t DOCTYPE = 19;
103 
104   static const int32_t BEFORE_DOCTYPE_NAME = 20;
105 
106   static const int32_t DOCTYPE_NAME = 21;
107 
108   static const int32_t AFTER_DOCTYPE_NAME = 22;
109 
110   static const int32_t BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
111 
112   static const int32_t DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
113 
114   static const int32_t DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
115 
116   static const int32_t AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
117 
118   static const int32_t BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
119 
120   static const int32_t DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
121 
122   static const int32_t DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
123 
124   static const int32_t AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
125 
126   static const int32_t BOGUS_DOCTYPE = 31;
127 
128   static const int32_t COMMENT_START = 32;
129 
130   static const int32_t COMMENT_START_DASH = 33;
131 
132   static const int32_t COMMENT = 34;
133 
134   static const int32_t COMMENT_END_DASH = 35;
135 
136   static const int32_t COMMENT_END = 36;
137 
138   static const int32_t COMMENT_END_BANG = 37;
139 
140   static const int32_t NON_DATA_END_TAG_NAME = 38;
141 
142   static const int32_t MARKUP_DECLARATION_HYPHEN = 39;
143 
144   static const int32_t MARKUP_DECLARATION_OCTYPE = 40;
145 
146   static const int32_t DOCTYPE_UBLIC = 41;
147 
148   static const int32_t DOCTYPE_YSTEM = 42;
149 
150   static const int32_t AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
151 
152   static const int32_t BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
153 
154   static const int32_t AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
155 
156   static const int32_t CONSUME_CHARACTER_REFERENCE = 46;
157 
158   static const int32_t CONSUME_NCR = 47;
159 
160   static const int32_t CHARACTER_REFERENCE_TAIL = 48;
161 
162   static const int32_t HEX_NCR_LOOP = 49;
163 
164   static const int32_t DECIMAL_NRC_LOOP = 50;
165 
166   static const int32_t HANDLE_NCR_VALUE = 51;
167 
168   static const int32_t HANDLE_NCR_VALUE_RECONSUME = 52;
169 
170   static const int32_t CHARACTER_REFERENCE_HILO_LOOKUP = 53;
171 
172   static const int32_t SELF_CLOSING_START_TAG = 54;
173 
174   static const int32_t CDATA_START = 55;
175 
176   static const int32_t CDATA_SECTION = 56;
177 
178   static const int32_t CDATA_RSQB = 57;
179 
180   static const int32_t CDATA_RSQB_RSQB = 58;
181 
182   static const int32_t SCRIPT_DATA_LESS_THAN_SIGN = 59;
183 
184   static const int32_t SCRIPT_DATA_ESCAPE_START = 60;
185 
186   static const int32_t SCRIPT_DATA_ESCAPE_START_DASH = 61;
187 
188   static const int32_t SCRIPT_DATA_ESCAPED_DASH = 62;
189 
190   static const int32_t SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
191 
192   static const int32_t BOGUS_COMMENT_HYPHEN = 64;
193 
194   static const int32_t RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
195 
196   static const int32_t SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
197 
198   static const int32_t SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
199 
200   static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED = 68;
201 
202   static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
203 
204   static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
205 
206   static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
207 
208   static const int32_t SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
209 
210   static const int32_t PROCESSING_INSTRUCTION = 73;
211 
212   static const int32_t PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
213 
214   static const int32_t COMMENT_LESSTHAN = 76;
215 
216   static const int32_t COMMENT_LESSTHAN_BANG = 77;
217 
218   static const int32_t COMMENT_LESSTHAN_BANG_DASH = 78;
219 
220   static const int32_t COMMENT_LESSTHAN_BANG_DASH_DASH = 79;
221 
222  private:
223   static const int32_t LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
224 
225   static char16_t LT_GT[];
226   static char16_t LT_SOLIDUS[];
227   static char16_t RSQB_RSQB[];
228   static char16_t REPLACEMENT_CHARACTER[];
229   static char16_t LF[];
230   static char16_t CDATA_LSQB[];
231   static char16_t OCTYPE[];
232   static char16_t UBLIC[];
233   static char16_t YSTEM[];
234   static staticJArray<char16_t, int32_t> TITLE_ARR;
235   static staticJArray<char16_t, int32_t> SCRIPT_ARR;
236   static staticJArray<char16_t, int32_t> STYLE_ARR;
237   static staticJArray<char16_t, int32_t> PLAINTEXT_ARR;
238   static staticJArray<char16_t, int32_t> XMP_ARR;
239   static staticJArray<char16_t, int32_t> TEXTAREA_ARR;
240   static staticJArray<char16_t, int32_t> IFRAME_ARR;
241   static staticJArray<char16_t, int32_t> NOEMBED_ARR;
242   static staticJArray<char16_t, int32_t> NOSCRIPT_ARR;
243   static staticJArray<char16_t, int32_t> NOFRAMES_ARR;
244 
245  protected:
246   nsHtml5TreeBuilder* tokenHandler;
247   nsHtml5StreamParser* encodingDeclarationHandler;
248   bool lastCR;
249   int32_t stateSave;
250 
251  private:
252   int32_t returnStateSave;
253 
254  protected:
255   int32_t index;
256 
257  private:
258   bool forceQuirks;
259   char16_t additional;
260   int32_t entCol;
261   int32_t firstCharKey;
262   int32_t lo;
263   int32_t hi;
264   int32_t candidate;
265   int32_t charRefBufMark;
266 
267  protected:
268   int32_t value;
269 
270  private:
271   bool seenDigits;
272   bool suspendAfterCurrentNonTextToken;
273 
274  protected:
275   int32_t cstart;
276 
277  private:
278   nsHtml5String publicId;
279   nsHtml5String systemId;
280   autoJArray<char16_t, int32_t> strBuf;
281   int32_t strBufLen;
282   autoJArray<char16_t, int32_t> charRefBuf;
283   int32_t charRefBufLen;
284   autoJArray<char16_t, int32_t> bmpChar;
285   autoJArray<char16_t, int32_t> astralChar;
286 
287  protected:
288   nsHtml5ElementName* endTagExpectation;
289 
290  private:
291   jArray<char16_t, int32_t> endTagExpectationAsArray;
292 
293  protected:
294   bool endTag;
295 
296  private:
297   bool containsHyphen;
298   nsHtml5ElementName* tagName;
299   nsHtml5ElementName* nonInternedTagName;
300 
301  protected:
302   nsHtml5AttributeName* attributeName;
303 
304  private:
305   nsHtml5AttributeName* nonInternedAttributeName;
306   RefPtr<nsAtom> doctypeName;
307   nsHtml5String publicIdentifier;
308   nsHtml5String systemIdentifier;
309   nsHtml5HtmlAttributes* attributes;
310   bool newAttributesEachTime;
311   bool shouldSuspend;
312 
313  protected:
314   bool confident;
315 
316  private:
317   int32_t line;
318   int32_t attributeLine;
319   nsHtml5AtomTable* interner;
320   bool viewingXmlSource;
321 
322  public:
323   nsHtml5Tokenizer(nsHtml5TreeBuilder* tokenHandler, bool viewingXmlSource);
324   void setInterner(nsHtml5AtomTable* interner);
325   void initLocation(nsHtml5String newPublicId, nsHtml5String newSystemId);
326   bool isViewingXmlSource();
327   void setState(int32_t specialTokenizerState);
328   void setStateAndEndTagExpectation(int32_t specialTokenizerState,
329                                     nsHtml5ElementName* endTagExpectation);
330 
331  private:
332   void endTagExpectationToArray();
333 
334  public:
335   void setLineNumber(int32_t line);
getLineNumber()336   inline int32_t getLineNumber() { return line; }
337 
338   nsHtml5HtmlAttributes* emptyAttributes();
339 
340  private:
appendCharRefBuf(char16_t c)341   inline void appendCharRefBuf(char16_t c) {
342     MOZ_RELEASE_ASSERT(charRefBufLen < charRefBuf.length,
343                        "Attempted to overrun charRefBuf!");
344     charRefBuf[charRefBufLen++] = c;
345   }
346 
347   void emitOrAppendCharRefBuf(int32_t returnState);
clearStrBufAfterUse()348   inline void clearStrBufAfterUse() { strBufLen = 0; }
349 
clearStrBufBeforeUse()350   inline void clearStrBufBeforeUse() {
351     MOZ_ASSERT(!strBufLen, "strBufLen not reset after previous use!");
352     strBufLen = 0;
353   }
354 
clearStrBufAfterOneHyphen()355   inline void clearStrBufAfterOneHyphen() {
356     MOZ_ASSERT(strBufLen == 1, "strBufLen length not one!");
357     MOZ_ASSERT(strBuf[0] == '-', "strBuf does not start with a hyphen!");
358     strBufLen = 0;
359   }
360 
appendStrBuf(char16_t c)361   inline void appendStrBuf(char16_t c) {
362     MOZ_ASSERT(strBufLen < strBuf.length,
363                "Previous buffer length insufficient.");
364     if (MOZ_UNLIKELY(strBufLen == strBuf.length)) {
365       if (MOZ_UNLIKELY(!EnsureBufferSpace(1))) {
366         MOZ_CRASH("Unable to recover from buffer reallocation failure");
367       }
368     }
369     strBuf[strBufLen++] = c;
370   }
371 
372  protected:
373   nsHtml5String strBufToString();
374 
375  private:
376   void strBufToDoctypeName();
377   void emitStrBuf();
appendSecondHyphenToBogusComment()378   inline void appendSecondHyphenToBogusComment() { appendStrBuf('-'); }
379 
adjustDoubleHyphenAndAppendToStrBufAndErr(char16_t c,bool reportedConsecutiveHyphens)380   inline void adjustDoubleHyphenAndAppendToStrBufAndErr(
381       char16_t c, bool reportedConsecutiveHyphens) {
382     appendStrBuf(c);
383   }
384 
385   void appendStrBuf(char16_t* buffer, int32_t offset, int32_t length);
appendCharRefBufToStrBuf()386   inline void appendCharRefBufToStrBuf() {
387     appendStrBuf(charRefBuf, 0, charRefBufLen);
388     charRefBufLen = 0;
389   }
390 
391   void emitComment(int32_t provisionalHyphens, int32_t pos);
392 
393  protected:
394   void flushChars(char16_t* buf, int32_t pos);
395 
396  private:
397   void strBufToElementNameString();
398   int32_t emitCurrentTagToken(bool selfClosing, int32_t pos);
399   void attributeNameComplete();
400   void addAttributeWithoutValue();
401   void addAttributeWithValue();
402 
403  public:
404   void start();
405   bool tokenizeBuffer(nsHtml5UTF16Buffer* buffer);
406 
407  private:
408   template <class P>
409   int32_t stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* buf,
410                     bool reconsume, int32_t returnState, int32_t endPos);
411   void initDoctypeFields();
adjustDoubleHyphenAndAppendToStrBufCarriageReturn()412   inline void adjustDoubleHyphenAndAppendToStrBufCarriageReturn() {
413     silentCarriageReturn();
414     adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false);
415   }
416 
adjustDoubleHyphenAndAppendToStrBufLineFeed()417   inline void adjustDoubleHyphenAndAppendToStrBufLineFeed() {
418     silentLineFeed();
419     adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false);
420   }
421 
appendStrBufLineFeed()422   inline void appendStrBufLineFeed() {
423     silentLineFeed();
424     appendStrBuf('\n');
425   }
426 
appendStrBufCarriageReturn()427   inline void appendStrBufCarriageReturn() {
428     silentCarriageReturn();
429     appendStrBuf('\n');
430   }
431 
432  protected:
silentCarriageReturn()433   inline void silentCarriageReturn() {
434     ++line;
435     lastCR = true;
436   }
437 
silentLineFeed()438   inline void silentLineFeed() { ++line; }
439 
440  private:
441   void emitCarriageReturn(char16_t* buf, int32_t pos);
442   void emitReplacementCharacter(char16_t* buf, int32_t pos);
443   void maybeEmitReplacementCharacter(char16_t* buf, int32_t pos);
444   void emitPlaintextReplacementCharacter(char16_t* buf, int32_t pos);
445   void setAdditionalAndRememberAmpersandLocation(char16_t add);
446   void bogusDoctype();
447   void bogusDoctypeWithoutQuirks();
448   void handleNcrValue(int32_t returnState);
449 
450  public:
451   void eof();
452 
453  private:
454   void emitDoctypeToken(int32_t pos);
455   void suspendIfRequestedAfterCurrentNonTextToken();
456   void suspendAfterCurrentTokenIfNotInText();
457   bool suspensionAfterCurrentNonTextTokenPending();
458 
459  protected:
checkChar(char16_t * buf,int32_t pos)460   inline char16_t checkChar(char16_t* buf, int32_t pos) { return buf[pos]; }
461 
462  public:
463   bool internalEncodingDeclaration(nsHtml5String internalCharset);
464 
465  private:
466   void emitOrAppendTwo(const char16_t* val, int32_t returnState);
467   void emitOrAppendOne(const char16_t* val, int32_t returnState);
468 
469  public:
470   void end();
471   void requestSuspension();
472   bool isInDataState();
473   void resetToDataState();
474   void loadState(nsHtml5Tokenizer* other);
475   void initializeWithoutStarting();
476   void setEncodingDeclarationHandler(
477       nsHtml5StreamParser* encodingDeclarationHandler);
478   ~nsHtml5Tokenizer();
479   static void initializeStatics();
480   static void releaseStatics();
481 
482 #include "nsHtml5TokenizerHSupplement.h"
483 };
484 
485 #endif
486