1 /* 2 * Copyright (c) 2005-2007 Henri Sivonen 3 * Copyright (c) 2007-2017 Mozilla Foundation 4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 5 * Foundation, and Opera Software ASA. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 * DEALINGS IN THE SOFTWARE. 24 */ 25 26 /* 27 * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 28 * Please edit Tokenizer.java instead and regenerate. 29 */ 30 31 #ifndef nsHtml5Tokenizer_h 32 #define nsHtml5Tokenizer_h 33 34 #include "jArray.h" 35 #include "nsAHtml5TreeBuilderState.h" 36 #include "nsAtom.h" 37 #include "nsGkAtoms.h" 38 #include "nsHtml5ArrayCopy.h" 39 #include "nsHtml5AtomTable.h" 40 #include "nsHtml5DocumentMode.h" 41 #include "nsHtml5Highlighter.h" 42 #include "nsHtml5Macros.h" 43 #include "nsHtml5NamedCharacters.h" 44 #include "nsHtml5NamedCharactersAccel.h" 45 #include "nsHtml5String.h" 46 #include "nsHtml5TokenizerLoopPolicies.h" 47 #include "nsIContent.h" 48 #include "nsTraceRefcnt.h" 49 50 class nsHtml5StreamParser; 51 52 class nsHtml5AttributeName; 53 class nsHtml5ElementName; 54 class nsHtml5TreeBuilder; 55 class nsHtml5UTF16Buffer; 56 class nsHtml5StateSnapshot; 57 class nsHtml5Portability; 58 59 class nsHtml5Tokenizer { 60 private: 61 static const int32_t DATA_AND_RCDATA_MASK = ~1; 62 63 public: 64 static const int32_t DATA = 0; 65 66 static const int32_t RCDATA = 1; 67 68 static const int32_t SCRIPT_DATA = 2; 69 70 static const int32_t RAWTEXT = 3; 71 72 static const int32_t SCRIPT_DATA_ESCAPED = 4; 73 74 static const int32_t ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; 75 76 static const int32_t ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; 77 78 static const int32_t ATTRIBUTE_VALUE_UNQUOTED = 7; 79 80 static const int32_t PLAINTEXT = 8; 81 82 static const int32_t TAG_OPEN = 9; 83 84 static const int32_t CLOSE_TAG_OPEN = 10; 85 86 static const int32_t TAG_NAME = 11; 87 88 static const int32_t BEFORE_ATTRIBUTE_NAME = 12; 89 90 static const int32_t ATTRIBUTE_NAME = 13; 91 92 static const int32_t AFTER_ATTRIBUTE_NAME = 14; 93 94 static const int32_t BEFORE_ATTRIBUTE_VALUE = 15; 95 96 static const int32_t AFTER_ATTRIBUTE_VALUE_QUOTED = 16; 97 98 static const int32_t BOGUS_COMMENT = 17; 99 100 static const int32_t MARKUP_DECLARATION_OPEN = 18; 101 102 static const int32_t DOCTYPE = 19; 103 104 static const int32_t BEFORE_DOCTYPE_NAME = 20; 105 106 static const int32_t DOCTYPE_NAME = 21; 107 108 static const int32_t AFTER_DOCTYPE_NAME = 22; 109 110 static const int32_t BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; 111 112 static const int32_t DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; 113 114 static const int32_t DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; 115 116 static const int32_t AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; 117 118 static const int32_t BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; 119 120 static const int32_t DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; 121 122 static const int32_t DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; 123 124 static const int32_t AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; 125 126 static const int32_t BOGUS_DOCTYPE = 31; 127 128 static const int32_t COMMENT_START = 32; 129 130 static const int32_t COMMENT_START_DASH = 33; 131 132 static const int32_t COMMENT = 34; 133 134 static const int32_t COMMENT_END_DASH = 35; 135 136 static const int32_t COMMENT_END = 36; 137 138 static const int32_t COMMENT_END_BANG = 37; 139 140 static const int32_t NON_DATA_END_TAG_NAME = 38; 141 142 static const int32_t MARKUP_DECLARATION_HYPHEN = 39; 143 144 static const int32_t MARKUP_DECLARATION_OCTYPE = 40; 145 146 static const int32_t DOCTYPE_UBLIC = 41; 147 148 static const int32_t DOCTYPE_YSTEM = 42; 149 150 static const int32_t AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; 151 152 static const int32_t BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; 153 154 static const int32_t AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; 155 156 static const int32_t CONSUME_CHARACTER_REFERENCE = 46; 157 158 static const int32_t CONSUME_NCR = 47; 159 160 static const int32_t CHARACTER_REFERENCE_TAIL = 48; 161 162 static const int32_t HEX_NCR_LOOP = 49; 163 164 static const int32_t DECIMAL_NRC_LOOP = 50; 165 166 static const int32_t HANDLE_NCR_VALUE = 51; 167 168 static const int32_t HANDLE_NCR_VALUE_RECONSUME = 52; 169 170 static const int32_t CHARACTER_REFERENCE_HILO_LOOKUP = 53; 171 172 static const int32_t SELF_CLOSING_START_TAG = 54; 173 174 static const int32_t CDATA_START = 55; 175 176 static const int32_t CDATA_SECTION = 56; 177 178 static const int32_t CDATA_RSQB = 57; 179 180 static const int32_t CDATA_RSQB_RSQB = 58; 181 182 static const int32_t SCRIPT_DATA_LESS_THAN_SIGN = 59; 183 184 static const int32_t SCRIPT_DATA_ESCAPE_START = 60; 185 186 static const int32_t SCRIPT_DATA_ESCAPE_START_DASH = 61; 187 188 static const int32_t SCRIPT_DATA_ESCAPED_DASH = 62; 189 190 static const int32_t SCRIPT_DATA_ESCAPED_DASH_DASH = 63; 191 192 static const int32_t BOGUS_COMMENT_HYPHEN = 64; 193 194 static const int32_t RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; 195 196 static const int32_t SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; 197 198 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; 199 200 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED = 68; 201 202 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; 203 204 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; 205 206 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; 207 208 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; 209 210 static const int32_t PROCESSING_INSTRUCTION = 73; 211 212 static const int32_t PROCESSING_INSTRUCTION_QUESTION_MARK = 74; 213 214 static const int32_t COMMENT_LESSTHAN = 76; 215 216 static const int32_t COMMENT_LESSTHAN_BANG = 77; 217 218 static const int32_t COMMENT_LESSTHAN_BANG_DASH = 78; 219 220 static const int32_t COMMENT_LESSTHAN_BANG_DASH_DASH = 79; 221 222 private: 223 static const int32_t LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); 224 225 static char16_t LT_GT[]; 226 static char16_t LT_SOLIDUS[]; 227 static char16_t RSQB_RSQB[]; 228 static char16_t REPLACEMENT_CHARACTER[]; 229 static char16_t LF[]; 230 static char16_t CDATA_LSQB[]; 231 static char16_t OCTYPE[]; 232 static char16_t UBLIC[]; 233 static char16_t YSTEM[]; 234 static staticJArray<char16_t, int32_t> TITLE_ARR; 235 static staticJArray<char16_t, int32_t> SCRIPT_ARR; 236 static staticJArray<char16_t, int32_t> STYLE_ARR; 237 static staticJArray<char16_t, int32_t> PLAINTEXT_ARR; 238 static staticJArray<char16_t, int32_t> XMP_ARR; 239 static staticJArray<char16_t, int32_t> TEXTAREA_ARR; 240 static staticJArray<char16_t, int32_t> IFRAME_ARR; 241 static staticJArray<char16_t, int32_t> NOEMBED_ARR; 242 static staticJArray<char16_t, int32_t> NOSCRIPT_ARR; 243 static staticJArray<char16_t, int32_t> NOFRAMES_ARR; 244 245 protected: 246 nsHtml5TreeBuilder* tokenHandler; 247 nsHtml5StreamParser* encodingDeclarationHandler; 248 bool lastCR; 249 int32_t stateSave; 250 251 private: 252 int32_t returnStateSave; 253 254 protected: 255 int32_t index; 256 257 private: 258 bool forceQuirks; 259 char16_t additional; 260 int32_t entCol; 261 int32_t firstCharKey; 262 int32_t lo; 263 int32_t hi; 264 int32_t candidate; 265 int32_t charRefBufMark; 266 267 protected: 268 int32_t value; 269 270 private: 271 bool seenDigits; 272 bool suspendAfterCurrentNonTextToken; 273 274 protected: 275 int32_t cstart; 276 277 private: 278 nsHtml5String publicId; 279 nsHtml5String systemId; 280 autoJArray<char16_t, int32_t> strBuf; 281 int32_t strBufLen; 282 autoJArray<char16_t, int32_t> charRefBuf; 283 int32_t charRefBufLen; 284 autoJArray<char16_t, int32_t> bmpChar; 285 autoJArray<char16_t, int32_t> astralChar; 286 287 protected: 288 nsHtml5ElementName* endTagExpectation; 289 290 private: 291 jArray<char16_t, int32_t> endTagExpectationAsArray; 292 293 protected: 294 bool endTag; 295 296 private: 297 bool containsHyphen; 298 nsHtml5ElementName* tagName; 299 nsHtml5ElementName* nonInternedTagName; 300 301 protected: 302 nsHtml5AttributeName* attributeName; 303 304 private: 305 nsHtml5AttributeName* nonInternedAttributeName; 306 RefPtr<nsAtom> doctypeName; 307 nsHtml5String publicIdentifier; 308 nsHtml5String systemIdentifier; 309 nsHtml5HtmlAttributes* attributes; 310 bool newAttributesEachTime; 311 bool shouldSuspend; 312 313 protected: 314 bool confident; 315 316 private: 317 int32_t line; 318 int32_t attributeLine; 319 nsHtml5AtomTable* interner; 320 bool viewingXmlSource; 321 322 public: 323 nsHtml5Tokenizer(nsHtml5TreeBuilder* tokenHandler, bool viewingXmlSource); 324 void setInterner(nsHtml5AtomTable* interner); 325 void initLocation(nsHtml5String newPublicId, nsHtml5String newSystemId); 326 bool isViewingXmlSource(); 327 void setState(int32_t specialTokenizerState); 328 void setStateAndEndTagExpectation(int32_t specialTokenizerState, 329 nsHtml5ElementName* endTagExpectation); 330 331 private: 332 void endTagExpectationToArray(); 333 334 public: 335 void setLineNumber(int32_t line); getLineNumber()336 inline int32_t getLineNumber() { return line; } 337 338 nsHtml5HtmlAttributes* emptyAttributes(); 339 340 private: appendCharRefBuf(char16_t c)341 inline void appendCharRefBuf(char16_t c) { 342 MOZ_RELEASE_ASSERT(charRefBufLen < charRefBuf.length, 343 "Attempted to overrun charRefBuf!"); 344 charRefBuf[charRefBufLen++] = c; 345 } 346 347 void emitOrAppendCharRefBuf(int32_t returnState); clearStrBufAfterUse()348 inline void clearStrBufAfterUse() { strBufLen = 0; } 349 clearStrBufBeforeUse()350 inline void clearStrBufBeforeUse() { 351 MOZ_ASSERT(!strBufLen, "strBufLen not reset after previous use!"); 352 strBufLen = 0; 353 } 354 clearStrBufAfterOneHyphen()355 inline void clearStrBufAfterOneHyphen() { 356 MOZ_ASSERT(strBufLen == 1, "strBufLen length not one!"); 357 MOZ_ASSERT(strBuf[0] == '-', "strBuf does not start with a hyphen!"); 358 strBufLen = 0; 359 } 360 appendStrBuf(char16_t c)361 inline void appendStrBuf(char16_t c) { 362 MOZ_ASSERT(strBufLen < strBuf.length, 363 "Previous buffer length insufficient."); 364 if (MOZ_UNLIKELY(strBufLen == strBuf.length)) { 365 if (MOZ_UNLIKELY(!EnsureBufferSpace(1))) { 366 MOZ_CRASH("Unable to recover from buffer reallocation failure"); 367 } 368 } 369 strBuf[strBufLen++] = c; 370 } 371 372 protected: 373 nsHtml5String strBufToString(); 374 375 private: 376 void strBufToDoctypeName(); 377 void emitStrBuf(); appendSecondHyphenToBogusComment()378 inline void appendSecondHyphenToBogusComment() { appendStrBuf('-'); } 379 adjustDoubleHyphenAndAppendToStrBufAndErr(char16_t c,bool reportedConsecutiveHyphens)380 inline void adjustDoubleHyphenAndAppendToStrBufAndErr( 381 char16_t c, bool reportedConsecutiveHyphens) { 382 appendStrBuf(c); 383 } 384 385 void appendStrBuf(char16_t* buffer, int32_t offset, int32_t length); appendCharRefBufToStrBuf()386 inline void appendCharRefBufToStrBuf() { 387 appendStrBuf(charRefBuf, 0, charRefBufLen); 388 charRefBufLen = 0; 389 } 390 391 void emitComment(int32_t provisionalHyphens, int32_t pos); 392 393 protected: 394 void flushChars(char16_t* buf, int32_t pos); 395 396 private: 397 void strBufToElementNameString(); 398 int32_t emitCurrentTagToken(bool selfClosing, int32_t pos); 399 void attributeNameComplete(); 400 void addAttributeWithoutValue(); 401 void addAttributeWithValue(); 402 403 public: 404 void start(); 405 bool tokenizeBuffer(nsHtml5UTF16Buffer* buffer); 406 407 private: 408 template <class P> 409 int32_t stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* buf, 410 bool reconsume, int32_t returnState, int32_t endPos); 411 void initDoctypeFields(); adjustDoubleHyphenAndAppendToStrBufCarriageReturn()412 inline void adjustDoubleHyphenAndAppendToStrBufCarriageReturn() { 413 silentCarriageReturn(); 414 adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false); 415 } 416 adjustDoubleHyphenAndAppendToStrBufLineFeed()417 inline void adjustDoubleHyphenAndAppendToStrBufLineFeed() { 418 silentLineFeed(); 419 adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false); 420 } 421 appendStrBufLineFeed()422 inline void appendStrBufLineFeed() { 423 silentLineFeed(); 424 appendStrBuf('\n'); 425 } 426 appendStrBufCarriageReturn()427 inline void appendStrBufCarriageReturn() { 428 silentCarriageReturn(); 429 appendStrBuf('\n'); 430 } 431 432 protected: silentCarriageReturn()433 inline void silentCarriageReturn() { 434 ++line; 435 lastCR = true; 436 } 437 silentLineFeed()438 inline void silentLineFeed() { ++line; } 439 440 private: 441 void emitCarriageReturn(char16_t* buf, int32_t pos); 442 void emitReplacementCharacter(char16_t* buf, int32_t pos); 443 void maybeEmitReplacementCharacter(char16_t* buf, int32_t pos); 444 void emitPlaintextReplacementCharacter(char16_t* buf, int32_t pos); 445 void setAdditionalAndRememberAmpersandLocation(char16_t add); 446 void bogusDoctype(); 447 void bogusDoctypeWithoutQuirks(); 448 void handleNcrValue(int32_t returnState); 449 450 public: 451 void eof(); 452 453 private: 454 void emitDoctypeToken(int32_t pos); 455 void suspendIfRequestedAfterCurrentNonTextToken(); 456 void suspendAfterCurrentTokenIfNotInText(); 457 bool suspensionAfterCurrentNonTextTokenPending(); 458 459 protected: checkChar(char16_t * buf,int32_t pos)460 inline char16_t checkChar(char16_t* buf, int32_t pos) { return buf[pos]; } 461 462 public: 463 bool internalEncodingDeclaration(nsHtml5String internalCharset); 464 465 private: 466 void emitOrAppendTwo(const char16_t* val, int32_t returnState); 467 void emitOrAppendOne(const char16_t* val, int32_t returnState); 468 469 public: 470 void end(); 471 void requestSuspension(); 472 bool isInDataState(); 473 void resetToDataState(); 474 void loadState(nsHtml5Tokenizer* other); 475 void initializeWithoutStarting(); 476 void setEncodingDeclarationHandler( 477 nsHtml5StreamParser* encodingDeclarationHandler); 478 ~nsHtml5Tokenizer(); 479 static void initializeStatics(); 480 static void releaseStatics(); 481 482 #include "nsHtml5TokenizerHSupplement.h" 483 }; 484 485 #endif 486