1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 #ifndef nsHtml5StreamParser_h 7 #define nsHtml5StreamParser_h 8 9 #include "MainThreadUtils.h" 10 #include "mozilla/AlreadyAddRefed.h" 11 #include "mozilla/Assertions.h" 12 #include "mozilla/Encoding.h" 13 #include "mozilla/Mutex.h" 14 #include "mozilla/NotNull.h" 15 #include "mozilla/RefPtr.h" 16 #include "mozilla/Span.h" 17 #include "mozilla/UniquePtr.h" 18 #include "nsCharsetSource.h" 19 #include "nsCOMPtr.h" 20 #include "nsCycleCollectionParticipant.h" 21 #include "nsDebug.h" 22 #include "nsHtml5AtomTable.h" 23 #include "nsIRequestObserver.h" 24 #include "nsISerialEventTarget.h" 25 #include "nsISupports.h" 26 #include "nsStringFwd.h" 27 #include "nsTArray.h" 28 #include "nscore.h" 29 30 class nsCycleCollectionTraversalCallback; 31 class nsHtml5MetaScanner; 32 class nsHtml5OwningUTF16Buffer; 33 class nsHtml5Parser; 34 class nsHtml5Speculation; 35 class nsHtml5String; 36 class nsHtml5Tokenizer; 37 class nsHtml5TreeBuilder; 38 class nsHtml5TreeOpExecutor; 39 class nsIChannel; 40 class nsIInputStream; 41 class nsIRequest; 42 class nsIRunnable; 43 class nsITimer; 44 class nsIURI; 45 46 namespace mozilla { 47 class EncodingDetector; 48 template <typename T> 49 class Buffer; 50 51 namespace dom { 52 class DocGroup; 53 } 54 } // namespace mozilla 55 56 enum eParserMode { 57 /** 58 * Parse a document normally as HTML. 59 */ 60 NORMAL, 61 62 /** 63 * View document as HTML source. 64 */ 65 VIEW_SOURCE_HTML, 66 67 /** 68 * View document as XML source 69 */ 70 VIEW_SOURCE_XML, 71 72 /** 73 * View document as plain text source 74 */ 75 VIEW_SOURCE_PLAIN, 76 77 /** 78 * View document as plain text 79 */ 80 PLAIN_TEXT, 81 82 /** 83 * Load as data (XHR) 84 */ 85 LOAD_AS_DATA 86 }; 87 88 enum eBomState { 89 /** 90 * BOM sniffing hasn't started. 91 */ 92 BOM_SNIFFING_NOT_STARTED, 93 94 /** 95 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been 96 * seen. 97 */ 98 SEEN_UTF_16_LE_FIRST_BYTE, 99 100 /** 101 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been 102 * seen. 103 */ 104 SEEN_UTF_16_BE_FIRST_BYTE, 105 106 /** 107 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been 108 * seen. 109 */ 110 SEEN_UTF_8_FIRST_BYTE, 111 112 /** 113 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM 114 * have been seen. 115 */ 116 SEEN_UTF_8_SECOND_BYTE, 117 118 /** 119 * Seen \x00 in UTF-16BE bogo-XML declaration. 120 */ 121 SEEN_UTF_16_BE_XML_FIRST, 122 123 /** 124 * Seen \x00< in UTF-16BE bogo-XML declaration. 125 */ 126 SEEN_UTF_16_BE_XML_SECOND, 127 128 /** 129 * Seen \x00<\x00 in UTF-16BE bogo-XML declaration. 130 */ 131 SEEN_UTF_16_BE_XML_THIRD, 132 133 /** 134 * Seen \x00<\x00? in UTF-16BE bogo-XML declaration. 135 */ 136 SEEN_UTF_16_BE_XML_FOURTH, 137 138 /** 139 * Seen \x00<\x00?\x00 in UTF-16BE bogo-XML declaration. 140 */ 141 SEEN_UTF_16_BE_XML_FIFTH, 142 143 /** 144 * Seen < in UTF-16BE bogo-XML declaration. 145 */ 146 SEEN_UTF_16_LE_XML_FIRST, 147 148 /** 149 * Seen <\x00 in UTF-16BE bogo-XML declaration. 150 */ 151 SEEN_UTF_16_LE_XML_SECOND, 152 153 /** 154 * Seen <\x00? in UTF-16BE bogo-XML declaration. 155 */ 156 SEEN_UTF_16_LE_XML_THIRD, 157 158 /** 159 * Seen <\x00?\x00 in UTF-16BE bogo-XML declaration. 160 */ 161 SEEN_UTF_16_LE_XML_FOURTH, 162 163 /** 164 * Seen <\x00?\x00x in UTF-16BE bogo-XML declaration. 165 */ 166 SEEN_UTF_16_LE_XML_FIFTH, 167 168 /** 169 * BOM sniffing was started but is now over for whatever reason. 170 */ 171 BOM_SNIFFING_OVER, 172 }; 173 174 enum eHtml5StreamState { 175 STREAM_NOT_STARTED = 0, 176 STREAM_BEING_READ = 1, 177 STREAM_ENDED = 2 178 }; 179 180 class nsHtml5StreamParser final : public nsISupports { 181 template <typename T> 182 using NotNull = mozilla::NotNull<T>; 183 using Encoding = mozilla::Encoding; 184 185 const uint32_t SNIFFING_BUFFER_SIZE = 1024; 186 const uint32_t READ_BUFFER_SIZE = 1024; 187 const uint32_t LOCAL_FILE_UTF_8_BUFFER_SIZE = 1024 * 1024 * 4; // 4 MB 188 189 friend class nsHtml5RequestStopper; 190 friend class nsHtml5DataAvailable; 191 friend class nsHtml5StreamParserContinuation; 192 friend class nsHtml5TimerKungFu; 193 friend class nsHtml5StreamParserPtr; 194 friend class nsHtml5StreamListener; 195 196 public: 197 NS_DECL_CYCLE_COLLECTING_ISUPPORTS 198 NS_DECL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser) 199 200 nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, nsHtml5Parser* aOwner, 201 eParserMode aMode); 202 203 // Methods that nsHtml5StreamListener calls 204 nsresult CheckListenerChain(); 205 206 nsresult OnStartRequest(nsIRequest* aRequest); 207 208 nsresult OnDataAvailable(nsIRequest* aRequest, nsIInputStream* aInStream, 209 uint64_t aSourceOffset, uint32_t aLength); 210 211 nsresult OnStopRequest(nsIRequest* aRequest, nsresult status); 212 213 // EncodingDeclarationHandler 214 // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java 215 /** 216 * Tree builder uses this to report a late <meta charset> 217 */ 218 bool internalEncodingDeclaration(nsHtml5String aEncoding); 219 220 // Not from an external interface 221 222 /** 223 * Pass a buffer to the Japanese or Cyrillic detector as appropriate. 224 */ 225 void FeedDetector(mozilla::Span<const uint8_t> aBuffer, bool aLast); 226 227 /** 228 * Call this method once you've created a parser, and want to instruct it 229 * about what charset to load 230 * 231 * @param aEncoding the charset of a document 232 * @param aCharsetSource the source of the charset 233 */ SetDocumentCharset(NotNull<const Encoding * > aEncoding,int32_t aSource,bool aChannelHadCharset)234 inline void SetDocumentCharset(NotNull<const Encoding*> aEncoding, 235 int32_t aSource, bool aChannelHadCharset) { 236 MOZ_ASSERT(mStreamState == STREAM_NOT_STARTED, 237 "SetDocumentCharset called too late."); 238 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); 239 MOZ_ASSERT(!(aSource == kCharsetFromChannel && !aChannelHadCharset), 240 "If charset is from channel, channel must have had charset."); 241 mEncoding = aEncoding; 242 mCharsetSource = aSource; 243 mChannelHadCharset = aChannelHadCharset; 244 } 245 SetObserver(nsIRequestObserver * aObserver)246 inline void SetObserver(nsIRequestObserver* aObserver) { 247 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); 248 mObserver = aObserver; 249 } 250 251 nsresult GetChannel(nsIChannel** aChannel); 252 253 /** 254 * The owner parser must call this after script execution 255 * when no scripts are executing and the document.written 256 * buffer has been exhausted. 257 */ 258 void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer, 259 nsHtml5TreeBuilder* aTreeBuilder, bool aLastWasCR); 260 261 /** 262 * Continues the stream parser if the charset switch failed. 263 */ 264 void ContinueAfterFailedCharsetSwitch(); 265 Terminate()266 void Terminate() { 267 mozilla::MutexAutoLock autoLock(mTerminatedMutex); 268 mTerminated = true; 269 } 270 271 void DropTimer(); 272 273 /** 274 * Sets mEncoding and mCharsetSource appropriately for the XML View Source 275 * case if aEncoding names a supported rough ASCII superset and sets 276 * the mEncoding and mCharsetSource to the UTF-8 default otherwise. 277 */ 278 void SetEncodingFromExpat(const char16_t* aEncoding); 279 280 /** 281 * Sets the URL for View Source title in case this parser ends up being 282 * used for View Source. If aURL is a view-source: URL, takes the inner 283 * URL. data: URLs are shown with an ellipsis instead of the actual data. 284 */ 285 void SetViewSourceTitle(nsIURI* aURL); 286 287 private: 288 virtual ~nsHtml5StreamParser(); 289 290 #ifdef DEBUG IsParserThread()291 bool IsParserThread() { return mEventTarget->IsOnCurrentThread(); } 292 #endif 293 294 void MarkAsBroken(nsresult aRv); 295 296 /** 297 * Marks the stream parser as interrupted. If you ever add calls to this 298 * method, be sure to review Uninterrupt usage very, very carefully to 299 * avoid having a previous in-flight runnable cancel your Interrupt() 300 * call on the other thread too soon. 301 */ Interrupt()302 void Interrupt() { 303 mozilla::MutexAutoLock autoLock(mTerminatedMutex); 304 mInterrupted = true; 305 } 306 Uninterrupt()307 void Uninterrupt() { 308 NS_ASSERTION(IsParserThread(), "Wrong thread!"); 309 mTokenizerMutex.AssertCurrentThreadOwns(); 310 // Not acquiring mTerminatedMutex because mTokenizerMutex is already 311 // held at this point and is already stronger. 312 mInterrupted = false; 313 } 314 315 /** 316 * Flushes the tree ops from the tree builder and disarms the flush 317 * timer. 318 */ 319 void FlushTreeOpsAndDisarmTimer(); 320 321 void ParseAvailableData(); 322 323 void DoStopRequest(); 324 325 void DoDataAvailableBuffer(mozilla::Buffer<uint8_t>&& aBuffer); 326 327 void DoDataAvailable(mozilla::Span<const uint8_t> aBuffer); 328 329 static nsresult CopySegmentsToParser(nsIInputStream* aInStream, 330 void* aClosure, const char* aFromSegment, 331 uint32_t aToOffset, uint32_t aCount, 332 uint32_t* aWriteCount); 333 IsTerminatedOrInterrupted()334 bool IsTerminatedOrInterrupted() { 335 mozilla::MutexAutoLock autoLock(mTerminatedMutex); 336 return mTerminated || mInterrupted; 337 } 338 IsTerminated()339 bool IsTerminated() { 340 mozilla::MutexAutoLock autoLock(mTerminatedMutex); 341 return mTerminated; 342 } 343 344 /** 345 * True when there is a Unicode decoder already 346 */ HasDecoder()347 inline bool HasDecoder() { return !!mUnicodeDecoder; } 348 349 /** 350 * Push bytes from network when there is no Unicode decoder yet 351 */ 352 nsresult SniffStreamBytes(mozilla::Span<const uint8_t> aFromSegment); 353 354 /** 355 * Push bytes from network when there is a Unicode decoder already 356 */ 357 nsresult WriteStreamBytes(mozilla::Span<const uint8_t> aFromSegment); 358 359 /** 360 * Check whether every other byte in the sniffing buffer is zero. 361 */ 362 void SniffBOMlessUTF16BasicLatin(const uint8_t* aBuf, size_t aBufLen); 363 364 /** 365 * Write the start of the stream to detector. 366 */ 367 void FinalizeSniffingWithDetector(mozilla::Span<const uint8_t> aFromSegment, 368 uint32_t aCountToSniffingLimit, bool aEof); 369 370 /** 371 * <meta charset> scan failed. Try chardet if applicable. After this, the 372 * the parser will have some encoding even if a last resolt fallback. 373 * 374 * @param aFromSegment The current network buffer 375 * @param aCountToSniffingLimit The number of unfilled slots in 376 * mSniffingBuffer 377 * @param aEof true iff called upon end of stream 378 */ 379 nsresult FinalizeSniffing(mozilla::Span<const uint8_t> aFromSegment, 380 uint32_t aCountToSniffingLimit, bool aEof); 381 382 /** 383 * Set up the Unicode decoder and write the sniffing buffer into it 384 * followed by the current network buffer. 385 * 386 * @param aFromSegment The current network buffer 387 */ 388 nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment( 389 mozilla::Span<const uint8_t> aFromSegment); 390 391 /** 392 * Initialize the Unicode decoder, mark the BOM as the source and 393 * drop the sniffer. 394 * 395 * @param aDecoderCharsetName The name for the decoder's charset 396 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has 397 * been swallowed) 398 */ 399 void SetupDecodingFromBom(NotNull<const Encoding*> aEncoding); 400 401 void SetupDecodingFromUtf16BogoXml(NotNull<const Encoding*> aEncoding); 402 403 /** 404 * When speculatively decoding from file: URL as UTF-8, commit 405 * to UTF-8 as the non-speculative encoding and start processing 406 * the decoded data. 407 */ 408 void CommitLocalFileToEncoding(); 409 410 /** 411 * When speculatively decoding from file: URL as UTF-8, redecode 412 * using fallback and then continue normally with the fallback. 413 */ 414 void ReDecodeLocalFile(); 415 416 /** 417 * Change a final autodetection source to the corresponding initial one. 418 */ 419 int32_t MaybeRollBackSource(int32_t aSource); 420 421 /** 422 * Potentially guess the encoding using mozilla::EncodingDetector. 423 */ 424 void GuessEncoding(bool aEof, bool aInitial); 425 DontGuessEncoding()426 inline void DontGuessEncoding() { 427 mFeedChardet = false; 428 mGuessEncoding = false; 429 if (mDecodingLocalFileWithoutTokenizing) { 430 CommitLocalFileToEncoding(); 431 } 432 } 433 434 /** 435 * Become confident or resolve and encoding name to its preferred form. 436 * @param aEncoding the value of an internal encoding decl. Acts as an 437 * out param, too, when the method returns true. 438 * @return true if the parser needs to start using the new value of 439 * aEncoding and false if the parser became confident or if 440 * the encoding name did not specify a usable encoding 441 */ 442 const Encoding* PreferredForInternalEncodingDecl(const nsACString& aEncoding); 443 444 /** 445 * Callback for mFlushTimer. 446 */ 447 static void TimerCallback(nsITimer* aTimer, void* aClosure); 448 449 /** 450 * Parser thread entry point for (maybe) flushing the ops and posting 451 * a flush runnable back on the main thread. 452 */ 453 void TimerFlush(); 454 455 /** 456 * Called when speculation fails. 457 */ MaybeDisableFutureSpeculation()458 void MaybeDisableFutureSpeculation() { mSpeculationFailureCount++; } 459 460 /** 461 * Used to check whether we're getting too many speculation failures and 462 * should just stop trying. The 100 is picked pretty randomly to be not too 463 * small (so most pages are not affected) but small enough that we don't end 464 * up with failed speculations over and over in pathological cases. 465 */ IsSpeculationEnabled()466 bool IsSpeculationEnabled() { return mSpeculationFailureCount < 100; } 467 468 /** 469 * Dispatch an event to a Quantum DOM main thread-ish thread. 470 * (Not the parser thread.) 471 */ 472 nsresult DispatchToMain(already_AddRefed<nsIRunnable>&& aRunnable); 473 474 /** 475 * Notify any devtools listeners about content newly received for parsing. 476 */ 477 inline void OnNewContent(mozilla::Span<const char16_t> aData); 478 479 /** 480 * Notify any devtools listeners after all parse content has been received. 481 */ 482 inline void OnContentComplete(); 483 484 nsCOMPtr<nsIRequest> mRequest; 485 nsCOMPtr<nsIRequestObserver> mObserver; 486 487 /** 488 * The document title to use if this turns out to be a View Source parser. 489 */ 490 nsCString mViewSourceTitle; 491 492 /** 493 * The Unicode decoder 494 */ 495 mozilla::UniquePtr<mozilla::Decoder> mUnicodeDecoder; 496 497 /** 498 * The buffer for sniffing the character encoding 499 */ 500 mozilla::UniquePtr<uint8_t[]> mSniffingBuffer; 501 502 /** 503 * The number of meaningful bytes in mSniffingBuffer 504 */ 505 uint32_t mSniffingLength; 506 507 /** 508 * BOM sniffing state 509 */ 510 eBomState mBomState; 511 512 /** 513 * <meta> prescan implementation 514 */ 515 mozilla::UniquePtr<nsHtml5MetaScanner> mMetaScanner; 516 517 // encoding-related stuff 518 /** 519 * The source (confidence) of the character encoding in use 520 */ 521 int32_t mCharsetSource; 522 523 /** 524 * The character encoding in use 525 */ 526 NotNull<const Encoding*> mEncoding; 527 528 /** 529 * Whether the generic or Japanese detector should still be fed. 530 */ 531 bool mFeedChardet; 532 533 /** 534 * Whether the generic detector should be still queried for its guess. 535 */ 536 bool mGuessEncoding; 537 538 /** 539 * Whether reparse is forbidden 540 */ 541 bool mReparseForbidden; 542 543 /** 544 * Whether the channel had charset. 545 */ 546 bool mChannelHadCharset; 547 548 // Portable parser objects 549 /** 550 * The first buffer in the pending UTF-16 buffer queue 551 */ 552 RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer; 553 554 /** 555 * The last buffer in the pending UTF-16 buffer queue 556 */ 557 nsHtml5OwningUTF16Buffer* 558 mLastBuffer; // weak ref; always points to 559 // a buffer of the size 560 // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 561 562 /** 563 * The tree operation executor 564 */ 565 nsHtml5TreeOpExecutor* mExecutor; 566 567 /** 568 * Network event target for mExecutor->mDocument 569 */ 570 nsCOMPtr<nsISerialEventTarget> mNetworkEventTarget; 571 572 /** 573 * The HTML5 tree builder 574 */ 575 mozilla::UniquePtr<nsHtml5TreeBuilder> mTreeBuilder; 576 577 /** 578 * The HTML5 tokenizer 579 */ 580 mozilla::UniquePtr<nsHtml5Tokenizer> mTokenizer; 581 582 /** 583 * Makes sure the main thread can't mess the tokenizer state while it's 584 * tokenizing. This mutex also protects the current speculation. 585 */ 586 mozilla::Mutex mTokenizerMutex; 587 588 /** 589 * The scoped atom table 590 */ 591 nsHtml5AtomTable mAtomTable; 592 593 /** 594 * The owner parser. 595 */ 596 RefPtr<nsHtml5Parser> mOwner; 597 598 /** 599 * Whether the last character tokenized was a carriage return (for CRLF) 600 */ 601 bool mLastWasCR; 602 603 /** 604 * For tracking stream life cycle 605 */ 606 eHtml5StreamState mStreamState; 607 608 /** 609 * Whether we are speculating. 610 */ 611 bool mSpeculating; 612 613 /** 614 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.) 615 */ 616 bool mAtEOF; 617 618 /** 619 * The speculations. The mutex protects the nsTArray itself. 620 * To access the queue of current speculation, mTokenizerMutex must be 621 * obtained. 622 * The current speculation is the last element 623 */ 624 nsTArray<mozilla::UniquePtr<nsHtml5Speculation>> mSpeculations; 625 mozilla::Mutex mSpeculationMutex; 626 627 /** 628 * Number of times speculation has failed for this parser. 629 */ 630 uint32_t mSpeculationFailureCount; 631 632 /** 633 * Number of bytes already buffered into mBufferedLocalFileData. 634 * Never counts above LOCAL_FILE_UTF_8_BUFFER_SIZE. 635 */ 636 uint32_t mLocalFileBytesBuffered; 637 638 nsTArray<mozilla::Buffer<uint8_t>> mBufferedLocalFileData; 639 640 /** 641 * True to terminate early; protected by mTerminatedMutex 642 */ 643 bool mTerminated; 644 bool mInterrupted; 645 mozilla::Mutex mTerminatedMutex; 646 647 /** 648 * The thread this stream parser runs on. 649 */ 650 nsCOMPtr<nsISerialEventTarget> mEventTarget; 651 652 nsCOMPtr<nsIRunnable> mExecutorFlusher; 653 654 nsCOMPtr<nsIRunnable> mLoadFlusher; 655 656 /** 657 * The generict detector. 658 */ 659 mozilla::UniquePtr<mozilla::EncodingDetector> mDetector; 660 661 /** 662 * The TLD we're loading from or empty if unknown. 663 */ 664 nsCString mTLD; 665 666 /** 667 * Whether the initial charset source was kCharsetFromParentFrame 668 */ 669 bool mInitialEncodingWasFromParentFrame; 670 671 bool mHasHadErrors; 672 673 bool mDetectorHasSeenNonAscii; 674 675 bool mDetectorHadOnlySeenAsciiWhenFirstGuessing; 676 677 /** 678 * If true, we are decoding a local file that lacks an encoding 679 * declaration and we are not tokenizing yet. 680 */ 681 bool mDecodingLocalFileWithoutTokenizing; 682 683 /** 684 * Timer for flushing tree ops once in a while when not speculating. 685 */ 686 nsCOMPtr<nsITimer> mFlushTimer; 687 688 /** 689 * Mutex for protecting access to mFlushTimer (but not for the two 690 * mFlushTimerFoo booleans below). 691 */ 692 mozilla::Mutex mFlushTimerMutex; 693 694 /** 695 * Keeps track whether mFlushTimer has been armed. Unfortunately, 696 * nsITimer doesn't enable querying this from the timer itself. 697 */ 698 bool mFlushTimerArmed; 699 700 /** 701 * False initially and true after the timer has fired at least once. 702 */ 703 bool mFlushTimerEverFired; 704 705 /** 706 * Whether the parser is doing a normal parse, view source or plain text. 707 */ 708 eParserMode mMode; 709 710 /** 711 * If the associated docshell is being watched by the devtools, this is 712 * set to the URI associated with the parse. All parse data is sent to the 713 * devtools, along with this URI. This URI is cleared out after the parse has 714 * been marked as completed. 715 */ 716 nsCOMPtr<nsIURI> mURIToSendToDevtools; 717 718 /** 719 * If content is being sent to the devtools, an encoded UUID for the parser. 720 */ 721 nsString mUUIDForDevtools; 722 }; 723 724 #endif // nsHtml5StreamParser_h 725