1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 #ifndef nsHtml5StreamParser_h 7 #define nsHtml5StreamParser_h 8 9 #include "nsAutoPtr.h" 10 #include "nsCOMPtr.h" 11 #include "nsICharsetDetectionObserver.h" 12 #include "nsHtml5MetaScanner.h" 13 #include "nsIUnicodeDecoder.h" 14 #include "nsHtml5TreeOpExecutor.h" 15 #include "nsHtml5OwningUTF16Buffer.h" 16 #include "nsIInputStream.h" 17 #include "mozilla/Mutex.h" 18 #include "mozilla/UniquePtr.h" 19 #include "nsHtml5AtomTable.h" 20 #include "nsHtml5Speculation.h" 21 #include "nsITimer.h" 22 #include "nsICharsetDetector.h" 23 24 class nsHtml5Parser; 25 26 #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024 27 #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024 28 29 enum eParserMode { 30 /** 31 * Parse a document normally as HTML. 32 */ 33 NORMAL, 34 35 /** 36 * View document as HTML source. 37 */ 38 VIEW_SOURCE_HTML, 39 40 /** 41 * View document as XML source 42 */ 43 VIEW_SOURCE_XML, 44 45 /** 46 * View document as plain text source 47 */ 48 VIEW_SOURCE_PLAIN, 49 50 /** 51 * View document as plain text 52 */ 53 PLAIN_TEXT, 54 55 /** 56 * Load as data (XHR) 57 */ 58 LOAD_AS_DATA 59 }; 60 61 enum eBomState { 62 /** 63 * BOM sniffing hasn't started. 64 */ 65 BOM_SNIFFING_NOT_STARTED = 0, 66 67 /** 68 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been 69 * seen. 70 */ 71 SEEN_UTF_16_LE_FIRST_BYTE = 1, 72 73 /** 74 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been 75 * seen. 76 */ 77 SEEN_UTF_16_BE_FIRST_BYTE = 2, 78 79 /** 80 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been 81 * seen. 82 */ 83 SEEN_UTF_8_FIRST_BYTE = 3, 84 85 /** 86 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM 87 * have been seen. 88 */ 89 SEEN_UTF_8_SECOND_BYTE = 4, 90 91 /** 92 * BOM sniffing was started but is now over for whatever reason. 93 */ 94 BOM_SNIFFING_OVER = 5 95 }; 96 97 enum eHtml5StreamState { 98 STREAM_NOT_STARTED = 0, 99 STREAM_BEING_READ = 1, 100 STREAM_ENDED = 2 101 }; 102 103 class nsHtml5StreamParser : public nsICharsetDetectionObserver { 104 105 friend class nsHtml5RequestStopper; 106 friend class nsHtml5DataAvailable; 107 friend class nsHtml5StreamParserContinuation; 108 friend class nsHtml5TimerKungFu; 109 110 public: 111 NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW 112 NS_DECL_CYCLE_COLLECTING_ISUPPORTS 113 NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser, 114 nsICharsetDetectionObserver) 115 116 static void InitializeStatics(); 117 118 nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, 119 nsHtml5Parser* aOwner, 120 eParserMode aMode); 121 122 // Methods that nsHtml5StreamListener calls 123 nsresult CheckListenerChain(); 124 125 nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext); 126 127 nsresult OnDataAvailable(nsIRequest* aRequest, 128 nsISupports* aContext, 129 nsIInputStream* aInStream, 130 uint64_t aSourceOffset, 131 uint32_t aLength); 132 133 nsresult OnStopRequest(nsIRequest* aRequest, 134 nsISupports* aContext, 135 nsresult status); 136 137 // nsICharsetDetectionObserver 138 /** 139 * Chardet calls this to report the detection result 140 */ 141 NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) override; 142 143 // EncodingDeclarationHandler 144 // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java 145 /** 146 * Tree builder uses this to report a late <meta charset> 147 */ 148 bool internalEncodingDeclaration(nsString* aEncoding); 149 150 // Not from an external interface 151 152 /** 153 * Call this method once you've created a parser, and want to instruct it 154 * about what charset to load 155 * 156 * @param aCharset the charset of a document 157 * @param aCharsetSource the source of the charset 158 */ SetDocumentCharset(const nsACString & aCharset,int32_t aSource)159 inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) { 160 NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED, 161 "SetDocumentCharset called too late."); 162 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); 163 mCharset = aCharset; 164 mCharsetSource = aSource; 165 } 166 SetObserver(nsIRequestObserver * aObserver)167 inline void SetObserver(nsIRequestObserver* aObserver) { 168 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); 169 mObserver = aObserver; 170 } 171 172 nsresult GetChannel(nsIChannel** aChannel); 173 174 /** 175 * The owner parser must call this after script execution 176 * when no scripts are executing and the document.written 177 * buffer has been exhausted. 178 */ 179 void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer, 180 nsHtml5TreeBuilder* aTreeBuilder, 181 bool aLastWasCR); 182 183 /** 184 * Continues the stream parser if the charset switch failed. 185 */ 186 void ContinueAfterFailedCharsetSwitch(); 187 Terminate()188 void Terminate() 189 { 190 mozilla::MutexAutoLock autoLock(mTerminatedMutex); 191 mTerminated = true; 192 } 193 194 void DropTimer(); 195 196 /** 197 * Sets mCharset and mCharsetSource appropriately for the XML View Source 198 * case if aEncoding names a supported rough ASCII superset and sets 199 * the mCharset and mCharsetSource to the UTF-8 default otherwise. 200 */ 201 void SetEncodingFromExpat(const char16_t* aEncoding); 202 203 /** 204 * Sets the URL for View Source title in case this parser ends up being 205 * used for View Source. If aURL is a view-source: URL, takes the inner 206 * URL. data: URLs are shown with an ellipsis instead of the actual data. 207 */ 208 void SetViewSourceTitle(nsIURI* aURL); 209 210 private: 211 virtual ~nsHtml5StreamParser(); 212 213 #ifdef DEBUG IsParserThread()214 bool IsParserThread() { 215 bool ret; 216 mThread->IsOnCurrentThread(&ret); 217 return ret; 218 } 219 #endif 220 221 void MarkAsBroken(nsresult aRv); 222 223 /** 224 * Marks the stream parser as interrupted. If you ever add calls to this 225 * method, be sure to review Uninterrupt usage very, very carefully to 226 * avoid having a previous in-flight runnable cancel your Interrupt() 227 * call on the other thread too soon. 228 */ Interrupt()229 void Interrupt() 230 { 231 mozilla::MutexAutoLock autoLock(mTerminatedMutex); 232 mInterrupted = true; 233 } 234 Uninterrupt()235 void Uninterrupt() 236 { 237 NS_ASSERTION(IsParserThread(), "Wrong thread!"); 238 mTokenizerMutex.AssertCurrentThreadOwns(); 239 // Not acquiring mTerminatedMutex because mTokenizerMutex is already 240 // held at this point and is already stronger. 241 mInterrupted = false; 242 } 243 244 /** 245 * Flushes the tree ops from the tree builder and disarms the flush 246 * timer. 247 */ 248 void FlushTreeOpsAndDisarmTimer(); 249 250 void ParseAvailableData(); 251 252 void DoStopRequest(); 253 254 void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength); 255 256 static nsresult CopySegmentsToParser(nsIInputStream *aInStream, 257 void *aClosure, 258 const char *aFromSegment, 259 uint32_t aToOffset, 260 uint32_t aCount, 261 uint32_t *aWriteCount); 262 IsTerminatedOrInterrupted()263 bool IsTerminatedOrInterrupted() 264 { 265 mozilla::MutexAutoLock autoLock(mTerminatedMutex); 266 return mTerminated || mInterrupted; 267 } 268 IsTerminated()269 bool IsTerminated() 270 { 271 mozilla::MutexAutoLock autoLock(mTerminatedMutex); 272 return mTerminated; 273 } 274 275 /** 276 * True when there is a Unicode decoder already 277 */ HasDecoder()278 inline bool HasDecoder() 279 { 280 return !!mUnicodeDecoder; 281 } 282 283 /** 284 * Push bytes from network when there is no Unicode decoder yet 285 */ 286 nsresult SniffStreamBytes(const uint8_t* aFromSegment, 287 uint32_t aCount, 288 uint32_t* aWriteCount); 289 290 /** 291 * Push bytes from network when there is a Unicode decoder already 292 */ 293 nsresult WriteStreamBytes(const uint8_t* aFromSegment, 294 uint32_t aCount, 295 uint32_t* aWriteCount); 296 297 /** 298 * Check whether every other byte in the sniffing buffer is zero. 299 */ 300 void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment, 301 uint32_t aCountToSniffingLimit); 302 303 /** 304 * <meta charset> scan failed. Try chardet if applicable. After this, the 305 * the parser will have some encoding even if a last resolt fallback. 306 * 307 * @param aFromSegment The current network buffer or null if the sniffing 308 * buffer is being flushed due to network stream ending. 309 * @param aCount The number of bytes in aFromSegment (ignored if 310 * aFromSegment is null) 311 * @param aWriteCount Return value for how many bytes got read from the 312 * buffer. 313 * @param aCountToSniffingLimit The number of unfilled slots in 314 * mSniffingBuffer 315 */ 316 nsresult FinalizeSniffing(const uint8_t* aFromSegment, 317 uint32_t aCount, 318 uint32_t* aWriteCount, 319 uint32_t aCountToSniffingLimit); 320 321 /** 322 * Set up the Unicode decoder and write the sniffing buffer into it 323 * followed by the current network buffer. 324 * 325 * @param aFromSegment The current network buffer or null if the sniffing 326 * buffer is being flushed due to network stream ending. 327 * @param aCount The number of bytes in aFromSegment (ignored if 328 * aFromSegment is null) 329 * @param aWriteCount Return value for how many bytes got read from the 330 * buffer. 331 */ 332 nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment, 333 uint32_t aCount, 334 uint32_t* aWriteCount); 335 336 /** 337 * Initialize the Unicode decoder, mark the BOM as the source and 338 * drop the sniffer. 339 * 340 * @param aDecoderCharsetName The name for the decoder's charset 341 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has 342 * been swallowed) 343 */ 344 nsresult SetupDecodingFromBom(const char* aDecoderCharsetName); 345 346 /** 347 * Become confident or resolve and encoding name to its preferred form. 348 * @param aEncoding the value of an internal encoding decl. Acts as an 349 * out param, too, when the method returns true. 350 * @return true if the parser needs to start using the new value of 351 * aEncoding and false if the parser became confident or if 352 * the encoding name did not specify a usable encoding 353 */ 354 bool PreferredForInternalEncodingDecl(nsACString& aEncoding); 355 356 /** 357 * Callback for mFlushTimer. 358 */ 359 static void TimerCallback(nsITimer* aTimer, void* aClosure); 360 361 /** 362 * Parser thread entry point for (maybe) flushing the ops and posting 363 * a flush runnable back on the main thread. 364 */ 365 void TimerFlush(); 366 367 /** 368 * Called when speculation fails. 369 */ MaybeDisableFutureSpeculation()370 void MaybeDisableFutureSpeculation() 371 { 372 mSpeculationFailureCount++; 373 } 374 375 /** 376 * Used to check whether we're getting too many speculation failures and 377 * should just stop trying. The 100 is picked pretty randomly to be not too 378 * small (so most pages are not affected) but small enough that we don't end 379 * up with failed speculations over and over in pathological cases. 380 */ IsSpeculationEnabled()381 bool IsSpeculationEnabled() 382 { 383 return mSpeculationFailureCount < 100; 384 } 385 386 nsCOMPtr<nsIRequest> mRequest; 387 nsCOMPtr<nsIRequestObserver> mObserver; 388 389 /** 390 * The document title to use if this turns out to be a View Source parser. 391 */ 392 nsCString mViewSourceTitle; 393 394 /** 395 * The Unicode decoder 396 */ 397 nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder; 398 399 /** 400 * The buffer for sniffing the character encoding 401 */ 402 mozilla::UniquePtr<uint8_t[]> mSniffingBuffer; 403 404 /** 405 * The number of meaningful bytes in mSniffingBuffer 406 */ 407 uint32_t mSniffingLength; 408 409 /** 410 * BOM sniffing state 411 */ 412 eBomState mBomState; 413 414 /** 415 * <meta> prescan implementation 416 */ 417 nsAutoPtr<nsHtml5MetaScanner> mMetaScanner; 418 419 // encoding-related stuff 420 /** 421 * The source (confidence) of the character encoding in use 422 */ 423 int32_t mCharsetSource; 424 425 /** 426 * The character encoding in use 427 */ 428 nsCString mCharset; 429 430 /** 431 * Whether reparse is forbidden 432 */ 433 bool mReparseForbidden; 434 435 // Portable parser objects 436 /** 437 * The first buffer in the pending UTF-16 buffer queue 438 */ 439 RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer; 440 441 /** 442 * The last buffer in the pending UTF-16 buffer queue 443 */ 444 nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to 445 // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 446 447 /** 448 * The tree operation executor 449 */ 450 nsHtml5TreeOpExecutor* mExecutor; 451 452 /** 453 * The HTML5 tree builder 454 */ 455 nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder; 456 457 /** 458 * The HTML5 tokenizer 459 */ 460 nsAutoPtr<nsHtml5Tokenizer> mTokenizer; 461 462 /** 463 * Makes sure the main thread can't mess the tokenizer state while it's 464 * tokenizing. This mutex also protects the current speculation. 465 */ 466 mozilla::Mutex mTokenizerMutex; 467 468 /** 469 * The scoped atom table 470 */ 471 nsHtml5AtomTable mAtomTable; 472 473 /** 474 * The owner parser. 475 */ 476 RefPtr<nsHtml5Parser> mOwner; 477 478 /** 479 * Whether the last character tokenized was a carriage return (for CRLF) 480 */ 481 bool mLastWasCR; 482 483 /** 484 * For tracking stream life cycle 485 */ 486 eHtml5StreamState mStreamState; 487 488 /** 489 * Whether we are speculating. 490 */ 491 bool mSpeculating; 492 493 /** 494 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.) 495 */ 496 bool mAtEOF; 497 498 /** 499 * The speculations. The mutex protects the nsTArray itself. 500 * To access the queue of current speculation, mTokenizerMutex must be 501 * obtained. 502 * The current speculation is the last element 503 */ 504 nsTArray<nsAutoPtr<nsHtml5Speculation> > mSpeculations; 505 mozilla::Mutex mSpeculationMutex; 506 507 /** 508 * Number of times speculation has failed for this parser. 509 */ 510 uint32_t mSpeculationFailureCount; 511 512 /** 513 * True to terminate early; protected by mTerminatedMutex 514 */ 515 bool mTerminated; 516 bool mInterrupted; 517 mozilla::Mutex mTerminatedMutex; 518 519 /** 520 * The thread this stream parser runs on. 521 */ 522 nsCOMPtr<nsIThread> mThread; 523 524 nsCOMPtr<nsIRunnable> mExecutorFlusher; 525 526 nsCOMPtr<nsIRunnable> mLoadFlusher; 527 528 /** 529 * The chardet instance if chardet is enabled. 530 */ 531 nsCOMPtr<nsICharsetDetector> mChardet; 532 533 /** 534 * If false, don't push data to chardet. 535 */ 536 bool mFeedChardet; 537 538 /** 539 * Whether the initial charset source was kCharsetFromParentFrame 540 */ 541 bool mInitialEncodingWasFromParentFrame; 542 543 /** 544 * Timer for flushing tree ops once in a while when not speculating. 545 */ 546 nsCOMPtr<nsITimer> mFlushTimer; 547 548 /** 549 * Keeps track whether mFlushTimer has been armed. Unfortunately, 550 * nsITimer doesn't enable querying this from the timer itself. 551 */ 552 bool mFlushTimerArmed; 553 554 /** 555 * False initially and true after the timer has fired at least once. 556 */ 557 bool mFlushTimerEverFired; 558 559 /** 560 * Whether the parser is doing a normal parse, view source or plain text. 561 */ 562 eParserMode mMode; 563 564 /** 565 * The pref html5.flushtimer.initialdelay: Time in milliseconds between 566 * the time a network buffer is seen and the timer firing when the 567 * timer hasn't fired previously in this parse. 568 */ 569 static int32_t sTimerInitialDelay; 570 571 /** 572 * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between 573 * the time a network buffer is seen and the timer firing when the 574 * timer has already fired previously in this parse. 575 */ 576 static int32_t sTimerSubsequentDelay; 577 }; 578 579 #endif // nsHtml5StreamParser_h 580