1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #ifndef nsHtml5StreamParser_h
7 #define nsHtml5StreamParser_h
8 
9 #include "MainThreadUtils.h"
10 #include "mozilla/AlreadyAddRefed.h"
11 #include "mozilla/Assertions.h"
12 #include "mozilla/Encoding.h"
13 #include "mozilla/Mutex.h"
14 #include "mozilla/NotNull.h"
15 #include "mozilla/RefPtr.h"
16 #include "mozilla/Span.h"
17 #include "mozilla/UniquePtr.h"
18 #include "nsCharsetSource.h"
19 #include "nsCOMPtr.h"
20 #include "nsCycleCollectionParticipant.h"
21 #include "nsDebug.h"
22 #include "nsHtml5AtomTable.h"
23 #include "nsIRequestObserver.h"
24 #include "nsISerialEventTarget.h"
25 #include "nsISupports.h"
26 #include "nsStringFwd.h"
27 #include "nsTArray.h"
28 #include "nscore.h"
29 
30 class nsCycleCollectionTraversalCallback;
31 class nsHtml5MetaScanner;
32 class nsHtml5OwningUTF16Buffer;
33 class nsHtml5Parser;
34 class nsHtml5Speculation;
35 class nsHtml5String;
36 class nsHtml5Tokenizer;
37 class nsHtml5TreeBuilder;
38 class nsHtml5TreeOpExecutor;
39 class nsIChannel;
40 class nsIInputStream;
41 class nsIRequest;
42 class nsIRunnable;
43 class nsITimer;
44 class nsIURI;
45 
46 namespace mozilla {
47 class EncodingDetector;
48 template <typename T>
49 class Buffer;
50 
51 namespace dom {
52 class DocGroup;
53 }
54 }  // namespace mozilla
55 
56 enum eParserMode {
57   /**
58    * Parse a document normally as HTML.
59    */
60   NORMAL,
61 
62   /**
63    * View document as HTML source.
64    */
65   VIEW_SOURCE_HTML,
66 
67   /**
68    * View document as XML source
69    */
70   VIEW_SOURCE_XML,
71 
72   /**
73    * View document as plain text source
74    */
75   VIEW_SOURCE_PLAIN,
76 
77   /**
78    * View document as plain text
79    */
80   PLAIN_TEXT,
81 
82   /**
83    * Load as data (XHR)
84    */
85   LOAD_AS_DATA
86 };
87 
88 enum eBomState {
89   /**
90    * BOM sniffing hasn't started.
91    */
92   BOM_SNIFFING_NOT_STARTED,
93 
94   /**
95    * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
96    * seen.
97    */
98   SEEN_UTF_16_LE_FIRST_BYTE,
99 
100   /**
101    * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
102    * seen.
103    */
104   SEEN_UTF_16_BE_FIRST_BYTE,
105 
106   /**
107    * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
108    * seen.
109    */
110   SEEN_UTF_8_FIRST_BYTE,
111 
112   /**
113    * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
114    * have been seen.
115    */
116   SEEN_UTF_8_SECOND_BYTE,
117 
118   /**
119    * Seen \x00 in UTF-16BE bogo-XML declaration.
120    */
121   SEEN_UTF_16_BE_XML_FIRST,
122 
123   /**
124    * Seen \x00< in UTF-16BE bogo-XML declaration.
125    */
126   SEEN_UTF_16_BE_XML_SECOND,
127 
128   /**
129    * Seen \x00<\x00 in UTF-16BE bogo-XML declaration.
130    */
131   SEEN_UTF_16_BE_XML_THIRD,
132 
133   /**
134    * Seen \x00<\x00? in UTF-16BE bogo-XML declaration.
135    */
136   SEEN_UTF_16_BE_XML_FOURTH,
137 
138   /**
139    * Seen \x00<\x00?\x00 in UTF-16BE bogo-XML declaration.
140    */
141   SEEN_UTF_16_BE_XML_FIFTH,
142 
143   /**
144    * Seen < in UTF-16BE bogo-XML declaration.
145    */
146   SEEN_UTF_16_LE_XML_FIRST,
147 
148   /**
149    * Seen <\x00 in UTF-16BE bogo-XML declaration.
150    */
151   SEEN_UTF_16_LE_XML_SECOND,
152 
153   /**
154    * Seen <\x00? in UTF-16BE bogo-XML declaration.
155    */
156   SEEN_UTF_16_LE_XML_THIRD,
157 
158   /**
159    * Seen <\x00?\x00 in UTF-16BE bogo-XML declaration.
160    */
161   SEEN_UTF_16_LE_XML_FOURTH,
162 
163   /**
164    * Seen <\x00?\x00x in UTF-16BE bogo-XML declaration.
165    */
166   SEEN_UTF_16_LE_XML_FIFTH,
167 
168   /**
169    * BOM sniffing was started but is now over for whatever reason.
170    */
171   BOM_SNIFFING_OVER,
172 };
173 
174 enum eHtml5StreamState {
175   STREAM_NOT_STARTED = 0,
176   STREAM_BEING_READ = 1,
177   STREAM_ENDED = 2
178 };
179 
180 class nsHtml5StreamParser final : public nsISupports {
181   template <typename T>
182   using NotNull = mozilla::NotNull<T>;
183   using Encoding = mozilla::Encoding;
184 
185   const uint32_t SNIFFING_BUFFER_SIZE = 1024;
186   const uint32_t READ_BUFFER_SIZE = 1024;
187   const uint32_t LOCAL_FILE_UTF_8_BUFFER_SIZE = 1024 * 1024 * 4;  // 4 MB
188 
189   friend class nsHtml5RequestStopper;
190   friend class nsHtml5DataAvailable;
191   friend class nsHtml5StreamParserContinuation;
192   friend class nsHtml5TimerKungFu;
193   friend class nsHtml5StreamParserPtr;
194   friend class nsHtml5StreamListener;
195 
196  public:
197   NS_DECL_CYCLE_COLLECTING_ISUPPORTS
198   NS_DECL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser)
199 
200   nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, nsHtml5Parser* aOwner,
201                       eParserMode aMode);
202 
203   // Methods that nsHtml5StreamListener calls
204   nsresult CheckListenerChain();
205 
206   nsresult OnStartRequest(nsIRequest* aRequest);
207 
208   nsresult OnDataAvailable(nsIRequest* aRequest, nsIInputStream* aInStream,
209                            uint64_t aSourceOffset, uint32_t aLength);
210 
211   nsresult OnStopRequest(nsIRequest* aRequest, nsresult status);
212 
213   // EncodingDeclarationHandler
214   // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
215   /**
216    * Tree builder uses this to report a late <meta charset>
217    */
218   bool internalEncodingDeclaration(nsHtml5String aEncoding);
219 
220   // Not from an external interface
221 
222   /**
223    * Pass a buffer to the Japanese or Cyrillic detector as appropriate.
224    */
225   void FeedDetector(mozilla::Span<const uint8_t> aBuffer, bool aLast);
226 
227   /**
228    *  Call this method once you've created a parser, and want to instruct it
229    *  about what charset to load
230    *
231    *  @param   aEncoding the charset of a document
232    *  @param   aCharsetSource the source of the charset
233    */
SetDocumentCharset(NotNull<const Encoding * > aEncoding,int32_t aSource,bool aChannelHadCharset)234   inline void SetDocumentCharset(NotNull<const Encoding*> aEncoding,
235                                  int32_t aSource, bool aChannelHadCharset) {
236     MOZ_ASSERT(mStreamState == STREAM_NOT_STARTED,
237                "SetDocumentCharset called too late.");
238     NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
239     MOZ_ASSERT(!(aSource == kCharsetFromChannel && !aChannelHadCharset),
240                "If charset is from channel, channel must have had charset.");
241     mEncoding = aEncoding;
242     mCharsetSource = aSource;
243     mChannelHadCharset = aChannelHadCharset;
244   }
245 
SetObserver(nsIRequestObserver * aObserver)246   inline void SetObserver(nsIRequestObserver* aObserver) {
247     NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
248     mObserver = aObserver;
249   }
250 
251   nsresult GetChannel(nsIChannel** aChannel);
252 
253   /**
254    * The owner parser must call this after script execution
255    * when no scripts are executing and the document.written
256    * buffer has been exhausted.
257    */
258   void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
259                             nsHtml5TreeBuilder* aTreeBuilder, bool aLastWasCR);
260 
261   /**
262    * Continues the stream parser if the charset switch failed.
263    */
264   void ContinueAfterFailedCharsetSwitch();
265 
Terminate()266   void Terminate() {
267     mozilla::MutexAutoLock autoLock(mTerminatedMutex);
268     mTerminated = true;
269   }
270 
271   void DropTimer();
272 
273   /**
274    * Sets mEncoding and mCharsetSource appropriately for the XML View Source
275    * case if aEncoding names a supported rough ASCII superset and sets
276    * the mEncoding and mCharsetSource to the UTF-8 default otherwise.
277    */
278   void SetEncodingFromExpat(const char16_t* aEncoding);
279 
280   /**
281    * Sets the URL for View Source title in case this parser ends up being
282    * used for View Source. If aURL is a view-source: URL, takes the inner
283    * URL. data: URLs are shown with an ellipsis instead of the actual data.
284    */
285   void SetViewSourceTitle(nsIURI* aURL);
286 
287  private:
288   virtual ~nsHtml5StreamParser();
289 
290 #ifdef DEBUG
IsParserThread()291   bool IsParserThread() { return mEventTarget->IsOnCurrentThread(); }
292 #endif
293 
294   void MarkAsBroken(nsresult aRv);
295 
296   /**
297    * Marks the stream parser as interrupted. If you ever add calls to this
298    * method, be sure to review Uninterrupt usage very, very carefully to
299    * avoid having a previous in-flight runnable cancel your Interrupt()
300    * call on the other thread too soon.
301    */
Interrupt()302   void Interrupt() {
303     mozilla::MutexAutoLock autoLock(mTerminatedMutex);
304     mInterrupted = true;
305   }
306 
Uninterrupt()307   void Uninterrupt() {
308     NS_ASSERTION(IsParserThread(), "Wrong thread!");
309     mTokenizerMutex.AssertCurrentThreadOwns();
310     // Not acquiring mTerminatedMutex because mTokenizerMutex is already
311     // held at this point and is already stronger.
312     mInterrupted = false;
313   }
314 
315   /**
316    * Flushes the tree ops from the tree builder and disarms the flush
317    * timer.
318    */
319   void FlushTreeOpsAndDisarmTimer();
320 
321   void ParseAvailableData();
322 
323   void DoStopRequest();
324 
325   void DoDataAvailableBuffer(mozilla::Buffer<uint8_t>&& aBuffer);
326 
327   void DoDataAvailable(mozilla::Span<const uint8_t> aBuffer);
328 
329   static nsresult CopySegmentsToParser(nsIInputStream* aInStream,
330                                        void* aClosure, const char* aFromSegment,
331                                        uint32_t aToOffset, uint32_t aCount,
332                                        uint32_t* aWriteCount);
333 
IsTerminatedOrInterrupted()334   bool IsTerminatedOrInterrupted() {
335     mozilla::MutexAutoLock autoLock(mTerminatedMutex);
336     return mTerminated || mInterrupted;
337   }
338 
IsTerminated()339   bool IsTerminated() {
340     mozilla::MutexAutoLock autoLock(mTerminatedMutex);
341     return mTerminated;
342   }
343 
344   /**
345    * True when there is a Unicode decoder already
346    */
HasDecoder()347   inline bool HasDecoder() { return !!mUnicodeDecoder; }
348 
349   /**
350    * Push bytes from network when there is no Unicode decoder yet
351    */
352   nsresult SniffStreamBytes(mozilla::Span<const uint8_t> aFromSegment);
353 
354   /**
355    * Push bytes from network when there is a Unicode decoder already
356    */
357   nsresult WriteStreamBytes(mozilla::Span<const uint8_t> aFromSegment);
358 
359   /**
360    * Check whether every other byte in the sniffing buffer is zero.
361    */
362   void SniffBOMlessUTF16BasicLatin(const uint8_t* aBuf, size_t aBufLen);
363 
364   /**
365    * Write the start of the stream to detector.
366    */
367   void FinalizeSniffingWithDetector(mozilla::Span<const uint8_t> aFromSegment,
368                                     uint32_t aCountToSniffingLimit, bool aEof);
369 
370   /**
371    * <meta charset> scan failed. Try chardet if applicable. After this, the
372    * the parser will have some encoding even if a last resolt fallback.
373    *
374    * @param aFromSegment The current network buffer
375    * @param aCountToSniffingLimit The number of unfilled slots in
376    *                              mSniffingBuffer
377    * @param aEof true iff called upon end of stream
378    */
379   nsresult FinalizeSniffing(mozilla::Span<const uint8_t> aFromSegment,
380                             uint32_t aCountToSniffingLimit, bool aEof);
381 
382   /**
383    * Set up the Unicode decoder and write the sniffing buffer into it
384    * followed by the current network buffer.
385    *
386    * @param aFromSegment The current network buffer
387    */
388   nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
389       mozilla::Span<const uint8_t> aFromSegment);
390 
391   /**
392    * Initialize the Unicode decoder, mark the BOM as the source and
393    * drop the sniffer.
394    *
395    * @param aDecoderCharsetName The name for the decoder's charset
396    *                            (UTF-16BE, UTF-16LE or UTF-8; the BOM has
397    *                            been swallowed)
398    */
399   void SetupDecodingFromBom(NotNull<const Encoding*> aEncoding);
400 
401   void SetupDecodingFromUtf16BogoXml(NotNull<const Encoding*> aEncoding);
402 
403   /**
404    * When speculatively decoding from file: URL as UTF-8, commit
405    * to UTF-8 as the non-speculative encoding and start processing
406    * the decoded data.
407    */
408   void CommitLocalFileToEncoding();
409 
410   /**
411    * When speculatively decoding from file: URL as UTF-8, redecode
412    * using fallback and then continue normally with the fallback.
413    */
414   void ReDecodeLocalFile();
415 
416   /**
417    * Change a final autodetection source to the corresponding initial one.
418    */
419   int32_t MaybeRollBackSource(int32_t aSource);
420 
421   /**
422    * Potentially guess the encoding using mozilla::EncodingDetector.
423    */
424   void GuessEncoding(bool aEof, bool aInitial);
425 
DontGuessEncoding()426   inline void DontGuessEncoding() {
427     mFeedChardet = false;
428     mGuessEncoding = false;
429     if (mDecodingLocalFileWithoutTokenizing) {
430       CommitLocalFileToEncoding();
431     }
432   }
433 
434   /**
435    * Become confident or resolve and encoding name to its preferred form.
436    * @param aEncoding the value of an internal encoding decl. Acts as an
437    *                  out param, too, when the method returns true.
438    * @return true if the parser needs to start using the new value of
439    *         aEncoding and false if the parser became confident or if
440    *         the encoding name did not specify a usable encoding
441    */
442   const Encoding* PreferredForInternalEncodingDecl(const nsACString& aEncoding);
443 
444   /**
445    * Callback for mFlushTimer.
446    */
447   static void TimerCallback(nsITimer* aTimer, void* aClosure);
448 
449   /**
450    * Parser thread entry point for (maybe) flushing the ops and posting
451    * a flush runnable back on the main thread.
452    */
453   void TimerFlush();
454 
455   /**
456    * Called when speculation fails.
457    */
MaybeDisableFutureSpeculation()458   void MaybeDisableFutureSpeculation() { mSpeculationFailureCount++; }
459 
460   /**
461    * Used to check whether we're getting too many speculation failures and
462    * should just stop trying.  The 100 is picked pretty randomly to be not too
463    * small (so most pages are not affected) but small enough that we don't end
464    * up with failed speculations over and over in pathological cases.
465    */
IsSpeculationEnabled()466   bool IsSpeculationEnabled() { return mSpeculationFailureCount < 100; }
467 
468   /**
469    * Dispatch an event to a Quantum DOM main thread-ish thread.
470    * (Not the parser thread.)
471    */
472   nsresult DispatchToMain(already_AddRefed<nsIRunnable>&& aRunnable);
473 
474   /**
475    * Notify any devtools listeners about content newly received for parsing.
476    */
477   inline void OnNewContent(mozilla::Span<const char16_t> aData);
478 
479   /**
480    * Notify any devtools listeners after all parse content has been received.
481    */
482   inline void OnContentComplete();
483 
484   nsCOMPtr<nsIRequest> mRequest;
485   nsCOMPtr<nsIRequestObserver> mObserver;
486 
487   /**
488    * The document title to use if this turns out to be a View Source parser.
489    */
490   nsCString mViewSourceTitle;
491 
492   /**
493    * The Unicode decoder
494    */
495   mozilla::UniquePtr<mozilla::Decoder> mUnicodeDecoder;
496 
497   /**
498    * The buffer for sniffing the character encoding
499    */
500   mozilla::UniquePtr<uint8_t[]> mSniffingBuffer;
501 
502   /**
503    * The number of meaningful bytes in mSniffingBuffer
504    */
505   uint32_t mSniffingLength;
506 
507   /**
508    * BOM sniffing state
509    */
510   eBomState mBomState;
511 
512   /**
513    * <meta> prescan implementation
514    */
515   mozilla::UniquePtr<nsHtml5MetaScanner> mMetaScanner;
516 
517   // encoding-related stuff
518   /**
519    * The source (confidence) of the character encoding in use
520    */
521   int32_t mCharsetSource;
522 
523   /**
524    * The character encoding in use
525    */
526   NotNull<const Encoding*> mEncoding;
527 
528   /**
529    * Whether the generic or Japanese detector should still be fed.
530    */
531   bool mFeedChardet;
532 
533   /**
534    * Whether the generic detector should be still queried for its guess.
535    */
536   bool mGuessEncoding;
537 
538   /**
539    * Whether reparse is forbidden
540    */
541   bool mReparseForbidden;
542 
543   /**
544    * Whether the channel had charset.
545    */
546   bool mChannelHadCharset;
547 
548   // Portable parser objects
549   /**
550    * The first buffer in the pending UTF-16 buffer queue
551    */
552   RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
553 
554   /**
555    * The last buffer in the pending UTF-16 buffer queue
556    */
557   nsHtml5OwningUTF16Buffer*
558       mLastBuffer;  // weak ref; always points to
559                     // a buffer of the size
560                     // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
561 
562   /**
563    * The tree operation executor
564    */
565   nsHtml5TreeOpExecutor* mExecutor;
566 
567   /**
568    * Network event target for mExecutor->mDocument
569    */
570   nsCOMPtr<nsISerialEventTarget> mNetworkEventTarget;
571 
572   /**
573    * The HTML5 tree builder
574    */
575   mozilla::UniquePtr<nsHtml5TreeBuilder> mTreeBuilder;
576 
577   /**
578    * The HTML5 tokenizer
579    */
580   mozilla::UniquePtr<nsHtml5Tokenizer> mTokenizer;
581 
582   /**
583    * Makes sure the main thread can't mess the tokenizer state while it's
584    * tokenizing. This mutex also protects the current speculation.
585    */
586   mozilla::Mutex mTokenizerMutex;
587 
588   /**
589    * The scoped atom table
590    */
591   nsHtml5AtomTable mAtomTable;
592 
593   /**
594    * The owner parser.
595    */
596   RefPtr<nsHtml5Parser> mOwner;
597 
598   /**
599    * Whether the last character tokenized was a carriage return (for CRLF)
600    */
601   bool mLastWasCR;
602 
603   /**
604    * For tracking stream life cycle
605    */
606   eHtml5StreamState mStreamState;
607 
608   /**
609    * Whether we are speculating.
610    */
611   bool mSpeculating;
612 
613   /**
614    * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
615    */
616   bool mAtEOF;
617 
618   /**
619    * The speculations. The mutex protects the nsTArray itself.
620    * To access the queue of current speculation, mTokenizerMutex must be
621    * obtained.
622    * The current speculation is the last element
623    */
624   nsTArray<mozilla::UniquePtr<nsHtml5Speculation>> mSpeculations;
625   mozilla::Mutex mSpeculationMutex;
626 
627   /**
628    * Number of times speculation has failed for this parser.
629    */
630   uint32_t mSpeculationFailureCount;
631 
632   /**
633    * Number of bytes already buffered into mBufferedLocalFileData.
634    * Never counts above LOCAL_FILE_UTF_8_BUFFER_SIZE.
635    */
636   uint32_t mLocalFileBytesBuffered;
637 
638   nsTArray<mozilla::Buffer<uint8_t>> mBufferedLocalFileData;
639 
640   /**
641    * True to terminate early; protected by mTerminatedMutex
642    */
643   bool mTerminated;
644   bool mInterrupted;
645   mozilla::Mutex mTerminatedMutex;
646 
647   /**
648    * The thread this stream parser runs on.
649    */
650   nsCOMPtr<nsISerialEventTarget> mEventTarget;
651 
652   nsCOMPtr<nsIRunnable> mExecutorFlusher;
653 
654   nsCOMPtr<nsIRunnable> mLoadFlusher;
655 
656   /**
657    * The generict detector.
658    */
659   mozilla::UniquePtr<mozilla::EncodingDetector> mDetector;
660 
661   /**
662    * The TLD we're loading from or empty if unknown.
663    */
664   nsCString mTLD;
665 
666   /**
667    * Whether the initial charset source was kCharsetFromParentFrame
668    */
669   bool mInitialEncodingWasFromParentFrame;
670 
671   bool mHasHadErrors;
672 
673   bool mDetectorHasSeenNonAscii;
674 
675   bool mDetectorHadOnlySeenAsciiWhenFirstGuessing;
676 
677   /**
678    * If true, we are decoding a local file that lacks an encoding
679    * declaration and we are not tokenizing yet.
680    */
681   bool mDecodingLocalFileWithoutTokenizing;
682 
683   /**
684    * Timer for flushing tree ops once in a while when not speculating.
685    */
686   nsCOMPtr<nsITimer> mFlushTimer;
687 
688   /**
689    * Mutex for protecting access to mFlushTimer (but not for the two
690    * mFlushTimerFoo booleans below).
691    */
692   mozilla::Mutex mFlushTimerMutex;
693 
694   /**
695    * Keeps track whether mFlushTimer has been armed. Unfortunately,
696    * nsITimer doesn't enable querying this from the timer itself.
697    */
698   bool mFlushTimerArmed;
699 
700   /**
701    * False initially and true after the timer has fired at least once.
702    */
703   bool mFlushTimerEverFired;
704 
705   /**
706    * Whether the parser is doing a normal parse, view source or plain text.
707    */
708   eParserMode mMode;
709 
710   /**
711    * If the associated docshell is being watched by the devtools, this is
712    * set to the URI associated with the parse. All parse data is sent to the
713    * devtools, along with this URI. This URI is cleared out after the parse has
714    * been marked as completed.
715    */
716   nsCOMPtr<nsIURI> mURIToSendToDevtools;
717 
718   /**
719    * If content is being sent to the devtools, an encoded UUID for the parser.
720    */
721   nsString mUUIDForDevtools;
722 };
723 
724 #endif  // nsHtml5StreamParser_h
725