1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #ifndef nsHtml5StreamParser_h
7 #define nsHtml5StreamParser_h
8 
9 #include "nsAutoPtr.h"
10 #include "nsCOMPtr.h"
11 #include "nsICharsetDetectionObserver.h"
12 #include "nsHtml5MetaScanner.h"
13 #include "nsIUnicodeDecoder.h"
14 #include "nsHtml5TreeOpExecutor.h"
15 #include "nsHtml5OwningUTF16Buffer.h"
16 #include "nsIInputStream.h"
17 #include "mozilla/Mutex.h"
18 #include "mozilla/UniquePtr.h"
19 #include "nsHtml5AtomTable.h"
20 #include "nsHtml5Speculation.h"
21 #include "nsITimer.h"
22 #include "nsICharsetDetector.h"
23 
24 class nsHtml5Parser;
25 
26 #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
27 #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
28 
29 enum eParserMode {
30   /**
31    * Parse a document normally as HTML.
32    */
33   NORMAL,
34 
35   /**
36    * View document as HTML source.
37    */
38   VIEW_SOURCE_HTML,
39 
40   /**
41    * View document as XML source
42    */
43   VIEW_SOURCE_XML,
44 
45   /**
46    * View document as plain text source
47    */
48   VIEW_SOURCE_PLAIN,
49 
50   /**
51    * View document as plain text
52    */
53   PLAIN_TEXT,
54 
55   /**
56    * Load as data (XHR)
57    */
58   LOAD_AS_DATA
59 };
60 
61 enum eBomState {
62   /**
63    * BOM sniffing hasn't started.
64    */
65   BOM_SNIFFING_NOT_STARTED = 0,
66 
67   /**
68    * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
69    * seen.
70    */
71   SEEN_UTF_16_LE_FIRST_BYTE = 1,
72 
73   /**
74    * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
75    * seen.
76    */
77   SEEN_UTF_16_BE_FIRST_BYTE = 2,
78 
79   /**
80    * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
81    * seen.
82    */
83   SEEN_UTF_8_FIRST_BYTE = 3,
84 
85   /**
86    * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
87    * have been seen.
88    */
89   SEEN_UTF_8_SECOND_BYTE = 4,
90 
91   /**
92    * BOM sniffing was started but is now over for whatever reason.
93    */
94   BOM_SNIFFING_OVER = 5
95 };
96 
97 enum eHtml5StreamState {
98   STREAM_NOT_STARTED = 0,
99   STREAM_BEING_READ = 1,
100   STREAM_ENDED = 2
101 };
102 
103 class nsHtml5StreamParser : public nsICharsetDetectionObserver {
104 
105   friend class nsHtml5RequestStopper;
106   friend class nsHtml5DataAvailable;
107   friend class nsHtml5StreamParserContinuation;
108   friend class nsHtml5TimerKungFu;
109 
110   public:
111     NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW
112     NS_DECL_CYCLE_COLLECTING_ISUPPORTS
113     NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
114                                              nsICharsetDetectionObserver)
115 
116     static void InitializeStatics();
117 
118     nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
119                         nsHtml5Parser* aOwner,
120                         eParserMode aMode);
121 
122     // Methods that nsHtml5StreamListener calls
123     nsresult CheckListenerChain();
124 
125     nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);
126 
127     nsresult OnDataAvailable(nsIRequest* aRequest,
128                              nsISupports* aContext,
129                              nsIInputStream* aInStream,
130                              uint64_t aSourceOffset,
131                              uint32_t aLength);
132 
133     nsresult OnStopRequest(nsIRequest* aRequest,
134                            nsISupports* aContext,
135                            nsresult status);
136 
137     // nsICharsetDetectionObserver
138     /**
139      * Chardet calls this to report the detection result
140      */
141     NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) override;
142 
143     // EncodingDeclarationHandler
144     // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
145     /**
146      * Tree builder uses this to report a late <meta charset>
147      */
148     bool internalEncodingDeclaration(nsString* aEncoding);
149 
150     // Not from an external interface
151 
152     /**
153      *  Call this method once you've created a parser, and want to instruct it
154      *  about what charset to load
155      *
156      *  @param   aCharset the charset of a document
157      *  @param   aCharsetSource the source of the charset
158      */
SetDocumentCharset(const nsACString & aCharset,int32_t aSource)159     inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) {
160       NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
161                       "SetDocumentCharset called too late.");
162       NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
163       mCharset = aCharset;
164       mCharsetSource = aSource;
165     }
166 
SetObserver(nsIRequestObserver * aObserver)167     inline void SetObserver(nsIRequestObserver* aObserver) {
168       NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
169       mObserver = aObserver;
170     }
171 
172     nsresult GetChannel(nsIChannel** aChannel);
173 
174     /**
175      * The owner parser must call this after script execution
176      * when no scripts are executing and the document.written
177      * buffer has been exhausted.
178      */
179     void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
180                               nsHtml5TreeBuilder* aTreeBuilder,
181                               bool aLastWasCR);
182 
183     /**
184      * Continues the stream parser if the charset switch failed.
185      */
186     void ContinueAfterFailedCharsetSwitch();
187 
Terminate()188     void Terminate()
189     {
190       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
191       mTerminated = true;
192     }
193 
194     void DropTimer();
195 
196     /**
197      * Sets mCharset and mCharsetSource appropriately for the XML View Source
198      * case if aEncoding names a supported rough ASCII superset and sets
199      * the mCharset and mCharsetSource to the UTF-8 default otherwise.
200      */
201     void SetEncodingFromExpat(const char16_t* aEncoding);
202 
203     /**
204      * Sets the URL for View Source title in case this parser ends up being
205      * used for View Source. If aURL is a view-source: URL, takes the inner
206      * URL. data: URLs are shown with an ellipsis instead of the actual data.
207      */
208     void SetViewSourceTitle(nsIURI* aURL);
209 
210   private:
211     virtual ~nsHtml5StreamParser();
212 
213 #ifdef DEBUG
IsParserThread()214     bool IsParserThread() {
215       bool ret;
216       mThread->IsOnCurrentThread(&ret);
217       return ret;
218     }
219 #endif
220 
221     void MarkAsBroken(nsresult aRv);
222 
223     /**
224      * Marks the stream parser as interrupted. If you ever add calls to this
225      * method, be sure to review Uninterrupt usage very, very carefully to
226      * avoid having a previous in-flight runnable cancel your Interrupt()
227      * call on the other thread too soon.
228      */
Interrupt()229     void Interrupt()
230     {
231       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
232       mInterrupted = true;
233     }
234 
Uninterrupt()235     void Uninterrupt()
236     {
237       NS_ASSERTION(IsParserThread(), "Wrong thread!");
238       mTokenizerMutex.AssertCurrentThreadOwns();
239       // Not acquiring mTerminatedMutex because mTokenizerMutex is already
240       // held at this point and is already stronger.
241       mInterrupted = false;
242     }
243 
244     /**
245      * Flushes the tree ops from the tree builder and disarms the flush
246      * timer.
247      */
248     void FlushTreeOpsAndDisarmTimer();
249 
250     void ParseAvailableData();
251 
252     void DoStopRequest();
253 
254     void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);
255 
256     static nsresult CopySegmentsToParser(nsIInputStream *aInStream,
257                                          void *aClosure,
258                                          const char *aFromSegment,
259                                          uint32_t aToOffset,
260                                          uint32_t aCount,
261                                          uint32_t *aWriteCount);
262 
IsTerminatedOrInterrupted()263     bool IsTerminatedOrInterrupted()
264     {
265       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
266       return mTerminated || mInterrupted;
267     }
268 
IsTerminated()269     bool IsTerminated()
270     {
271       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
272       return mTerminated;
273     }
274 
275     /**
276      * True when there is a Unicode decoder already
277      */
HasDecoder()278     inline bool HasDecoder()
279     {
280       return !!mUnicodeDecoder;
281     }
282 
283     /**
284      * Push bytes from network when there is no Unicode decoder yet
285      */
286     nsresult SniffStreamBytes(const uint8_t* aFromSegment,
287                               uint32_t aCount,
288                               uint32_t* aWriteCount);
289 
290     /**
291      * Push bytes from network when there is a Unicode decoder already
292      */
293     nsresult WriteStreamBytes(const uint8_t* aFromSegment,
294                               uint32_t aCount,
295                               uint32_t* aWriteCount);
296 
297     /**
298      * Check whether every other byte in the sniffing buffer is zero.
299      */
300     void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
301                                      uint32_t aCountToSniffingLimit);
302 
303     /**
304      * <meta charset> scan failed. Try chardet if applicable. After this, the
305      * the parser will have some encoding even if a last resolt fallback.
306      *
307      * @param aFromSegment The current network buffer or null if the sniffing
308      *                     buffer is being flushed due to network stream ending.
309      * @param aCount       The number of bytes in aFromSegment (ignored if
310      *                     aFromSegment is null)
311      * @param aWriteCount  Return value for how many bytes got read from the
312      *                     buffer.
313      * @param aCountToSniffingLimit The number of unfilled slots in
314      *                              mSniffingBuffer
315      */
316     nsresult FinalizeSniffing(const uint8_t* aFromSegment,
317                               uint32_t aCount,
318                               uint32_t* aWriteCount,
319                               uint32_t aCountToSniffingLimit);
320 
321     /**
322      * Set up the Unicode decoder and write the sniffing buffer into it
323      * followed by the current network buffer.
324      *
325      * @param aFromSegment The current network buffer or null if the sniffing
326      *                     buffer is being flushed due to network stream ending.
327      * @param aCount       The number of bytes in aFromSegment (ignored if
328      *                     aFromSegment is null)
329      * @param aWriteCount  Return value for how many bytes got read from the
330      *                     buffer.
331      */
332     nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
333                                                                   uint32_t aCount,
334                                                                   uint32_t* aWriteCount);
335 
336     /**
337      * Initialize the Unicode decoder, mark the BOM as the source and
338      * drop the sniffer.
339      *
340      * @param aDecoderCharsetName The name for the decoder's charset
341      *                            (UTF-16BE, UTF-16LE or UTF-8; the BOM has
342      *                            been swallowed)
343      */
344     nsresult SetupDecodingFromBom(const char* aDecoderCharsetName);
345 
346     /**
347      * Become confident or resolve and encoding name to its preferred form.
348      * @param aEncoding the value of an internal encoding decl. Acts as an
349      *                  out param, too, when the method returns true.
350      * @return true if the parser needs to start using the new value of
351      *         aEncoding and false if the parser became confident or if
352      *         the encoding name did not specify a usable encoding
353      */
354     bool PreferredForInternalEncodingDecl(nsACString& aEncoding);
355 
356     /**
357      * Callback for mFlushTimer.
358      */
359     static void TimerCallback(nsITimer* aTimer, void* aClosure);
360 
361     /**
362      * Parser thread entry point for (maybe) flushing the ops and posting
363      * a flush runnable back on the main thread.
364      */
365     void TimerFlush();
366 
367     /**
368      * Called when speculation fails.
369      */
MaybeDisableFutureSpeculation()370     void MaybeDisableFutureSpeculation()
371     {
372         mSpeculationFailureCount++;
373     }
374 
375     /**
376      * Used to check whether we're getting too many speculation failures and
377      * should just stop trying.  The 100 is picked pretty randomly to be not too
378      * small (so most pages are not affected) but small enough that we don't end
379      * up with failed speculations over and over in pathological cases.
380      */
IsSpeculationEnabled()381     bool IsSpeculationEnabled()
382     {
383         return mSpeculationFailureCount < 100;
384     }
385 
386     nsCOMPtr<nsIRequest>          mRequest;
387     nsCOMPtr<nsIRequestObserver>  mObserver;
388 
389     /**
390      * The document title to use if this turns out to be a View Source parser.
391      */
392     nsCString                     mViewSourceTitle;
393 
394     /**
395      * The Unicode decoder
396      */
397     nsCOMPtr<nsIUnicodeDecoder>   mUnicodeDecoder;
398 
399     /**
400      * The buffer for sniffing the character encoding
401      */
402     mozilla::UniquePtr<uint8_t[]> mSniffingBuffer;
403 
404     /**
405      * The number of meaningful bytes in mSniffingBuffer
406      */
407     uint32_t                      mSniffingLength;
408 
409     /**
410      * BOM sniffing state
411      */
412     eBomState                     mBomState;
413 
414     /**
415      * <meta> prescan implementation
416      */
417     nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;
418 
419     // encoding-related stuff
420     /**
421      * The source (confidence) of the character encoding in use
422      */
423     int32_t                       mCharsetSource;
424 
425     /**
426      * The character encoding in use
427      */
428     nsCString                     mCharset;
429 
430     /**
431      * Whether reparse is forbidden
432      */
433     bool                          mReparseForbidden;
434 
435     // Portable parser objects
436     /**
437      * The first buffer in the pending UTF-16 buffer queue
438      */
439     RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
440 
441     /**
442      * The last buffer in the pending UTF-16 buffer queue
443      */
444     nsHtml5OwningUTF16Buffer*     mLastBuffer; // weak ref; always points to
445                       // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
446 
447     /**
448      * The tree operation executor
449      */
450     nsHtml5TreeOpExecutor*        mExecutor;
451 
452     /**
453      * The HTML5 tree builder
454      */
455     nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;
456 
457     /**
458      * The HTML5 tokenizer
459      */
460     nsAutoPtr<nsHtml5Tokenizer>   mTokenizer;
461 
462     /**
463      * Makes sure the main thread can't mess the tokenizer state while it's
464      * tokenizing. This mutex also protects the current speculation.
465      */
466     mozilla::Mutex                mTokenizerMutex;
467 
468     /**
469      * The scoped atom table
470      */
471     nsHtml5AtomTable              mAtomTable;
472 
473     /**
474      * The owner parser.
475      */
476     RefPtr<nsHtml5Parser>       mOwner;
477 
478     /**
479      * Whether the last character tokenized was a carriage return (for CRLF)
480      */
481     bool                          mLastWasCR;
482 
483     /**
484      * For tracking stream life cycle
485      */
486     eHtml5StreamState             mStreamState;
487 
488     /**
489      * Whether we are speculating.
490      */
491     bool                          mSpeculating;
492 
493     /**
494      * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
495      */
496     bool                          mAtEOF;
497 
498     /**
499      * The speculations. The mutex protects the nsTArray itself.
500      * To access the queue of current speculation, mTokenizerMutex must be
501      * obtained.
502      * The current speculation is the last element
503      */
504     nsTArray<nsAutoPtr<nsHtml5Speculation> >  mSpeculations;
505     mozilla::Mutex                            mSpeculationMutex;
506 
507     /**
508      * Number of times speculation has failed for this parser.
509      */
510     uint32_t                      mSpeculationFailureCount;
511 
512     /**
513      * True to terminate early; protected by mTerminatedMutex
514      */
515     bool                          mTerminated;
516     bool                          mInterrupted;
517     mozilla::Mutex                mTerminatedMutex;
518 
519     /**
520      * The thread this stream parser runs on.
521      */
522     nsCOMPtr<nsIThread>           mThread;
523 
524     nsCOMPtr<nsIRunnable>         mExecutorFlusher;
525 
526     nsCOMPtr<nsIRunnable>         mLoadFlusher;
527 
528     /**
529      * The chardet instance if chardet is enabled.
530      */
531     nsCOMPtr<nsICharsetDetector>  mChardet;
532 
533     /**
534      * If false, don't push data to chardet.
535      */
536     bool                          mFeedChardet;
537 
538     /**
539      * Whether the initial charset source was kCharsetFromParentFrame
540      */
541     bool                          mInitialEncodingWasFromParentFrame;
542 
543     /**
544      * Timer for flushing tree ops once in a while when not speculating.
545      */
546     nsCOMPtr<nsITimer>            mFlushTimer;
547 
548     /**
549      * Keeps track whether mFlushTimer has been armed. Unfortunately,
550      * nsITimer doesn't enable querying this from the timer itself.
551      */
552     bool                          mFlushTimerArmed;
553 
554     /**
555      * False initially and true after the timer has fired at least once.
556      */
557     bool                          mFlushTimerEverFired;
558 
559     /**
560      * Whether the parser is doing a normal parse, view source or plain text.
561      */
562     eParserMode                   mMode;
563 
564     /**
565      * The pref html5.flushtimer.initialdelay: Time in milliseconds between
566      * the time a network buffer is seen and the timer firing when the
567      * timer hasn't fired previously in this parse.
568      */
569     static int32_t                sTimerInitialDelay;
570 
571     /**
572      * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
573      * the time a network buffer is seen and the timer firing when the
574      * timer has already fired previously in this parse.
575      */
576     static int32_t                sTimerSubsequentDelay;
577 };
578 
579 #endif // nsHtml5StreamParser_h
580