1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set sw=2 ts=2 et tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "nsHtml5StreamParser.h"
8 
9 #include <stdlib.h>
10 #include <string.h>
11 #include <algorithm>
12 #include <new>
13 #include <type_traits>
14 #include <utility>
15 #include "GeckoProfiler.h"
16 #include "js/GCAPI.h"
17 #include "mozilla/ArrayIterator.h"
18 #include "mozilla/Buffer.h"
19 #include "mozilla/CheckedInt.h"
20 #include "mozilla/DebugOnly.h"
21 #include "mozilla/Encoding.h"
22 #include "mozilla/EncodingDetector.h"
23 #include "mozilla/Likely.h"
24 #include "mozilla/Maybe.h"
25 #include "mozilla/SchedulerGroup.h"
26 #include "mozilla/ScopeExit.h"
27 #include "mozilla/Services.h"
28 #include "mozilla/StaticPrefs_html5.h"
29 #include "mozilla/StaticPrefs_intl.h"
30 #include "mozilla/TaskCategory.h"
31 #include "mozilla/Tuple.h"
32 #include "mozilla/UniquePtrExtensions.h"
33 #include "mozilla/Unused.h"
34 #include "mozilla/dom/BindingDeclarations.h"
35 #include "mozilla/dom/BrowsingContext.h"
36 #include "mozilla/dom/DebuggerUtilsBinding.h"
37 #include "mozilla/dom/DocGroup.h"
38 #include "mozilla/dom/Document.h"
39 #include "mozilla/mozalloc.h"
40 #include "nsContentSink.h"
41 #include "nsContentUtils.h"
42 #include "nsCycleCollectionTraversalCallback.h"
43 #include "nsHtml5AtomTable.h"
44 #include "nsHtml5ByteReadable.h"
45 #include "nsHtml5Highlighter.h"
46 #include "nsHtml5MetaScanner.h"
47 #include "nsHtml5Module.h"
48 #include "nsHtml5OwningUTF16Buffer.h"
49 #include "nsHtml5Parser.h"
50 #include "nsHtml5Speculation.h"
51 #include "nsHtml5StreamParserPtr.h"
52 #include "nsHtml5Tokenizer.h"
53 #include "nsHtml5TreeBuilder.h"
54 #include "nsHtml5TreeOpExecutor.h"
55 #include "nsHtml5TreeOpStage.h"
56 #include "nsIChannel.h"
57 #include "nsIContentSink.h"
58 #include "nsID.h"
59 #include "nsIDTD.h"
60 #include "nsIDocShell.h"
61 #include "nsIEventTarget.h"
62 #include "nsIHttpChannel.h"
63 #include "nsIInputStream.h"
64 #include "nsINestedURI.h"
65 #include "nsIObserverService.h"
66 #include "nsIRequest.h"
67 #include "nsIRunnable.h"
68 #include "nsIScriptError.h"
69 #include "nsIThread.h"
70 #include "nsIThreadRetargetableRequest.h"
71 #include "nsIThreadRetargetableStreamListener.h"
72 #include "nsITimer.h"
73 #include "nsIURI.h"
74 #include "nsJSEnvironment.h"
75 #include "nsLiteralString.h"
76 #include "nsNetUtil.h"
77 #include "nsString.h"
78 #include "nsTPromiseFlatString.h"
79 #include "nsThreadUtils.h"
80 #include "nsXULAppAPI.h"
81 
82 // Include expat after the other, since it defines XML_NS, which conflicts with
83 // our symbol names.
84 #include "expat_config.h"
85 #include "expat.h"
86 
87 extern "C" {
88 // Defined in intl/encoding_glue/src/lib.rs
89 const mozilla::Encoding* xmldecl_parse(const uint8_t* buf, size_t buf_len);
90 };
91 
92 using namespace mozilla;
93 using namespace mozilla::dom;
94 
95 /*
96  * Note that nsHtml5StreamParser implements cycle collecting AddRef and
97  * Release. Therefore, nsHtml5StreamParser must never be refcounted from
98  * the parser thread!
99  *
100  * To work around this limitation, runnables posted by the main thread to the
101  * parser thread hold their reference to the stream parser in an
102  * nsHtml5StreamParserPtr. Upon creation, nsHtml5StreamParserPtr addrefs the
103  * object it holds
104  * just like a regular nsRefPtr. This is OK, since the creation of the
105  * runnable and the nsHtml5StreamParserPtr happens on the main thread.
106  *
107  * When the runnable is done on the parser thread, the destructor of
108  * nsHtml5StreamParserPtr runs there. It doesn't call Release on the held object
109  * directly. Instead, it posts another runnable back to the main thread where
110  * that runnable calls Release on the wrapped object.
111  *
112  * When posting runnables in the other direction, the runnables have to be
113  * created on the main thread when nsHtml5StreamParser is instantiated and
114  * held for the lifetime of the nsHtml5StreamParser. This works, because the
115  * same runnabled can be dispatched multiple times and currently runnables
116  * posted from the parser thread to main thread don't need to wrap any
117  * runnable-specific data. (In the other direction, the runnables most notably
118  * wrap the byte data of the stream.)
119  */
120 NS_IMPL_CYCLE_COLLECTING_ADDREF(nsHtml5StreamParser)
121 NS_IMPL_CYCLE_COLLECTING_RELEASE(nsHtml5StreamParser)
122 
123 NS_INTERFACE_TABLE_HEAD(nsHtml5StreamParser)
124   NS_INTERFACE_TABLE(nsHtml5StreamParser, nsISupports)
125   NS_INTERFACE_TABLE_TO_MAP_SEGUE_CYCLE_COLLECTION(nsHtml5StreamParser)
126 NS_INTERFACE_MAP_END
127 
128 NS_IMPL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser)
129 
130 NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsHtml5StreamParser)
131   tmp->DropTimer();
132   NS_IMPL_CYCLE_COLLECTION_UNLINK(mObserver)
133   NS_IMPL_CYCLE_COLLECTION_UNLINK(mRequest)
134   NS_IMPL_CYCLE_COLLECTION_UNLINK(mOwner)
135   tmp->mExecutorFlusher = nullptr;
136   tmp->mLoadFlusher = nullptr;
137   tmp->mExecutor = nullptr;
138 NS_IMPL_CYCLE_COLLECTION_UNLINK_END
139 
140 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsHtml5StreamParser)
141   NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mObserver)
142   NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mRequest)
143   NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mOwner)
144   // hack: count the strongly owned edge wrapped in the runnable
145   if (tmp->mExecutorFlusher) {
146     NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mExecutorFlusher->mExecutor");
147     cb.NoteXPCOMChild(static_cast<nsIContentSink*>(tmp->mExecutor));
148   }
149   // hack: count the strongly owned edge wrapped in the runnable
150   if (tmp->mLoadFlusher) {
151     NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mLoadFlusher->mExecutor");
152     cb.NoteXPCOMChild(static_cast<nsIContentSink*>(tmp->mExecutor));
153   }
154 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
155 
156 class nsHtml5ExecutorFlusher : public Runnable {
157  private:
158   RefPtr<nsHtml5TreeOpExecutor> mExecutor;
159 
160  public:
nsHtml5ExecutorFlusher(nsHtml5TreeOpExecutor * aExecutor)161   explicit nsHtml5ExecutorFlusher(nsHtml5TreeOpExecutor* aExecutor)
162       : Runnable("nsHtml5ExecutorFlusher"), mExecutor(aExecutor) {}
Run()163   NS_IMETHOD Run() override {
164     if (!mExecutor->isInList()) {
165       Document* doc = mExecutor->GetDocument();
166       if (XRE_IsContentProcess() &&
167           nsContentUtils::
168               HighPriorityEventPendingForTopLevelDocumentBeforeContentfulPaint(
169                   doc)) {
170         // Possible early paint pending, reuse the runnable and try to
171         // call RunFlushLoop later.
172         nsCOMPtr<nsIRunnable> flusher = this;
173         if (NS_SUCCEEDED(
174                 doc->Dispatch(TaskCategory::Network, flusher.forget()))) {
175           PROFILER_MARKER_UNTYPED("HighPrio blocking parser flushing(1)", DOM);
176           return NS_OK;
177         }
178       }
179       mExecutor->RunFlushLoop();
180     }
181     return NS_OK;
182   }
183 };
184 
185 class nsHtml5LoadFlusher : public Runnable {
186  private:
187   RefPtr<nsHtml5TreeOpExecutor> mExecutor;
188 
189  public:
nsHtml5LoadFlusher(nsHtml5TreeOpExecutor * aExecutor)190   explicit nsHtml5LoadFlusher(nsHtml5TreeOpExecutor* aExecutor)
191       : Runnable("nsHtml5LoadFlusher"), mExecutor(aExecutor) {}
Run()192   NS_IMETHOD Run() override {
193     mExecutor->FlushSpeculativeLoads();
194     return NS_OK;
195   }
196 };
197 
nsHtml5StreamParser(nsHtml5TreeOpExecutor * aExecutor,nsHtml5Parser * aOwner,eParserMode aMode)198 nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
199                                          nsHtml5Parser* aOwner,
200                                          eParserMode aMode)
201     : mSniffingLength(0),
202       mBomState(eBomState::BOM_SNIFFING_NOT_STARTED),
203       mCharsetSource(kCharsetUninitialized),
204       mEncoding(WINDOWS_1252_ENCODING),
205       mFeedChardet(true),
206       mGuessEncoding(true),
207       mReparseForbidden(false),
208       mChannelHadCharset(false),
209       mLastBuffer(nullptr),  // Will be filled when starting
210       mExecutor(aExecutor),
211       mTreeBuilder(new nsHtml5TreeBuilder(
212           (aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML)
213               ? nullptr
214               : mExecutor->GetStage(),
215           aMode == NORMAL ? mExecutor->GetStage() : nullptr)),
216       mTokenizer(
217           new nsHtml5Tokenizer(mTreeBuilder.get(), aMode == VIEW_SOURCE_XML)),
218       mTokenizerMutex("nsHtml5StreamParser mTokenizerMutex"),
219       mOwner(aOwner),
220       mLastWasCR(false),
221       mStreamState(eHtml5StreamState::STREAM_NOT_STARTED),
222       mSpeculating(false),
223       mAtEOF(false),
224       mSpeculationMutex("nsHtml5StreamParser mSpeculationMutex"),
225       mSpeculationFailureCount(0),
226       mLocalFileBytesBuffered(0),
227       mTerminated(false),
228       mInterrupted(false),
229       mTerminatedMutex("nsHtml5StreamParser mTerminatedMutex"),
230       mEventTarget(nsHtml5Module::GetStreamParserThread()->SerialEventTarget()),
231       mExecutorFlusher(new nsHtml5ExecutorFlusher(aExecutor)),
232       mLoadFlusher(new nsHtml5LoadFlusher(aExecutor)),
233       mInitialEncodingWasFromParentFrame(false),
234       mHasHadErrors(false),
235       mDetectorHasSeenNonAscii(false),
236       mDetectorHadOnlySeenAsciiWhenFirstGuessing(false),
237       mDecodingLocalFileWithoutTokenizing(false),
238       mFlushTimer(NS_NewTimer(mEventTarget)),
239       mFlushTimerMutex("nsHtml5StreamParser mFlushTimerMutex"),
240       mFlushTimerArmed(false),
241       mFlushTimerEverFired(false),
242       mMode(aMode) {
243   NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
244 #ifdef DEBUG
245   mAtomTable.SetPermittedLookupEventTarget(mEventTarget);
246 #endif
247   mTokenizer->setInterner(&mAtomTable);
248   mTokenizer->setEncodingDeclarationHandler(this);
249 
250   if (aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML) {
251     nsHtml5Highlighter* highlighter =
252         new nsHtml5Highlighter(mExecutor->GetStage());
253     mTokenizer->EnableViewSource(highlighter);    // takes ownership
254     mTreeBuilder->EnableViewSource(highlighter);  // doesn't own
255   }
256 
257   // There's a zeroing operator new for everything else
258 }
259 
~nsHtml5StreamParser()260 nsHtml5StreamParser::~nsHtml5StreamParser() {
261   NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
262   mTokenizer->end();
263 #ifdef DEBUG
264   {
265     mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex);
266     MOZ_ASSERT(!mFlushTimer, "Flush timer was not dropped before dtor!");
267   }
268   mRequest = nullptr;
269   mObserver = nullptr;
270   mUnicodeDecoder = nullptr;
271   mSniffingBuffer = nullptr;
272   mMetaScanner = nullptr;
273   mFirstBuffer = nullptr;
274   mExecutor = nullptr;
275   mTreeBuilder = nullptr;
276   mTokenizer = nullptr;
277   mOwner = nullptr;
278 #endif
279 }
280 
GetChannel(nsIChannel ** aChannel)281 nsresult nsHtml5StreamParser::GetChannel(nsIChannel** aChannel) {
282   NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
283   return mRequest ? CallQueryInterface(mRequest, aChannel)
284                   : NS_ERROR_NOT_AVAILABLE;
285 }
286 
MaybeRollBackSource(int32_t aSource)287 int32_t nsHtml5StreamParser::MaybeRollBackSource(int32_t aSource) {
288   if (aSource ==
289       kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD) {
290     return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
291   }
292   if (aSource == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
293     return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic;
294   }
295   if (aSource == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content) {
296     return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
297   }
298   if (aSource == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
299       !mDetectorHadOnlySeenAsciiWhenFirstGuessing) {
300     return kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
301   }
302   if (aSource == kCharsetFromFinalUserForcedAutoDetection) {
303     aSource = kCharsetFromInitialUserForcedAutoDetection;
304   }
305   return aSource;
306 }
307 
GuessEncoding(bool aEof,bool aInitial)308 void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
309   if (aInitial) {
310     if (!mDetectorHasSeenNonAscii) {
311       mDetectorHadOnlySeenAsciiWhenFirstGuessing = true;
312     }
313   } else {
314     mGuessEncoding = false;
315   }
316   bool forced = (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
317                  mCharsetSource == kCharsetFromInitialUserForcedAutoDetection);
318   MOZ_ASSERT(
319       mCharsetSource != kCharsetFromFinalUserForcedAutoDetection &&
320       mCharsetSource != kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
321       mCharsetSource !=
322           kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic &&
323       mCharsetSource !=
324           kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content &&
325       mCharsetSource !=
326           kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD &&
327       mCharsetSource != kCharsetFromFinalAutoDetectionFile);
328   auto ifHadBeenForced = mDetector->Guess(EmptyCString(), true);
329   auto encoding =
330       forced ? ifHadBeenForced
331              : mDetector->Guess(mTLD, mDecodingLocalFileWithoutTokenizing);
332   int32_t source =
333       aInitial
334           ? (forced
335                  ? kCharsetFromInitialUserForcedAutoDetection
336                  : kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic)
337           : (forced
338                  ? kCharsetFromFinalUserForcedAutoDetection
339                  : (mDecodingLocalFileWithoutTokenizing
340                         ? kCharsetFromFinalAutoDetectionFile
341                         : kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic));
342   if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
343     if (encoding == ISO_2022_JP_ENCODING) {
344       if (EncodingDetector::TldMayAffectGuess(mTLD)) {
345         source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
346       }
347     } else if (!mDetectorHasSeenNonAscii) {
348       source = kCharsetFromInitialAutoDetectionASCII;  // deliberately Initial
349     } else if (ifHadBeenForced == UTF_8_ENCODING) {
350       source = kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8;
351     } else if (encoding != ifHadBeenForced) {
352       source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
353     } else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
354       source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
355     }
356   } else if (source ==
357              kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic) {
358     if (encoding == ISO_2022_JP_ENCODING) {
359       if (EncodingDetector::TldMayAffectGuess(mTLD)) {
360         source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
361       }
362     } else if (!mDetectorHasSeenNonAscii) {
363       source = kCharsetFromInitialAutoDetectionASCII;
364     } else if (ifHadBeenForced == UTF_8_ENCODING) {
365       source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
366     } else if (encoding != ifHadBeenForced) {
367       source =
368           kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
369     } else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
370       source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
371     }
372   }
373   if (HasDecoder() && !mDecodingLocalFileWithoutTokenizing) {
374     if (mEncoding == encoding) {
375       MOZ_ASSERT(mCharsetSource == kCharsetFromInitialAutoDetectionASCII ||
376                      mCharsetSource < source,
377                  "Why are we running chardet at all?");
378       // Source didn't actually change between initial and final, so roll it
379       // back for telemetry purposes.
380       mCharsetSource = MaybeRollBackSource(source);
381       mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
382     } else {
383       MOZ_ASSERT(mCharsetSource < kCharsetFromXmlDeclarationUtf16 || forced);
384       // We've already committed to a decoder. Request a reload from the
385       // docshell.
386       mTreeBuilder->NeedsCharsetSwitchTo(encoding, source, 0);
387       FlushTreeOpsAndDisarmTimer();
388       Interrupt();
389     }
390   } else {
391     // Got a confident answer from the sniffing buffer. That code will
392     // take care of setting up the decoder.
393     if (mCharsetSource == kCharsetUninitialized && aEof) {
394       // The document is so short that the initial buffer is the last
395       // buffer.
396       source = MaybeRollBackSource(source);
397     }
398     mEncoding = encoding;
399     mCharsetSource = source;
400     mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
401   }
402 }
403 
FeedDetector(Span<const uint8_t> aBuffer,bool aLast)404 void nsHtml5StreamParser::FeedDetector(Span<const uint8_t> aBuffer,
405                                        bool aLast) {
406   mDetectorHasSeenNonAscii = mDetector->Feed(aBuffer, aLast);
407 }
408 
SetViewSourceTitle(nsIURI * aURL)409 void nsHtml5StreamParser::SetViewSourceTitle(nsIURI* aURL) {
410   MOZ_ASSERT(NS_IsMainThread());
411 
412   BrowsingContext* browsingContext =
413       mExecutor->GetDocument()->GetBrowsingContext();
414   if (browsingContext && browsingContext->WatchedByDevTools()) {
415     mURIToSendToDevtools = aURL;
416 
417     nsID uuid;
418     nsresult rv = nsContentUtils::GenerateUUIDInPlace(uuid);
419     if (!NS_FAILED(rv)) {
420       char buffer[NSID_LENGTH];
421       uuid.ToProvidedString(buffer);
422       mUUIDForDevtools = NS_ConvertASCIItoUTF16(buffer);
423     }
424   }
425 
426   if (aURL) {
427     nsCOMPtr<nsIURI> temp;
428     if (aURL->SchemeIs("view-source")) {
429       nsCOMPtr<nsINestedURI> nested = do_QueryInterface(aURL);
430       nested->GetInnerURI(getter_AddRefs(temp));
431     } else {
432       temp = aURL;
433     }
434     if (temp->SchemeIs("data")) {
435       // Avoid showing potentially huge data: URLs. The three last bytes are
436       // UTF-8 for an ellipsis.
437       mViewSourceTitle.AssignLiteral("data:\xE2\x80\xA6");
438     } else {
439       nsresult rv = temp->GetSpec(mViewSourceTitle);
440       if (NS_FAILED(rv)) {
441         mViewSourceTitle.AssignLiteral("\xE2\x80\xA6");
442       }
443     }
444   }
445 }
446 
447 nsresult
SetupDecodingAndWriteSniffingBufferAndCurrentSegment(Span<const uint8_t> aFromSegment)448 nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
449     Span<const uint8_t> aFromSegment) {
450   NS_ASSERTION(IsParserThread(), "Wrong thread!");
451   nsresult rv = NS_OK;
452   if (mDecodingLocalFileWithoutTokenizing &&
453       mCharsetSource <= kCharsetFromFallback) {
454     MOZ_ASSERT(mEncoding != UTF_8_ENCODING);
455     mUnicodeDecoder = UTF_8_ENCODING->NewDecoderWithBOMRemoval();
456   } else {
457     if (mCharsetSource >= kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8) {
458       if (!(mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
459             mCharsetSource == kCharsetFromInitialUserForcedAutoDetection)) {
460         DontGuessEncoding();
461       }
462       mDecodingLocalFileWithoutTokenizing = false;
463     }
464     mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval();
465   }
466   if (mSniffingBuffer) {
467     rv = WriteStreamBytes(Span(mSniffingBuffer.get(), mSniffingLength));
468     NS_ENSURE_SUCCESS(rv, rv);
469     mSniffingBuffer = nullptr;
470   }
471   mMetaScanner = nullptr;
472   return WriteStreamBytes(aFromSegment);
473 }
474 
SetupDecodingFromBom(NotNull<const Encoding * > aEncoding)475 void nsHtml5StreamParser::SetupDecodingFromBom(
476     NotNull<const Encoding*> aEncoding) {
477   MOZ_ASSERT(IsParserThread(), "Wrong thread!");
478   mEncoding = aEncoding;
479   mDecodingLocalFileWithoutTokenizing = false;
480   mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling();
481   mCharsetSource = kCharsetFromByteOrderMark;
482   DontGuessEncoding();
483   mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
484   mSniffingBuffer = nullptr;
485   mMetaScanner = nullptr;
486   mBomState = BOM_SNIFFING_OVER;
487 }
488 
SetupDecodingFromUtf16BogoXml(NotNull<const Encoding * > aEncoding)489 void nsHtml5StreamParser::SetupDecodingFromUtf16BogoXml(
490     NotNull<const Encoding*> aEncoding) {
491   MOZ_ASSERT(IsParserThread(), "Wrong thread!");
492   mEncoding = aEncoding;
493   mDecodingLocalFileWithoutTokenizing = false;
494   mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling();
495   mCharsetSource = kCharsetFromXmlDeclarationUtf16;
496   DontGuessEncoding();
497   mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
498   mSniffingBuffer = nullptr;
499   mMetaScanner = nullptr;
500   mBomState = BOM_SNIFFING_OVER;
501   auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE);
502   dst[0] = '<';
503   dst[1] = '?';
504   dst[2] = 'x';
505   mLastBuffer->AdvanceEnd(3);
506 }
507 
SniffBOMlessUTF16BasicLatin(const uint8_t * aBuf,size_t aBufLen)508 void nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const uint8_t* aBuf,
509                                                       size_t aBufLen) {
510   // Avoid underspecified heuristic craziness for XHR
511   if (mMode == LOAD_AS_DATA) {
512     return;
513   }
514   // Make sure there's enough data. Require room for "<title></title>"
515   if (aBufLen < 30) {
516     return;
517   }
518   // even-numbered bytes tracked at 0, odd-numbered bytes tracked at 1
519   bool byteZero[2] = {false, false};
520   bool byteNonZero[2] = {false, false};
521   uint32_t i = 0;
522   for (; i < aBufLen; ++i) {
523     if (aBuf[i]) {
524       if (byteNonZero[1 - (i % 2)]) {
525         return;
526       }
527       byteNonZero[i % 2] = true;
528     } else {
529       if (byteZero[1 - (i % 2)]) {
530         return;
531       }
532       byteZero[i % 2] = true;
533     }
534   }
535   if (byteNonZero[0]) {
536     mEncoding = UTF_16LE_ENCODING;
537   } else {
538     mEncoding = UTF_16BE_ENCODING;
539   }
540   mCharsetSource = kCharsetFromIrreversibleAutoDetection;
541   mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
542   DontGuessEncoding();
543   mTreeBuilder->MaybeComplainAboutCharset("EncBomlessUtf16", true, 0);
544 }
545 
SetEncodingFromExpat(const char16_t * aEncoding)546 void nsHtml5StreamParser::SetEncodingFromExpat(const char16_t* aEncoding) {
547   if (aEncoding) {
548     nsDependentString utf16(aEncoding);
549     nsAutoCString utf8;
550     CopyUTF16toUTF8(utf16, utf8);
551     auto encoding = PreferredForInternalEncodingDecl(utf8);
552     if (encoding) {
553       mEncoding = WrapNotNull(encoding);
554       mCharsetSource = kCharsetFromMetaTag;  // closest for XML
555       mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
556       return;
557     }
558     // else the page declared an encoding Gecko doesn't support and we'd
559     // end up defaulting to UTF-8 anyway. Might as well fall through here
560     // right away and let the encoding be set to UTF-8 which we'd default to
561     // anyway.
562   }
563   mEncoding = UTF_8_ENCODING;            // XML defaults to UTF-8 without a BOM
564   mCharsetSource = kCharsetFromMetaTag;  // means confident
565   mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
566 }
567 
568 // A separate user data struct is used instead of passing the
569 // nsHtml5StreamParser instance as user data in order to avoid including
570 // expat.h in nsHtml5StreamParser.h. Doing that would cause naming conflicts.
571 // Using a separate user data struct also avoids bloating nsHtml5StreamParser
572 // by one pointer.
573 struct UserData {
574   XML_Parser mExpat;
575   nsHtml5StreamParser* mStreamParser;
576 };
577 
578 // Using no-namespace handler callbacks to avoid including expat.h in
579 // nsHtml5StreamParser.h, since doing so would cause naming conclicts.
HandleXMLDeclaration(void * aUserData,const XML_Char * aVersion,const XML_Char * aEncoding,int aStandalone)580 static void HandleXMLDeclaration(void* aUserData, const XML_Char* aVersion,
581                                  const XML_Char* aEncoding, int aStandalone) {
582   UserData* ud = static_cast<UserData*>(aUserData);
583   ud->mStreamParser->SetEncodingFromExpat(
584       reinterpret_cast<const char16_t*>(aEncoding));
585   XML_StopParser(ud->mExpat, false);
586 }
587 
HandleStartElement(void * aUserData,const XML_Char * aName,const XML_Char ** aAtts)588 static void HandleStartElement(void* aUserData, const XML_Char* aName,
589                                const XML_Char** aAtts) {
590   UserData* ud = static_cast<UserData*>(aUserData);
591   XML_StopParser(ud->mExpat, false);
592 }
593 
HandleEndElement(void * aUserData,const XML_Char * aName)594 static void HandleEndElement(void* aUserData, const XML_Char* aName) {
595   UserData* ud = static_cast<UserData*>(aUserData);
596   XML_StopParser(ud->mExpat, false);
597 }
598 
HandleComment(void * aUserData,const XML_Char * aName)599 static void HandleComment(void* aUserData, const XML_Char* aName) {
600   UserData* ud = static_cast<UserData*>(aUserData);
601   XML_StopParser(ud->mExpat, false);
602 }
603 
HandleProcessingInstruction(void * aUserData,const XML_Char * aTarget,const XML_Char * aData)604 static void HandleProcessingInstruction(void* aUserData,
605                                         const XML_Char* aTarget,
606                                         const XML_Char* aData) {
607   UserData* ud = static_cast<UserData*>(aUserData);
608   XML_StopParser(ud->mExpat, false);
609 }
610 
FinalizeSniffingWithDetector(Span<const uint8_t> aFromSegment,uint32_t aCountToSniffingLimit,bool aEof)611 void nsHtml5StreamParser::FinalizeSniffingWithDetector(
612     Span<const uint8_t> aFromSegment, uint32_t aCountToSniffingLimit,
613     bool aEof) {
614   if (mFeedChardet && mSniffingBuffer) {
615     FeedDetector(Span(mSniffingBuffer.get(), mSniffingLength), false);
616   }
617   if (mFeedChardet && !aFromSegment.IsEmpty()) {
618     // Avoid buffer boundary-dependent behavior.
619     FeedDetector(aFromSegment.To(aCountToSniffingLimit), false);
620   }
621   bool guess = mFeedChardet;
622   if (mFeedChardet && aEof && aCountToSniffingLimit <= aFromSegment.Length()) {
623     FeedDetector(Span<const uint8_t>(), true);
624     mFeedChardet = false;
625   }
626   if (guess) {
627     GuessEncoding(aEof, (guess == mFeedChardet));
628   }
629   if (mReparseForbidden) {
630     DontGuessEncoding();
631   }
632   if (mFeedChardet && !aEof && aCountToSniffingLimit < aFromSegment.Length()) {
633     // Avoid buffer boundary-dependent behavior.
634     FeedDetector(aFromSegment.From(aCountToSniffingLimit), false);
635   }
636 }
637 
FinalizeSniffing(Span<const uint8_t> aFromSegment,uint32_t aCountToSniffingLimit,bool aEof)638 nsresult nsHtml5StreamParser::FinalizeSniffing(Span<const uint8_t> aFromSegment,
639                                                uint32_t aCountToSniffingLimit,
640                                                bool aEof) {
641   MOZ_ASSERT(IsParserThread(), "Wrong thread!");
642   MOZ_ASSERT(mCharsetSource < kCharsetFromXmlDeclarationUtf16,
643              "Should not finalize sniffing with strong decision already made.");
644   if (mMode == VIEW_SOURCE_XML) {
645     static const XML_Memory_Handling_Suite memsuite = {
646         (void* (*)(size_t))moz_xmalloc, (void* (*)(void*, size_t))moz_xrealloc,
647         free};
648 
649     static const char16_t kExpatSeparator[] = {0xFFFF, '\0'};
650 
651     static const char16_t kISO88591[] = {'I', 'S', 'O', '-', '8', '8',
652                                          '5', '9', '-', '1', '\0'};
653 
654     UserData ud;
655     ud.mStreamParser = this;
656 
657     // If we got this far, the stream didn't have a BOM. UTF-16-encoded XML
658     // documents MUST begin with a BOM. We don't support EBCDIC and such.
659     // Thus, at this point, what we have is garbage or something encoded using
660     // a rough ASCII superset. ISO-8859-1 allows us to decode ASCII bytes
661     // without throwing errors when bytes have the most significant bit set
662     // and without triggering expat's unknown encoding code paths. This is
663     // enough to be able to use expat to parse the XML declaration in order
664     // to extract the encoding name from it.
665     ud.mExpat = XML_ParserCreate_MM(kISO88591, &memsuite, kExpatSeparator);
666     XML_SetXmlDeclHandler(ud.mExpat, HandleXMLDeclaration);
667     XML_SetElementHandler(ud.mExpat, HandleStartElement, HandleEndElement);
668     XML_SetCommentHandler(ud.mExpat, HandleComment);
669     XML_SetProcessingInstructionHandler(ud.mExpat, HandleProcessingInstruction);
670     XML_SetUserData(ud.mExpat, static_cast<void*>(&ud));
671 
672     XML_Status status = XML_STATUS_OK;
673 
674     // aFromSegment points to the data obtained from the current network
675     // event. mSniffingBuffer (if it exists) contains the data obtained before
676     // the current event. Thus, mSniffingLenth bytes of mSniffingBuffer
677     // followed by aCountToSniffingLimit bytes from aFromSegment are the
678     // first 1024 bytes of the file (or the file as a whole if the file is
679     // 1024 bytes long or shorter). Thus, we parse both buffers, but if the
680     // first call succeeds already, we skip parsing the second buffer.
681     if (mSniffingBuffer) {
682       status = XML_Parse(ud.mExpat,
683                          reinterpret_cast<const char*>(mSniffingBuffer.get()),
684                          mSniffingLength, false);
685     }
686     if (status == XML_STATUS_OK && mCharsetSource < kCharsetFromMetaTag) {
687       mozilla::Unused << XML_Parse(
688           ud.mExpat, reinterpret_cast<const char*>(aFromSegment.Elements()),
689           aCountToSniffingLimit, false);
690     }
691     XML_ParserFree(ud.mExpat);
692 
693     if (mCharsetSource < kCharsetFromMetaTag) {
694       // Failed to get an encoding from the XML declaration. XML defaults
695       // confidently to UTF-8 in this case.
696       // It is also possible that the document has an XML declaration that is
697       // longer than 1024 bytes, but that case is not worth worrying about.
698       mEncoding = UTF_8_ENCODING;
699       mCharsetSource = kCharsetFromMetaTag;  // means confident
700       mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
701     }
702 
703     return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
704   }
705   bool forced = (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
706                  mCharsetSource == kCharsetFromInitialUserForcedAutoDetection ||
707                  mCharsetSource == kCharsetFromFinalUserForcedAutoDetection);
708   if (!mChannelHadCharset &&
709       (forced || mCharsetSource < kCharsetFromMetaPrescan) &&
710       (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA)) {
711     // Look for XML declaration in text/html.
712 
713     const uint8_t* buf;
714     size_t bufLen;
715     if (mSniffingLength) {
716       // Copy data to a contiguous buffer if we already have something buffered
717       // up.
718       memcpy(mSniffingBuffer.get() + mSniffingLength, aFromSegment.Elements(),
719              aCountToSniffingLimit);
720       mSniffingLength += aCountToSniffingLimit;
721       aFromSegment = aFromSegment.From(aCountToSniffingLimit);
722       aCountToSniffingLimit = 0;
723       buf = mSniffingBuffer.get();
724       bufLen = mSniffingLength;
725     } else {
726       buf = aFromSegment.Elements();
727       bufLen = aCountToSniffingLimit;
728     }
729     const Encoding* encoding = xmldecl_parse(buf, bufLen);
730     if (encoding) {
731       if (forced &&
732           (encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) {
733         // Honor override
734         if (mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) {
735           DontGuessEncoding();
736         } else {
737           FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit,
738                                        false);
739         }
740         return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
741             aFromSegment);
742       }
743       DontGuessEncoding();
744       mEncoding = WrapNotNull(encoding);
745       mCharsetSource = kCharsetFromXmlDeclaration;
746       mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
747     } else if (mCharsetSource < kCharsetFromIrreversibleAutoDetection) {
748       // meta scan and XML declaration check failed.
749       // Check for BOMless UTF-16 with Basic
750       // Latin content for compat with IE. See bug 631751.
751       SniffBOMlessUTF16BasicLatin(buf, bufLen);
752     }
753   }
754   if (forced && mCharsetSource != kCharsetFromIrreversibleAutoDetection) {
755     // neither meta nor XML declaration found, honor override
756     if (mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) {
757       DontGuessEncoding();
758     } else {
759       FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit, false);
760     }
761     return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
762   }
763 
764   // the charset may have been set now
765   // maybe try chardet now;
766   if (mFeedChardet) {
767     FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit, aEof);
768     // fall thru; charset may have changed
769   }
770   if (mCharsetSource == kCharsetUninitialized) {
771     // Hopefully this case is never needed, but dealing with it anyway
772     mEncoding = WINDOWS_1252_ENCODING;
773     mCharsetSource = kCharsetFromFallback;
774     mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
775   } else if (mMode == LOAD_AS_DATA && mCharsetSource == kCharsetFromFallback) {
776     NS_ASSERTION(mReparseForbidden, "Reparse should be forbidden for XHR");
777     NS_ASSERTION(!mFeedChardet, "Should not feed chardet for XHR");
778     NS_ASSERTION(mEncoding == UTF_8_ENCODING, "XHR should default to UTF-8");
779     // Now mark charset source as non-weak to signal that we have a decision
780     mCharsetSource = kCharsetFromDocTypeDefault;
781     mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
782   }
783   return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
784 }
785 
SniffStreamBytes(Span<const uint8_t> aFromSegment)786 nsresult nsHtml5StreamParser::SniffStreamBytes(
787     Span<const uint8_t> aFromSegment) {
788   MOZ_ASSERT(IsParserThread(), "Wrong thread!");
789   // mEncoding and mCharsetSource potentially have come from channel or higher
790   // by now. If we find a BOM, SetupDecodingFromBom() will overwrite them.
791   // If we don't find a BOM, the previously set values of mEncoding and
792   // mCharsetSource are not modified by the BOM sniffing here.
793   for (uint32_t i = 0;
794        i < aFromSegment.Length() && mBomState != BOM_SNIFFING_OVER; i++) {
795     switch (mBomState) {
796       case BOM_SNIFFING_NOT_STARTED:
797         MOZ_ASSERT(i == 0, "Bad BOM sniffing state.");
798         switch (aFromSegment[0]) {
799           case 0xEF:
800             mBomState = SEEN_UTF_8_FIRST_BYTE;
801             break;
802           case 0xFF:
803             mBomState = SEEN_UTF_16_LE_FIRST_BYTE;
804             break;
805           case 0xFE:
806             mBomState = SEEN_UTF_16_BE_FIRST_BYTE;
807             break;
808           case 0x00:
809             if (mCharsetSource < kCharsetFromXmlDeclarationUtf16 &&
810                 !mChannelHadCharset) {
811               mBomState = SEEN_UTF_16_BE_XML_FIRST;
812             } else {
813               mBomState = BOM_SNIFFING_OVER;
814             }
815             break;
816           case 0x3C:
817             if (mCharsetSource < kCharsetFromXmlDeclarationUtf16 &&
818                 !mChannelHadCharset) {
819               mBomState = SEEN_UTF_16_LE_XML_FIRST;
820             } else {
821               mBomState = BOM_SNIFFING_OVER;
822             }
823             break;
824           default:
825             mBomState = BOM_SNIFFING_OVER;
826             break;
827         }
828         break;
829       case SEEN_UTF_16_LE_FIRST_BYTE:
830         if (aFromSegment[i] == 0xFE) {
831           SetupDecodingFromBom(UTF_16LE_ENCODING);
832           return WriteStreamBytes(aFromSegment.From(i + 1));
833         }
834         mBomState = BOM_SNIFFING_OVER;
835         break;
836       case SEEN_UTF_16_BE_FIRST_BYTE:
837         if (aFromSegment[i] == 0xFF) {
838           SetupDecodingFromBom(UTF_16BE_ENCODING);
839           return WriteStreamBytes(aFromSegment.From(i + 1));
840         }
841         mBomState = BOM_SNIFFING_OVER;
842         break;
843       case SEEN_UTF_8_FIRST_BYTE:
844         if (aFromSegment[i] == 0xBB) {
845           mBomState = SEEN_UTF_8_SECOND_BYTE;
846         } else {
847           mBomState = BOM_SNIFFING_OVER;
848         }
849         break;
850       case SEEN_UTF_8_SECOND_BYTE:
851         if (aFromSegment[i] == 0xBF) {
852           SetupDecodingFromBom(UTF_8_ENCODING);
853           return WriteStreamBytes(aFromSegment.From(i + 1));
854         }
855         mBomState = BOM_SNIFFING_OVER;
856         break;
857       case SEEN_UTF_16_BE_XML_FIRST:
858         if (aFromSegment[i] == 0x3C) {
859           mBomState = SEEN_UTF_16_BE_XML_SECOND;
860         } else {
861           mBomState = BOM_SNIFFING_OVER;
862         }
863         break;
864       case SEEN_UTF_16_BE_XML_SECOND:
865         if (aFromSegment[i] == 0x00) {
866           mBomState = SEEN_UTF_16_BE_XML_THIRD;
867         } else {
868           mBomState = BOM_SNIFFING_OVER;
869         }
870         break;
871       case SEEN_UTF_16_BE_XML_THIRD:
872         if (aFromSegment[i] == 0x3F) {
873           mBomState = SEEN_UTF_16_BE_XML_FOURTH;
874         } else {
875           mBomState = BOM_SNIFFING_OVER;
876         }
877         break;
878       case SEEN_UTF_16_BE_XML_FOURTH:
879         if (aFromSegment[i] == 0x00) {
880           mBomState = SEEN_UTF_16_BE_XML_FIFTH;
881         } else {
882           mBomState = BOM_SNIFFING_OVER;
883         }
884         break;
885       case SEEN_UTF_16_BE_XML_FIFTH:
886         if (aFromSegment[i] == 0x78) {
887           SetupDecodingFromUtf16BogoXml(UTF_16BE_ENCODING);
888           return WriteStreamBytes(aFromSegment.From(i + 1));
889         }
890         mBomState = BOM_SNIFFING_OVER;
891         break;
892       case SEEN_UTF_16_LE_XML_FIRST:
893         if (aFromSegment[i] == 0x00) {
894           mBomState = SEEN_UTF_16_LE_XML_SECOND;
895         } else {
896           mBomState = BOM_SNIFFING_OVER;
897         }
898         break;
899       case SEEN_UTF_16_LE_XML_SECOND:
900         if (aFromSegment[i] == 0x3F) {
901           mBomState = SEEN_UTF_16_LE_XML_THIRD;
902         } else {
903           mBomState = BOM_SNIFFING_OVER;
904         }
905         break;
906       case SEEN_UTF_16_LE_XML_THIRD:
907         if (aFromSegment[i] == 0x00) {
908           mBomState = SEEN_UTF_16_LE_XML_FOURTH;
909         } else {
910           mBomState = BOM_SNIFFING_OVER;
911         }
912         break;
913       case SEEN_UTF_16_LE_XML_FOURTH:
914         if (aFromSegment[i] == 0x78) {
915           mBomState = SEEN_UTF_16_LE_XML_FIFTH;
916         } else {
917           mBomState = BOM_SNIFFING_OVER;
918         }
919         break;
920       case SEEN_UTF_16_LE_XML_FIFTH:
921         if (aFromSegment[i] == 0x00) {
922           SetupDecodingFromUtf16BogoXml(UTF_16LE_ENCODING);
923           return WriteStreamBytes(aFromSegment.From(i + 1));
924         }
925         mBomState = BOM_SNIFFING_OVER;
926         break;
927       default:
928         mBomState = BOM_SNIFFING_OVER;
929         break;
930     }
931   }
932   // if we get here, there either was no BOM or the BOM sniffing isn't complete
933   // yet
934 
935   MOZ_ASSERT(mCharsetSource != kCharsetFromByteOrderMark,
936              "Should not come here if BOM was found.");
937   MOZ_ASSERT(mCharsetSource != kCharsetFromXmlDeclarationUtf16,
938              "Should not come here if UTF-16 bogo-XML declaration was found.");
939   MOZ_ASSERT(mCharsetSource != kCharsetFromOtherComponent,
940              "kCharsetFromOtherComponent is for XSLT.");
941 
942   if (mBomState == BOM_SNIFFING_OVER && mCharsetSource == kCharsetFromChannel) {
943     // There was no BOM and the charset came from channel. mEncoding
944     // still contains the charset from the channel as set by an
945     // earlier call to SetDocumentCharset(), since we didn't find a BOM and
946     // overwrite mEncoding. (Note that if the user has overridden the charset,
947     // we don't come here but check <meta> for XSS-dangerous charsets first.)
948     mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
949     return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
950   }
951 
952   if (!mChannelHadCharset && !mMetaScanner &&
953       (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA)) {
954     mMetaScanner = MakeUnique<nsHtml5MetaScanner>(mTreeBuilder.get());
955   }
956 
957   if (mSniffingLength + aFromSegment.Length() >= SNIFFING_BUFFER_SIZE) {
958     // this is the last buffer
959     uint32_t countToSniffingLimit = SNIFFING_BUFFER_SIZE - mSniffingLength;
960     bool forced =
961         (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
962          mCharsetSource == kCharsetFromInitialUserForcedAutoDetection ||
963          mCharsetSource == kCharsetFromFinalUserForcedAutoDetection);
964     if (!mChannelHadCharset && (mMode == NORMAL || mMode == VIEW_SOURCE_HTML ||
965                                 mMode == LOAD_AS_DATA)) {
966       nsHtml5ByteReadable readable(
967           aFromSegment.Elements(),
968           aFromSegment.Elements() + countToSniffingLimit);
969       nsAutoCString charset;
970       auto encoding = mMetaScanner->sniff(&readable);
971       // Due to the way nsHtml5Portability reports OOM, ask the tree buider
972       nsresult rv;
973       if (NS_FAILED((rv = mTreeBuilder->IsBroken()))) {
974         MarkAsBroken(rv);
975         return rv;
976       }
977 
978       if (encoding) {
979         // meta scan successful; honor overrides unless meta is XSS-dangerous
980         if (forced && (encoding->IsAsciiCompatible() ||
981                        encoding == ISO_2022_JP_ENCODING)) {
982           // Honor override
983           if (mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) {
984             DontGuessEncoding();
985           } else {
986             FinalizeSniffingWithDetector(aFromSegment, countToSniffingLimit,
987                                          false);
988           }
989           return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
990               aFromSegment);
991         }
992         DontGuessEncoding();
993         mEncoding = WrapNotNull(encoding);
994         mCharsetSource = kCharsetFromMetaPrescan;
995         mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
996         return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
997             aFromSegment);
998       }
999     }
1000     return FinalizeSniffing(aFromSegment, countToSniffingLimit, false);
1001   }
1002 
1003   // not the last buffer
1004   if (!mChannelHadCharset &&
1005       (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA)) {
1006     nsHtml5ByteReadable readable(
1007         aFromSegment.Elements(),
1008         aFromSegment.Elements() + aFromSegment.Length());
1009     auto encoding = mMetaScanner->sniff(&readable);
1010     // Due to the way nsHtml5Portability reports OOM, ask the tree buider
1011     nsresult rv;
1012     if (NS_FAILED((rv = mTreeBuilder->IsBroken()))) {
1013       MarkAsBroken(rv);
1014       return rv;
1015     }
1016     if (encoding) {
1017       // meta scan successful; honor overrides unless meta is XSS-dangerous
1018       if ((mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) &&
1019           (encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) {
1020         // Honor override
1021         return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
1022             aFromSegment);
1023       }
1024       if ((mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
1025            mCharsetSource == kCharsetFromInitialUserForcedAutoDetection) &&
1026           (encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) {
1027         FinalizeSniffingWithDetector(aFromSegment, aFromSegment.Length(),
1028                                      false);
1029       } else {
1030         DontGuessEncoding();
1031         mEncoding = WrapNotNull(encoding);
1032         mCharsetSource = kCharsetFromMetaPrescan;
1033         mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
1034       }
1035       return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
1036     }
1037   }
1038 
1039   if (!mSniffingBuffer) {
1040     mSniffingBuffer = MakeUniqueFallible<uint8_t[]>(SNIFFING_BUFFER_SIZE);
1041     if (!mSniffingBuffer) {
1042       return NS_ERROR_OUT_OF_MEMORY;
1043     }
1044   }
1045   memcpy(&mSniffingBuffer[mSniffingLength], aFromSegment.Elements(),
1046          aFromSegment.Length());
1047   mSniffingLength += aFromSegment.Length();
1048   return NS_OK;
1049 }
1050 
1051 class AddContentRunnable : public Runnable {
1052  public:
AddContentRunnable(const nsAString & aParserID,nsIURI * aURI,Span<const char16_t> aData,bool aComplete)1053   AddContentRunnable(const nsAString& aParserID, nsIURI* aURI,
1054                      Span<const char16_t> aData, bool aComplete)
1055       : Runnable("AddContent") {
1056     nsAutoCString spec;
1057     aURI->GetSpec(spec);
1058     mData.mUri.Construct(NS_ConvertUTF8toUTF16(spec));
1059     mData.mParserID.Construct(aParserID);
1060     mData.mContents.Construct(aData.Elements(), aData.Length());
1061     mData.mComplete.Construct(aComplete);
1062   }
1063 
Run()1064   NS_IMETHOD Run() override {
1065     nsAutoString json;
1066     if (!mData.ToJSON(json)) {
1067       return NS_ERROR_FAILURE;
1068     }
1069 
1070     nsCOMPtr<nsIObserverService> obsService = services::GetObserverService();
1071     if (obsService) {
1072       obsService->NotifyObservers(nullptr, "devtools-html-content",
1073                                   PromiseFlatString(json).get());
1074     }
1075 
1076     return NS_OK;
1077   }
1078 
1079   HTMLContent mData;
1080 };
1081 
OnNewContent(Span<const char16_t> aData)1082 inline void nsHtml5StreamParser::OnNewContent(Span<const char16_t> aData) {
1083   if (mURIToSendToDevtools) {
1084     NS_DispatchToMainThread(new AddContentRunnable(mUUIDForDevtools,
1085                                                    mURIToSendToDevtools, aData,
1086                                                    /* aComplete */ false));
1087   }
1088 }
1089 
OnContentComplete()1090 inline void nsHtml5StreamParser::OnContentComplete() {
1091   if (mURIToSendToDevtools) {
1092     NS_DispatchToMainThread(new AddContentRunnable(
1093         mUUIDForDevtools, mURIToSendToDevtools, Span<const char16_t>(),
1094         /* aComplete */ true));
1095     mURIToSendToDevtools = nullptr;
1096   }
1097 }
1098 
WriteStreamBytes(Span<const uint8_t> aFromSegment)1099 nsresult nsHtml5StreamParser::WriteStreamBytes(
1100     Span<const uint8_t> aFromSegment) {
1101   NS_ASSERTION(IsParserThread(), "Wrong thread!");
1102   // mLastBuffer should always point to a buffer of the size
1103   // READ_BUFFER_SIZE.
1104   if (!mLastBuffer) {
1105     NS_WARNING("mLastBuffer should not be null!");
1106     MarkAsBroken(NS_ERROR_NULL_POINTER);
1107     return NS_ERROR_NULL_POINTER;
1108   }
1109   size_t totalRead = 0;
1110   auto src = aFromSegment;
1111   for (;;) {
1112     auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE);
1113     uint32_t result;
1114     size_t read;
1115     size_t written;
1116     bool hadErrors;
1117     Tie(result, read, written, hadErrors) =
1118         mUnicodeDecoder->DecodeToUTF16(src, dst, false);
1119     if (!mDecodingLocalFileWithoutTokenizing) {
1120       OnNewContent(dst.To(written));
1121     }
1122     if (hadErrors && !mHasHadErrors) {
1123       mHasHadErrors = true;
1124       if (mEncoding == UTF_8_ENCODING) {
1125         mTreeBuilder->TryToEnableEncodingMenu();
1126       }
1127     }
1128     src = src.From(read);
1129     totalRead += read;
1130     mLastBuffer->AdvanceEnd(written);
1131     if (result == kOutputFull) {
1132       RefPtr<nsHtml5OwningUTF16Buffer> newBuf =
1133           nsHtml5OwningUTF16Buffer::FalliblyCreate(READ_BUFFER_SIZE);
1134       if (!newBuf) {
1135         MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1136         return NS_ERROR_OUT_OF_MEMORY;
1137       }
1138       mLastBuffer = (mLastBuffer->next = std::move(newBuf));
1139     } else {
1140       MOZ_ASSERT(totalRead == aFromSegment.Length(),
1141                  "The Unicode decoder consumed the wrong number of bytes.");
1142       if (mDecodingLocalFileWithoutTokenizing &&
1143           mLocalFileBytesBuffered == LOCAL_FILE_UTF_8_BUFFER_SIZE) {
1144         auto encoding = mEncoding;
1145         GuessEncoding(false, false);
1146         if (encoding == mEncoding) {
1147           CommitLocalFileToEncoding();
1148         } else {
1149           ReDecodeLocalFile();
1150         }
1151       }
1152       return NS_OK;
1153     }
1154   }
1155 }
1156 
ReDecodeLocalFile()1157 void nsHtml5StreamParser::ReDecodeLocalFile() {
1158   MOZ_ASSERT(mDecodingLocalFileWithoutTokenizing);
1159   mDecodingLocalFileWithoutTokenizing = false;
1160   mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval();
1161   mHasHadErrors = false;
1162 
1163   DontGuessEncoding();
1164 
1165   // Throw away previous decoded data
1166   mLastBuffer = mFirstBuffer;
1167   mLastBuffer->next = nullptr;
1168   mLastBuffer->setStart(0);
1169   mLastBuffer->setEnd(0);
1170 
1171   // Decode again
1172   for (auto&& buffer : mBufferedLocalFileData) {
1173     DoDataAvailable(buffer);
1174   }
1175 }
1176 
CommitLocalFileToEncoding()1177 void nsHtml5StreamParser::CommitLocalFileToEncoding() {
1178   MOZ_ASSERT(mDecodingLocalFileWithoutTokenizing);
1179   mDecodingLocalFileWithoutTokenizing = false;
1180   mFeedChardet = false;
1181   mGuessEncoding = false;
1182 
1183   nsHtml5OwningUTF16Buffer* buffer = mFirstBuffer;
1184   while (buffer) {
1185     Span<const char16_t> data(buffer->getBuffer() + buffer->getStart(),
1186                               buffer->getLength());
1187     OnNewContent(data);
1188     buffer = buffer->next;
1189   }
1190 }
1191 
1192 class MaybeRunCollector : public Runnable {
1193  public:
MaybeRunCollector(nsIDocShell * aDocShell)1194   explicit MaybeRunCollector(nsIDocShell* aDocShell)
1195       : Runnable("MaybeRunCollector"), mDocShell(aDocShell) {}
1196 
Run()1197   NS_IMETHOD Run() override {
1198     nsJSContext::MaybeRunNextCollectorSlice(mDocShell,
1199                                             JS::GCReason::HTML_PARSER);
1200     return NS_OK;
1201   }
1202 
1203   nsCOMPtr<nsIDocShell> mDocShell;
1204 };
1205 
OnStartRequest(nsIRequest * aRequest)1206 nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
1207   MOZ_RELEASE_ASSERT(STREAM_NOT_STARTED == mStreamState,
1208                      "Got OnStartRequest when the stream had already started.");
1209   MOZ_ASSERT(
1210       !mExecutor->HasStarted(),
1211       "Got OnStartRequest at the wrong stage in the executor life cycle.");
1212   MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
1213 
1214   // To avoid the cost of instantiating the detector when it's not needed,
1215   // let's instantiate only if we make it out of this method with the
1216   // intent to use it.
1217   auto detectorCreator = MakeScopeExit([&] {
1218     if (mFeedChardet) {
1219       mDetector = mozilla::EncodingDetector::Create();
1220     }
1221   });
1222 
1223   if (mObserver) {
1224     mObserver->OnStartRequest(aRequest);
1225   }
1226   mRequest = aRequest;
1227 
1228   mStreamState = STREAM_BEING_READ;
1229 
1230   if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1231     mTokenizer->StartViewSource(NS_ConvertUTF8toUTF16(mViewSourceTitle));
1232   }
1233 
1234   // For View Source, the parser should run with scripts "enabled" if a normal
1235   // load would have scripts enabled.
1236   bool scriptingEnabled =
1237       mMode == LOAD_AS_DATA ? false : mExecutor->IsScriptEnabled();
1238   mOwner->StartTokenizer(scriptingEnabled);
1239 
1240   MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing);
1241   bool isSrcdoc = false;
1242   nsCOMPtr<nsIChannel> channel;
1243   nsresult rv = GetChannel(getter_AddRefs(channel));
1244   if (NS_SUCCEEDED(rv)) {
1245     isSrcdoc = NS_IsSrcdocChannel(channel);
1246     if (!isSrcdoc && mCharsetSource <= kCharsetFromFallback) {
1247       nsCOMPtr<nsIURI> originalURI;
1248       rv = channel->GetOriginalURI(getter_AddRefs(originalURI));
1249       if (NS_SUCCEEDED(rv)) {
1250         if (originalURI->SchemeIs("resource")) {
1251           mCharsetSource = kCharsetFromBuiltIn;
1252           mEncoding = UTF_8_ENCODING;
1253           mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
1254         } else {
1255           nsCOMPtr<nsIURI> currentURI;
1256           rv = channel->GetURI(getter_AddRefs(currentURI));
1257           if (NS_SUCCEEDED(rv)) {
1258             nsCOMPtr<nsIURI> innermost = NS_GetInnermostURI(currentURI);
1259             if (innermost->SchemeIs("file")) {
1260               mDecodingLocalFileWithoutTokenizing = true;
1261             } else {
1262               nsAutoCString host;
1263               innermost->GetAsciiHost(host);
1264               if (!host.IsEmpty()) {
1265                 // First let's see if the host is DNS-absolute and ends with a
1266                 // dot and get rid of that one.
1267                 if (host.Last() == '.') {
1268                   host.SetLength(host.Length() - 1);
1269                 }
1270                 int32_t index = host.RFindChar('.');
1271                 if (index != kNotFound) {
1272                   // We tolerate an IPv4 component as generic "TLD", so don't
1273                   // bother checking.
1274                   ToLowerCase(
1275                       Substring(host, index + 1, host.Length() - (index + 1)),
1276                       mTLD);
1277                 }
1278               }
1279             }
1280           }
1281         }
1282       }
1283     }
1284   }
1285   mTreeBuilder->setIsSrcdocDocument(isSrcdoc);
1286   mTreeBuilder->setScriptingEnabled(scriptingEnabled);
1287   mTreeBuilder->SetPreventScriptExecution(
1288       !((mMode == NORMAL) && scriptingEnabled));
1289   mTokenizer->start();
1290   mExecutor->Start();
1291   mExecutor->StartReadingFromStage();
1292 
1293   if (mMode == PLAIN_TEXT) {
1294     mTreeBuilder->StartPlainText();
1295     mTokenizer->StartPlainText();
1296   } else if (mMode == VIEW_SOURCE_PLAIN) {
1297     nsAutoString viewSourceTitle;
1298     CopyUTF8toUTF16(mViewSourceTitle, viewSourceTitle);
1299     mTreeBuilder->EnsureBufferSpace(viewSourceTitle.Length());
1300     mTreeBuilder->StartPlainTextViewSource(viewSourceTitle);
1301     mTokenizer->StartPlainText();
1302   }
1303 
1304   /*
1305    * If you move the following line, be very careful not to cause
1306    * WillBuildModel to be called before the document has had its
1307    * script global object set.
1308    */
1309   rv = mExecutor->WillBuildModel(eDTDMode_unknown);
1310   NS_ENSURE_SUCCESS(rv, rv);
1311 
1312   RefPtr<nsHtml5OwningUTF16Buffer> newBuf =
1313       nsHtml5OwningUTF16Buffer::FalliblyCreate(READ_BUFFER_SIZE);
1314   if (!newBuf) {
1315     // marks this stream parser as terminated,
1316     // which prevents entry to code paths that
1317     // would use mFirstBuffer or mLastBuffer.
1318     return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1319   }
1320   MOZ_ASSERT(!mFirstBuffer, "How come we have the first buffer set?");
1321   MOZ_ASSERT(!mLastBuffer, "How come we have the last buffer set?");
1322   mFirstBuffer = mLastBuffer = newBuf;
1323 
1324   rv = NS_OK;
1325 
1326   // The line below means that the encoding can end up being wrong if
1327   // a view-source URL is loaded without having the encoding hint from a
1328   // previous normal load in the history.
1329   mReparseForbidden = !(mMode == NORMAL || mMode == PLAIN_TEXT);
1330 
1331   mNetworkEventTarget =
1332       mExecutor->GetDocument()->EventTargetFor(TaskCategory::Network);
1333 
1334   nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(mRequest, &rv));
1335   if (NS_SUCCEEDED(rv)) {
1336     // Non-HTTP channels are bogus enough that we let them work with unlabeled
1337     // runnables for now. Asserting for HTTP channels only.
1338     MOZ_ASSERT(mNetworkEventTarget || mMode == LOAD_AS_DATA,
1339                "How come the network event target is still null?");
1340 
1341     nsAutoCString method;
1342     Unused << httpChannel->GetRequestMethod(method);
1343     // XXX does Necko have a way to renavigate POST, etc. without hitting
1344     // the network?
1345     if (!method.EqualsLiteral("GET")) {
1346       // This is the old Gecko behavior but the HTML5 spec disagrees.
1347       // Don't reparse on POST.
1348       mReparseForbidden = true;
1349     }
1350   }
1351 
1352   // Attempt to retarget delivery of data (via OnDataAvailable) to the parser
1353   // thread, rather than through the main thread.
1354   nsCOMPtr<nsIThreadRetargetableRequest> threadRetargetableRequest =
1355       do_QueryInterface(mRequest, &rv);
1356   if (threadRetargetableRequest) {
1357     rv = threadRetargetableRequest->RetargetDeliveryTo(mEventTarget);
1358     if (NS_SUCCEEDED(rv)) {
1359       // Parser thread should be now ready to get data from necko and parse it
1360       // and main thread might have a chance to process a collector slice.
1361       // We need to do this asynchronously so that necko may continue processing
1362       // the request.
1363       nsCOMPtr<nsIRunnable> runnable =
1364           new MaybeRunCollector(mExecutor->GetDocument()->GetDocShell());
1365       mozilla::SchedulerGroup::Dispatch(
1366           mozilla::TaskCategory::GarbageCollection, runnable.forget());
1367     }
1368   }
1369 
1370   if (NS_FAILED(rv)) {
1371     NS_WARNING("Failed to retarget HTML data delivery to the parser thread.");
1372   }
1373 
1374   if (mCharsetSource == kCharsetFromParentFrame) {
1375     // Remember this for error reporting.
1376     mInitialEncodingWasFromParentFrame = true;
1377   }
1378 
1379   if (!(mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
1380         mCharsetSource == kCharsetFromInitialUserForcedAutoDetection ||
1381         mCharsetSource == kCharsetFromFinalUserForcedAutoDetection)) {
1382     if (mCharsetSource >= kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8) {
1383       DontGuessEncoding();
1384     }
1385   }
1386 
1387   if (mCharsetSource < kCharsetFromUtf8OnlyMime) {
1388     // we aren't ready to commit to an encoding yet
1389     // leave converter uninstantiated for now
1390     return NS_OK;
1391   }
1392 
1393   // We are loading JSON/WebVTT/etc. into a browsing context.
1394   // There's no need to remove the BOM manually here, because
1395   // the UTF-8 decoder removes it.
1396   mReparseForbidden = true;
1397   DontGuessEncoding();
1398 
1399   // Instantiate the converter here to avoid BOM sniffing.
1400   mDecodingLocalFileWithoutTokenizing = false;
1401   mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval();
1402   return NS_OK;
1403 }
1404 
CheckListenerChain()1405 nsresult nsHtml5StreamParser::CheckListenerChain() {
1406   NS_ASSERTION(NS_IsMainThread(), "Should be on the main thread!");
1407   if (!mObserver) {
1408     return NS_OK;
1409   }
1410   nsresult rv;
1411   nsCOMPtr<nsIThreadRetargetableStreamListener> retargetable =
1412       do_QueryInterface(mObserver, &rv);
1413   if (NS_SUCCEEDED(rv) && retargetable) {
1414     rv = retargetable->CheckListenerChain();
1415   }
1416   return rv;
1417 }
1418 
DoStopRequest()1419 void nsHtml5StreamParser::DoStopRequest() {
1420   NS_ASSERTION(IsParserThread(), "Wrong thread!");
1421   MOZ_RELEASE_ASSERT(STREAM_BEING_READ == mStreamState,
1422                      "Stream ended without being open.");
1423   mTokenizerMutex.AssertCurrentThreadOwns();
1424 
1425   auto guard = MakeScopeExit([&] { OnContentComplete(); });
1426 
1427   if (IsTerminated()) {
1428     return;
1429   }
1430 
1431   if (!mUnicodeDecoder) {
1432     nsresult rv;
1433     Span<const uint8_t> empty;
1434     if (NS_FAILED(rv = FinalizeSniffing(empty, 0, true))) {
1435       MarkAsBroken(rv);
1436       return;
1437     }
1438   }
1439   if (mFeedChardet) {
1440     mFeedChardet = false;
1441     FeedDetector(Span<uint8_t>(), true);
1442   }
1443 
1444   MOZ_ASSERT(mUnicodeDecoder,
1445              "Should have a decoder after finalizing sniffing.");
1446 
1447   // mLastBuffer should always point to a buffer of the size
1448   // READ_BUFFER_SIZE.
1449   if (!mLastBuffer) {
1450     NS_WARNING("mLastBuffer should not be null!");
1451     MarkAsBroken(NS_ERROR_NULL_POINTER);
1452     return;
1453   }
1454 
1455   Span<uint8_t> src;  // empty span
1456   for (;;) {
1457     auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE);
1458     uint32_t result;
1459     size_t read;
1460     size_t written;
1461     bool hadErrors;
1462     Tie(result, read, written, hadErrors) =
1463         mUnicodeDecoder->DecodeToUTF16(src, dst, true);
1464     if (!mDecodingLocalFileWithoutTokenizing) {
1465       OnNewContent(dst.To(written));
1466     }
1467     if (hadErrors && !mHasHadErrors) {
1468       mHasHadErrors = true;
1469       if (mEncoding == UTF_8_ENCODING) {
1470         mTreeBuilder->TryToEnableEncodingMenu();
1471       }
1472     }
1473     MOZ_ASSERT(read == 0, "How come an empty span was read form?");
1474     mLastBuffer->AdvanceEnd(written);
1475     if (result == kOutputFull) {
1476       RefPtr<nsHtml5OwningUTF16Buffer> newBuf =
1477           nsHtml5OwningUTF16Buffer::FalliblyCreate(READ_BUFFER_SIZE);
1478       if (!newBuf) {
1479         MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1480         return;
1481       }
1482       mLastBuffer = (mLastBuffer->next = std::move(newBuf));
1483     } else {
1484       if (mDecodingLocalFileWithoutTokenizing) {
1485         MOZ_ASSERT(mLocalFileBytesBuffered < LOCAL_FILE_UTF_8_BUFFER_SIZE);
1486         MOZ_ASSERT(mGuessEncoding);
1487         auto encoding = mEncoding;
1488         GuessEncoding(true, false);
1489         if (encoding == mEncoding) {
1490           CommitLocalFileToEncoding();
1491         } else {
1492           ReDecodeLocalFile();
1493           DoStopRequest();
1494           return;
1495         }
1496       } else if (mGuessEncoding) {
1497         GuessEncoding(true, false);
1498       }
1499       break;
1500     }
1501   }
1502 
1503   mStreamState = STREAM_ENDED;
1504 
1505   if (IsTerminatedOrInterrupted()) {
1506     return;
1507   }
1508 
1509   ParseAvailableData();
1510 }
1511 
1512 class nsHtml5RequestStopper : public Runnable {
1513  private:
1514   nsHtml5StreamParserPtr mStreamParser;
1515 
1516  public:
nsHtml5RequestStopper(nsHtml5StreamParser * aStreamParser)1517   explicit nsHtml5RequestStopper(nsHtml5StreamParser* aStreamParser)
1518       : Runnable("nsHtml5RequestStopper"), mStreamParser(aStreamParser) {}
Run()1519   NS_IMETHOD Run() override {
1520     mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1521     mStreamParser->DoStopRequest();
1522     return NS_OK;
1523   }
1524 };
1525 
OnStopRequest(nsIRequest * aRequest,nsresult status)1526 nsresult nsHtml5StreamParser::OnStopRequest(nsIRequest* aRequest,
1527                                             nsresult status) {
1528   NS_ASSERTION(mRequest == aRequest, "Got Stop on wrong stream.");
1529   NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1530   if (mObserver) {
1531     mObserver->OnStopRequest(aRequest, status);
1532   }
1533   nsCOMPtr<nsIRunnable> stopper = new nsHtml5RequestStopper(this);
1534   if (NS_FAILED(mEventTarget->Dispatch(stopper, nsIThread::DISPATCH_NORMAL))) {
1535     NS_WARNING("Dispatching StopRequest event failed.");
1536   }
1537   return NS_OK;
1538 }
1539 
DoDataAvailableBuffer(mozilla::Buffer<uint8_t> && aBuffer)1540 void nsHtml5StreamParser::DoDataAvailableBuffer(
1541     mozilla::Buffer<uint8_t>&& aBuffer) {
1542   if (MOZ_LIKELY(!mDecodingLocalFileWithoutTokenizing)) {
1543     DoDataAvailable(aBuffer);
1544     return;
1545   }
1546   CheckedInt<size_t> bufferedPlusLength(aBuffer.Length());
1547   bufferedPlusLength += mLocalFileBytesBuffered;
1548   if (!bufferedPlusLength.isValid()) {
1549     MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1550     return;
1551   }
1552   // Ensure that WriteStreamBytes() sees a buffer ending
1553   // exactly at LOCAL_FILE_UTF_8_BUFFER_SIZE
1554   // if we are about to cross the threshold. This way,
1555   // Necko buffer boundaries don't affect user-visible
1556   // behavior.
1557   if (bufferedPlusLength.value() <= LOCAL_FILE_UTF_8_BUFFER_SIZE) {
1558     // Truncation OK, because we just checked the range.
1559     mLocalFileBytesBuffered = bufferedPlusLength.value();
1560     mBufferedLocalFileData.AppendElement(std::move(aBuffer));
1561     DoDataAvailable(mBufferedLocalFileData.LastElement());
1562   } else {
1563     // Truncation OK, because the constant is small enough.
1564     size_t overBoundary =
1565         bufferedPlusLength.value() - LOCAL_FILE_UTF_8_BUFFER_SIZE;
1566     MOZ_RELEASE_ASSERT(overBoundary < aBuffer.Length());
1567     size_t untilBoundary = aBuffer.Length() - overBoundary;
1568     auto span = aBuffer.AsSpan();
1569     auto head = span.To(untilBoundary);
1570     auto tail = span.From(untilBoundary);
1571     MOZ_RELEASE_ASSERT(mLocalFileBytesBuffered + untilBoundary ==
1572                        LOCAL_FILE_UTF_8_BUFFER_SIZE);
1573     // We make a theoretically useless copy here, because avoiding
1574     // the copy adds too much complexity.
1575     Maybe<Buffer<uint8_t>> maybe = Buffer<uint8_t>::CopyFrom(head);
1576     if (maybe.isNothing()) {
1577       MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1578       return;
1579     }
1580     mLocalFileBytesBuffered = LOCAL_FILE_UTF_8_BUFFER_SIZE;
1581     mBufferedLocalFileData.AppendElement(std::move(*maybe));
1582 
1583     DoDataAvailable(head);
1584     // Re-decode may have happened here.
1585     DoDataAvailable(tail);
1586   }
1587   // Do this clean-up here to avoid use-after-free when
1588   // DoDataAvailable is passed a span pointing into an
1589   // element of mBufferedLocalFileData.
1590   if (!mDecodingLocalFileWithoutTokenizing) {
1591     mBufferedLocalFileData.Clear();
1592   }
1593 }
1594 
DoDataAvailable(Span<const uint8_t> aBuffer)1595 void nsHtml5StreamParser::DoDataAvailable(Span<const uint8_t> aBuffer) {
1596   NS_ASSERTION(IsParserThread(), "Wrong thread!");
1597   MOZ_RELEASE_ASSERT(STREAM_BEING_READ == mStreamState,
1598                      "DoDataAvailable called when stream not open.");
1599   mTokenizerMutex.AssertCurrentThreadOwns();
1600 
1601   if (IsTerminated()) {
1602     return;
1603   }
1604 
1605   nsresult rv;
1606   if (HasDecoder()) {
1607     if (mFeedChardet) {
1608       FeedDetector(aBuffer, false);
1609     }
1610     rv = WriteStreamBytes(aBuffer);
1611   } else {
1612     rv = SniffStreamBytes(aBuffer);
1613   }
1614   if (NS_FAILED(rv)) {
1615     MarkAsBroken(rv);
1616     return;
1617   }
1618 
1619   if (IsTerminatedOrInterrupted()) {
1620     return;
1621   }
1622 
1623   if (mDecodingLocalFileWithoutTokenizing) {
1624     return;
1625   }
1626 
1627   ParseAvailableData();
1628 
1629   if (mFlushTimerArmed || mSpeculating) {
1630     return;
1631   }
1632 
1633   {
1634     mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex);
1635     mFlushTimer->InitWithNamedFuncCallback(
1636         nsHtml5StreamParser::TimerCallback, static_cast<void*>(this),
1637         mFlushTimerEverFired ? StaticPrefs::html5_flushtimer_initialdelay()
1638                              : StaticPrefs::html5_flushtimer_subsequentdelay(),
1639         nsITimer::TYPE_ONE_SHOT, "nsHtml5StreamParser::DoDataAvailable");
1640   }
1641   mFlushTimerArmed = true;
1642 }
1643 
1644 class nsHtml5DataAvailable : public Runnable {
1645  private:
1646   nsHtml5StreamParserPtr mStreamParser;
1647   Buffer<uint8_t> mData;
1648 
1649  public:
nsHtml5DataAvailable(nsHtml5StreamParser * aStreamParser,Buffer<uint8_t> && aData)1650   nsHtml5DataAvailable(nsHtml5StreamParser* aStreamParser,
1651                        Buffer<uint8_t>&& aData)
1652       : Runnable("nsHtml5DataAvailable"),
1653         mStreamParser(aStreamParser),
1654         mData(std::move(aData)) {}
Run()1655   NS_IMETHOD Run() override {
1656     mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1657     mStreamParser->DoDataAvailableBuffer(std::move(mData));
1658     return NS_OK;
1659   }
1660 };
1661 
OnDataAvailable(nsIRequest * aRequest,nsIInputStream * aInStream,uint64_t aSourceOffset,uint32_t aLength)1662 nsresult nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest,
1663                                               nsIInputStream* aInStream,
1664                                               uint64_t aSourceOffset,
1665                                               uint32_t aLength) {
1666   nsresult rv;
1667   if (NS_FAILED(rv = mExecutor->IsBroken())) {
1668     return rv;
1669   }
1670 
1671   MOZ_ASSERT(mRequest == aRequest, "Got data on wrong stream.");
1672   uint32_t totalRead;
1673   // Main thread to parser thread dispatch requires copying to buffer first.
1674   if (MOZ_UNLIKELY(NS_IsMainThread())) {
1675     Maybe<Buffer<uint8_t>> maybe = Buffer<uint8_t>::Alloc(aLength);
1676     if (maybe.isNothing()) {
1677       return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1678     }
1679     Buffer<uint8_t> data(std::move(*maybe));
1680     rv = aInStream->Read(reinterpret_cast<char*>(data.Elements()),
1681                          data.Length(), &totalRead);
1682     NS_ENSURE_SUCCESS(rv, rv);
1683     MOZ_ASSERT(totalRead == aLength);
1684 
1685     nsCOMPtr<nsIRunnable> dataAvailable =
1686         new nsHtml5DataAvailable(this, std::move(data));
1687     if (NS_FAILED(mEventTarget->Dispatch(dataAvailable,
1688                                          nsIThread::DISPATCH_NORMAL))) {
1689       NS_WARNING("Dispatching DataAvailable event failed.");
1690     }
1691     return rv;
1692   }
1693   MOZ_ASSERT(IsParserThread(), "Wrong thread!");
1694   mozilla::MutexAutoLock autoLock(mTokenizerMutex);
1695 
1696   if (MOZ_UNLIKELY(mDecodingLocalFileWithoutTokenizing)) {
1697     // It's a bit sad to potentially buffer the first 1024
1698     // bytes in two places, but it's a lot simpler than trying
1699     // to optitize out that copy. It only happens for local files
1700     // and not for the http(s) content anyway.
1701     Maybe<Buffer<uint8_t>> maybe = Buffer<uint8_t>::Alloc(aLength);
1702     if (maybe.isNothing()) {
1703       MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1704       return NS_ERROR_OUT_OF_MEMORY;
1705     }
1706     Buffer<uint8_t> data(std::move(*maybe));
1707     rv = aInStream->Read(reinterpret_cast<char*>(data.Elements()),
1708                          data.Length(), &totalRead);
1709     NS_ENSURE_SUCCESS(rv, rv);
1710     MOZ_ASSERT(totalRead == aLength);
1711     DoDataAvailableBuffer(std::move(data));
1712     return rv;
1713   }
1714   // Read directly from response buffer.
1715   rv = aInStream->ReadSegments(CopySegmentsToParser, this, aLength, &totalRead);
1716   NS_ENSURE_SUCCESS(rv, rv);
1717   MOZ_ASSERT(totalRead == aLength);
1718   return rv;
1719 }
1720 
1721 /* static */
CopySegmentsToParser(nsIInputStream * aInStream,void * aClosure,const char * aFromSegment,uint32_t aToOffset,uint32_t aCount,uint32_t * aWriteCount)1722 nsresult nsHtml5StreamParser::CopySegmentsToParser(
1723     nsIInputStream* aInStream, void* aClosure, const char* aFromSegment,
1724     uint32_t aToOffset, uint32_t aCount, uint32_t* aWriteCount) {
1725   nsHtml5StreamParser* parser = static_cast<nsHtml5StreamParser*>(aClosure);
1726 
1727   parser->DoDataAvailable(AsBytes(Span(aFromSegment, aCount)));
1728   // Assume DoDataAvailable consumed all available bytes.
1729   *aWriteCount = aCount;
1730   return NS_OK;
1731 }
1732 
PreferredForInternalEncodingDecl(const nsACString & aEncoding)1733 const Encoding* nsHtml5StreamParser::PreferredForInternalEncodingDecl(
1734     const nsACString& aEncoding) {
1735   const Encoding* newEncoding = Encoding::ForLabel(aEncoding);
1736   if (!newEncoding) {
1737     // the encoding name is bogus
1738     mTreeBuilder->MaybeComplainAboutCharset("EncMetaUnsupported", true,
1739                                             mTokenizer->getLineNumber());
1740     return nullptr;
1741   }
1742 
1743   if (newEncoding == UTF_16BE_ENCODING || newEncoding == UTF_16LE_ENCODING) {
1744     mTreeBuilder->MaybeComplainAboutCharset("EncMetaUtf16", true,
1745                                             mTokenizer->getLineNumber());
1746     newEncoding = UTF_8_ENCODING;
1747   }
1748 
1749   if (newEncoding == X_USER_DEFINED_ENCODING) {
1750     // WebKit/Blink hack for Indian and Armenian legacy sites
1751     mTreeBuilder->MaybeComplainAboutCharset("EncMetaUserDefined", true,
1752                                             mTokenizer->getLineNumber());
1753     newEncoding = WINDOWS_1252_ENCODING;
1754   }
1755 
1756   if (newEncoding == mEncoding) {
1757     if (mCharsetSource < kCharsetFromMetaPrescan) {
1758       if (mInitialEncodingWasFromParentFrame) {
1759         mTreeBuilder->MaybeComplainAboutCharset("EncLateMetaFrame", false,
1760                                                 mTokenizer->getLineNumber());
1761       } else {
1762         mTreeBuilder->MaybeComplainAboutCharset("EncLateMeta", false,
1763                                                 mTokenizer->getLineNumber());
1764       }
1765     }
1766     mCharsetSource = kCharsetFromMetaTag;  // become confident
1767     mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
1768     DontGuessEncoding();  // don't feed chardet when confident
1769     return nullptr;
1770   }
1771 
1772   return newEncoding;
1773 }
1774 
internalEncodingDeclaration(nsHtml5String aEncoding)1775 bool nsHtml5StreamParser::internalEncodingDeclaration(nsHtml5String aEncoding) {
1776   // This code needs to stay in sync with
1777   // nsHtml5MetaScanner::tryCharset. Unfortunately, the
1778   // trickery with member fields there leads to some copy-paste reuse. :-(
1779   NS_ASSERTION(IsParserThread(), "Wrong thread!");
1780   if (mCharsetSource >= kCharsetFromMetaTag) {  // this threshold corresponds to
1781                                                 // "confident" in the HTML5 spec
1782     return false;
1783   }
1784 
1785   nsString newEncoding16;  // Not Auto, because using it to hold nsStringBuffer*
1786   aEncoding.ToString(newEncoding16);
1787   nsAutoCString newEncoding;
1788   CopyUTF16toUTF8(newEncoding16, newEncoding);
1789 
1790   auto encoding = PreferredForInternalEncodingDecl(newEncoding);
1791   if (!encoding) {
1792     return false;
1793   }
1794 
1795   if (mReparseForbidden) {
1796     // This mReparseForbidden check happens after the call to
1797     // PreferredForInternalEncodingDecl so that if that method calls
1798     // MaybeComplainAboutCharset, its charset complaint wins over the one
1799     // below.
1800     mTreeBuilder->MaybeComplainAboutCharset("EncLateMetaTooLate", true,
1801                                             mTokenizer->getLineNumber());
1802     return false;  // not reparsing even if we wanted to
1803   }
1804 
1805   // Avoid having the chardet ask for another restart after this restart
1806   // request.
1807   DontGuessEncoding();
1808   mTreeBuilder->NeedsCharsetSwitchTo(WrapNotNull(encoding), kCharsetFromMetaTag,
1809                                      mTokenizer->getLineNumber());
1810   FlushTreeOpsAndDisarmTimer();
1811   Interrupt();
1812   // the tree op executor will cause the stream parser to terminate
1813   // if the charset switch request is accepted or it'll uninterrupt
1814   // if the request failed. Note that if the restart request fails,
1815   // we don't bother trying to make chardet resume. Might as well
1816   // assume that chardet-requested restarts would fail, too.
1817   return true;
1818 }
1819 
FlushTreeOpsAndDisarmTimer()1820 void nsHtml5StreamParser::FlushTreeOpsAndDisarmTimer() {
1821   NS_ASSERTION(IsParserThread(), "Wrong thread!");
1822   if (mFlushTimerArmed) {
1823     // avoid calling Cancel if the flush timer isn't armed to avoid acquiring
1824     // a mutex
1825     {
1826       mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex);
1827       mFlushTimer->Cancel();
1828     }
1829     mFlushTimerArmed = false;
1830   }
1831   if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1832     mTokenizer->FlushViewSource();
1833   }
1834   mTreeBuilder->Flush();
1835   nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher);
1836   if (NS_FAILED(DispatchToMain(runnable.forget()))) {
1837     NS_WARNING("failed to dispatch executor flush event");
1838   }
1839 }
1840 
ParseAvailableData()1841 void nsHtml5StreamParser::ParseAvailableData() {
1842   MOZ_ASSERT(IsParserThread(), "Wrong thread!");
1843   mTokenizerMutex.AssertCurrentThreadOwns();
1844   MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing);
1845 
1846   if (IsTerminatedOrInterrupted()) {
1847     return;
1848   }
1849 
1850   if (mSpeculating && !IsSpeculationEnabled()) {
1851     return;
1852   }
1853 
1854   for (;;) {
1855     if (!mFirstBuffer->hasMore()) {
1856       if (mFirstBuffer == mLastBuffer) {
1857         switch (mStreamState) {
1858           case STREAM_BEING_READ:
1859             // never release the last buffer.
1860             if (!mSpeculating) {
1861               // reuse buffer space if not speculating
1862               mFirstBuffer->setStart(0);
1863               mFirstBuffer->setEnd(0);
1864             }
1865             mTreeBuilder->FlushLoads();
1866             {
1867               // Dispatch this runnable unconditionally, because the loads
1868               // that need flushing may have been flushed earlier even if the
1869               // flush right above here did nothing.
1870               nsCOMPtr<nsIRunnable> runnable(mLoadFlusher);
1871               if (NS_FAILED(DispatchToMain(runnable.forget()))) {
1872                 NS_WARNING("failed to dispatch load flush event");
1873               }
1874             }
1875             return;  // no more data for now but expecting more
1876           case STREAM_ENDED:
1877             if (mAtEOF) {
1878               return;
1879             }
1880             mAtEOF = true;
1881             if (mCharsetSource < kCharsetFromMetaTag) {
1882               if (mInitialEncodingWasFromParentFrame) {
1883                 // Unfortunately, this check doesn't take effect for
1884                 // cross-origin frames, so cross-origin ad frames that have
1885                 // no text and only an image or a Flash embed get the more
1886                 // severe message from the next if block. The message is
1887                 // technically accurate, though.
1888                 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclarationFrame",
1889                                                         false, 0);
1890               } else if (mMode == NORMAL) {
1891                 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclaration",
1892                                                         true, 0);
1893               } else if (mMode == PLAIN_TEXT) {
1894                 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclarationPlain",
1895                                                         true, 0);
1896               }
1897             }
1898             if (NS_SUCCEEDED(mTreeBuilder->IsBroken())) {
1899               mTokenizer->eof();
1900               nsresult rv;
1901               if (NS_FAILED((rv = mTreeBuilder->IsBroken()))) {
1902                 MarkAsBroken(rv);
1903               } else {
1904                 mTreeBuilder->StreamEnded();
1905                 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1906                   mTokenizer->EndViewSource();
1907                 }
1908               }
1909             }
1910             FlushTreeOpsAndDisarmTimer();
1911             return;  // no more data and not expecting more
1912           default:
1913             MOZ_ASSERT_UNREACHABLE("It should be impossible to reach this.");
1914             return;
1915         }
1916       }
1917       mFirstBuffer = mFirstBuffer->next;
1918       continue;
1919     }
1920 
1921     // now we have a non-empty buffer
1922     mFirstBuffer->adjust(mLastWasCR);
1923     mLastWasCR = false;
1924     if (mFirstBuffer->hasMore()) {
1925       if (!mTokenizer->EnsureBufferSpace(mFirstBuffer->getLength())) {
1926         MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1927         return;
1928       }
1929       mLastWasCR = mTokenizer->tokenizeBuffer(mFirstBuffer);
1930       nsresult rv;
1931       if (NS_FAILED((rv = mTreeBuilder->IsBroken()))) {
1932         MarkAsBroken(rv);
1933         return;
1934       }
1935       // At this point, internalEncodingDeclaration() may have called
1936       // Terminate, but that never happens together with script.
1937       // Can't assert that here, though, because it's possible that the main
1938       // thread has called Terminate() while this thread was parsing.
1939       if (mTreeBuilder->HasScript()) {
1940         // HasScript() cannot return true if the tree builder is preventing
1941         // script execution.
1942         MOZ_ASSERT(mMode == NORMAL);
1943         mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
1944         nsHtml5Speculation* speculation = new nsHtml5Speculation(
1945             mFirstBuffer, mFirstBuffer->getStart(), mTokenizer->getLineNumber(),
1946             mTreeBuilder->newSnapshot());
1947         mTreeBuilder->AddSnapshotToScript(speculation->GetSnapshot(),
1948                                           speculation->GetStartLineNumber());
1949         FlushTreeOpsAndDisarmTimer();
1950         mTreeBuilder->SetOpSink(speculation);
1951         mSpeculations.AppendElement(speculation);  // adopts the pointer
1952         mSpeculating = true;
1953       }
1954       if (IsTerminatedOrInterrupted()) {
1955         return;
1956       }
1957     }
1958   }
1959 }
1960 
1961 class nsHtml5StreamParserContinuation : public Runnable {
1962  private:
1963   nsHtml5StreamParserPtr mStreamParser;
1964 
1965  public:
nsHtml5StreamParserContinuation(nsHtml5StreamParser * aStreamParser)1966   explicit nsHtml5StreamParserContinuation(nsHtml5StreamParser* aStreamParser)
1967       : Runnable("nsHtml5StreamParserContinuation"),
1968         mStreamParser(aStreamParser) {}
Run()1969   NS_IMETHOD Run() override {
1970     mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1971     mStreamParser->Uninterrupt();
1972     mStreamParser->ParseAvailableData();
1973     return NS_OK;
1974   }
1975 };
1976 
ContinueAfterScripts(nsHtml5Tokenizer * aTokenizer,nsHtml5TreeBuilder * aTreeBuilder,bool aLastWasCR)1977 void nsHtml5StreamParser::ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
1978                                                nsHtml5TreeBuilder* aTreeBuilder,
1979                                                bool aLastWasCR) {
1980   NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1981   NS_ASSERTION(!(mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML),
1982                "ContinueAfterScripts called in view source mode!");
1983   if (NS_FAILED(mExecutor->IsBroken())) {
1984     return;
1985   }
1986 #ifdef DEBUG
1987   mExecutor->AssertStageEmpty();
1988 #endif
1989   bool speculationFailed = false;
1990   {
1991     mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
1992     if (mSpeculations.IsEmpty()) {
1993       MOZ_ASSERT_UNREACHABLE(
1994           "ContinueAfterScripts called without "
1995           "speculations.");
1996       return;
1997     }
1998 
1999     const auto& speculation = mSpeculations.ElementAt(0);
2000     if (aLastWasCR || !aTokenizer->isInDataState() ||
2001         !aTreeBuilder->snapshotMatches(speculation->GetSnapshot())) {
2002       speculationFailed = true;
2003       // We've got a failed speculation :-(
2004       MaybeDisableFutureSpeculation();
2005       Interrupt();  // Make the parser thread release the tokenizer mutex sooner
2006       // now fall out of the speculationAutoLock into the tokenizerAutoLock
2007       // block
2008     } else {
2009       // We've got a successful speculation!
2010       if (mSpeculations.Length() > 1) {
2011         // the first speculation isn't the current speculation, so there's
2012         // no need to bother the parser thread.
2013         speculation->FlushToSink(mExecutor);
2014         NS_ASSERTION(!mExecutor->IsScriptExecuting(),
2015                      "ParseUntilBlocked() was supposed to ensure we don't come "
2016                      "here when scripts are executing.");
2017         NS_ASSERTION(
2018             mExecutor->IsInFlushLoop(),
2019             "How are we here if "
2020             "RunFlushLoop() didn't call ParseUntilBlocked() which is the "
2021             "only caller of this method?");
2022         mSpeculations.RemoveElementAt(0);
2023         return;
2024       }
2025       // else
2026       Interrupt();  // Make the parser thread release the tokenizer mutex sooner
2027 
2028       // now fall through
2029       // the first speculation is the current speculation. Need to
2030       // release the the speculation mutex and acquire the tokenizer
2031       // mutex. (Just acquiring the other mutex here would deadlock)
2032     }
2033   }
2034   {
2035     mozilla::MutexAutoLock tokenizerAutoLock(mTokenizerMutex);
2036 #ifdef DEBUG
2037     {
2038       mAtomTable.SetPermittedLookupEventTarget(
2039           GetMainThreadSerialEventTarget());
2040     }
2041 #endif
2042     // In principle, the speculation mutex should be acquired here,
2043     // but there's no point, because the parser thread only acquires it
2044     // when it has also acquired the tokenizer mutex and we are already
2045     // holding the tokenizer mutex.
2046     if (speculationFailed) {
2047       // Rewind the stream
2048       mAtEOF = false;
2049       const auto& speculation = mSpeculations.ElementAt(0);
2050       mFirstBuffer = speculation->GetBuffer();
2051       mFirstBuffer->setStart(speculation->GetStart());
2052       mTokenizer->setLineNumber(speculation->GetStartLineNumber());
2053 
2054       nsContentUtils::ReportToConsole(
2055           nsIScriptError::warningFlag, "DOM Events"_ns,
2056           mExecutor->GetDocument(), nsContentUtils::eDOM_PROPERTIES,
2057           "SpeculationFailed", nsTArray<nsString>(), nullptr, u""_ns,
2058           speculation->GetStartLineNumber());
2059 
2060       nsHtml5OwningUTF16Buffer* buffer = mFirstBuffer->next;
2061       while (buffer) {
2062         buffer->setStart(0);
2063         buffer = buffer->next;
2064       }
2065 
2066       mSpeculations.Clear();  // potentially a huge number of destructors
2067                               // run here synchronously on the main thread...
2068 
2069       mTreeBuilder->flushCharacters();  // empty the pending buffer
2070       mTreeBuilder->ClearOps();         // now get rid of the failed ops
2071 
2072       mTreeBuilder->SetOpSink(mExecutor->GetStage());
2073       mExecutor->StartReadingFromStage();
2074       mSpeculating = false;
2075 
2076       // Copy state over
2077       mLastWasCR = aLastWasCR;
2078       mTokenizer->loadState(aTokenizer);
2079       mTreeBuilder->loadState(aTreeBuilder);
2080     } else {
2081       // We've got a successful speculation and at least a moment ago it was
2082       // the current speculation
2083       mSpeculations.ElementAt(0)->FlushToSink(mExecutor);
2084       NS_ASSERTION(!mExecutor->IsScriptExecuting(),
2085                    "ParseUntilBlocked() was supposed to ensure we don't come "
2086                    "here when scripts are executing.");
2087       NS_ASSERTION(
2088           mExecutor->IsInFlushLoop(),
2089           "How are we here if "
2090           "RunFlushLoop() didn't call ParseUntilBlocked() which is the "
2091           "only caller of this method?");
2092       mSpeculations.RemoveElementAt(0);
2093       if (mSpeculations.IsEmpty()) {
2094         // yes, it was still the only speculation. Now stop speculating
2095         // However, before telling the executor to read from stage, flush
2096         // any pending ops straight to the executor, because otherwise
2097         // they remain unflushed until we get more data from the network.
2098         mTreeBuilder->SetOpSink(mExecutor);
2099         mTreeBuilder->Flush(true);
2100         mTreeBuilder->SetOpSink(mExecutor->GetStage());
2101         mExecutor->StartReadingFromStage();
2102         mSpeculating = false;
2103       }
2104     }
2105     nsCOMPtr<nsIRunnable> event = new nsHtml5StreamParserContinuation(this);
2106     if (NS_FAILED(mEventTarget->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
2107       NS_WARNING("Failed to dispatch nsHtml5StreamParserContinuation");
2108     }
2109 // A stream event might run before this event runs, but that's harmless.
2110 #ifdef DEBUG
2111     mAtomTable.SetPermittedLookupEventTarget(mEventTarget);
2112 #endif
2113   }
2114 }
2115 
ContinueAfterFailedCharsetSwitch()2116 void nsHtml5StreamParser::ContinueAfterFailedCharsetSwitch() {
2117   NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
2118   nsCOMPtr<nsIRunnable> event = new nsHtml5StreamParserContinuation(this);
2119   if (NS_FAILED(mEventTarget->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
2120     NS_WARNING("Failed to dispatch nsHtml5StreamParserContinuation");
2121   }
2122 }
2123 
2124 class nsHtml5TimerKungFu : public Runnable {
2125  private:
2126   nsHtml5StreamParserPtr mStreamParser;
2127 
2128  public:
nsHtml5TimerKungFu(nsHtml5StreamParser * aStreamParser)2129   explicit nsHtml5TimerKungFu(nsHtml5StreamParser* aStreamParser)
2130       : Runnable("nsHtml5TimerKungFu"), mStreamParser(aStreamParser) {}
Run()2131   NS_IMETHOD Run() override {
2132     mozilla::MutexAutoLock flushTimerLock(mStreamParser->mFlushTimerMutex);
2133     if (mStreamParser->mFlushTimer) {
2134       mStreamParser->mFlushTimer->Cancel();
2135       mStreamParser->mFlushTimer = nullptr;
2136     }
2137     return NS_OK;
2138   }
2139 };
2140 
DropTimer()2141 void nsHtml5StreamParser::DropTimer() {
2142   NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
2143   /*
2144    * Simply nulling out the timer wouldn't work, because if the timer is
2145    * armed, it needs to be canceled first. Simply canceling it first wouldn't
2146    * work, because nsTimerImpl::Cancel is not safe for calling from outside
2147    * the thread where nsTimerImpl::Fire would run. It's not safe to
2148    * dispatch a runnable to cancel the timer from the destructor of this
2149    * class, because the timer has a weak (void*) pointer back to this instance
2150    * of the stream parser and having the timer fire before the runnable
2151    * cancels it would make the timer access a deleted object.
2152    *
2153    * This DropTimer method addresses these issues. This method must be called
2154    * on the main thread before the destructor of this class is reached.
2155    * The nsHtml5TimerKungFu object has an nsHtml5StreamParserPtr that addrefs
2156    * this
2157    * stream parser object to keep it alive until the runnable is done.
2158    * The runnable cancels the timer on the parser thread, drops the timer
2159    * and lets nsHtml5StreamParserPtr send a runnable back to the main thread to
2160    * release the stream parser.
2161    */
2162   mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex);
2163   if (mFlushTimer) {
2164     nsCOMPtr<nsIRunnable> event = new nsHtml5TimerKungFu(this);
2165     if (NS_FAILED(mEventTarget->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
2166       NS_WARNING("Failed to dispatch TimerKungFu event");
2167     }
2168   }
2169 }
2170 
2171 // Using a static, because the method name Notify is taken by the chardet
2172 // callback.
TimerCallback(nsITimer * aTimer,void * aClosure)2173 void nsHtml5StreamParser::TimerCallback(nsITimer* aTimer, void* aClosure) {
2174   (static_cast<nsHtml5StreamParser*>(aClosure))->TimerFlush();
2175 }
2176 
TimerFlush()2177 void nsHtml5StreamParser::TimerFlush() {
2178   NS_ASSERTION(IsParserThread(), "Wrong thread!");
2179   mozilla::MutexAutoLock autoLock(mTokenizerMutex);
2180 
2181   NS_ASSERTION(!mSpeculating, "Flush timer fired while speculating.");
2182 
2183   // The timer fired if we got here. No need to cancel it. Mark it as
2184   // not armed, though.
2185   mFlushTimerArmed = false;
2186 
2187   mFlushTimerEverFired = true;
2188 
2189   if (IsTerminatedOrInterrupted()) {
2190     return;
2191   }
2192 
2193   if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
2194     mTreeBuilder->Flush();  // delete useless ops
2195     if (mTokenizer->FlushViewSource()) {
2196       nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher);
2197       if (NS_FAILED(DispatchToMain(runnable.forget()))) {
2198         NS_WARNING("failed to dispatch executor flush event");
2199       }
2200     }
2201   } else {
2202     // we aren't speculating and we don't know when new data is
2203     // going to arrive. Send data to the main thread.
2204     if (mTreeBuilder->Flush(true)) {
2205       nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher);
2206       if (NS_FAILED(DispatchToMain(runnable.forget()))) {
2207         NS_WARNING("failed to dispatch executor flush event");
2208       }
2209     }
2210   }
2211 }
2212 
MarkAsBroken(nsresult aRv)2213 void nsHtml5StreamParser::MarkAsBroken(nsresult aRv) {
2214   NS_ASSERTION(IsParserThread(), "Wrong thread!");
2215   mTokenizerMutex.AssertCurrentThreadOwns();
2216 
2217   Terminate();
2218   mTreeBuilder->MarkAsBroken(aRv);
2219   mozilla::DebugOnly<bool> hadOps = mTreeBuilder->Flush(false);
2220   NS_ASSERTION(hadOps, "Should have had the markAsBroken op!");
2221   nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher);
2222   if (NS_FAILED(DispatchToMain(runnable.forget()))) {
2223     NS_WARNING("failed to dispatch executor flush event");
2224   }
2225 }
2226 
DispatchToMain(already_AddRefed<nsIRunnable> && aRunnable)2227 nsresult nsHtml5StreamParser::DispatchToMain(
2228     already_AddRefed<nsIRunnable>&& aRunnable) {
2229   if (mNetworkEventTarget) {
2230     return mNetworkEventTarget->Dispatch(std::move(aRunnable));
2231   }
2232   return SchedulerGroup::UnlabeledDispatch(TaskCategory::Network,
2233                                            std::move(aRunnable));
2234 }
2235