1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set sw=2 ts=2 et tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "nsHtml5StreamParser.h"
8
9 #include <stdlib.h>
10 #include <string.h>
11 #include <algorithm>
12 #include <new>
13 #include <type_traits>
14 #include <utility>
15 #include "GeckoProfiler.h"
16 #include "js/GCAPI.h"
17 #include "mozilla/ArrayIterator.h"
18 #include "mozilla/Buffer.h"
19 #include "mozilla/CheckedInt.h"
20 #include "mozilla/DebugOnly.h"
21 #include "mozilla/Encoding.h"
22 #include "mozilla/EncodingDetector.h"
23 #include "mozilla/Likely.h"
24 #include "mozilla/Maybe.h"
25 #include "mozilla/SchedulerGroup.h"
26 #include "mozilla/ScopeExit.h"
27 #include "mozilla/Services.h"
28 #include "mozilla/StaticPrefs_html5.h"
29 #include "mozilla/StaticPrefs_intl.h"
30 #include "mozilla/TaskCategory.h"
31 #include "mozilla/Tuple.h"
32 #include "mozilla/UniquePtrExtensions.h"
33 #include "mozilla/Unused.h"
34 #include "mozilla/dom/BindingDeclarations.h"
35 #include "mozilla/dom/BrowsingContext.h"
36 #include "mozilla/dom/DebuggerUtilsBinding.h"
37 #include "mozilla/dom/DocGroup.h"
38 #include "mozilla/dom/Document.h"
39 #include "mozilla/mozalloc.h"
40 #include "nsContentSink.h"
41 #include "nsContentUtils.h"
42 #include "nsCycleCollectionTraversalCallback.h"
43 #include "nsHtml5AtomTable.h"
44 #include "nsHtml5ByteReadable.h"
45 #include "nsHtml5Highlighter.h"
46 #include "nsHtml5MetaScanner.h"
47 #include "nsHtml5Module.h"
48 #include "nsHtml5OwningUTF16Buffer.h"
49 #include "nsHtml5Parser.h"
50 #include "nsHtml5Speculation.h"
51 #include "nsHtml5StreamParserPtr.h"
52 #include "nsHtml5Tokenizer.h"
53 #include "nsHtml5TreeBuilder.h"
54 #include "nsHtml5TreeOpExecutor.h"
55 #include "nsHtml5TreeOpStage.h"
56 #include "nsIChannel.h"
57 #include "nsIContentSink.h"
58 #include "nsID.h"
59 #include "nsIDTD.h"
60 #include "nsIDocShell.h"
61 #include "nsIEventTarget.h"
62 #include "nsIHttpChannel.h"
63 #include "nsIInputStream.h"
64 #include "nsINestedURI.h"
65 #include "nsIObserverService.h"
66 #include "nsIRequest.h"
67 #include "nsIRunnable.h"
68 #include "nsIScriptError.h"
69 #include "nsIThread.h"
70 #include "nsIThreadRetargetableRequest.h"
71 #include "nsIThreadRetargetableStreamListener.h"
72 #include "nsITimer.h"
73 #include "nsIURI.h"
74 #include "nsJSEnvironment.h"
75 #include "nsLiteralString.h"
76 #include "nsNetUtil.h"
77 #include "nsString.h"
78 #include "nsTPromiseFlatString.h"
79 #include "nsThreadUtils.h"
80 #include "nsXULAppAPI.h"
81
82 // Include expat after the other, since it defines XML_NS, which conflicts with
83 // our symbol names.
84 #include "expat_config.h"
85 #include "expat.h"
86
87 extern "C" {
88 // Defined in intl/encoding_glue/src/lib.rs
89 const mozilla::Encoding* xmldecl_parse(const uint8_t* buf, size_t buf_len);
90 };
91
92 using namespace mozilla;
93 using namespace mozilla::dom;
94
95 /*
96 * Note that nsHtml5StreamParser implements cycle collecting AddRef and
97 * Release. Therefore, nsHtml5StreamParser must never be refcounted from
98 * the parser thread!
99 *
100 * To work around this limitation, runnables posted by the main thread to the
101 * parser thread hold their reference to the stream parser in an
102 * nsHtml5StreamParserPtr. Upon creation, nsHtml5StreamParserPtr addrefs the
103 * object it holds
104 * just like a regular nsRefPtr. This is OK, since the creation of the
105 * runnable and the nsHtml5StreamParserPtr happens on the main thread.
106 *
107 * When the runnable is done on the parser thread, the destructor of
108 * nsHtml5StreamParserPtr runs there. It doesn't call Release on the held object
109 * directly. Instead, it posts another runnable back to the main thread where
110 * that runnable calls Release on the wrapped object.
111 *
112 * When posting runnables in the other direction, the runnables have to be
113 * created on the main thread when nsHtml5StreamParser is instantiated and
114 * held for the lifetime of the nsHtml5StreamParser. This works, because the
115 * same runnabled can be dispatched multiple times and currently runnables
116 * posted from the parser thread to main thread don't need to wrap any
117 * runnable-specific data. (In the other direction, the runnables most notably
118 * wrap the byte data of the stream.)
119 */
120 NS_IMPL_CYCLE_COLLECTING_ADDREF(nsHtml5StreamParser)
121 NS_IMPL_CYCLE_COLLECTING_RELEASE(nsHtml5StreamParser)
122
123 NS_INTERFACE_TABLE_HEAD(nsHtml5StreamParser)
124 NS_INTERFACE_TABLE(nsHtml5StreamParser, nsISupports)
125 NS_INTERFACE_TABLE_TO_MAP_SEGUE_CYCLE_COLLECTION(nsHtml5StreamParser)
126 NS_INTERFACE_MAP_END
127
128 NS_IMPL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser)
129
130 NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsHtml5StreamParser)
131 tmp->DropTimer();
132 NS_IMPL_CYCLE_COLLECTION_UNLINK(mObserver)
133 NS_IMPL_CYCLE_COLLECTION_UNLINK(mRequest)
134 NS_IMPL_CYCLE_COLLECTION_UNLINK(mOwner)
135 tmp->mExecutorFlusher = nullptr;
136 tmp->mLoadFlusher = nullptr;
137 tmp->mExecutor = nullptr;
138 NS_IMPL_CYCLE_COLLECTION_UNLINK_END
139
140 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsHtml5StreamParser)
141 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mObserver)
142 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mRequest)
143 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mOwner)
144 // hack: count the strongly owned edge wrapped in the runnable
145 if (tmp->mExecutorFlusher) {
146 NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mExecutorFlusher->mExecutor");
147 cb.NoteXPCOMChild(static_cast<nsIContentSink*>(tmp->mExecutor));
148 }
149 // hack: count the strongly owned edge wrapped in the runnable
150 if (tmp->mLoadFlusher) {
151 NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mLoadFlusher->mExecutor");
152 cb.NoteXPCOMChild(static_cast<nsIContentSink*>(tmp->mExecutor));
153 }
154 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
155
156 class nsHtml5ExecutorFlusher : public Runnable {
157 private:
158 RefPtr<nsHtml5TreeOpExecutor> mExecutor;
159
160 public:
nsHtml5ExecutorFlusher(nsHtml5TreeOpExecutor * aExecutor)161 explicit nsHtml5ExecutorFlusher(nsHtml5TreeOpExecutor* aExecutor)
162 : Runnable("nsHtml5ExecutorFlusher"), mExecutor(aExecutor) {}
Run()163 NS_IMETHOD Run() override {
164 if (!mExecutor->isInList()) {
165 Document* doc = mExecutor->GetDocument();
166 if (XRE_IsContentProcess() &&
167 nsContentUtils::
168 HighPriorityEventPendingForTopLevelDocumentBeforeContentfulPaint(
169 doc)) {
170 // Possible early paint pending, reuse the runnable and try to
171 // call RunFlushLoop later.
172 nsCOMPtr<nsIRunnable> flusher = this;
173 if (NS_SUCCEEDED(
174 doc->Dispatch(TaskCategory::Network, flusher.forget()))) {
175 PROFILER_MARKER_UNTYPED("HighPrio blocking parser flushing(1)", DOM);
176 return NS_OK;
177 }
178 }
179 mExecutor->RunFlushLoop();
180 }
181 return NS_OK;
182 }
183 };
184
185 class nsHtml5LoadFlusher : public Runnable {
186 private:
187 RefPtr<nsHtml5TreeOpExecutor> mExecutor;
188
189 public:
nsHtml5LoadFlusher(nsHtml5TreeOpExecutor * aExecutor)190 explicit nsHtml5LoadFlusher(nsHtml5TreeOpExecutor* aExecutor)
191 : Runnable("nsHtml5LoadFlusher"), mExecutor(aExecutor) {}
Run()192 NS_IMETHOD Run() override {
193 mExecutor->FlushSpeculativeLoads();
194 return NS_OK;
195 }
196 };
197
nsHtml5StreamParser(nsHtml5TreeOpExecutor * aExecutor,nsHtml5Parser * aOwner,eParserMode aMode)198 nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
199 nsHtml5Parser* aOwner,
200 eParserMode aMode)
201 : mSniffingLength(0),
202 mBomState(eBomState::BOM_SNIFFING_NOT_STARTED),
203 mCharsetSource(kCharsetUninitialized),
204 mEncoding(WINDOWS_1252_ENCODING),
205 mFeedChardet(true),
206 mGuessEncoding(true),
207 mReparseForbidden(false),
208 mChannelHadCharset(false),
209 mLastBuffer(nullptr), // Will be filled when starting
210 mExecutor(aExecutor),
211 mTreeBuilder(new nsHtml5TreeBuilder(
212 (aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML)
213 ? nullptr
214 : mExecutor->GetStage(),
215 aMode == NORMAL ? mExecutor->GetStage() : nullptr)),
216 mTokenizer(
217 new nsHtml5Tokenizer(mTreeBuilder.get(), aMode == VIEW_SOURCE_XML)),
218 mTokenizerMutex("nsHtml5StreamParser mTokenizerMutex"),
219 mOwner(aOwner),
220 mLastWasCR(false),
221 mStreamState(eHtml5StreamState::STREAM_NOT_STARTED),
222 mSpeculating(false),
223 mAtEOF(false),
224 mSpeculationMutex("nsHtml5StreamParser mSpeculationMutex"),
225 mSpeculationFailureCount(0),
226 mLocalFileBytesBuffered(0),
227 mTerminated(false),
228 mInterrupted(false),
229 mTerminatedMutex("nsHtml5StreamParser mTerminatedMutex"),
230 mEventTarget(nsHtml5Module::GetStreamParserThread()->SerialEventTarget()),
231 mExecutorFlusher(new nsHtml5ExecutorFlusher(aExecutor)),
232 mLoadFlusher(new nsHtml5LoadFlusher(aExecutor)),
233 mInitialEncodingWasFromParentFrame(false),
234 mHasHadErrors(false),
235 mDetectorHasSeenNonAscii(false),
236 mDetectorHadOnlySeenAsciiWhenFirstGuessing(false),
237 mDecodingLocalFileWithoutTokenizing(false),
238 mFlushTimer(NS_NewTimer(mEventTarget)),
239 mFlushTimerMutex("nsHtml5StreamParser mFlushTimerMutex"),
240 mFlushTimerArmed(false),
241 mFlushTimerEverFired(false),
242 mMode(aMode) {
243 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
244 #ifdef DEBUG
245 mAtomTable.SetPermittedLookupEventTarget(mEventTarget);
246 #endif
247 mTokenizer->setInterner(&mAtomTable);
248 mTokenizer->setEncodingDeclarationHandler(this);
249
250 if (aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML) {
251 nsHtml5Highlighter* highlighter =
252 new nsHtml5Highlighter(mExecutor->GetStage());
253 mTokenizer->EnableViewSource(highlighter); // takes ownership
254 mTreeBuilder->EnableViewSource(highlighter); // doesn't own
255 }
256
257 // There's a zeroing operator new for everything else
258 }
259
~nsHtml5StreamParser()260 nsHtml5StreamParser::~nsHtml5StreamParser() {
261 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
262 mTokenizer->end();
263 #ifdef DEBUG
264 {
265 mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex);
266 MOZ_ASSERT(!mFlushTimer, "Flush timer was not dropped before dtor!");
267 }
268 mRequest = nullptr;
269 mObserver = nullptr;
270 mUnicodeDecoder = nullptr;
271 mSniffingBuffer = nullptr;
272 mMetaScanner = nullptr;
273 mFirstBuffer = nullptr;
274 mExecutor = nullptr;
275 mTreeBuilder = nullptr;
276 mTokenizer = nullptr;
277 mOwner = nullptr;
278 #endif
279 }
280
GetChannel(nsIChannel ** aChannel)281 nsresult nsHtml5StreamParser::GetChannel(nsIChannel** aChannel) {
282 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
283 return mRequest ? CallQueryInterface(mRequest, aChannel)
284 : NS_ERROR_NOT_AVAILABLE;
285 }
286
MaybeRollBackSource(int32_t aSource)287 int32_t nsHtml5StreamParser::MaybeRollBackSource(int32_t aSource) {
288 if (aSource ==
289 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD) {
290 return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
291 }
292 if (aSource == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
293 return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic;
294 }
295 if (aSource == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content) {
296 return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
297 }
298 if (aSource == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
299 !mDetectorHadOnlySeenAsciiWhenFirstGuessing) {
300 return kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
301 }
302 if (aSource == kCharsetFromFinalUserForcedAutoDetection) {
303 aSource = kCharsetFromInitialUserForcedAutoDetection;
304 }
305 return aSource;
306 }
307
GuessEncoding(bool aEof,bool aInitial)308 void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
309 if (aInitial) {
310 if (!mDetectorHasSeenNonAscii) {
311 mDetectorHadOnlySeenAsciiWhenFirstGuessing = true;
312 }
313 } else {
314 mGuessEncoding = false;
315 }
316 bool forced = (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
317 mCharsetSource == kCharsetFromInitialUserForcedAutoDetection);
318 MOZ_ASSERT(
319 mCharsetSource != kCharsetFromFinalUserForcedAutoDetection &&
320 mCharsetSource != kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
321 mCharsetSource !=
322 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic &&
323 mCharsetSource !=
324 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content &&
325 mCharsetSource !=
326 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD &&
327 mCharsetSource != kCharsetFromFinalAutoDetectionFile);
328 auto ifHadBeenForced = mDetector->Guess(EmptyCString(), true);
329 auto encoding =
330 forced ? ifHadBeenForced
331 : mDetector->Guess(mTLD, mDecodingLocalFileWithoutTokenizing);
332 int32_t source =
333 aInitial
334 ? (forced
335 ? kCharsetFromInitialUserForcedAutoDetection
336 : kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic)
337 : (forced
338 ? kCharsetFromFinalUserForcedAutoDetection
339 : (mDecodingLocalFileWithoutTokenizing
340 ? kCharsetFromFinalAutoDetectionFile
341 : kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic));
342 if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
343 if (encoding == ISO_2022_JP_ENCODING) {
344 if (EncodingDetector::TldMayAffectGuess(mTLD)) {
345 source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
346 }
347 } else if (!mDetectorHasSeenNonAscii) {
348 source = kCharsetFromInitialAutoDetectionASCII; // deliberately Initial
349 } else if (ifHadBeenForced == UTF_8_ENCODING) {
350 source = kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8;
351 } else if (encoding != ifHadBeenForced) {
352 source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
353 } else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
354 source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
355 }
356 } else if (source ==
357 kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic) {
358 if (encoding == ISO_2022_JP_ENCODING) {
359 if (EncodingDetector::TldMayAffectGuess(mTLD)) {
360 source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
361 }
362 } else if (!mDetectorHasSeenNonAscii) {
363 source = kCharsetFromInitialAutoDetectionASCII;
364 } else if (ifHadBeenForced == UTF_8_ENCODING) {
365 source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
366 } else if (encoding != ifHadBeenForced) {
367 source =
368 kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
369 } else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
370 source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
371 }
372 }
373 if (HasDecoder() && !mDecodingLocalFileWithoutTokenizing) {
374 if (mEncoding == encoding) {
375 MOZ_ASSERT(mCharsetSource == kCharsetFromInitialAutoDetectionASCII ||
376 mCharsetSource < source,
377 "Why are we running chardet at all?");
378 // Source didn't actually change between initial and final, so roll it
379 // back for telemetry purposes.
380 mCharsetSource = MaybeRollBackSource(source);
381 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
382 } else {
383 MOZ_ASSERT(mCharsetSource < kCharsetFromXmlDeclarationUtf16 || forced);
384 // We've already committed to a decoder. Request a reload from the
385 // docshell.
386 mTreeBuilder->NeedsCharsetSwitchTo(encoding, source, 0);
387 FlushTreeOpsAndDisarmTimer();
388 Interrupt();
389 }
390 } else {
391 // Got a confident answer from the sniffing buffer. That code will
392 // take care of setting up the decoder.
393 if (mCharsetSource == kCharsetUninitialized && aEof) {
394 // The document is so short that the initial buffer is the last
395 // buffer.
396 source = MaybeRollBackSource(source);
397 }
398 mEncoding = encoding;
399 mCharsetSource = source;
400 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
401 }
402 }
403
FeedDetector(Span<const uint8_t> aBuffer,bool aLast)404 void nsHtml5StreamParser::FeedDetector(Span<const uint8_t> aBuffer,
405 bool aLast) {
406 mDetectorHasSeenNonAscii = mDetector->Feed(aBuffer, aLast);
407 }
408
SetViewSourceTitle(nsIURI * aURL)409 void nsHtml5StreamParser::SetViewSourceTitle(nsIURI* aURL) {
410 MOZ_ASSERT(NS_IsMainThread());
411
412 BrowsingContext* browsingContext =
413 mExecutor->GetDocument()->GetBrowsingContext();
414 if (browsingContext && browsingContext->WatchedByDevTools()) {
415 mURIToSendToDevtools = aURL;
416
417 nsID uuid;
418 nsresult rv = nsContentUtils::GenerateUUIDInPlace(uuid);
419 if (!NS_FAILED(rv)) {
420 char buffer[NSID_LENGTH];
421 uuid.ToProvidedString(buffer);
422 mUUIDForDevtools = NS_ConvertASCIItoUTF16(buffer);
423 }
424 }
425
426 if (aURL) {
427 nsCOMPtr<nsIURI> temp;
428 if (aURL->SchemeIs("view-source")) {
429 nsCOMPtr<nsINestedURI> nested = do_QueryInterface(aURL);
430 nested->GetInnerURI(getter_AddRefs(temp));
431 } else {
432 temp = aURL;
433 }
434 if (temp->SchemeIs("data")) {
435 // Avoid showing potentially huge data: URLs. The three last bytes are
436 // UTF-8 for an ellipsis.
437 mViewSourceTitle.AssignLiteral("data:\xE2\x80\xA6");
438 } else {
439 nsresult rv = temp->GetSpec(mViewSourceTitle);
440 if (NS_FAILED(rv)) {
441 mViewSourceTitle.AssignLiteral("\xE2\x80\xA6");
442 }
443 }
444 }
445 }
446
447 nsresult
SetupDecodingAndWriteSniffingBufferAndCurrentSegment(Span<const uint8_t> aFromSegment)448 nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
449 Span<const uint8_t> aFromSegment) {
450 NS_ASSERTION(IsParserThread(), "Wrong thread!");
451 nsresult rv = NS_OK;
452 if (mDecodingLocalFileWithoutTokenizing &&
453 mCharsetSource <= kCharsetFromFallback) {
454 MOZ_ASSERT(mEncoding != UTF_8_ENCODING);
455 mUnicodeDecoder = UTF_8_ENCODING->NewDecoderWithBOMRemoval();
456 } else {
457 if (mCharsetSource >= kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8) {
458 if (!(mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
459 mCharsetSource == kCharsetFromInitialUserForcedAutoDetection)) {
460 DontGuessEncoding();
461 }
462 mDecodingLocalFileWithoutTokenizing = false;
463 }
464 mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval();
465 }
466 if (mSniffingBuffer) {
467 rv = WriteStreamBytes(Span(mSniffingBuffer.get(), mSniffingLength));
468 NS_ENSURE_SUCCESS(rv, rv);
469 mSniffingBuffer = nullptr;
470 }
471 mMetaScanner = nullptr;
472 return WriteStreamBytes(aFromSegment);
473 }
474
SetupDecodingFromBom(NotNull<const Encoding * > aEncoding)475 void nsHtml5StreamParser::SetupDecodingFromBom(
476 NotNull<const Encoding*> aEncoding) {
477 MOZ_ASSERT(IsParserThread(), "Wrong thread!");
478 mEncoding = aEncoding;
479 mDecodingLocalFileWithoutTokenizing = false;
480 mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling();
481 mCharsetSource = kCharsetFromByteOrderMark;
482 DontGuessEncoding();
483 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
484 mSniffingBuffer = nullptr;
485 mMetaScanner = nullptr;
486 mBomState = BOM_SNIFFING_OVER;
487 }
488
SetupDecodingFromUtf16BogoXml(NotNull<const Encoding * > aEncoding)489 void nsHtml5StreamParser::SetupDecodingFromUtf16BogoXml(
490 NotNull<const Encoding*> aEncoding) {
491 MOZ_ASSERT(IsParserThread(), "Wrong thread!");
492 mEncoding = aEncoding;
493 mDecodingLocalFileWithoutTokenizing = false;
494 mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling();
495 mCharsetSource = kCharsetFromXmlDeclarationUtf16;
496 DontGuessEncoding();
497 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
498 mSniffingBuffer = nullptr;
499 mMetaScanner = nullptr;
500 mBomState = BOM_SNIFFING_OVER;
501 auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE);
502 dst[0] = '<';
503 dst[1] = '?';
504 dst[2] = 'x';
505 mLastBuffer->AdvanceEnd(3);
506 }
507
SniffBOMlessUTF16BasicLatin(const uint8_t * aBuf,size_t aBufLen)508 void nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const uint8_t* aBuf,
509 size_t aBufLen) {
510 // Avoid underspecified heuristic craziness for XHR
511 if (mMode == LOAD_AS_DATA) {
512 return;
513 }
514 // Make sure there's enough data. Require room for "<title></title>"
515 if (aBufLen < 30) {
516 return;
517 }
518 // even-numbered bytes tracked at 0, odd-numbered bytes tracked at 1
519 bool byteZero[2] = {false, false};
520 bool byteNonZero[2] = {false, false};
521 uint32_t i = 0;
522 for (; i < aBufLen; ++i) {
523 if (aBuf[i]) {
524 if (byteNonZero[1 - (i % 2)]) {
525 return;
526 }
527 byteNonZero[i % 2] = true;
528 } else {
529 if (byteZero[1 - (i % 2)]) {
530 return;
531 }
532 byteZero[i % 2] = true;
533 }
534 }
535 if (byteNonZero[0]) {
536 mEncoding = UTF_16LE_ENCODING;
537 } else {
538 mEncoding = UTF_16BE_ENCODING;
539 }
540 mCharsetSource = kCharsetFromIrreversibleAutoDetection;
541 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
542 DontGuessEncoding();
543 mTreeBuilder->MaybeComplainAboutCharset("EncBomlessUtf16", true, 0);
544 }
545
SetEncodingFromExpat(const char16_t * aEncoding)546 void nsHtml5StreamParser::SetEncodingFromExpat(const char16_t* aEncoding) {
547 if (aEncoding) {
548 nsDependentString utf16(aEncoding);
549 nsAutoCString utf8;
550 CopyUTF16toUTF8(utf16, utf8);
551 auto encoding = PreferredForInternalEncodingDecl(utf8);
552 if (encoding) {
553 mEncoding = WrapNotNull(encoding);
554 mCharsetSource = kCharsetFromMetaTag; // closest for XML
555 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
556 return;
557 }
558 // else the page declared an encoding Gecko doesn't support and we'd
559 // end up defaulting to UTF-8 anyway. Might as well fall through here
560 // right away and let the encoding be set to UTF-8 which we'd default to
561 // anyway.
562 }
563 mEncoding = UTF_8_ENCODING; // XML defaults to UTF-8 without a BOM
564 mCharsetSource = kCharsetFromMetaTag; // means confident
565 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
566 }
567
568 // A separate user data struct is used instead of passing the
569 // nsHtml5StreamParser instance as user data in order to avoid including
570 // expat.h in nsHtml5StreamParser.h. Doing that would cause naming conflicts.
571 // Using a separate user data struct also avoids bloating nsHtml5StreamParser
572 // by one pointer.
573 struct UserData {
574 XML_Parser mExpat;
575 nsHtml5StreamParser* mStreamParser;
576 };
577
578 // Using no-namespace handler callbacks to avoid including expat.h in
579 // nsHtml5StreamParser.h, since doing so would cause naming conclicts.
HandleXMLDeclaration(void * aUserData,const XML_Char * aVersion,const XML_Char * aEncoding,int aStandalone)580 static void HandleXMLDeclaration(void* aUserData, const XML_Char* aVersion,
581 const XML_Char* aEncoding, int aStandalone) {
582 UserData* ud = static_cast<UserData*>(aUserData);
583 ud->mStreamParser->SetEncodingFromExpat(
584 reinterpret_cast<const char16_t*>(aEncoding));
585 XML_StopParser(ud->mExpat, false);
586 }
587
HandleStartElement(void * aUserData,const XML_Char * aName,const XML_Char ** aAtts)588 static void HandleStartElement(void* aUserData, const XML_Char* aName,
589 const XML_Char** aAtts) {
590 UserData* ud = static_cast<UserData*>(aUserData);
591 XML_StopParser(ud->mExpat, false);
592 }
593
HandleEndElement(void * aUserData,const XML_Char * aName)594 static void HandleEndElement(void* aUserData, const XML_Char* aName) {
595 UserData* ud = static_cast<UserData*>(aUserData);
596 XML_StopParser(ud->mExpat, false);
597 }
598
HandleComment(void * aUserData,const XML_Char * aName)599 static void HandleComment(void* aUserData, const XML_Char* aName) {
600 UserData* ud = static_cast<UserData*>(aUserData);
601 XML_StopParser(ud->mExpat, false);
602 }
603
HandleProcessingInstruction(void * aUserData,const XML_Char * aTarget,const XML_Char * aData)604 static void HandleProcessingInstruction(void* aUserData,
605 const XML_Char* aTarget,
606 const XML_Char* aData) {
607 UserData* ud = static_cast<UserData*>(aUserData);
608 XML_StopParser(ud->mExpat, false);
609 }
610
FinalizeSniffingWithDetector(Span<const uint8_t> aFromSegment,uint32_t aCountToSniffingLimit,bool aEof)611 void nsHtml5StreamParser::FinalizeSniffingWithDetector(
612 Span<const uint8_t> aFromSegment, uint32_t aCountToSniffingLimit,
613 bool aEof) {
614 if (mFeedChardet && mSniffingBuffer) {
615 FeedDetector(Span(mSniffingBuffer.get(), mSniffingLength), false);
616 }
617 if (mFeedChardet && !aFromSegment.IsEmpty()) {
618 // Avoid buffer boundary-dependent behavior.
619 FeedDetector(aFromSegment.To(aCountToSniffingLimit), false);
620 }
621 bool guess = mFeedChardet;
622 if (mFeedChardet && aEof && aCountToSniffingLimit <= aFromSegment.Length()) {
623 FeedDetector(Span<const uint8_t>(), true);
624 mFeedChardet = false;
625 }
626 if (guess) {
627 GuessEncoding(aEof, (guess == mFeedChardet));
628 }
629 if (mReparseForbidden) {
630 DontGuessEncoding();
631 }
632 if (mFeedChardet && !aEof && aCountToSniffingLimit < aFromSegment.Length()) {
633 // Avoid buffer boundary-dependent behavior.
634 FeedDetector(aFromSegment.From(aCountToSniffingLimit), false);
635 }
636 }
637
FinalizeSniffing(Span<const uint8_t> aFromSegment,uint32_t aCountToSniffingLimit,bool aEof)638 nsresult nsHtml5StreamParser::FinalizeSniffing(Span<const uint8_t> aFromSegment,
639 uint32_t aCountToSniffingLimit,
640 bool aEof) {
641 MOZ_ASSERT(IsParserThread(), "Wrong thread!");
642 MOZ_ASSERT(mCharsetSource < kCharsetFromXmlDeclarationUtf16,
643 "Should not finalize sniffing with strong decision already made.");
644 if (mMode == VIEW_SOURCE_XML) {
645 static const XML_Memory_Handling_Suite memsuite = {
646 (void* (*)(size_t))moz_xmalloc, (void* (*)(void*, size_t))moz_xrealloc,
647 free};
648
649 static const char16_t kExpatSeparator[] = {0xFFFF, '\0'};
650
651 static const char16_t kISO88591[] = {'I', 'S', 'O', '-', '8', '8',
652 '5', '9', '-', '1', '\0'};
653
654 UserData ud;
655 ud.mStreamParser = this;
656
657 // If we got this far, the stream didn't have a BOM. UTF-16-encoded XML
658 // documents MUST begin with a BOM. We don't support EBCDIC and such.
659 // Thus, at this point, what we have is garbage or something encoded using
660 // a rough ASCII superset. ISO-8859-1 allows us to decode ASCII bytes
661 // without throwing errors when bytes have the most significant bit set
662 // and without triggering expat's unknown encoding code paths. This is
663 // enough to be able to use expat to parse the XML declaration in order
664 // to extract the encoding name from it.
665 ud.mExpat = XML_ParserCreate_MM(kISO88591, &memsuite, kExpatSeparator);
666 XML_SetXmlDeclHandler(ud.mExpat, HandleXMLDeclaration);
667 XML_SetElementHandler(ud.mExpat, HandleStartElement, HandleEndElement);
668 XML_SetCommentHandler(ud.mExpat, HandleComment);
669 XML_SetProcessingInstructionHandler(ud.mExpat, HandleProcessingInstruction);
670 XML_SetUserData(ud.mExpat, static_cast<void*>(&ud));
671
672 XML_Status status = XML_STATUS_OK;
673
674 // aFromSegment points to the data obtained from the current network
675 // event. mSniffingBuffer (if it exists) contains the data obtained before
676 // the current event. Thus, mSniffingLenth bytes of mSniffingBuffer
677 // followed by aCountToSniffingLimit bytes from aFromSegment are the
678 // first 1024 bytes of the file (or the file as a whole if the file is
679 // 1024 bytes long or shorter). Thus, we parse both buffers, but if the
680 // first call succeeds already, we skip parsing the second buffer.
681 if (mSniffingBuffer) {
682 status = XML_Parse(ud.mExpat,
683 reinterpret_cast<const char*>(mSniffingBuffer.get()),
684 mSniffingLength, false);
685 }
686 if (status == XML_STATUS_OK && mCharsetSource < kCharsetFromMetaTag) {
687 mozilla::Unused << XML_Parse(
688 ud.mExpat, reinterpret_cast<const char*>(aFromSegment.Elements()),
689 aCountToSniffingLimit, false);
690 }
691 XML_ParserFree(ud.mExpat);
692
693 if (mCharsetSource < kCharsetFromMetaTag) {
694 // Failed to get an encoding from the XML declaration. XML defaults
695 // confidently to UTF-8 in this case.
696 // It is also possible that the document has an XML declaration that is
697 // longer than 1024 bytes, but that case is not worth worrying about.
698 mEncoding = UTF_8_ENCODING;
699 mCharsetSource = kCharsetFromMetaTag; // means confident
700 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
701 }
702
703 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
704 }
705 bool forced = (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
706 mCharsetSource == kCharsetFromInitialUserForcedAutoDetection ||
707 mCharsetSource == kCharsetFromFinalUserForcedAutoDetection);
708 if (!mChannelHadCharset &&
709 (forced || mCharsetSource < kCharsetFromMetaPrescan) &&
710 (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA)) {
711 // Look for XML declaration in text/html.
712
713 const uint8_t* buf;
714 size_t bufLen;
715 if (mSniffingLength) {
716 // Copy data to a contiguous buffer if we already have something buffered
717 // up.
718 memcpy(mSniffingBuffer.get() + mSniffingLength, aFromSegment.Elements(),
719 aCountToSniffingLimit);
720 mSniffingLength += aCountToSniffingLimit;
721 aFromSegment = aFromSegment.From(aCountToSniffingLimit);
722 aCountToSniffingLimit = 0;
723 buf = mSniffingBuffer.get();
724 bufLen = mSniffingLength;
725 } else {
726 buf = aFromSegment.Elements();
727 bufLen = aCountToSniffingLimit;
728 }
729 const Encoding* encoding = xmldecl_parse(buf, bufLen);
730 if (encoding) {
731 if (forced &&
732 (encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) {
733 // Honor override
734 if (mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) {
735 DontGuessEncoding();
736 } else {
737 FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit,
738 false);
739 }
740 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
741 aFromSegment);
742 }
743 DontGuessEncoding();
744 mEncoding = WrapNotNull(encoding);
745 mCharsetSource = kCharsetFromXmlDeclaration;
746 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
747 } else if (mCharsetSource < kCharsetFromIrreversibleAutoDetection) {
748 // meta scan and XML declaration check failed.
749 // Check for BOMless UTF-16 with Basic
750 // Latin content for compat with IE. See bug 631751.
751 SniffBOMlessUTF16BasicLatin(buf, bufLen);
752 }
753 }
754 if (forced && mCharsetSource != kCharsetFromIrreversibleAutoDetection) {
755 // neither meta nor XML declaration found, honor override
756 if (mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) {
757 DontGuessEncoding();
758 } else {
759 FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit, false);
760 }
761 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
762 }
763
764 // the charset may have been set now
765 // maybe try chardet now;
766 if (mFeedChardet) {
767 FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit, aEof);
768 // fall thru; charset may have changed
769 }
770 if (mCharsetSource == kCharsetUninitialized) {
771 // Hopefully this case is never needed, but dealing with it anyway
772 mEncoding = WINDOWS_1252_ENCODING;
773 mCharsetSource = kCharsetFromFallback;
774 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
775 } else if (mMode == LOAD_AS_DATA && mCharsetSource == kCharsetFromFallback) {
776 NS_ASSERTION(mReparseForbidden, "Reparse should be forbidden for XHR");
777 NS_ASSERTION(!mFeedChardet, "Should not feed chardet for XHR");
778 NS_ASSERTION(mEncoding == UTF_8_ENCODING, "XHR should default to UTF-8");
779 // Now mark charset source as non-weak to signal that we have a decision
780 mCharsetSource = kCharsetFromDocTypeDefault;
781 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
782 }
783 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
784 }
785
SniffStreamBytes(Span<const uint8_t> aFromSegment)786 nsresult nsHtml5StreamParser::SniffStreamBytes(
787 Span<const uint8_t> aFromSegment) {
788 MOZ_ASSERT(IsParserThread(), "Wrong thread!");
789 // mEncoding and mCharsetSource potentially have come from channel or higher
790 // by now. If we find a BOM, SetupDecodingFromBom() will overwrite them.
791 // If we don't find a BOM, the previously set values of mEncoding and
792 // mCharsetSource are not modified by the BOM sniffing here.
793 for (uint32_t i = 0;
794 i < aFromSegment.Length() && mBomState != BOM_SNIFFING_OVER; i++) {
795 switch (mBomState) {
796 case BOM_SNIFFING_NOT_STARTED:
797 MOZ_ASSERT(i == 0, "Bad BOM sniffing state.");
798 switch (aFromSegment[0]) {
799 case 0xEF:
800 mBomState = SEEN_UTF_8_FIRST_BYTE;
801 break;
802 case 0xFF:
803 mBomState = SEEN_UTF_16_LE_FIRST_BYTE;
804 break;
805 case 0xFE:
806 mBomState = SEEN_UTF_16_BE_FIRST_BYTE;
807 break;
808 case 0x00:
809 if (mCharsetSource < kCharsetFromXmlDeclarationUtf16 &&
810 !mChannelHadCharset) {
811 mBomState = SEEN_UTF_16_BE_XML_FIRST;
812 } else {
813 mBomState = BOM_SNIFFING_OVER;
814 }
815 break;
816 case 0x3C:
817 if (mCharsetSource < kCharsetFromXmlDeclarationUtf16 &&
818 !mChannelHadCharset) {
819 mBomState = SEEN_UTF_16_LE_XML_FIRST;
820 } else {
821 mBomState = BOM_SNIFFING_OVER;
822 }
823 break;
824 default:
825 mBomState = BOM_SNIFFING_OVER;
826 break;
827 }
828 break;
829 case SEEN_UTF_16_LE_FIRST_BYTE:
830 if (aFromSegment[i] == 0xFE) {
831 SetupDecodingFromBom(UTF_16LE_ENCODING);
832 return WriteStreamBytes(aFromSegment.From(i + 1));
833 }
834 mBomState = BOM_SNIFFING_OVER;
835 break;
836 case SEEN_UTF_16_BE_FIRST_BYTE:
837 if (aFromSegment[i] == 0xFF) {
838 SetupDecodingFromBom(UTF_16BE_ENCODING);
839 return WriteStreamBytes(aFromSegment.From(i + 1));
840 }
841 mBomState = BOM_SNIFFING_OVER;
842 break;
843 case SEEN_UTF_8_FIRST_BYTE:
844 if (aFromSegment[i] == 0xBB) {
845 mBomState = SEEN_UTF_8_SECOND_BYTE;
846 } else {
847 mBomState = BOM_SNIFFING_OVER;
848 }
849 break;
850 case SEEN_UTF_8_SECOND_BYTE:
851 if (aFromSegment[i] == 0xBF) {
852 SetupDecodingFromBom(UTF_8_ENCODING);
853 return WriteStreamBytes(aFromSegment.From(i + 1));
854 }
855 mBomState = BOM_SNIFFING_OVER;
856 break;
857 case SEEN_UTF_16_BE_XML_FIRST:
858 if (aFromSegment[i] == 0x3C) {
859 mBomState = SEEN_UTF_16_BE_XML_SECOND;
860 } else {
861 mBomState = BOM_SNIFFING_OVER;
862 }
863 break;
864 case SEEN_UTF_16_BE_XML_SECOND:
865 if (aFromSegment[i] == 0x00) {
866 mBomState = SEEN_UTF_16_BE_XML_THIRD;
867 } else {
868 mBomState = BOM_SNIFFING_OVER;
869 }
870 break;
871 case SEEN_UTF_16_BE_XML_THIRD:
872 if (aFromSegment[i] == 0x3F) {
873 mBomState = SEEN_UTF_16_BE_XML_FOURTH;
874 } else {
875 mBomState = BOM_SNIFFING_OVER;
876 }
877 break;
878 case SEEN_UTF_16_BE_XML_FOURTH:
879 if (aFromSegment[i] == 0x00) {
880 mBomState = SEEN_UTF_16_BE_XML_FIFTH;
881 } else {
882 mBomState = BOM_SNIFFING_OVER;
883 }
884 break;
885 case SEEN_UTF_16_BE_XML_FIFTH:
886 if (aFromSegment[i] == 0x78) {
887 SetupDecodingFromUtf16BogoXml(UTF_16BE_ENCODING);
888 return WriteStreamBytes(aFromSegment.From(i + 1));
889 }
890 mBomState = BOM_SNIFFING_OVER;
891 break;
892 case SEEN_UTF_16_LE_XML_FIRST:
893 if (aFromSegment[i] == 0x00) {
894 mBomState = SEEN_UTF_16_LE_XML_SECOND;
895 } else {
896 mBomState = BOM_SNIFFING_OVER;
897 }
898 break;
899 case SEEN_UTF_16_LE_XML_SECOND:
900 if (aFromSegment[i] == 0x3F) {
901 mBomState = SEEN_UTF_16_LE_XML_THIRD;
902 } else {
903 mBomState = BOM_SNIFFING_OVER;
904 }
905 break;
906 case SEEN_UTF_16_LE_XML_THIRD:
907 if (aFromSegment[i] == 0x00) {
908 mBomState = SEEN_UTF_16_LE_XML_FOURTH;
909 } else {
910 mBomState = BOM_SNIFFING_OVER;
911 }
912 break;
913 case SEEN_UTF_16_LE_XML_FOURTH:
914 if (aFromSegment[i] == 0x78) {
915 mBomState = SEEN_UTF_16_LE_XML_FIFTH;
916 } else {
917 mBomState = BOM_SNIFFING_OVER;
918 }
919 break;
920 case SEEN_UTF_16_LE_XML_FIFTH:
921 if (aFromSegment[i] == 0x00) {
922 SetupDecodingFromUtf16BogoXml(UTF_16LE_ENCODING);
923 return WriteStreamBytes(aFromSegment.From(i + 1));
924 }
925 mBomState = BOM_SNIFFING_OVER;
926 break;
927 default:
928 mBomState = BOM_SNIFFING_OVER;
929 break;
930 }
931 }
932 // if we get here, there either was no BOM or the BOM sniffing isn't complete
933 // yet
934
935 MOZ_ASSERT(mCharsetSource != kCharsetFromByteOrderMark,
936 "Should not come here if BOM was found.");
937 MOZ_ASSERT(mCharsetSource != kCharsetFromXmlDeclarationUtf16,
938 "Should not come here if UTF-16 bogo-XML declaration was found.");
939 MOZ_ASSERT(mCharsetSource != kCharsetFromOtherComponent,
940 "kCharsetFromOtherComponent is for XSLT.");
941
942 if (mBomState == BOM_SNIFFING_OVER && mCharsetSource == kCharsetFromChannel) {
943 // There was no BOM and the charset came from channel. mEncoding
944 // still contains the charset from the channel as set by an
945 // earlier call to SetDocumentCharset(), since we didn't find a BOM and
946 // overwrite mEncoding. (Note that if the user has overridden the charset,
947 // we don't come here but check <meta> for XSS-dangerous charsets first.)
948 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
949 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
950 }
951
952 if (!mChannelHadCharset && !mMetaScanner &&
953 (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA)) {
954 mMetaScanner = MakeUnique<nsHtml5MetaScanner>(mTreeBuilder.get());
955 }
956
957 if (mSniffingLength + aFromSegment.Length() >= SNIFFING_BUFFER_SIZE) {
958 // this is the last buffer
959 uint32_t countToSniffingLimit = SNIFFING_BUFFER_SIZE - mSniffingLength;
960 bool forced =
961 (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
962 mCharsetSource == kCharsetFromInitialUserForcedAutoDetection ||
963 mCharsetSource == kCharsetFromFinalUserForcedAutoDetection);
964 if (!mChannelHadCharset && (mMode == NORMAL || mMode == VIEW_SOURCE_HTML ||
965 mMode == LOAD_AS_DATA)) {
966 nsHtml5ByteReadable readable(
967 aFromSegment.Elements(),
968 aFromSegment.Elements() + countToSniffingLimit);
969 nsAutoCString charset;
970 auto encoding = mMetaScanner->sniff(&readable);
971 // Due to the way nsHtml5Portability reports OOM, ask the tree buider
972 nsresult rv;
973 if (NS_FAILED((rv = mTreeBuilder->IsBroken()))) {
974 MarkAsBroken(rv);
975 return rv;
976 }
977
978 if (encoding) {
979 // meta scan successful; honor overrides unless meta is XSS-dangerous
980 if (forced && (encoding->IsAsciiCompatible() ||
981 encoding == ISO_2022_JP_ENCODING)) {
982 // Honor override
983 if (mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) {
984 DontGuessEncoding();
985 } else {
986 FinalizeSniffingWithDetector(aFromSegment, countToSniffingLimit,
987 false);
988 }
989 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
990 aFromSegment);
991 }
992 DontGuessEncoding();
993 mEncoding = WrapNotNull(encoding);
994 mCharsetSource = kCharsetFromMetaPrescan;
995 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
996 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
997 aFromSegment);
998 }
999 }
1000 return FinalizeSniffing(aFromSegment, countToSniffingLimit, false);
1001 }
1002
1003 // not the last buffer
1004 if (!mChannelHadCharset &&
1005 (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA)) {
1006 nsHtml5ByteReadable readable(
1007 aFromSegment.Elements(),
1008 aFromSegment.Elements() + aFromSegment.Length());
1009 auto encoding = mMetaScanner->sniff(&readable);
1010 // Due to the way nsHtml5Portability reports OOM, ask the tree buider
1011 nsresult rv;
1012 if (NS_FAILED((rv = mTreeBuilder->IsBroken()))) {
1013 MarkAsBroken(rv);
1014 return rv;
1015 }
1016 if (encoding) {
1017 // meta scan successful; honor overrides unless meta is XSS-dangerous
1018 if ((mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) &&
1019 (encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) {
1020 // Honor override
1021 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
1022 aFromSegment);
1023 }
1024 if ((mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
1025 mCharsetSource == kCharsetFromInitialUserForcedAutoDetection) &&
1026 (encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) {
1027 FinalizeSniffingWithDetector(aFromSegment, aFromSegment.Length(),
1028 false);
1029 } else {
1030 DontGuessEncoding();
1031 mEncoding = WrapNotNull(encoding);
1032 mCharsetSource = kCharsetFromMetaPrescan;
1033 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
1034 }
1035 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
1036 }
1037 }
1038
1039 if (!mSniffingBuffer) {
1040 mSniffingBuffer = MakeUniqueFallible<uint8_t[]>(SNIFFING_BUFFER_SIZE);
1041 if (!mSniffingBuffer) {
1042 return NS_ERROR_OUT_OF_MEMORY;
1043 }
1044 }
1045 memcpy(&mSniffingBuffer[mSniffingLength], aFromSegment.Elements(),
1046 aFromSegment.Length());
1047 mSniffingLength += aFromSegment.Length();
1048 return NS_OK;
1049 }
1050
1051 class AddContentRunnable : public Runnable {
1052 public:
AddContentRunnable(const nsAString & aParserID,nsIURI * aURI,Span<const char16_t> aData,bool aComplete)1053 AddContentRunnable(const nsAString& aParserID, nsIURI* aURI,
1054 Span<const char16_t> aData, bool aComplete)
1055 : Runnable("AddContent") {
1056 nsAutoCString spec;
1057 aURI->GetSpec(spec);
1058 mData.mUri.Construct(NS_ConvertUTF8toUTF16(spec));
1059 mData.mParserID.Construct(aParserID);
1060 mData.mContents.Construct(aData.Elements(), aData.Length());
1061 mData.mComplete.Construct(aComplete);
1062 }
1063
Run()1064 NS_IMETHOD Run() override {
1065 nsAutoString json;
1066 if (!mData.ToJSON(json)) {
1067 return NS_ERROR_FAILURE;
1068 }
1069
1070 nsCOMPtr<nsIObserverService> obsService = services::GetObserverService();
1071 if (obsService) {
1072 obsService->NotifyObservers(nullptr, "devtools-html-content",
1073 PromiseFlatString(json).get());
1074 }
1075
1076 return NS_OK;
1077 }
1078
1079 HTMLContent mData;
1080 };
1081
OnNewContent(Span<const char16_t> aData)1082 inline void nsHtml5StreamParser::OnNewContent(Span<const char16_t> aData) {
1083 if (mURIToSendToDevtools) {
1084 NS_DispatchToMainThread(new AddContentRunnable(mUUIDForDevtools,
1085 mURIToSendToDevtools, aData,
1086 /* aComplete */ false));
1087 }
1088 }
1089
OnContentComplete()1090 inline void nsHtml5StreamParser::OnContentComplete() {
1091 if (mURIToSendToDevtools) {
1092 NS_DispatchToMainThread(new AddContentRunnable(
1093 mUUIDForDevtools, mURIToSendToDevtools, Span<const char16_t>(),
1094 /* aComplete */ true));
1095 mURIToSendToDevtools = nullptr;
1096 }
1097 }
1098
WriteStreamBytes(Span<const uint8_t> aFromSegment)1099 nsresult nsHtml5StreamParser::WriteStreamBytes(
1100 Span<const uint8_t> aFromSegment) {
1101 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1102 // mLastBuffer should always point to a buffer of the size
1103 // READ_BUFFER_SIZE.
1104 if (!mLastBuffer) {
1105 NS_WARNING("mLastBuffer should not be null!");
1106 MarkAsBroken(NS_ERROR_NULL_POINTER);
1107 return NS_ERROR_NULL_POINTER;
1108 }
1109 size_t totalRead = 0;
1110 auto src = aFromSegment;
1111 for (;;) {
1112 auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE);
1113 uint32_t result;
1114 size_t read;
1115 size_t written;
1116 bool hadErrors;
1117 Tie(result, read, written, hadErrors) =
1118 mUnicodeDecoder->DecodeToUTF16(src, dst, false);
1119 if (!mDecodingLocalFileWithoutTokenizing) {
1120 OnNewContent(dst.To(written));
1121 }
1122 if (hadErrors && !mHasHadErrors) {
1123 mHasHadErrors = true;
1124 if (mEncoding == UTF_8_ENCODING) {
1125 mTreeBuilder->TryToEnableEncodingMenu();
1126 }
1127 }
1128 src = src.From(read);
1129 totalRead += read;
1130 mLastBuffer->AdvanceEnd(written);
1131 if (result == kOutputFull) {
1132 RefPtr<nsHtml5OwningUTF16Buffer> newBuf =
1133 nsHtml5OwningUTF16Buffer::FalliblyCreate(READ_BUFFER_SIZE);
1134 if (!newBuf) {
1135 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1136 return NS_ERROR_OUT_OF_MEMORY;
1137 }
1138 mLastBuffer = (mLastBuffer->next = std::move(newBuf));
1139 } else {
1140 MOZ_ASSERT(totalRead == aFromSegment.Length(),
1141 "The Unicode decoder consumed the wrong number of bytes.");
1142 if (mDecodingLocalFileWithoutTokenizing &&
1143 mLocalFileBytesBuffered == LOCAL_FILE_UTF_8_BUFFER_SIZE) {
1144 auto encoding = mEncoding;
1145 GuessEncoding(false, false);
1146 if (encoding == mEncoding) {
1147 CommitLocalFileToEncoding();
1148 } else {
1149 ReDecodeLocalFile();
1150 }
1151 }
1152 return NS_OK;
1153 }
1154 }
1155 }
1156
ReDecodeLocalFile()1157 void nsHtml5StreamParser::ReDecodeLocalFile() {
1158 MOZ_ASSERT(mDecodingLocalFileWithoutTokenizing);
1159 mDecodingLocalFileWithoutTokenizing = false;
1160 mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval();
1161 mHasHadErrors = false;
1162
1163 DontGuessEncoding();
1164
1165 // Throw away previous decoded data
1166 mLastBuffer = mFirstBuffer;
1167 mLastBuffer->next = nullptr;
1168 mLastBuffer->setStart(0);
1169 mLastBuffer->setEnd(0);
1170
1171 // Decode again
1172 for (auto&& buffer : mBufferedLocalFileData) {
1173 DoDataAvailable(buffer);
1174 }
1175 }
1176
CommitLocalFileToEncoding()1177 void nsHtml5StreamParser::CommitLocalFileToEncoding() {
1178 MOZ_ASSERT(mDecodingLocalFileWithoutTokenizing);
1179 mDecodingLocalFileWithoutTokenizing = false;
1180 mFeedChardet = false;
1181 mGuessEncoding = false;
1182
1183 nsHtml5OwningUTF16Buffer* buffer = mFirstBuffer;
1184 while (buffer) {
1185 Span<const char16_t> data(buffer->getBuffer() + buffer->getStart(),
1186 buffer->getLength());
1187 OnNewContent(data);
1188 buffer = buffer->next;
1189 }
1190 }
1191
1192 class MaybeRunCollector : public Runnable {
1193 public:
MaybeRunCollector(nsIDocShell * aDocShell)1194 explicit MaybeRunCollector(nsIDocShell* aDocShell)
1195 : Runnable("MaybeRunCollector"), mDocShell(aDocShell) {}
1196
Run()1197 NS_IMETHOD Run() override {
1198 nsJSContext::MaybeRunNextCollectorSlice(mDocShell,
1199 JS::GCReason::HTML_PARSER);
1200 return NS_OK;
1201 }
1202
1203 nsCOMPtr<nsIDocShell> mDocShell;
1204 };
1205
OnStartRequest(nsIRequest * aRequest)1206 nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
1207 MOZ_RELEASE_ASSERT(STREAM_NOT_STARTED == mStreamState,
1208 "Got OnStartRequest when the stream had already started.");
1209 MOZ_ASSERT(
1210 !mExecutor->HasStarted(),
1211 "Got OnStartRequest at the wrong stage in the executor life cycle.");
1212 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
1213
1214 // To avoid the cost of instantiating the detector when it's not needed,
1215 // let's instantiate only if we make it out of this method with the
1216 // intent to use it.
1217 auto detectorCreator = MakeScopeExit([&] {
1218 if (mFeedChardet) {
1219 mDetector = mozilla::EncodingDetector::Create();
1220 }
1221 });
1222
1223 if (mObserver) {
1224 mObserver->OnStartRequest(aRequest);
1225 }
1226 mRequest = aRequest;
1227
1228 mStreamState = STREAM_BEING_READ;
1229
1230 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1231 mTokenizer->StartViewSource(NS_ConvertUTF8toUTF16(mViewSourceTitle));
1232 }
1233
1234 // For View Source, the parser should run with scripts "enabled" if a normal
1235 // load would have scripts enabled.
1236 bool scriptingEnabled =
1237 mMode == LOAD_AS_DATA ? false : mExecutor->IsScriptEnabled();
1238 mOwner->StartTokenizer(scriptingEnabled);
1239
1240 MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing);
1241 bool isSrcdoc = false;
1242 nsCOMPtr<nsIChannel> channel;
1243 nsresult rv = GetChannel(getter_AddRefs(channel));
1244 if (NS_SUCCEEDED(rv)) {
1245 isSrcdoc = NS_IsSrcdocChannel(channel);
1246 if (!isSrcdoc && mCharsetSource <= kCharsetFromFallback) {
1247 nsCOMPtr<nsIURI> originalURI;
1248 rv = channel->GetOriginalURI(getter_AddRefs(originalURI));
1249 if (NS_SUCCEEDED(rv)) {
1250 if (originalURI->SchemeIs("resource")) {
1251 mCharsetSource = kCharsetFromBuiltIn;
1252 mEncoding = UTF_8_ENCODING;
1253 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
1254 } else {
1255 nsCOMPtr<nsIURI> currentURI;
1256 rv = channel->GetURI(getter_AddRefs(currentURI));
1257 if (NS_SUCCEEDED(rv)) {
1258 nsCOMPtr<nsIURI> innermost = NS_GetInnermostURI(currentURI);
1259 if (innermost->SchemeIs("file")) {
1260 mDecodingLocalFileWithoutTokenizing = true;
1261 } else {
1262 nsAutoCString host;
1263 innermost->GetAsciiHost(host);
1264 if (!host.IsEmpty()) {
1265 // First let's see if the host is DNS-absolute and ends with a
1266 // dot and get rid of that one.
1267 if (host.Last() == '.') {
1268 host.SetLength(host.Length() - 1);
1269 }
1270 int32_t index = host.RFindChar('.');
1271 if (index != kNotFound) {
1272 // We tolerate an IPv4 component as generic "TLD", so don't
1273 // bother checking.
1274 ToLowerCase(
1275 Substring(host, index + 1, host.Length() - (index + 1)),
1276 mTLD);
1277 }
1278 }
1279 }
1280 }
1281 }
1282 }
1283 }
1284 }
1285 mTreeBuilder->setIsSrcdocDocument(isSrcdoc);
1286 mTreeBuilder->setScriptingEnabled(scriptingEnabled);
1287 mTreeBuilder->SetPreventScriptExecution(
1288 !((mMode == NORMAL) && scriptingEnabled));
1289 mTokenizer->start();
1290 mExecutor->Start();
1291 mExecutor->StartReadingFromStage();
1292
1293 if (mMode == PLAIN_TEXT) {
1294 mTreeBuilder->StartPlainText();
1295 mTokenizer->StartPlainText();
1296 } else if (mMode == VIEW_SOURCE_PLAIN) {
1297 nsAutoString viewSourceTitle;
1298 CopyUTF8toUTF16(mViewSourceTitle, viewSourceTitle);
1299 mTreeBuilder->EnsureBufferSpace(viewSourceTitle.Length());
1300 mTreeBuilder->StartPlainTextViewSource(viewSourceTitle);
1301 mTokenizer->StartPlainText();
1302 }
1303
1304 /*
1305 * If you move the following line, be very careful not to cause
1306 * WillBuildModel to be called before the document has had its
1307 * script global object set.
1308 */
1309 rv = mExecutor->WillBuildModel(eDTDMode_unknown);
1310 NS_ENSURE_SUCCESS(rv, rv);
1311
1312 RefPtr<nsHtml5OwningUTF16Buffer> newBuf =
1313 nsHtml5OwningUTF16Buffer::FalliblyCreate(READ_BUFFER_SIZE);
1314 if (!newBuf) {
1315 // marks this stream parser as terminated,
1316 // which prevents entry to code paths that
1317 // would use mFirstBuffer or mLastBuffer.
1318 return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1319 }
1320 MOZ_ASSERT(!mFirstBuffer, "How come we have the first buffer set?");
1321 MOZ_ASSERT(!mLastBuffer, "How come we have the last buffer set?");
1322 mFirstBuffer = mLastBuffer = newBuf;
1323
1324 rv = NS_OK;
1325
1326 // The line below means that the encoding can end up being wrong if
1327 // a view-source URL is loaded without having the encoding hint from a
1328 // previous normal load in the history.
1329 mReparseForbidden = !(mMode == NORMAL || mMode == PLAIN_TEXT);
1330
1331 mNetworkEventTarget =
1332 mExecutor->GetDocument()->EventTargetFor(TaskCategory::Network);
1333
1334 nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(mRequest, &rv));
1335 if (NS_SUCCEEDED(rv)) {
1336 // Non-HTTP channels are bogus enough that we let them work with unlabeled
1337 // runnables for now. Asserting for HTTP channels only.
1338 MOZ_ASSERT(mNetworkEventTarget || mMode == LOAD_AS_DATA,
1339 "How come the network event target is still null?");
1340
1341 nsAutoCString method;
1342 Unused << httpChannel->GetRequestMethod(method);
1343 // XXX does Necko have a way to renavigate POST, etc. without hitting
1344 // the network?
1345 if (!method.EqualsLiteral("GET")) {
1346 // This is the old Gecko behavior but the HTML5 spec disagrees.
1347 // Don't reparse on POST.
1348 mReparseForbidden = true;
1349 }
1350 }
1351
1352 // Attempt to retarget delivery of data (via OnDataAvailable) to the parser
1353 // thread, rather than through the main thread.
1354 nsCOMPtr<nsIThreadRetargetableRequest> threadRetargetableRequest =
1355 do_QueryInterface(mRequest, &rv);
1356 if (threadRetargetableRequest) {
1357 rv = threadRetargetableRequest->RetargetDeliveryTo(mEventTarget);
1358 if (NS_SUCCEEDED(rv)) {
1359 // Parser thread should be now ready to get data from necko and parse it
1360 // and main thread might have a chance to process a collector slice.
1361 // We need to do this asynchronously so that necko may continue processing
1362 // the request.
1363 nsCOMPtr<nsIRunnable> runnable =
1364 new MaybeRunCollector(mExecutor->GetDocument()->GetDocShell());
1365 mozilla::SchedulerGroup::Dispatch(
1366 mozilla::TaskCategory::GarbageCollection, runnable.forget());
1367 }
1368 }
1369
1370 if (NS_FAILED(rv)) {
1371 NS_WARNING("Failed to retarget HTML data delivery to the parser thread.");
1372 }
1373
1374 if (mCharsetSource == kCharsetFromParentFrame) {
1375 // Remember this for error reporting.
1376 mInitialEncodingWasFromParentFrame = true;
1377 }
1378
1379 if (!(mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
1380 mCharsetSource == kCharsetFromInitialUserForcedAutoDetection ||
1381 mCharsetSource == kCharsetFromFinalUserForcedAutoDetection)) {
1382 if (mCharsetSource >= kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8) {
1383 DontGuessEncoding();
1384 }
1385 }
1386
1387 if (mCharsetSource < kCharsetFromUtf8OnlyMime) {
1388 // we aren't ready to commit to an encoding yet
1389 // leave converter uninstantiated for now
1390 return NS_OK;
1391 }
1392
1393 // We are loading JSON/WebVTT/etc. into a browsing context.
1394 // There's no need to remove the BOM manually here, because
1395 // the UTF-8 decoder removes it.
1396 mReparseForbidden = true;
1397 DontGuessEncoding();
1398
1399 // Instantiate the converter here to avoid BOM sniffing.
1400 mDecodingLocalFileWithoutTokenizing = false;
1401 mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval();
1402 return NS_OK;
1403 }
1404
CheckListenerChain()1405 nsresult nsHtml5StreamParser::CheckListenerChain() {
1406 NS_ASSERTION(NS_IsMainThread(), "Should be on the main thread!");
1407 if (!mObserver) {
1408 return NS_OK;
1409 }
1410 nsresult rv;
1411 nsCOMPtr<nsIThreadRetargetableStreamListener> retargetable =
1412 do_QueryInterface(mObserver, &rv);
1413 if (NS_SUCCEEDED(rv) && retargetable) {
1414 rv = retargetable->CheckListenerChain();
1415 }
1416 return rv;
1417 }
1418
DoStopRequest()1419 void nsHtml5StreamParser::DoStopRequest() {
1420 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1421 MOZ_RELEASE_ASSERT(STREAM_BEING_READ == mStreamState,
1422 "Stream ended without being open.");
1423 mTokenizerMutex.AssertCurrentThreadOwns();
1424
1425 auto guard = MakeScopeExit([&] { OnContentComplete(); });
1426
1427 if (IsTerminated()) {
1428 return;
1429 }
1430
1431 if (!mUnicodeDecoder) {
1432 nsresult rv;
1433 Span<const uint8_t> empty;
1434 if (NS_FAILED(rv = FinalizeSniffing(empty, 0, true))) {
1435 MarkAsBroken(rv);
1436 return;
1437 }
1438 }
1439 if (mFeedChardet) {
1440 mFeedChardet = false;
1441 FeedDetector(Span<uint8_t>(), true);
1442 }
1443
1444 MOZ_ASSERT(mUnicodeDecoder,
1445 "Should have a decoder after finalizing sniffing.");
1446
1447 // mLastBuffer should always point to a buffer of the size
1448 // READ_BUFFER_SIZE.
1449 if (!mLastBuffer) {
1450 NS_WARNING("mLastBuffer should not be null!");
1451 MarkAsBroken(NS_ERROR_NULL_POINTER);
1452 return;
1453 }
1454
1455 Span<uint8_t> src; // empty span
1456 for (;;) {
1457 auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE);
1458 uint32_t result;
1459 size_t read;
1460 size_t written;
1461 bool hadErrors;
1462 Tie(result, read, written, hadErrors) =
1463 mUnicodeDecoder->DecodeToUTF16(src, dst, true);
1464 if (!mDecodingLocalFileWithoutTokenizing) {
1465 OnNewContent(dst.To(written));
1466 }
1467 if (hadErrors && !mHasHadErrors) {
1468 mHasHadErrors = true;
1469 if (mEncoding == UTF_8_ENCODING) {
1470 mTreeBuilder->TryToEnableEncodingMenu();
1471 }
1472 }
1473 MOZ_ASSERT(read == 0, "How come an empty span was read form?");
1474 mLastBuffer->AdvanceEnd(written);
1475 if (result == kOutputFull) {
1476 RefPtr<nsHtml5OwningUTF16Buffer> newBuf =
1477 nsHtml5OwningUTF16Buffer::FalliblyCreate(READ_BUFFER_SIZE);
1478 if (!newBuf) {
1479 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1480 return;
1481 }
1482 mLastBuffer = (mLastBuffer->next = std::move(newBuf));
1483 } else {
1484 if (mDecodingLocalFileWithoutTokenizing) {
1485 MOZ_ASSERT(mLocalFileBytesBuffered < LOCAL_FILE_UTF_8_BUFFER_SIZE);
1486 MOZ_ASSERT(mGuessEncoding);
1487 auto encoding = mEncoding;
1488 GuessEncoding(true, false);
1489 if (encoding == mEncoding) {
1490 CommitLocalFileToEncoding();
1491 } else {
1492 ReDecodeLocalFile();
1493 DoStopRequest();
1494 return;
1495 }
1496 } else if (mGuessEncoding) {
1497 GuessEncoding(true, false);
1498 }
1499 break;
1500 }
1501 }
1502
1503 mStreamState = STREAM_ENDED;
1504
1505 if (IsTerminatedOrInterrupted()) {
1506 return;
1507 }
1508
1509 ParseAvailableData();
1510 }
1511
1512 class nsHtml5RequestStopper : public Runnable {
1513 private:
1514 nsHtml5StreamParserPtr mStreamParser;
1515
1516 public:
nsHtml5RequestStopper(nsHtml5StreamParser * aStreamParser)1517 explicit nsHtml5RequestStopper(nsHtml5StreamParser* aStreamParser)
1518 : Runnable("nsHtml5RequestStopper"), mStreamParser(aStreamParser) {}
Run()1519 NS_IMETHOD Run() override {
1520 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1521 mStreamParser->DoStopRequest();
1522 return NS_OK;
1523 }
1524 };
1525
OnStopRequest(nsIRequest * aRequest,nsresult status)1526 nsresult nsHtml5StreamParser::OnStopRequest(nsIRequest* aRequest,
1527 nsresult status) {
1528 NS_ASSERTION(mRequest == aRequest, "Got Stop on wrong stream.");
1529 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1530 if (mObserver) {
1531 mObserver->OnStopRequest(aRequest, status);
1532 }
1533 nsCOMPtr<nsIRunnable> stopper = new nsHtml5RequestStopper(this);
1534 if (NS_FAILED(mEventTarget->Dispatch(stopper, nsIThread::DISPATCH_NORMAL))) {
1535 NS_WARNING("Dispatching StopRequest event failed.");
1536 }
1537 return NS_OK;
1538 }
1539
DoDataAvailableBuffer(mozilla::Buffer<uint8_t> && aBuffer)1540 void nsHtml5StreamParser::DoDataAvailableBuffer(
1541 mozilla::Buffer<uint8_t>&& aBuffer) {
1542 if (MOZ_LIKELY(!mDecodingLocalFileWithoutTokenizing)) {
1543 DoDataAvailable(aBuffer);
1544 return;
1545 }
1546 CheckedInt<size_t> bufferedPlusLength(aBuffer.Length());
1547 bufferedPlusLength += mLocalFileBytesBuffered;
1548 if (!bufferedPlusLength.isValid()) {
1549 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1550 return;
1551 }
1552 // Ensure that WriteStreamBytes() sees a buffer ending
1553 // exactly at LOCAL_FILE_UTF_8_BUFFER_SIZE
1554 // if we are about to cross the threshold. This way,
1555 // Necko buffer boundaries don't affect user-visible
1556 // behavior.
1557 if (bufferedPlusLength.value() <= LOCAL_FILE_UTF_8_BUFFER_SIZE) {
1558 // Truncation OK, because we just checked the range.
1559 mLocalFileBytesBuffered = bufferedPlusLength.value();
1560 mBufferedLocalFileData.AppendElement(std::move(aBuffer));
1561 DoDataAvailable(mBufferedLocalFileData.LastElement());
1562 } else {
1563 // Truncation OK, because the constant is small enough.
1564 size_t overBoundary =
1565 bufferedPlusLength.value() - LOCAL_FILE_UTF_8_BUFFER_SIZE;
1566 MOZ_RELEASE_ASSERT(overBoundary < aBuffer.Length());
1567 size_t untilBoundary = aBuffer.Length() - overBoundary;
1568 auto span = aBuffer.AsSpan();
1569 auto head = span.To(untilBoundary);
1570 auto tail = span.From(untilBoundary);
1571 MOZ_RELEASE_ASSERT(mLocalFileBytesBuffered + untilBoundary ==
1572 LOCAL_FILE_UTF_8_BUFFER_SIZE);
1573 // We make a theoretically useless copy here, because avoiding
1574 // the copy adds too much complexity.
1575 Maybe<Buffer<uint8_t>> maybe = Buffer<uint8_t>::CopyFrom(head);
1576 if (maybe.isNothing()) {
1577 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1578 return;
1579 }
1580 mLocalFileBytesBuffered = LOCAL_FILE_UTF_8_BUFFER_SIZE;
1581 mBufferedLocalFileData.AppendElement(std::move(*maybe));
1582
1583 DoDataAvailable(head);
1584 // Re-decode may have happened here.
1585 DoDataAvailable(tail);
1586 }
1587 // Do this clean-up here to avoid use-after-free when
1588 // DoDataAvailable is passed a span pointing into an
1589 // element of mBufferedLocalFileData.
1590 if (!mDecodingLocalFileWithoutTokenizing) {
1591 mBufferedLocalFileData.Clear();
1592 }
1593 }
1594
DoDataAvailable(Span<const uint8_t> aBuffer)1595 void nsHtml5StreamParser::DoDataAvailable(Span<const uint8_t> aBuffer) {
1596 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1597 MOZ_RELEASE_ASSERT(STREAM_BEING_READ == mStreamState,
1598 "DoDataAvailable called when stream not open.");
1599 mTokenizerMutex.AssertCurrentThreadOwns();
1600
1601 if (IsTerminated()) {
1602 return;
1603 }
1604
1605 nsresult rv;
1606 if (HasDecoder()) {
1607 if (mFeedChardet) {
1608 FeedDetector(aBuffer, false);
1609 }
1610 rv = WriteStreamBytes(aBuffer);
1611 } else {
1612 rv = SniffStreamBytes(aBuffer);
1613 }
1614 if (NS_FAILED(rv)) {
1615 MarkAsBroken(rv);
1616 return;
1617 }
1618
1619 if (IsTerminatedOrInterrupted()) {
1620 return;
1621 }
1622
1623 if (mDecodingLocalFileWithoutTokenizing) {
1624 return;
1625 }
1626
1627 ParseAvailableData();
1628
1629 if (mFlushTimerArmed || mSpeculating) {
1630 return;
1631 }
1632
1633 {
1634 mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex);
1635 mFlushTimer->InitWithNamedFuncCallback(
1636 nsHtml5StreamParser::TimerCallback, static_cast<void*>(this),
1637 mFlushTimerEverFired ? StaticPrefs::html5_flushtimer_initialdelay()
1638 : StaticPrefs::html5_flushtimer_subsequentdelay(),
1639 nsITimer::TYPE_ONE_SHOT, "nsHtml5StreamParser::DoDataAvailable");
1640 }
1641 mFlushTimerArmed = true;
1642 }
1643
1644 class nsHtml5DataAvailable : public Runnable {
1645 private:
1646 nsHtml5StreamParserPtr mStreamParser;
1647 Buffer<uint8_t> mData;
1648
1649 public:
nsHtml5DataAvailable(nsHtml5StreamParser * aStreamParser,Buffer<uint8_t> && aData)1650 nsHtml5DataAvailable(nsHtml5StreamParser* aStreamParser,
1651 Buffer<uint8_t>&& aData)
1652 : Runnable("nsHtml5DataAvailable"),
1653 mStreamParser(aStreamParser),
1654 mData(std::move(aData)) {}
Run()1655 NS_IMETHOD Run() override {
1656 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1657 mStreamParser->DoDataAvailableBuffer(std::move(mData));
1658 return NS_OK;
1659 }
1660 };
1661
OnDataAvailable(nsIRequest * aRequest,nsIInputStream * aInStream,uint64_t aSourceOffset,uint32_t aLength)1662 nsresult nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest,
1663 nsIInputStream* aInStream,
1664 uint64_t aSourceOffset,
1665 uint32_t aLength) {
1666 nsresult rv;
1667 if (NS_FAILED(rv = mExecutor->IsBroken())) {
1668 return rv;
1669 }
1670
1671 MOZ_ASSERT(mRequest == aRequest, "Got data on wrong stream.");
1672 uint32_t totalRead;
1673 // Main thread to parser thread dispatch requires copying to buffer first.
1674 if (MOZ_UNLIKELY(NS_IsMainThread())) {
1675 Maybe<Buffer<uint8_t>> maybe = Buffer<uint8_t>::Alloc(aLength);
1676 if (maybe.isNothing()) {
1677 return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1678 }
1679 Buffer<uint8_t> data(std::move(*maybe));
1680 rv = aInStream->Read(reinterpret_cast<char*>(data.Elements()),
1681 data.Length(), &totalRead);
1682 NS_ENSURE_SUCCESS(rv, rv);
1683 MOZ_ASSERT(totalRead == aLength);
1684
1685 nsCOMPtr<nsIRunnable> dataAvailable =
1686 new nsHtml5DataAvailable(this, std::move(data));
1687 if (NS_FAILED(mEventTarget->Dispatch(dataAvailable,
1688 nsIThread::DISPATCH_NORMAL))) {
1689 NS_WARNING("Dispatching DataAvailable event failed.");
1690 }
1691 return rv;
1692 }
1693 MOZ_ASSERT(IsParserThread(), "Wrong thread!");
1694 mozilla::MutexAutoLock autoLock(mTokenizerMutex);
1695
1696 if (MOZ_UNLIKELY(mDecodingLocalFileWithoutTokenizing)) {
1697 // It's a bit sad to potentially buffer the first 1024
1698 // bytes in two places, but it's a lot simpler than trying
1699 // to optitize out that copy. It only happens for local files
1700 // and not for the http(s) content anyway.
1701 Maybe<Buffer<uint8_t>> maybe = Buffer<uint8_t>::Alloc(aLength);
1702 if (maybe.isNothing()) {
1703 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1704 return NS_ERROR_OUT_OF_MEMORY;
1705 }
1706 Buffer<uint8_t> data(std::move(*maybe));
1707 rv = aInStream->Read(reinterpret_cast<char*>(data.Elements()),
1708 data.Length(), &totalRead);
1709 NS_ENSURE_SUCCESS(rv, rv);
1710 MOZ_ASSERT(totalRead == aLength);
1711 DoDataAvailableBuffer(std::move(data));
1712 return rv;
1713 }
1714 // Read directly from response buffer.
1715 rv = aInStream->ReadSegments(CopySegmentsToParser, this, aLength, &totalRead);
1716 NS_ENSURE_SUCCESS(rv, rv);
1717 MOZ_ASSERT(totalRead == aLength);
1718 return rv;
1719 }
1720
1721 /* static */
CopySegmentsToParser(nsIInputStream * aInStream,void * aClosure,const char * aFromSegment,uint32_t aToOffset,uint32_t aCount,uint32_t * aWriteCount)1722 nsresult nsHtml5StreamParser::CopySegmentsToParser(
1723 nsIInputStream* aInStream, void* aClosure, const char* aFromSegment,
1724 uint32_t aToOffset, uint32_t aCount, uint32_t* aWriteCount) {
1725 nsHtml5StreamParser* parser = static_cast<nsHtml5StreamParser*>(aClosure);
1726
1727 parser->DoDataAvailable(AsBytes(Span(aFromSegment, aCount)));
1728 // Assume DoDataAvailable consumed all available bytes.
1729 *aWriteCount = aCount;
1730 return NS_OK;
1731 }
1732
PreferredForInternalEncodingDecl(const nsACString & aEncoding)1733 const Encoding* nsHtml5StreamParser::PreferredForInternalEncodingDecl(
1734 const nsACString& aEncoding) {
1735 const Encoding* newEncoding = Encoding::ForLabel(aEncoding);
1736 if (!newEncoding) {
1737 // the encoding name is bogus
1738 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUnsupported", true,
1739 mTokenizer->getLineNumber());
1740 return nullptr;
1741 }
1742
1743 if (newEncoding == UTF_16BE_ENCODING || newEncoding == UTF_16LE_ENCODING) {
1744 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUtf16", true,
1745 mTokenizer->getLineNumber());
1746 newEncoding = UTF_8_ENCODING;
1747 }
1748
1749 if (newEncoding == X_USER_DEFINED_ENCODING) {
1750 // WebKit/Blink hack for Indian and Armenian legacy sites
1751 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUserDefined", true,
1752 mTokenizer->getLineNumber());
1753 newEncoding = WINDOWS_1252_ENCODING;
1754 }
1755
1756 if (newEncoding == mEncoding) {
1757 if (mCharsetSource < kCharsetFromMetaPrescan) {
1758 if (mInitialEncodingWasFromParentFrame) {
1759 mTreeBuilder->MaybeComplainAboutCharset("EncLateMetaFrame", false,
1760 mTokenizer->getLineNumber());
1761 } else {
1762 mTreeBuilder->MaybeComplainAboutCharset("EncLateMeta", false,
1763 mTokenizer->getLineNumber());
1764 }
1765 }
1766 mCharsetSource = kCharsetFromMetaTag; // become confident
1767 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
1768 DontGuessEncoding(); // don't feed chardet when confident
1769 return nullptr;
1770 }
1771
1772 return newEncoding;
1773 }
1774
internalEncodingDeclaration(nsHtml5String aEncoding)1775 bool nsHtml5StreamParser::internalEncodingDeclaration(nsHtml5String aEncoding) {
1776 // This code needs to stay in sync with
1777 // nsHtml5MetaScanner::tryCharset. Unfortunately, the
1778 // trickery with member fields there leads to some copy-paste reuse. :-(
1779 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1780 if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to
1781 // "confident" in the HTML5 spec
1782 return false;
1783 }
1784
1785 nsString newEncoding16; // Not Auto, because using it to hold nsStringBuffer*
1786 aEncoding.ToString(newEncoding16);
1787 nsAutoCString newEncoding;
1788 CopyUTF16toUTF8(newEncoding16, newEncoding);
1789
1790 auto encoding = PreferredForInternalEncodingDecl(newEncoding);
1791 if (!encoding) {
1792 return false;
1793 }
1794
1795 if (mReparseForbidden) {
1796 // This mReparseForbidden check happens after the call to
1797 // PreferredForInternalEncodingDecl so that if that method calls
1798 // MaybeComplainAboutCharset, its charset complaint wins over the one
1799 // below.
1800 mTreeBuilder->MaybeComplainAboutCharset("EncLateMetaTooLate", true,
1801 mTokenizer->getLineNumber());
1802 return false; // not reparsing even if we wanted to
1803 }
1804
1805 // Avoid having the chardet ask for another restart after this restart
1806 // request.
1807 DontGuessEncoding();
1808 mTreeBuilder->NeedsCharsetSwitchTo(WrapNotNull(encoding), kCharsetFromMetaTag,
1809 mTokenizer->getLineNumber());
1810 FlushTreeOpsAndDisarmTimer();
1811 Interrupt();
1812 // the tree op executor will cause the stream parser to terminate
1813 // if the charset switch request is accepted or it'll uninterrupt
1814 // if the request failed. Note that if the restart request fails,
1815 // we don't bother trying to make chardet resume. Might as well
1816 // assume that chardet-requested restarts would fail, too.
1817 return true;
1818 }
1819
FlushTreeOpsAndDisarmTimer()1820 void nsHtml5StreamParser::FlushTreeOpsAndDisarmTimer() {
1821 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1822 if (mFlushTimerArmed) {
1823 // avoid calling Cancel if the flush timer isn't armed to avoid acquiring
1824 // a mutex
1825 {
1826 mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex);
1827 mFlushTimer->Cancel();
1828 }
1829 mFlushTimerArmed = false;
1830 }
1831 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1832 mTokenizer->FlushViewSource();
1833 }
1834 mTreeBuilder->Flush();
1835 nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher);
1836 if (NS_FAILED(DispatchToMain(runnable.forget()))) {
1837 NS_WARNING("failed to dispatch executor flush event");
1838 }
1839 }
1840
ParseAvailableData()1841 void nsHtml5StreamParser::ParseAvailableData() {
1842 MOZ_ASSERT(IsParserThread(), "Wrong thread!");
1843 mTokenizerMutex.AssertCurrentThreadOwns();
1844 MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing);
1845
1846 if (IsTerminatedOrInterrupted()) {
1847 return;
1848 }
1849
1850 if (mSpeculating && !IsSpeculationEnabled()) {
1851 return;
1852 }
1853
1854 for (;;) {
1855 if (!mFirstBuffer->hasMore()) {
1856 if (mFirstBuffer == mLastBuffer) {
1857 switch (mStreamState) {
1858 case STREAM_BEING_READ:
1859 // never release the last buffer.
1860 if (!mSpeculating) {
1861 // reuse buffer space if not speculating
1862 mFirstBuffer->setStart(0);
1863 mFirstBuffer->setEnd(0);
1864 }
1865 mTreeBuilder->FlushLoads();
1866 {
1867 // Dispatch this runnable unconditionally, because the loads
1868 // that need flushing may have been flushed earlier even if the
1869 // flush right above here did nothing.
1870 nsCOMPtr<nsIRunnable> runnable(mLoadFlusher);
1871 if (NS_FAILED(DispatchToMain(runnable.forget()))) {
1872 NS_WARNING("failed to dispatch load flush event");
1873 }
1874 }
1875 return; // no more data for now but expecting more
1876 case STREAM_ENDED:
1877 if (mAtEOF) {
1878 return;
1879 }
1880 mAtEOF = true;
1881 if (mCharsetSource < kCharsetFromMetaTag) {
1882 if (mInitialEncodingWasFromParentFrame) {
1883 // Unfortunately, this check doesn't take effect for
1884 // cross-origin frames, so cross-origin ad frames that have
1885 // no text and only an image or a Flash embed get the more
1886 // severe message from the next if block. The message is
1887 // technically accurate, though.
1888 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclarationFrame",
1889 false, 0);
1890 } else if (mMode == NORMAL) {
1891 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclaration",
1892 true, 0);
1893 } else if (mMode == PLAIN_TEXT) {
1894 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclarationPlain",
1895 true, 0);
1896 }
1897 }
1898 if (NS_SUCCEEDED(mTreeBuilder->IsBroken())) {
1899 mTokenizer->eof();
1900 nsresult rv;
1901 if (NS_FAILED((rv = mTreeBuilder->IsBroken()))) {
1902 MarkAsBroken(rv);
1903 } else {
1904 mTreeBuilder->StreamEnded();
1905 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1906 mTokenizer->EndViewSource();
1907 }
1908 }
1909 }
1910 FlushTreeOpsAndDisarmTimer();
1911 return; // no more data and not expecting more
1912 default:
1913 MOZ_ASSERT_UNREACHABLE("It should be impossible to reach this.");
1914 return;
1915 }
1916 }
1917 mFirstBuffer = mFirstBuffer->next;
1918 continue;
1919 }
1920
1921 // now we have a non-empty buffer
1922 mFirstBuffer->adjust(mLastWasCR);
1923 mLastWasCR = false;
1924 if (mFirstBuffer->hasMore()) {
1925 if (!mTokenizer->EnsureBufferSpace(mFirstBuffer->getLength())) {
1926 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1927 return;
1928 }
1929 mLastWasCR = mTokenizer->tokenizeBuffer(mFirstBuffer);
1930 nsresult rv;
1931 if (NS_FAILED((rv = mTreeBuilder->IsBroken()))) {
1932 MarkAsBroken(rv);
1933 return;
1934 }
1935 // At this point, internalEncodingDeclaration() may have called
1936 // Terminate, but that never happens together with script.
1937 // Can't assert that here, though, because it's possible that the main
1938 // thread has called Terminate() while this thread was parsing.
1939 if (mTreeBuilder->HasScript()) {
1940 // HasScript() cannot return true if the tree builder is preventing
1941 // script execution.
1942 MOZ_ASSERT(mMode == NORMAL);
1943 mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
1944 nsHtml5Speculation* speculation = new nsHtml5Speculation(
1945 mFirstBuffer, mFirstBuffer->getStart(), mTokenizer->getLineNumber(),
1946 mTreeBuilder->newSnapshot());
1947 mTreeBuilder->AddSnapshotToScript(speculation->GetSnapshot(),
1948 speculation->GetStartLineNumber());
1949 FlushTreeOpsAndDisarmTimer();
1950 mTreeBuilder->SetOpSink(speculation);
1951 mSpeculations.AppendElement(speculation); // adopts the pointer
1952 mSpeculating = true;
1953 }
1954 if (IsTerminatedOrInterrupted()) {
1955 return;
1956 }
1957 }
1958 }
1959 }
1960
1961 class nsHtml5StreamParserContinuation : public Runnable {
1962 private:
1963 nsHtml5StreamParserPtr mStreamParser;
1964
1965 public:
nsHtml5StreamParserContinuation(nsHtml5StreamParser * aStreamParser)1966 explicit nsHtml5StreamParserContinuation(nsHtml5StreamParser* aStreamParser)
1967 : Runnable("nsHtml5StreamParserContinuation"),
1968 mStreamParser(aStreamParser) {}
Run()1969 NS_IMETHOD Run() override {
1970 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1971 mStreamParser->Uninterrupt();
1972 mStreamParser->ParseAvailableData();
1973 return NS_OK;
1974 }
1975 };
1976
ContinueAfterScripts(nsHtml5Tokenizer * aTokenizer,nsHtml5TreeBuilder * aTreeBuilder,bool aLastWasCR)1977 void nsHtml5StreamParser::ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
1978 nsHtml5TreeBuilder* aTreeBuilder,
1979 bool aLastWasCR) {
1980 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1981 NS_ASSERTION(!(mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML),
1982 "ContinueAfterScripts called in view source mode!");
1983 if (NS_FAILED(mExecutor->IsBroken())) {
1984 return;
1985 }
1986 #ifdef DEBUG
1987 mExecutor->AssertStageEmpty();
1988 #endif
1989 bool speculationFailed = false;
1990 {
1991 mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
1992 if (mSpeculations.IsEmpty()) {
1993 MOZ_ASSERT_UNREACHABLE(
1994 "ContinueAfterScripts called without "
1995 "speculations.");
1996 return;
1997 }
1998
1999 const auto& speculation = mSpeculations.ElementAt(0);
2000 if (aLastWasCR || !aTokenizer->isInDataState() ||
2001 !aTreeBuilder->snapshotMatches(speculation->GetSnapshot())) {
2002 speculationFailed = true;
2003 // We've got a failed speculation :-(
2004 MaybeDisableFutureSpeculation();
2005 Interrupt(); // Make the parser thread release the tokenizer mutex sooner
2006 // now fall out of the speculationAutoLock into the tokenizerAutoLock
2007 // block
2008 } else {
2009 // We've got a successful speculation!
2010 if (mSpeculations.Length() > 1) {
2011 // the first speculation isn't the current speculation, so there's
2012 // no need to bother the parser thread.
2013 speculation->FlushToSink(mExecutor);
2014 NS_ASSERTION(!mExecutor->IsScriptExecuting(),
2015 "ParseUntilBlocked() was supposed to ensure we don't come "
2016 "here when scripts are executing.");
2017 NS_ASSERTION(
2018 mExecutor->IsInFlushLoop(),
2019 "How are we here if "
2020 "RunFlushLoop() didn't call ParseUntilBlocked() which is the "
2021 "only caller of this method?");
2022 mSpeculations.RemoveElementAt(0);
2023 return;
2024 }
2025 // else
2026 Interrupt(); // Make the parser thread release the tokenizer mutex sooner
2027
2028 // now fall through
2029 // the first speculation is the current speculation. Need to
2030 // release the the speculation mutex and acquire the tokenizer
2031 // mutex. (Just acquiring the other mutex here would deadlock)
2032 }
2033 }
2034 {
2035 mozilla::MutexAutoLock tokenizerAutoLock(mTokenizerMutex);
2036 #ifdef DEBUG
2037 {
2038 mAtomTable.SetPermittedLookupEventTarget(
2039 GetMainThreadSerialEventTarget());
2040 }
2041 #endif
2042 // In principle, the speculation mutex should be acquired here,
2043 // but there's no point, because the parser thread only acquires it
2044 // when it has also acquired the tokenizer mutex and we are already
2045 // holding the tokenizer mutex.
2046 if (speculationFailed) {
2047 // Rewind the stream
2048 mAtEOF = false;
2049 const auto& speculation = mSpeculations.ElementAt(0);
2050 mFirstBuffer = speculation->GetBuffer();
2051 mFirstBuffer->setStart(speculation->GetStart());
2052 mTokenizer->setLineNumber(speculation->GetStartLineNumber());
2053
2054 nsContentUtils::ReportToConsole(
2055 nsIScriptError::warningFlag, "DOM Events"_ns,
2056 mExecutor->GetDocument(), nsContentUtils::eDOM_PROPERTIES,
2057 "SpeculationFailed", nsTArray<nsString>(), nullptr, u""_ns,
2058 speculation->GetStartLineNumber());
2059
2060 nsHtml5OwningUTF16Buffer* buffer = mFirstBuffer->next;
2061 while (buffer) {
2062 buffer->setStart(0);
2063 buffer = buffer->next;
2064 }
2065
2066 mSpeculations.Clear(); // potentially a huge number of destructors
2067 // run here synchronously on the main thread...
2068
2069 mTreeBuilder->flushCharacters(); // empty the pending buffer
2070 mTreeBuilder->ClearOps(); // now get rid of the failed ops
2071
2072 mTreeBuilder->SetOpSink(mExecutor->GetStage());
2073 mExecutor->StartReadingFromStage();
2074 mSpeculating = false;
2075
2076 // Copy state over
2077 mLastWasCR = aLastWasCR;
2078 mTokenizer->loadState(aTokenizer);
2079 mTreeBuilder->loadState(aTreeBuilder);
2080 } else {
2081 // We've got a successful speculation and at least a moment ago it was
2082 // the current speculation
2083 mSpeculations.ElementAt(0)->FlushToSink(mExecutor);
2084 NS_ASSERTION(!mExecutor->IsScriptExecuting(),
2085 "ParseUntilBlocked() was supposed to ensure we don't come "
2086 "here when scripts are executing.");
2087 NS_ASSERTION(
2088 mExecutor->IsInFlushLoop(),
2089 "How are we here if "
2090 "RunFlushLoop() didn't call ParseUntilBlocked() which is the "
2091 "only caller of this method?");
2092 mSpeculations.RemoveElementAt(0);
2093 if (mSpeculations.IsEmpty()) {
2094 // yes, it was still the only speculation. Now stop speculating
2095 // However, before telling the executor to read from stage, flush
2096 // any pending ops straight to the executor, because otherwise
2097 // they remain unflushed until we get more data from the network.
2098 mTreeBuilder->SetOpSink(mExecutor);
2099 mTreeBuilder->Flush(true);
2100 mTreeBuilder->SetOpSink(mExecutor->GetStage());
2101 mExecutor->StartReadingFromStage();
2102 mSpeculating = false;
2103 }
2104 }
2105 nsCOMPtr<nsIRunnable> event = new nsHtml5StreamParserContinuation(this);
2106 if (NS_FAILED(mEventTarget->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
2107 NS_WARNING("Failed to dispatch nsHtml5StreamParserContinuation");
2108 }
2109 // A stream event might run before this event runs, but that's harmless.
2110 #ifdef DEBUG
2111 mAtomTable.SetPermittedLookupEventTarget(mEventTarget);
2112 #endif
2113 }
2114 }
2115
ContinueAfterFailedCharsetSwitch()2116 void nsHtml5StreamParser::ContinueAfterFailedCharsetSwitch() {
2117 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
2118 nsCOMPtr<nsIRunnable> event = new nsHtml5StreamParserContinuation(this);
2119 if (NS_FAILED(mEventTarget->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
2120 NS_WARNING("Failed to dispatch nsHtml5StreamParserContinuation");
2121 }
2122 }
2123
2124 class nsHtml5TimerKungFu : public Runnable {
2125 private:
2126 nsHtml5StreamParserPtr mStreamParser;
2127
2128 public:
nsHtml5TimerKungFu(nsHtml5StreamParser * aStreamParser)2129 explicit nsHtml5TimerKungFu(nsHtml5StreamParser* aStreamParser)
2130 : Runnable("nsHtml5TimerKungFu"), mStreamParser(aStreamParser) {}
Run()2131 NS_IMETHOD Run() override {
2132 mozilla::MutexAutoLock flushTimerLock(mStreamParser->mFlushTimerMutex);
2133 if (mStreamParser->mFlushTimer) {
2134 mStreamParser->mFlushTimer->Cancel();
2135 mStreamParser->mFlushTimer = nullptr;
2136 }
2137 return NS_OK;
2138 }
2139 };
2140
DropTimer()2141 void nsHtml5StreamParser::DropTimer() {
2142 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
2143 /*
2144 * Simply nulling out the timer wouldn't work, because if the timer is
2145 * armed, it needs to be canceled first. Simply canceling it first wouldn't
2146 * work, because nsTimerImpl::Cancel is not safe for calling from outside
2147 * the thread where nsTimerImpl::Fire would run. It's not safe to
2148 * dispatch a runnable to cancel the timer from the destructor of this
2149 * class, because the timer has a weak (void*) pointer back to this instance
2150 * of the stream parser and having the timer fire before the runnable
2151 * cancels it would make the timer access a deleted object.
2152 *
2153 * This DropTimer method addresses these issues. This method must be called
2154 * on the main thread before the destructor of this class is reached.
2155 * The nsHtml5TimerKungFu object has an nsHtml5StreamParserPtr that addrefs
2156 * this
2157 * stream parser object to keep it alive until the runnable is done.
2158 * The runnable cancels the timer on the parser thread, drops the timer
2159 * and lets nsHtml5StreamParserPtr send a runnable back to the main thread to
2160 * release the stream parser.
2161 */
2162 mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex);
2163 if (mFlushTimer) {
2164 nsCOMPtr<nsIRunnable> event = new nsHtml5TimerKungFu(this);
2165 if (NS_FAILED(mEventTarget->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
2166 NS_WARNING("Failed to dispatch TimerKungFu event");
2167 }
2168 }
2169 }
2170
2171 // Using a static, because the method name Notify is taken by the chardet
2172 // callback.
TimerCallback(nsITimer * aTimer,void * aClosure)2173 void nsHtml5StreamParser::TimerCallback(nsITimer* aTimer, void* aClosure) {
2174 (static_cast<nsHtml5StreamParser*>(aClosure))->TimerFlush();
2175 }
2176
TimerFlush()2177 void nsHtml5StreamParser::TimerFlush() {
2178 NS_ASSERTION(IsParserThread(), "Wrong thread!");
2179 mozilla::MutexAutoLock autoLock(mTokenizerMutex);
2180
2181 NS_ASSERTION(!mSpeculating, "Flush timer fired while speculating.");
2182
2183 // The timer fired if we got here. No need to cancel it. Mark it as
2184 // not armed, though.
2185 mFlushTimerArmed = false;
2186
2187 mFlushTimerEverFired = true;
2188
2189 if (IsTerminatedOrInterrupted()) {
2190 return;
2191 }
2192
2193 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
2194 mTreeBuilder->Flush(); // delete useless ops
2195 if (mTokenizer->FlushViewSource()) {
2196 nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher);
2197 if (NS_FAILED(DispatchToMain(runnable.forget()))) {
2198 NS_WARNING("failed to dispatch executor flush event");
2199 }
2200 }
2201 } else {
2202 // we aren't speculating and we don't know when new data is
2203 // going to arrive. Send data to the main thread.
2204 if (mTreeBuilder->Flush(true)) {
2205 nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher);
2206 if (NS_FAILED(DispatchToMain(runnable.forget()))) {
2207 NS_WARNING("failed to dispatch executor flush event");
2208 }
2209 }
2210 }
2211 }
2212
MarkAsBroken(nsresult aRv)2213 void nsHtml5StreamParser::MarkAsBroken(nsresult aRv) {
2214 NS_ASSERTION(IsParserThread(), "Wrong thread!");
2215 mTokenizerMutex.AssertCurrentThreadOwns();
2216
2217 Terminate();
2218 mTreeBuilder->MarkAsBroken(aRv);
2219 mozilla::DebugOnly<bool> hadOps = mTreeBuilder->Flush(false);
2220 NS_ASSERTION(hadOps, "Should have had the markAsBroken op!");
2221 nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher);
2222 if (NS_FAILED(DispatchToMain(runnable.forget()))) {
2223 NS_WARNING("failed to dispatch executor flush event");
2224 }
2225 }
2226
DispatchToMain(already_AddRefed<nsIRunnable> && aRunnable)2227 nsresult nsHtml5StreamParser::DispatchToMain(
2228 already_AddRefed<nsIRunnable>&& aRunnable) {
2229 if (mNetworkEventTarget) {
2230 return mNetworkEventTarget->Dispatch(std::move(aRunnable));
2231 }
2232 return SchedulerGroup::UnlabeledDispatch(TaskCategory::Network,
2233 std::move(aRunnable));
2234 }
2235