1 /*
2  * Copyright (C) 2013 Google, Inc. All Rights Reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GOOGLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "third_party/blink/renderer/core/html/parser/background_html_parser.h"
27 
28 #include <memory>
29 #include <utility>
30 
31 #include "base/single_thread_task_runner.h"
32 #include "third_party/blink/public/platform/platform.h"
33 #include "third_party/blink/renderer/core/html/parser/html_document_parser.h"
34 #include "third_party/blink/renderer/core/html/parser/text_resource_decoder.h"
35 #include "third_party/blink/renderer/core/html_names.h"
36 #include "third_party/blink/renderer/platform/instrumentation/tracing/trace_event.h"
37 #include "third_party/blink/renderer/platform/wtf/cross_thread_functional.h"
38 #include "third_party/blink/renderer/platform/wtf/functional.h"
39 #include "third_party/blink/renderer/platform/wtf/text/text_position.h"
40 
41 namespace blink {
42 
43 // On a network with high latency and high bandwidth, using a device with a fast
44 // CPU, we could end up speculatively tokenizing the whole document, well ahead
45 // of when the main-thread actually needs it. This is a waste of memory (and
46 // potentially time if the speculation fails). So we limit our outstanding
47 // tokens arbitrarily to 10,000. Our maximal memory spent speculating will be
48 // approximately:
49 // (kOutstandingTokenLimit + kPendingTokenLimit) * sizeof(CompactToken)
50 //
51 // We use a separate low and high water mark to avoid
52 // constantly topping off the main thread's token buffer. At time of writing,
53 // this is (10000 + 1000) * 28 bytes = ~308kb of memory. These numbers have not
54 // been tuned.
55 static const size_t kOutstandingTokenLimit = 10000;
56 
57 // We limit our chucks to 1000 tokens, to make sure the main thread is never
58 // waiting on the parser thread for tokens. This was tuned in
59 // https://bugs.webkit.org/show_bug.cgi?id=110408.
60 static const size_t kPendingTokenLimit = 1000;
61 
62 static_assert(kOutstandingTokenLimit > kPendingTokenLimit,
63               "Outstanding token limit is applied after pending token limit.");
64 
Create(std::unique_ptr<Configuration> config,scoped_refptr<base::SingleThreadTaskRunner> loading_task_runner)65 base::WeakPtr<BackgroundHTMLParser> BackgroundHTMLParser::Create(
66     std::unique_ptr<Configuration> config,
67     scoped_refptr<base::SingleThreadTaskRunner> loading_task_runner) {
68   auto* background_parser = new BackgroundHTMLParser(
69       std::move(config), std::move(loading_task_runner));
70   return background_parser->weak_factory_.GetWeakPtr();
71 }
72 
Init(const KURL & document_url,std::unique_ptr<CachedDocumentParameters> cached_document_parameters,const MediaValuesCached::MediaValuesCachedData & media_values_cached_data,bool priority_hints_origin_trial_enabled)73 void BackgroundHTMLParser::Init(
74     const KURL& document_url,
75     std::unique_ptr<CachedDocumentParameters> cached_document_parameters,
76     const MediaValuesCached::MediaValuesCachedData& media_values_cached_data,
77     bool priority_hints_origin_trial_enabled) {
78   TRACE_EVENT1("loading", "BackgroundHTMLParser::Init", "url",
79                document_url.GetString().Utf8());
80   preload_scanner_.reset(new TokenPreloadScanner(
81       document_url, std::move(cached_document_parameters),
82       media_values_cached_data, TokenPreloadScanner::ScannerType::kMainDocument,
83       priority_hints_origin_trial_enabled));
84 }
85 
Configuration()86 BackgroundHTMLParser::Configuration::Configuration() {}
87 
BackgroundHTMLParser(std::unique_ptr<Configuration> config,scoped_refptr<base::SingleThreadTaskRunner> loading_task_runner)88 BackgroundHTMLParser::BackgroundHTMLParser(
89     std::unique_ptr<Configuration> config,
90     scoped_refptr<base::SingleThreadTaskRunner> loading_task_runner)
91     : token_(std::make_unique<HTMLToken>()),
92       tokenizer_(std::make_unique<HTMLTokenizer>(config->options)),
93       tree_builder_simulator_(config->options),
94       options_(config->options),
95       parser_(config->parser),
96       decoder_(std::move(config->decoder)),
97       loading_task_runner_(std::move(loading_task_runner)),
98       pending_csp_meta_token_index_(
99           HTMLDocumentParser::TokenizedChunk::kNoPendingToken),
100       starting_script_(false) {}
101 
102 BackgroundHTMLParser::~BackgroundHTMLParser() = default;
103 
AppendRawBytesFromMainThread(std::unique_ptr<Vector<char>> buffer)104 void BackgroundHTMLParser::AppendRawBytesFromMainThread(
105     std::unique_ptr<Vector<char>> buffer) {
106   TRACE_EVENT0("loading", "BackgroundHTMLParser::AppendRawBytesFromMainThread");
107   DCHECK(decoder_);
108   UpdateDocument(decoder_->Decode(buffer->data(), buffer->size()));
109 }
110 
AppendDecodedBytes(const String & input)111 void BackgroundHTMLParser::AppendDecodedBytes(const String& input) {
112   DCHECK(!input_.Current().IsClosed());
113   input_.Append(input);
114   PumpTokenizer();
115 }
116 
SetDecoder(std::unique_ptr<TextResourceDecoder> decoder)117 void BackgroundHTMLParser::SetDecoder(
118     std::unique_ptr<TextResourceDecoder> decoder) {
119   DCHECK(decoder);
120   decoder_ = std::move(decoder);
121 }
122 
Flush()123 void BackgroundHTMLParser::Flush() {
124   DCHECK(decoder_);
125   UpdateDocument(decoder_->Flush());
126 }
127 
UpdateDocument(const String & decoded_data)128 void BackgroundHTMLParser::UpdateDocument(const String& decoded_data) {
129   DocumentEncodingData encoding_data(*decoder_.get());
130   if (encoding_data != last_seen_encoding_data_) {
131     last_seen_encoding_data_ = encoding_data;
132     if (parser_)
133       parser_->DidReceiveEncodingDataFromBackgroundParser(encoding_data);
134   }
135   if (decoded_data.IsEmpty())
136     return;
137 
138   AppendDecodedBytes(decoded_data);
139 }
140 
ResumeFrom(std::unique_ptr<Checkpoint> checkpoint)141 void BackgroundHTMLParser::ResumeFrom(std::unique_ptr<Checkpoint> checkpoint) {
142   parser_ = checkpoint->parser;
143   token_ = std::move(checkpoint->token);
144   tokenizer_ = std::move(checkpoint->tokenizer);
145   tree_builder_simulator_.SetState(checkpoint->tree_builder_state);
146   input_.RewindTo(checkpoint->input_checkpoint, checkpoint->unparsed_input);
147   preload_scanner_->RewindTo(checkpoint->preload_scanner_checkpoint);
148   starting_script_ = false;
149   PumpTokenizer();
150 }
151 
StartedChunkWithCheckpoint(HTMLInputCheckpoint input_checkpoint)152 void BackgroundHTMLParser::StartedChunkWithCheckpoint(
153     HTMLInputCheckpoint input_checkpoint) {
154   // Note, we should not have to worry about the index being invalid as messages
155   // from the main thread will be processed in FIFO order.
156   input_.InvalidateCheckpointsBefore(input_checkpoint);
157   PumpTokenizer();
158 }
159 
Finish()160 void BackgroundHTMLParser::Finish() {
161   MarkEndOfFile();
162   PumpTokenizer();
163 }
164 
Stop()165 void BackgroundHTMLParser::Stop() {
166   ClearParser();
167   delete this;
168 }
169 
ForcePlaintextForTextDocument()170 void BackgroundHTMLParser::ForcePlaintextForTextDocument() {
171   // This is only used by the TextDocumentParser (a subclass of
172   // HTMLDocumentParser) to force us into the PLAINTEXT state w/o using a
173   // <plaintext> tag. The TextDocumentParser uses a <pre> tag for historical /
174   // compatibility reasons.
175   tokenizer_->SetState(HTMLTokenizer::kPLAINTEXTState);
176 }
177 
ClearParser()178 void BackgroundHTMLParser::ClearParser() {
179   parser_.Clear();
180 }
181 
MarkEndOfFile()182 void BackgroundHTMLParser::MarkEndOfFile() {
183   DCHECK(!input_.Current().IsClosed());
184   input_.Append(String(&kEndOfFileMarker, 1));
185   input_.Close();
186 }
187 
PumpTokenizer()188 void BackgroundHTMLParser::PumpTokenizer() {
189   TRACE_EVENT0("loading", "BackgroundHTMLParser::pumpTokenizer");
190   HTMLTreeBuilderSimulator::SimulatedToken simulated_token =
191       HTMLTreeBuilderSimulator::kOtherToken;
192 
193   // No need to start speculating until the main thread has almost caught up.
194   if (input_.TotalCheckpointTokenCount() > kOutstandingTokenLimit)
195     return;
196 
197   while (tokenizer_->NextToken(input_.Current(), *token_)) {
198     {
199       TextPosition position = TextPosition(input_.Current().CurrentLine(),
200                                            input_.Current().CurrentColumn());
201 
202       CompactHTMLToken token(token_.get(), position);
203       bool is_csp_meta_tag = false;
204       preload_scanner_->Scan(token, input_.Current(), pending_preloads_,
205                              &viewport_description_, &is_csp_meta_tag);
206 
207       simulated_token =
208           tree_builder_simulator_.Simulate(token, tokenizer_.get());
209 
210       // Break chunks before a script tag is inserted and flag the chunk as
211       // starting a script so the main parser can decide if it should yield
212       // before processing the chunk.
213       if (simulated_token == HTMLTreeBuilderSimulator::kValidScriptStart) {
214         EnqueueTokenizedChunk();
215         starting_script_ = true;
216       }
217 
218       pending_tokens_.push_back(token);
219       if (is_csp_meta_tag) {
220         pending_csp_meta_token_index_ = pending_tokens_.size() - 1;
221       }
222     }
223 
224     token_->Clear();
225 
226     if (simulated_token == HTMLTreeBuilderSimulator::kScriptEnd ||
227         simulated_token == HTMLTreeBuilderSimulator::kStyleEnd ||
228         simulated_token == HTMLTreeBuilderSimulator::kLink ||
229         simulated_token == HTMLTreeBuilderSimulator::kCustomElementBegin ||
230         pending_tokens_.size() >= kPendingTokenLimit) {
231       EnqueueTokenizedChunk();
232 
233       // If we're far ahead of the main thread, yield for a bit to avoid
234       // consuming too much memory.
235       if (input_.TotalCheckpointTokenCount() > kOutstandingTokenLimit)
236         break;
237     }
238   }
239 
240   EnqueueTokenizedChunk();
241 }
242 
EnqueueTokenizedChunk()243 void BackgroundHTMLParser::EnqueueTokenizedChunk() {
244   if (pending_tokens_.IsEmpty())
245     return;
246 
247   auto chunk = std::make_unique<HTMLDocumentParser::TokenizedChunk>();
248   TRACE_EVENT_WITH_FLOW0("blink,loading",
249                          "BackgroundHTMLParser::sendTokensToMainThread",
250                          chunk.get(), TRACE_EVENT_FLAG_FLOW_OUT);
251 
252   chunk->preloads.swap(pending_preloads_);
253   if (viewport_description_.has_value())
254     chunk->viewport = viewport_description_;
255   chunk->tokenizer_state = tokenizer_->GetState();
256   chunk->tree_builder_state = tree_builder_simulator_.GetState();
257   chunk->input_checkpoint = input_.CreateCheckpoint(pending_tokens_.size());
258   chunk->preload_scanner_checkpoint = preload_scanner_->CreateCheckpoint();
259   chunk->tokens.swap(pending_tokens_);
260   chunk->starting_script = starting_script_;
261   chunk->pending_csp_meta_token_index = pending_csp_meta_token_index_;
262   starting_script_ = false;
263   pending_csp_meta_token_index_ =
264       HTMLDocumentParser::TokenizedChunk::kNoPendingToken;
265 
266   if (parser_)
267     parser_->EnqueueTokenizedChunk(std::move(chunk));
268 }
269 
270 }  // namespace blink
271