1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "content/browser/download/mhtml_generation_manager.h"
6 
7 #include <utility>
8 
9 #include "base/bind.h"
10 #include "base/containers/queue.h"
11 #include "base/files/file.h"
12 #include "base/guid.h"
13 #include "base/macros.h"
14 #include "base/memory/ptr_util.h"
15 #include "base/metrics/histogram_macros.h"
16 #include "base/scoped_observer.h"
17 #include "base/stl_util.h"
18 #include "base/strings/string_util.h"
19 #include "base/strings/stringprintf.h"
20 #include "base/task_runner_util.h"
21 #include "base/time/time.h"
22 #include "base/trace_event/trace_event.h"
23 #include "components/download/public/common/download_task_runner.h"
24 #include "content/browser/bad_message.h"
25 #include "content/browser/download/mhtml_extra_parts_impl.h"
26 #include "content/browser/frame_host/frame_tree_node.h"
27 #include "content/browser/frame_host/render_frame_host_impl.h"
28 #include "content/common/download/mhtml_file_writer.mojom.h"
29 #include "content/public/browser/browser_thread.h"
30 #include "content/public/browser/mhtml_extra_parts.h"
31 #include "content/public/browser/mhtml_generation_result.h"
32 #include "content/public/browser/render_frame_host.h"
33 #include "content/public/browser/render_process_host.h"
34 #include "content/public/browser/web_contents.h"
35 #include "content/public/common/mhtml_generation_params.h"
36 #include "crypto/secure_hash.h"
37 #include "crypto/sha2.h"
38 #include "mojo/core/embedder/embedder.h"
39 #include "mojo/public/cpp/bindings/associated_remote.h"
40 #include "net/base/mime_util.h"
41 #include "third_party/blink/public/common/associated_interfaces/associated_interface_provider.h"
42 
43 namespace {
44 
45 // Callback to notify the UI thread that writing to the MHTML file is complete.
46 using MHTMLWriteCompleteCallback =
47     base::RepeatingCallback<void(content::mojom::MhtmlSaveStatus)>;
48 
49 const char kContentLocation[] = "Content-Location: ";
50 const char kContentType[] = "Content-Type: ";
51 int kInvalidFileSize = -1;
52 
53 // CloseFileResult holds the result of closing the generated file using the
54 // status of the operation, a file size and a pointer to a file digest. It
55 // stores the values of the status and size directly, and makes a copy of the
56 // digest if present.
57 struct CloseFileResult {
CloseFileResult__anon30b841df0111::CloseFileResult58   CloseFileResult(content::mojom::MhtmlSaveStatus status,
59                   int64_t size,
60                   std::string* digest)
61       : save_status(status), file_size(size) {
62     if (digest)
63       file_digest = base::Optional<std::string>(*digest);
64   }
65 
66   content::mojom::MhtmlSaveStatus save_status;
67   int64_t file_size;
68   base::Optional<std::string> file_digest;
69 
toMHTMLGenerationResult__anon30b841df0111::CloseFileResult70   content::MHTMLGenerationResult toMHTMLGenerationResult() const {
71     return content::MHTMLGenerationResult(file_size,
72                                           base::OptionalOrNullptr(file_digest));
73   }
74 };
75 
CreateMHTMLFile(const base::FilePath & file_path)76 base::File CreateMHTMLFile(const base::FilePath& file_path) {
77   DCHECK(download::GetDownloadTaskRunner()->RunsTasksInCurrentSequence());
78 
79   // SECURITY NOTE: A file descriptor to the file created below will be passed
80   // to multiple renderer processes which (in out-of-process iframes mode) can
81   // act on behalf of separate web principals.  Therefore it is important to
82   // only allow writing to the file and forbid reading from the file (as this
83   // would allow reading content generated by other renderers / other web
84   // principals).
85   uint32_t file_flags = base::File::FLAG_CREATE_ALWAYS | base::File::FLAG_WRITE;
86 
87   base::File browser_file(file_path, file_flags);
88   if (!browser_file.IsValid()) {
89     DLOG(ERROR) << "Failed to create file to save MHTML at: "
90                 << file_path.value();
91   }
92   return browser_file;
93 }
94 
95 }  // namespace
96 
97 namespace content {
98 
99 // The class and all of its members live on the UI thread.  Only static methods
100 // are executed on other threads.
101 // Job instances are created in MHTMLGenerationManager::Job::StartNewJob(),
102 // proceeding with the MHTML saving process unmanaged. Every instance is
103 // self-owned and responsible for deleting itself upon invoking OnFinished.
104 // With self-ownership lifetime concerns, we make the following precautions:
105 // - SerializeAsMHTMLResponse() always proceeds with finalizing upon detecting
106 //   Job completion/cancellation.
107 // - Jobs are prematurely finalized and deleted upon detecting a connection
108 //   error with the message pipe during serialization.
109 // - Any pending callbacks after deletion are invalidated using weak pointers.
110 class MHTMLGenerationManager::Job {
111  public:
112   // Creates and registers a new job.
113   static void StartNewJob(
114       WebContents* web_contents,
115       const MHTMLGenerationParams& params,
116       MHTMLGenerationResult::GenerateMHTMLCallback callback);
117 
118  private:
119   Job(WebContents* web_contents,
120       const MHTMLGenerationParams& params,
121       MHTMLGenerationResult::GenerateMHTMLCallback callback);
122   ~Job();
123 
124   // Begins queuing frames from web_contents, creates a new MHTML file and
125   // begins page serialization to created file.
126   void initializeJob(WebContents* web_contents);
127 
128   // Writes the string |to_write| to the file. If successful, updates hash and
129   // returns true, otherwise, returns false. Does not take ownership of |file|
130   // nor |raw_secure_hash|.
131   static bool WriteToFileAndUpdateHash(base::File* file,
132                                        crypto::SecureHash* secure_hash,
133                                        std::string to_write);
134 
135   // Writes the MHTML footer to the file and closes it. It also receives the
136   // SimpleWatcher instance used to watch the data pipe and the current hash
137   // state for safe destruction on the IO thread.
138   //
139   // Note: The same |boundary| marker must be used for all "boundaries" -- in
140   // the header, parts and footer -- that belong to the same MHTML document (see
141   // also rfc1341, section 7.2.1, "boundary" description).
142   static CloseFileResult FinalizeOnFileThread(
143       mojom::MhtmlSaveStatus save_status,
144       const std::string& boundary,
145       base::File file,
146       const std::vector<MHTMLExtraDataPart>& extra_data_parts,
147       std::unique_ptr<mojo::SimpleWatcher> watcher,
148       std::unique_ptr<crypto::SecureHash> secure_hash);
149 
150   void AddFrame(RenderFrameHost* render_frame_host);
151 
152   // Creates a string that encompasses any remaining extra data parts to write
153   // to the file.
154   static std::string CreateExtraDataParts(
155       const std::string& boundary,
156       const std::vector<MHTMLExtraDataPart>& extra_data_parts);
157 
158   // Creates a string with the contents if htem MHTML file footer.
159   static std::string CreateFooter(const std::string& boundary);
160 
161   // Called on the UI thread when the file that should hold the MHTML data has
162   // been created.
163   void OnFileAvailable(base::File browser_file);
164 
165   // Called on the UI thread after the file got finalized and we have its size,
166   // or an error occurred while creating a new file.
167   void OnFinished(const CloseFileResult& result);
168 
169   // Starts watching a handle on the file thread. Instantiates a new instance
170   // of |watcher_| upon call.
171   void BeginWatchingHandle(MHTMLWriteCompleteCallback callback);
172 
173   // Writes data from the consumer handle to the new MHTML file. Only done
174   // with on the fly hash computation.
175   // Bound to the data pipe watcher and called upon notification of write
176   // completion to producer pipe sent to the Renderer.
177   // TODO(https://crbug.com/915966): Eventually simplify this implementation
178   // with a DataPipeDrainer once error signalling is implemented there.
179   void WriteMHTMLToDisk(MHTMLWriteCompleteCallback callback,
180                         MojoResult result,
181                         const mojo::HandleSignalsState& state);
182 
183   // Destroys |watcher_| instance and notifies UI thread of write completion.
184   void OnWriteComplete(MHTMLWriteCompleteCallback callback,
185                        mojom::MhtmlSaveStatus save_status);
186 
187   // Notifies Job of frame write completion and sends request to next render
188   // frame if the response was blocked by the write operation.
189   void DoneWritingToDisk(mojom::MhtmlSaveStatus save_status);
190 
191   // Called when the message pipe to the renderer is disconnected.
192   void OnConnectionError();
193 
194   // Handler for the Mojo interface callback (a notification from the
195   // renderer that the MHTML generation for previous frame has finished).
196   void SerializeAsMHTMLResponse(
197       mojom::MhtmlSaveStatus save_status,
198       const std::vector<std::string>& digests_of_uris_of_serialized_resources,
199       base::TimeDelta renderer_main_thread_time);
200 
201   // Records newly serialized resource digests into
202   // |digests_of_already_serialized_uris_|.
203   void RecordDigests(
204       const std::vector<std::string>& digests_of_uris_of_serialized_resources);
205 
206   // Continues sending serialization requests to the next frame if ready and
207   // there are more frames to be serialized.
208   void MaybeSendToNextRenderFrame(mojom::MhtmlSaveStatus save_status);
209 
210   // Packs up the current status of the MHTML file save operation into a Mojo
211   // struct to send to the renderer process.
212   mojom::SerializeAsMHTMLParamsPtr CreateMojoParams();
213 
214   // Sends Mojo interface call to the renderer, asking for MHTML
215   // generation of the next frame. Returns MhtmlSaveStatus::kSuccess or a
216   // specific error status.
217   mojom::MhtmlSaveStatus SendToNextRenderFrame();
218 
219   // Indicates if the writing operation on the IO thread is complete, and
220   // we have received a response from the Renderer.
221   // This check is necessary to provide synchronization between file writing
222   // operations and MHTML serialization.
223   bool CurrentFrameDone() const;
224 
225   // Called on the UI thread when a job has been finished.
226   void Finalize(mojom::MhtmlSaveStatus save_status);
227 
228   // Write the MHTML footer and close the file on the file thread and respond
229   // back on the UI thread with the updated status and file size (which will be
230   // negative in case of errors).
231   void CloseFile(mojom::MhtmlSaveStatus save_status);
232 
233   // Marks the Job as completed, preventing any further notifications from the
234   // Renderer. This prevents the race/crash from https://crbug.com/612098.
235   void MarkAsFinished();
236 
237   void ReportRendererMainThreadTime(base::TimeDelta renderer_main_thread_time);
238 
239   // Close the MHTML file if it looks good, setting the size param.  Returns
240   // false for failure.
241   static bool CloseFileIfValid(base::File& file, int64_t* file_size);
242 
243   // Time tracking for performance metrics reporting.
244   const base::TimeTicks creation_time_;
245   base::TimeTicks wait_on_renderer_start_time_;
246   base::TimeDelta all_renderers_wait_time_;
247   base::TimeDelta all_renderers_main_thread_time_;
248   base::TimeDelta longest_renderer_main_thread_time_;
249 
250   // User-configurable parameters. Includes the file location, binary encoding
251   // choices.
252   MHTMLGenerationParams params_;
253 
254   // The IDs of frames that still need to be processed.
255   base::queue<int> pending_frame_tree_node_ids_;
256 
257   // Identifies a frame to which we've sent through
258   // MhtmlFileWriter::SerializeAsMHTML but for which we didn't yet process
259   // the response via SerializeAsMHTMLResponse.
260   int frame_tree_node_id_of_busy_frame_;
261 
262   // The handle to the file the MHTML is saved to for the browser process.
263   base::File browser_file_;
264 
265   // MIME multipart boundary to use in the MHTML doc.
266   const std::string mhtml_boundary_marker_;
267 
268   // Digests of URIs of already generated MHTML parts.
269   std::set<std::string> digests_of_already_serialized_uris_;
270   std::string salt_;
271 
272   // The callback to call once generation is complete.
273   MHTMLGenerationResult::GenerateMHTMLCallback callback_;
274 
275   // Whether the job is finished (set to true only for the short duration of
276   // time between MHTMLGenerationManager::Job::Finalize is called and the job is
277   // destroyed by MHTMLGenerationManager::Job::OnFinished).
278   bool is_finished_;
279 
280   // Any extra data parts that should be emitted into the output MHTML.
281   std::vector<MHTMLExtraDataPart> extra_data_parts_;
282 
283   // MHTMLFileWriter instance for the frame being currently serialized.
284   mojo::AssociatedRemote<mojom::MhtmlFileWriter> writer_;
285 
286   // Watcher to detect new data written to |mhtml_data_consumer_|.
287   // This is instantiated and destroyed in the download sequence for each frame.
288   std::unique_ptr<mojo::SimpleWatcher> watcher_;
289 
290   // Consumer handle for data pipe streaming.
291   mojo::ScopedDataPipeConsumerHandle mhtml_data_consumer_;
292 
293   // Indicates whether there is currently data being streamed from the Renderer.
294   // Not used when the renderer is writing directly to file.
295   bool waiting_on_data_streaming_;
296 
297   // Current state of contents hash computation.
298   // This is updated upon every successful file write and finalized in the
299   // download sequence.
300   std::unique_ptr<crypto::SecureHash> secure_hash_;
301 
302   base::WeakPtrFactory<Job> weak_factory_{this};
303 
304   DISALLOW_COPY_AND_ASSIGN(Job);
305 };
306 
Job(WebContents * web_contents,const MHTMLGenerationParams & params,MHTMLGenerationResult::GenerateMHTMLCallback callback)307 MHTMLGenerationManager::Job::Job(
308     WebContents* web_contents,
309     const MHTMLGenerationParams& params,
310     MHTMLGenerationResult::GenerateMHTMLCallback callback)
311     : creation_time_(base::TimeTicks::Now()),
312       params_(params),
313       frame_tree_node_id_of_busy_frame_(FrameTreeNode::kFrameTreeNodeInvalidId),
314       mhtml_boundary_marker_(net::GenerateMimeMultipartBoundary()),
315       salt_(base::GenerateGUID()),
316       callback_(std::move(callback)),
317       is_finished_(false),
318       waiting_on_data_streaming_(false) {
319   initializeJob(web_contents);
320 }
321 
~Job()322 MHTMLGenerationManager::Job::~Job() {
323   DCHECK_CURRENTLY_ON(BrowserThread::UI);
324   DCHECK(!watcher_);
325 }
326 
initializeJob(WebContents * web_contents)327 void MHTMLGenerationManager::Job::initializeJob(WebContents* web_contents) {
328   DCHECK_CURRENTLY_ON(BrowserThread::UI);
329 
330   TRACE_EVENT_NESTABLE_ASYNC_BEGIN2(
331       "page-serialization", "SavingMhtmlJob", this, "url",
332       web_contents->GetLastCommittedURL().possibly_invalid_spec(), "file",
333       params_.file_path.AsUTF8Unsafe());
334 
335   web_contents->ForEachFrame(base::BindRepeating(
336       &MHTMLGenerationManager::Job::AddFrame,
337       base::Unretained(this)));  // Safe because ForEachFrame() is synchronous.
338 
339   // Main frame needs to be processed first.
340   DCHECK(!pending_frame_tree_node_ids_.empty());
341   DCHECK(FrameTreeNode::GloballyFindByID(pending_frame_tree_node_ids_.front())
342              ->parent() == nullptr);
343 
344   // Save off any extra data.
345   auto* extra_parts = static_cast<MHTMLExtraPartsImpl*>(
346       MHTMLExtraParts::FromWebContents(web_contents));
347   if (extra_parts)
348     extra_data_parts_ = extra_parts->parts();
349 
350   base::PostTaskAndReplyWithResult(
351       download::GetDownloadTaskRunner().get(), FROM_HERE,
352       base::BindOnce(&CreateMHTMLFile, params_.file_path),
353       base::BindOnce(&Job::OnFileAvailable, weak_factory_.GetWeakPtr()));
354 }
355 
356 mojom::SerializeAsMHTMLParamsPtr
CreateMojoParams()357 MHTMLGenerationManager::Job::CreateMojoParams() {
358   mojom::SerializeAsMHTMLParamsPtr mojo_params =
359       mojom::SerializeAsMHTMLParams::New();
360   mojo_params->mhtml_boundary_marker = mhtml_boundary_marker_;
361   mojo_params->mhtml_binary_encoding = params_.use_binary_encoding;
362   mojo_params->mhtml_popup_overlay_removal = params_.remove_popup_overlay;
363   mojo_params->mhtml_problem_detection = params_.use_page_problem_detectors;
364 
365   // Tell the renderer to skip (= deduplicate) already covered MHTML parts.
366   mojo_params->salt = salt_;
367   mojo_params->digests_of_uris_to_skip.assign(
368       digests_of_already_serialized_uris_.begin(),
369       digests_of_already_serialized_uris_.end());
370 
371   return mojo_params;
372 }
373 
SendToNextRenderFrame()374 mojom::MhtmlSaveStatus MHTMLGenerationManager::Job::SendToNextRenderFrame() {
375   DCHECK(browser_file_.IsValid());
376   DCHECK(!pending_frame_tree_node_ids_.empty());
377 
378   int frame_tree_node_id = pending_frame_tree_node_ids_.front();
379   pending_frame_tree_node_ids_.pop();
380 
381   FrameTreeNode* ftn = FrameTreeNode::GloballyFindByID(frame_tree_node_id);
382   if (!ftn)  // The contents went away.
383     return mojom::MhtmlSaveStatus::kFrameNoLongerExists;
384   RenderFrameHost* rfh = ftn->current_frame_host();
385 
386   if (writer_) {
387     // If we reached here, means the work for previous frame is done, so it is
388     // safe to cut the connection to the previous frame.
389     writer_.reset();
390   }
391 
392   // Bind Mojo interface to the RenderFrame
393   rfh->GetRemoteAssociatedInterfaces()->GetInterface(&writer_);
394 
395   // Safe, as |writer_| is owned by this Job instance.
396   auto error_callback =
397       base::BindOnce(&Job::OnConnectionError, base::Unretained(this));
398   writer_.set_disconnect_handler(std::move(error_callback));
399 
400   mojom::SerializeAsMHTMLParamsPtr params(CreateMojoParams());
401 
402   // Initialize method of file writing depending on |compute_contents_hash|
403   // flag.
404   params->output_handle = mojom::MhtmlOutputHandle::New();
405   if (params_.compute_contents_hash) {
406     // Create and set up the data pipe.
407     mojo::ScopedDataPipeProducerHandle producer;
408     if (mojo::CreateDataPipe(nullptr, &producer, &mhtml_data_consumer_) !=
409         MOJO_RESULT_OK) {
410       DLOG(ERROR) << "Failed to create Mojo Data Pipe.";
411       return mojom::MhtmlSaveStatus::kStreamingError;
412     }
413     MHTMLWriteCompleteCallback write_complete_callback = base::BindRepeating(
414         &Job::DoneWritingToDisk, weak_factory_.GetWeakPtr());
415     download::GetDownloadTaskRunner().get()->PostTask(
416         FROM_HERE,
417         base::BindOnce(&Job::BeginWatchingHandle, base::Unretained(this),
418                        std::move(write_complete_callback)));
419     waiting_on_data_streaming_ = true;
420     params->output_handle->set_producer_handle(std::move(producer));
421   } else {
422     // File::Duplicate() creates a reference to this file for use in the
423     // Renderer.
424     params->output_handle->set_file_handle(browser_file_.Duplicate());
425   }
426 
427   // Send a Mojo request to Renderer to serialize its frame.
428   DCHECK_EQ(FrameTreeNode::kFrameTreeNodeInvalidId,
429             frame_tree_node_id_of_busy_frame_);
430   frame_tree_node_id_of_busy_frame_ = frame_tree_node_id;
431 
432   auto response_callback = base::BindOnce(&Job::SerializeAsMHTMLResponse,
433                                           weak_factory_.GetWeakPtr());
434   writer_->SerializeAsMHTML(std::move(params), std::move(response_callback));
435 
436   TRACE_EVENT_NESTABLE_ASYNC_BEGIN1("page-serialization", "WaitingOnRenderer",
437                                     this, "frame tree node id",
438                                     frame_tree_node_id_of_busy_frame_);
439   DCHECK(wait_on_renderer_start_time_.is_null());
440   wait_on_renderer_start_time_ = base::TimeTicks::Now();
441   return mojom::MhtmlSaveStatus::kSuccess;
442 }
443 
BeginWatchingHandle(MHTMLWriteCompleteCallback callback)444 void MHTMLGenerationManager::Job::BeginWatchingHandle(
445     MHTMLWriteCompleteCallback callback) {
446   DCHECK(download::GetDownloadTaskRunner()->RunsTasksInCurrentSequence());
447 
448   DCHECK(!watcher_);
449   watcher_ = std::make_unique<mojo::SimpleWatcher>(
450       FROM_HERE, mojo::SimpleWatcher::ArmingPolicy::AUTOMATIC,
451       download::GetDownloadTaskRunner());
452   // It is entirely possible for BeginWatchingHandle to get bound multiple times
453   // if we have to serialize multiple render frames, but we will only ever want
454   // one secure hash instance created.
455   if (params_.compute_contents_hash && !secure_hash_) {
456     secure_hash_ =
457         crypto::SecureHash::Create(crypto::SecureHash::Algorithm::SHA256);
458   }
459 
460   // base::Unretained is safe, as |this| owns |mhtml_data_consumer_|, which
461   // is responsible for invoking |watcher_| callbacks.
462   if (watcher_->Watch(
463           mhtml_data_consumer_.get(),
464           MOJO_HANDLE_SIGNAL_NEW_DATA_READABLE | MOJO_HANDLE_SIGNAL_PEER_CLOSED,
465           MOJO_WATCH_CONDITION_SATISFIED,
466           base::BindRepeating(&Job::WriteMHTMLToDisk, base::Unretained(this),
467                               callback)) != MOJO_RESULT_OK) {
468     DLOG(ERROR) << "Failed to strap watcher to consumer handle.";
469     OnWriteComplete(callback, mojom::MhtmlSaveStatus::kStreamingError);
470   }
471 }
472 
WriteMHTMLToDisk(MHTMLWriteCompleteCallback callback,MojoResult result,const mojo::HandleSignalsState & state)473 void MHTMLGenerationManager::Job::WriteMHTMLToDisk(
474     MHTMLWriteCompleteCallback callback,
475     MojoResult result,
476     const mojo::HandleSignalsState& state) {
477   DCHECK(download::GetDownloadTaskRunner()->RunsTasksInCurrentSequence());
478   DCHECK_NE(result, MOJO_RESULT_FAILED_PRECONDITION);
479   // Begin consumer data pipe handle read and file write loop.
480   char buffer[1024];
481   uint32_t num_bytes = sizeof(buffer);
482   while (result == MOJO_RESULT_OK && state.readable()) {
483     result = mhtml_data_consumer_->ReadData(&buffer, &num_bytes,
484                                             MOJO_READ_DATA_FLAG_NONE);
485     if (result == MOJO_RESULT_OK) {
486       if (secure_hash_)
487         secure_hash_->Update(&buffer, num_bytes);
488       if (browser_file_.WriteAtCurrentPos(buffer, num_bytes) < 0) {
489         DLOG(ERROR) << "Error writing to file handle.";
490         OnWriteComplete(std::move(callback),
491                         mojom::MhtmlSaveStatus::kFileWritingError);
492         return;
493       }
494     }
495   }
496 
497   if (result != MOJO_RESULT_OK && result != MOJO_RESULT_FAILED_PRECONDITION &&
498       result != MOJO_RESULT_SHOULD_WAIT) {
499     DLOG(ERROR) << "Error streaming MHTML data to the Browser.";
500     OnWriteComplete(std::move(callback),
501                     mojom::MhtmlSaveStatus::kStreamingError);
502     return;
503   }
504 
505   // Only notify successful write completion if peer handle is closed without
506   // any errors.
507   if (state.peer_closed())
508     OnWriteComplete(std::move(callback), mojom::MhtmlSaveStatus::kSuccess);
509 }
510 
OnWriteComplete(MHTMLWriteCompleteCallback callback,mojom::MhtmlSaveStatus save_status)511 void MHTMLGenerationManager::Job::OnWriteComplete(
512     MHTMLWriteCompleteCallback callback,
513     mojom::MhtmlSaveStatus save_status) {
514   DCHECK(download::GetDownloadTaskRunner()->RunsTasksInCurrentSequence());
515 
516   watcher_.reset();
517   base::PostTask(FROM_HERE, {BrowserThread::UI},
518                  base::BindOnce(std::move(callback), save_status));
519 }
520 
DoneWritingToDisk(mojom::MhtmlSaveStatus save_status)521 void MHTMLGenerationManager::Job::DoneWritingToDisk(
522     mojom::MhtmlSaveStatus save_status) {
523   DCHECK_CURRENTLY_ON(BrowserThread::UI);
524 
525   // If the Job has prematurely finalized and marked as finished, make this
526   // response no-op.
527   if (is_finished_)
528     return;
529 
530   waiting_on_data_streaming_ = false;
531   MaybeSendToNextRenderFrame(save_status);
532 }
533 
OnConnectionError()534 void MHTMLGenerationManager::Job::OnConnectionError() {
535   DCHECK_CURRENTLY_ON(BrowserThread::UI);
536   // If message pipe end closes, then it is an unexpected crash.
537   DLOG(ERROR) << "Message pipe to renderer closed while expecting response";
538   Finalize(mojom::MhtmlSaveStatus::kRenderProcessExited);
539 }
540 
OnFileAvailable(base::File browser_file)541 void MHTMLGenerationManager::Job::OnFileAvailable(base::File browser_file) {
542   DCHECK_CURRENTLY_ON(BrowserThread::UI);
543 
544   if (!browser_file.IsValid()) {
545     DLOG(ERROR) << "Failed to create file";
546     Finalize(mojom::MhtmlSaveStatus::kFileCreationError);
547     return;
548   }
549 
550   browser_file_ = std::move(browser_file);
551 
552   mojom::MhtmlSaveStatus save_status = SendToNextRenderFrame();
553   if (save_status != mojom::MhtmlSaveStatus::kSuccess)
554     Finalize(save_status);
555 }
556 
OnFinished(const CloseFileResult & close_file_result)557 void MHTMLGenerationManager::Job::OnFinished(
558     const CloseFileResult& close_file_result) {
559   DCHECK_CURRENTLY_ON(BrowserThread::UI);
560   mojom::MhtmlSaveStatus save_status = close_file_result.save_status;
561   int64_t file_size = close_file_result.file_size;
562 
563   TRACE_EVENT_NESTABLE_ASYNC_END2("page-serialization", "SavingMhtmlJob", this,
564                                   "job save status", save_status, "file size",
565                                   file_size);
566   UMA_HISTOGRAM_TIMES("PageSerialization.MhtmlGeneration.FullPageSavingTime",
567                       base::TimeTicks::Now() - creation_time_);
568   UMA_HISTOGRAM_ENUMERATION("PageSerialization.MhtmlGeneration.FinalSaveStatus",
569                             save_status);
570 
571   std::move(callback_).Run(close_file_result.toMHTMLGenerationResult());
572 
573   delete this;  // This is the last time the Job is referenced.
574 }
575 
MarkAsFinished()576 void MHTMLGenerationManager::Job::MarkAsFinished() {
577   // MarkAsFinished() may be called twice only in the case which
578   // writer_.reset() does not correctly stop OnConnectionError
579   // notifications for the case described in https://crbug.com/612098.
580   if (is_finished_) {
581     NOTREACHED();
582     return;
583   }
584   is_finished_ = true;
585   writer_.reset();
586 
587   // Additionally, |watcher_| may also invoke DoneWritingToDisk() from
588   // the download sequence, potentially calling this twice. We cannot disable
589   // |watcher_| notifications similar to |writer_|, since it exists in
590   // the download sequence, so we handle the case in DoneWritingToDisk().
591 
592   TRACE_EVENT_NESTABLE_ASYNC_INSTANT0("page-serialization", "JobFinished",
593                                       this);
594 
595   // End of job timing reports.
596   if (!wait_on_renderer_start_time_.is_null()) {
597     base::TimeDelta renderer_wait_time =
598         base::TimeTicks::Now() - wait_on_renderer_start_time_;
599     UMA_HISTOGRAM_TIMES(
600         "PageSerialization.MhtmlGeneration.BrowserWaitForRendererTime."
601         "SingleFrame",
602         renderer_wait_time);
603     all_renderers_wait_time_ += renderer_wait_time;
604   }
605   if (!all_renderers_wait_time_.is_zero()) {
606     UMA_HISTOGRAM_TIMES(
607         "PageSerialization.MhtmlGeneration.BrowserWaitForRendererTime."
608         "FrameTree",
609         all_renderers_wait_time_);
610   }
611   if (!all_renderers_main_thread_time_.is_zero()) {
612     UMA_HISTOGRAM_TIMES(
613         "PageSerialization.MhtmlGeneration.RendererMainThreadTime.FrameTree",
614         all_renderers_main_thread_time_);
615   }
616   if (!longest_renderer_main_thread_time_.is_zero()) {
617     UMA_HISTOGRAM_TIMES(
618         "PageSerialization.MhtmlGeneration.RendererMainThreadTime.SlowestFrame",
619         longest_renderer_main_thread_time_);
620   }
621 }
622 
ReportRendererMainThreadTime(base::TimeDelta renderer_main_thread_time)623 void MHTMLGenerationManager::Job::ReportRendererMainThreadTime(
624     base::TimeDelta renderer_main_thread_time) {
625   DCHECK(renderer_main_thread_time > base::TimeDelta());
626   if (renderer_main_thread_time > base::TimeDelta())
627     all_renderers_main_thread_time_ += renderer_main_thread_time;
628   if (renderer_main_thread_time > longest_renderer_main_thread_time_)
629     longest_renderer_main_thread_time_ = renderer_main_thread_time;
630 }
631 
AddFrame(RenderFrameHost * render_frame_host)632 void MHTMLGenerationManager::Job::AddFrame(RenderFrameHost* render_frame_host) {
633   auto* rfhi = static_cast<RenderFrameHostImpl*>(render_frame_host);
634   int frame_tree_node_id = rfhi->frame_tree_node()->frame_tree_node_id();
635   pending_frame_tree_node_ids_.push(frame_tree_node_id);
636 }
637 
CloseFile(mojom::MhtmlSaveStatus save_status)638 void MHTMLGenerationManager::Job::CloseFile(
639     mojom::MhtmlSaveStatus save_status) {
640   DCHECK_CURRENTLY_ON(BrowserThread::UI);
641   DCHECK(!mhtml_boundary_marker_.empty());
642 
643   // Only update the status if that won't hide an earlier error.
644   if (!browser_file_.IsValid() &&
645       save_status == mojom::MhtmlSaveStatus::kSuccess)
646     save_status = mojom::MhtmlSaveStatus::kFileWritingError;
647 
648   // If no previous error occurred the boundary should be sent.
649   base::PostTaskAndReplyWithResult(
650       download::GetDownloadTaskRunner().get(), FROM_HERE,
651       base::BindOnce(&MHTMLGenerationManager::Job::FinalizeOnFileThread,
652                      save_status, mhtml_boundary_marker_,
653                      std::move(browser_file_), std::move(extra_data_parts_),
654                      std::move(watcher_), std::move(secure_hash_)),
655       base::BindOnce(&Job::OnFinished, weak_factory_.GetWeakPtr()));
656 }
657 
SerializeAsMHTMLResponse(mojom::MhtmlSaveStatus save_status,const std::vector<std::string> & digests_of_uris_of_serialized_resources,base::TimeDelta renderer_main_thread_time)658 void MHTMLGenerationManager::Job::SerializeAsMHTMLResponse(
659     mojom::MhtmlSaveStatus save_status,
660     const std::vector<std::string>& digests_of_uris_of_serialized_resources,
661     base::TimeDelta renderer_main_thread_time) {
662   DCHECK_CURRENTLY_ON(BrowserThread::UI);
663 
664   TRACE_EVENT_NESTABLE_ASYNC_END0("page-serialization", "WaitingOnRenderer",
665                                   this);
666   ReportRendererMainThreadTime(renderer_main_thread_time);
667 
668   frame_tree_node_id_of_busy_frame_ = FrameTreeNode::kFrameTreeNodeInvalidId;
669 
670   // If the renderer succeeded, update the resource digests.
671   if (save_status == mojom::MhtmlSaveStatus::kSuccess)
672     RecordDigests(digests_of_uris_of_serialized_resources);
673 
674   MaybeSendToNextRenderFrame(save_status);
675 }
676 
RecordDigests(const std::vector<std::string> & digests_of_uris_of_serialized_resources)677 void MHTMLGenerationManager::Job::RecordDigests(
678     const std::vector<std::string>& digests_of_uris_of_serialized_resources) {
679   DCHECK(!wait_on_renderer_start_time_.is_null());
680   base::TimeDelta renderer_wait_time =
681       base::TimeTicks::Now() - wait_on_renderer_start_time_;
682   UMA_HISTOGRAM_TIMES(
683       "PageSerialization.MhtmlGeneration.BrowserWaitForRendererTime."
684       "SingleFrame",
685       renderer_wait_time);
686   all_renderers_wait_time_ += renderer_wait_time;
687   wait_on_renderer_start_time_ = base::TimeTicks();
688 
689   // Renderer should be deduping resources with the same uris.
690   DCHECK_EQ(0u, base::STLSetIntersection<std::set<std::string>>(
691                     digests_of_already_serialized_uris_,
692                     std::set<std::string>(
693                         digests_of_uris_of_serialized_resources.begin(),
694                         digests_of_uris_of_serialized_resources.end()))
695                     .size());
696   digests_of_already_serialized_uris_.insert(
697       digests_of_uris_of_serialized_resources.begin(),
698       digests_of_uris_of_serialized_resources.end());
699 }
700 
MaybeSendToNextRenderFrame(mojom::MhtmlSaveStatus save_status)701 void MHTMLGenerationManager::Job::MaybeSendToNextRenderFrame(
702     mojom::MhtmlSaveStatus save_status) {
703   // If current operation is successful and there are more frames to process,
704   // let save status depend on the result of sending the next request.
705   if (save_status == mojom::MhtmlSaveStatus::kSuccess &&
706       !pending_frame_tree_node_ids_.empty() && CurrentFrameDone()) {
707     save_status = SendToNextRenderFrame();
708   }
709 
710   // If there was a failure (either from the renderer or from the job) then
711   // terminate the job and return.
712   if (save_status != mojom::MhtmlSaveStatus::kSuccess) {
713     Finalize(save_status);
714     return;
715   }
716 
717   // Otherwise report completion if there are no more frames to process
718   // and Job is done processing the current frame.
719   if (pending_frame_tree_node_ids_.empty() && CurrentFrameDone())
720     Finalize(mojom::MhtmlSaveStatus::kSuccess);
721 }
722 
CurrentFrameDone() const723 bool MHTMLGenerationManager::Job::CurrentFrameDone() const {
724   bool waiting_for_response_from_renderer =
725       frame_tree_node_id_of_busy_frame_ !=
726       FrameTreeNode::kFrameTreeNodeInvalidId;
727   return !waiting_for_response_from_renderer && !waiting_on_data_streaming_;
728 }
729 
Finalize(mojom::MhtmlSaveStatus save_status)730 void MHTMLGenerationManager::Job::Finalize(mojom::MhtmlSaveStatus save_status) {
731   DCHECK_CURRENTLY_ON(BrowserThread::UI);
732   MarkAsFinished();
733   CloseFile(save_status);
734 }
735 
736 // static
StartNewJob(WebContents * web_contents,const MHTMLGenerationParams & params,MHTMLGenerationResult::GenerateMHTMLCallback callback)737 void MHTMLGenerationManager::Job::StartNewJob(
738     WebContents* web_contents,
739     const MHTMLGenerationParams& params,
740     MHTMLGenerationResult::GenerateMHTMLCallback callback) {
741   // Creates a new Job.
742   // The constructor starts the serialization process and it will delete
743   // itself upon finishing.
744   new Job(web_contents, params, std::move(callback));
745 }
746 
747 // static
WriteToFileAndUpdateHash(base::File * file,crypto::SecureHash * secure_hash,std::string to_write)748 bool MHTMLGenerationManager::Job::WriteToFileAndUpdateHash(
749     base::File* file,
750     crypto::SecureHash* secure_hash,
751     std::string to_write) {
752   bool result = file->WriteAtCurrentPos(to_write.data(), to_write.size()) >= 0;
753   if (result && secure_hash) {
754     secure_hash->Update(to_write.data(), to_write.size());
755   }
756   return result;
757 }
758 
759 // static
FinalizeOnFileThread(mojom::MhtmlSaveStatus save_status,const std::string & boundary,base::File file,const std::vector<MHTMLExtraDataPart> & extra_data_parts,std::unique_ptr<mojo::SimpleWatcher> watcher,std::unique_ptr<crypto::SecureHash> secure_hash)760 CloseFileResult MHTMLGenerationManager::Job::FinalizeOnFileThread(
761     mojom::MhtmlSaveStatus save_status,
762     const std::string& boundary,
763     base::File file,
764     const std::vector<MHTMLExtraDataPart>& extra_data_parts,
765     std::unique_ptr<mojo::SimpleWatcher> watcher,
766     std::unique_ptr<crypto::SecureHash> secure_hash) {
767   DCHECK(download::GetDownloadTaskRunner()->RunsTasksInCurrentSequence());
768 
769   watcher.reset();
770   DCHECK(!boundary.empty());
771 
772   if (save_status == mojom::MhtmlSaveStatus::kSuccess) {
773     TRACE_EVENT0("page-serialization",
774                  "MHTMLGenerationManager::Job MHTML footer writing");
775 
776     // Write the extra data into a part of its own, if we have any.
777     std::string serialized_extra_data_parts =
778         CreateExtraDataParts(boundary, extra_data_parts);
779     // Short circuit to prevent file IO if nothing to write.
780     if (!serialized_extra_data_parts.empty() &&
781         !WriteToFileAndUpdateHash(&file, secure_hash.get(),
782                                   serialized_extra_data_parts)) {
783       save_status = mojom::MhtmlSaveStatus::kFileWritingError;
784     }
785 
786     // Write out the footer at the bottom of the file.
787     std::string footer = CreateFooter(boundary);
788     if (save_status == mojom::MhtmlSaveStatus::kSuccess &&
789         !WriteToFileAndUpdateHash(&file, secure_hash.get(), footer)) {
790       save_status = mojom::MhtmlSaveStatus::kFileWritingError;
791     }
792   }
793 
794   // If the file is still valid try to close it. Only update the status if that
795   // won't hide an earlier error.
796   int64_t file_size;
797   if (!CloseFileIfValid(file, &file_size) &&
798       save_status == mojom::MhtmlSaveStatus::kSuccess) {
799     save_status = mojom::MhtmlSaveStatus::kFileClosingError;
800   }
801 
802   file_size = save_status == mojom::MhtmlSaveStatus::kSuccess
803                   ? file_size
804                   : kInvalidFileSize;
805   // If we do not have a pending hash or the file is invalid, finalize operation
806   // with an empty digest result.
807   if (!secure_hash || file_size == kInvalidFileSize)
808     return CloseFileResult(save_status, file_size, nullptr);
809 
810   // Record hash and finish operation.
811   std::string file_digest = std::string(secure_hash->GetHashLength(), 0);
812   secure_hash->Finish(&(file_digest[0]), file_digest.size());
813   secure_hash.reset();
814   return CloseFileResult(save_status, file_size, &file_digest);
815 }
816 
817 // static
CreateExtraDataParts(const std::string & boundary,const std::vector<MHTMLExtraDataPart> & extra_data_parts)818 std::string MHTMLGenerationManager::Job::CreateExtraDataParts(
819     const std::string& boundary,
820     const std::vector<MHTMLExtraDataPart>& extra_data_parts) {
821   DCHECK(download::GetDownloadTaskRunner()->RunsTasksInCurrentSequence());
822   std::string serialized_extra_data_parts;
823 
824   // Don't write an extra data part if there is none.
825   if (extra_data_parts.empty())
826     return serialized_extra_data_parts;
827 
828   // For each extra part, serialize that part and add to our accumulator
829   // string.
830   for (const auto& part : extra_data_parts) {
831     // Write a newline, then a boundary, a newline, then the content
832     // location, a newline, the content type, a newline, extra_headers,
833     // two newlines, the body, and end with a newline.
834     std::string serialized_extra_data_part = base::StringPrintf(
835         "\r\n--%s\r\n%s%s\r\n%s%s\r\n%s\r\n\r\n%s\r\n", boundary.c_str(),
836         kContentLocation, part.content_location.c_str(), kContentType,
837         part.content_type.c_str(), part.extra_headers.c_str(),
838         part.body.c_str());
839     DCHECK(base::IsStringASCII(serialized_extra_data_part));
840 
841     serialized_extra_data_parts += serialized_extra_data_part;
842   }
843   return serialized_extra_data_parts;
844 }
845 
846 // static
CreateFooter(const std::string & boundary)847 std::string MHTMLGenerationManager::Job::CreateFooter(
848     const std::string& boundary) {
849   DCHECK(download::GetDownloadTaskRunner()->RunsTasksInCurrentSequence());
850   // Per the spec, the boundary must occur at the beginning of a line.
851   std::string footer = base::StringPrintf("\r\n--%s--\r\n", boundary.c_str());
852   DCHECK(base::IsStringASCII(footer));
853   return footer;
854 }
855 
856 // static
CloseFileIfValid(base::File & file,int64_t * file_size)857 bool MHTMLGenerationManager::Job::CloseFileIfValid(base::File& file,
858                                                    int64_t* file_size) {
859   DCHECK(download::GetDownloadTaskRunner()->RunsTasksInCurrentSequence());
860   DCHECK(file_size);
861   if (file.IsValid()) {
862     *file_size = file.GetLength();
863     file.Close();
864     return true;
865   }
866 
867   return false;
868 }
869 
GetInstance()870 MHTMLGenerationManager* MHTMLGenerationManager::GetInstance() {
871   return base::Singleton<MHTMLGenerationManager>::get();
872 }
873 
874 MHTMLGenerationManager::MHTMLGenerationManager() = default;
875 
876 MHTMLGenerationManager::~MHTMLGenerationManager() = default;
877 
SaveMHTML(WebContents * web_contents,const MHTMLGenerationParams & params,MHTMLGenerationResult::GenerateMHTMLCallback callback)878 void MHTMLGenerationManager::SaveMHTML(
879     WebContents* web_contents,
880     const MHTMLGenerationParams& params,
881     MHTMLGenerationResult::GenerateMHTMLCallback callback) {
882   DCHECK_CURRENTLY_ON(BrowserThread::UI);
883   Job::StartNewJob(web_contents, params, std::move(callback));
884 }
885 
886 }  // namespace content
887