1 /*
2  * Copyright (C) 2011 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "third_party/blink/renderer/platform/mhtml/mhtml_archive.h"
32 
33 #include <stddef.h>
34 #include "base/metrics/histogram_macros.h"
35 #include "build/build_config.h"
36 #include "third_party/blink/public/mojom/loader/mhtml_load_result.mojom-blink.h"
37 #include "third_party/blink/renderer/platform/mhtml/archive_resource.h"
38 #include "third_party/blink/renderer/platform/mhtml/mhtml_parser.h"
39 #include "third_party/blink/renderer/platform/mhtml/serialized_resource.h"
40 #include "third_party/blink/renderer/platform/network/mime/mime_type_registry.h"
41 #include "third_party/blink/renderer/platform/text/date_components.h"
42 #include "third_party/blink/renderer/platform/weborigin/scheme_registry.h"
43 #include "third_party/blink/renderer/platform/wtf/assertions.h"
44 #include "third_party/blink/renderer/platform/wtf/date_math.h"
45 #include "third_party/blink/renderer/platform/wtf/shared_buffer.h"
46 #include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
47 #include "third_party/blink/renderer/platform/wtf/text/base64.h"
48 #include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
49 #include "third_party/blink/renderer/platform/wtf/vector.h"
50 
51 namespace blink {
52 
53 namespace {
54 
55 using blink::mojom::MHTMLLoadResult;
56 
57 const wtf_size_t kMaximumLineLength = 76;
58 
59 const char kRFC2047EncodingPrefix[] = "=?utf-8?Q?";
60 const size_t kRFC2047EncodingPrefixLength = 10;
61 const char kRFC2047EncodingSuffix[] = "?=";
62 const size_t kRFC2047EncodingSuffixLength = 2;
63 
64 const char kQuotedPrintable[] = "quoted-printable";
65 const char kBase64[] = "base64";
66 const char kBinary[] = "binary";
67 
68 // Returns the length of a line-ending if one is present starting at
69 // |input[index]| or zero if no line-ending is present at the given |index|.
LengthOfLineEndingAtIndex(const char * input,size_t input_length,size_t index)70 size_t LengthOfLineEndingAtIndex(const char* input,
71                                  size_t input_length,
72                                  size_t index) {
73   SECURITY_DCHECK(index < input_length);
74   if (input[index] == '\n')
75     return 1;  // Single LF.
76 
77   if (input[index] == '\r') {
78     if ((index + 1) == input_length || input[index + 1] != '\n')
79       return 1;  // Single CR (Classic Mac OS).
80     return 2;    // CR-LF.
81   }
82 
83   return 0;
84 }
85 
86 // Performs quoted-printable encoding characters, per RFC 2047.
QuotedPrintableEncode(const char * input,wtf_size_t input_length,bool is_header,Vector<char> & out)87 void QuotedPrintableEncode(const char* input,
88                            wtf_size_t input_length,
89                            bool is_header,
90                            Vector<char>& out) {
91   out.clear();
92   out.ReserveCapacity(input_length);
93   if (is_header)
94     out.Append(kRFC2047EncodingPrefix, kRFC2047EncodingPrefixLength);
95   size_t current_line_length = 0;
96   for (size_t i = 0; i < input_length; ++i) {
97     bool is_last_character = (i == input_length - 1);
98     char current_character = input[i];
99     bool requires_encoding = false;
100     // All non-printable ASCII characters and = require encoding.
101     if ((current_character < ' ' || current_character > '~' ||
102          current_character == '=') &&
103         current_character != '\t')
104       requires_encoding = true;
105 
106     // Decide if space and tab characters need to be encoded.
107     if (!requires_encoding &&
108         (current_character == '\t' || current_character == ' ')) {
109       if (is_header) {
110         // White space characters should always be encoded if they appear
111         // anywhere in the header.
112         requires_encoding = true;
113       } else {
114         bool end_of_line = is_last_character || LengthOfLineEndingAtIndex(
115                                                     input, input_length, i + 1);
116         requires_encoding = end_of_line;
117       }
118     }
119 
120     // End of line should be converted to CR-LF sequences.
121     if (!is_last_character) {
122       size_t length_of_line_ending =
123           LengthOfLineEndingAtIndex(input, input_length, i);
124       if (length_of_line_ending) {
125         out.Append("\r\n", 2);
126         current_line_length = 0;
127         i += (length_of_line_ending -
128               1);  // -1 because we'll ++ in the for() above.
129         continue;
130       }
131     }
132 
133     size_t length_of_encoded_character = 1;
134     if (requires_encoding)
135       length_of_encoded_character += 2;
136     if (!is_last_character)
137       length_of_encoded_character += 1;  // + 1 for the = (soft line break).
138 
139     // Insert a soft line break if necessary.
140     size_t max_line_length_for_encoded_content = kMaximumLineLength;
141     if (is_header) {
142       max_line_length_for_encoded_content -= kRFC2047EncodingPrefixLength;
143       max_line_length_for_encoded_content -= kRFC2047EncodingSuffixLength;
144     }
145 
146     if (current_line_length + length_of_encoded_character >
147         max_line_length_for_encoded_content) {
148       if (is_header) {
149         out.Append(kRFC2047EncodingSuffix, kRFC2047EncodingSuffixLength);
150         out.Append("\r\n", 2);
151         out.push_back(' ');
152       } else {
153         out.push_back('=');
154         out.Append("\r\n", 2);
155       }
156       current_line_length = 0;
157       if (is_header)
158         out.Append(kRFC2047EncodingPrefix, kRFC2047EncodingPrefixLength);
159     }
160 
161     // Finally, insert the actual character(s).
162     if (requires_encoding) {
163       out.push_back('=');
164       out.push_back(UpperNibbleToASCIIHexDigit(current_character));
165       out.push_back(LowerNibbleToASCIIHexDigit(current_character));
166       current_line_length += 3;
167     } else {
168       out.push_back(current_character);
169       current_line_length++;
170     }
171   }
172   if (is_header)
173     out.Append(kRFC2047EncodingSuffix, kRFC2047EncodingSuffixLength);
174 }
175 
ConvertToPrintableCharacters(const String & text)176 String ConvertToPrintableCharacters(const String& text) {
177   // If the text contains all printable ASCII characters, no need for encoding.
178   bool found_non_printable_char = false;
179   for (wtf_size_t i = 0; i < text.length(); ++i) {
180     if (!IsASCIIPrintable(text[i])) {
181       found_non_printable_char = true;
182       break;
183     }
184   }
185   if (!found_non_printable_char)
186     return text;
187 
188   // Encode the text as sequences of printable ASCII characters per RFC 2047
189   // (https://tools.ietf.org/html/rfc2047). Specially, the encoded text will be
190   // as:   =?utf-8?Q?encoded_text?=
191   // where, "utf-8" is the chosen charset to represent the text and "Q" is the
192   // Quoted-Printable format to convert to 7-bit printable ASCII characters.
193   std::string utf8_text = text.Utf8();
194   Vector<char> encoded_text;
195   QuotedPrintableEncode(utf8_text.c_str(), utf8_text.length(),
196                         true /* is_header */, encoded_text);
197   return String(encoded_text.data(), encoded_text.size());
198 }
199 
200 }  // namespace
201 
MHTMLArchive()202 MHTMLArchive::MHTMLArchive() : load_result_(MHTMLLoadResult::kInvalidArchive) {}
203 
204 // static
ReportLoadResult(MHTMLLoadResult result)205 void MHTMLArchive::ReportLoadResult(MHTMLLoadResult result) {
206   UMA_HISTOGRAM_ENUMERATION("PageSerialization.MhtmlLoading.LoadResult",
207                             result);
208 }
209 
210 // static
Create(const KURL & url,scoped_refptr<const SharedBuffer> data)211 MHTMLArchive* MHTMLArchive::Create(const KURL& url,
212                                    scoped_refptr<const SharedBuffer> data) {
213   MHTMLArchive* archive = CreateArchive(url, data);
214   ReportLoadResult(archive->LoadResult());
215   return archive;
216 }
217 
218 // static
CreateArchive(const KURL & url,scoped_refptr<const SharedBuffer> data)219 MHTMLArchive* MHTMLArchive::CreateArchive(
220     const KURL& url,
221     scoped_refptr<const SharedBuffer> data) {
222   MHTMLArchive* archive = MakeGarbageCollected<MHTMLArchive>();
223 
224   // |data| may be null if archive file is empty.
225   if (!data || data->IsEmpty()) {
226     archive->load_result_ = MHTMLLoadResult::kEmptyFile;
227     return archive;
228   }
229 
230   // MHTML pages can only be loaded from local URLs, http/https URLs, and
231   // content URLs(Android specific).  The latter is now allowed due to full
232   // sandboxing enforcement on MHTML pages.
233   if (!CanLoadArchive(url)) {
234     archive->load_result_ = MHTMLLoadResult::kUrlSchemeNotAllowed;
235     return archive;
236   }
237 
238   MHTMLParser parser(std::move(data));
239   HeapVector<Member<ArchiveResource>> resources = parser.ParseArchive();
240   if (resources.IsEmpty()) {
241     archive->load_result_ = MHTMLLoadResult::kInvalidArchive;
242     return archive;
243   }
244 
245   archive->date_ = parser.CreationDate();
246 
247   size_t resources_count = resources.size();
248   // The first document suitable resource is the main resource of the top frame.
249   for (ArchiveResource* resource : resources) {
250     if (archive->MainResource()) {
251       archive->AddSubresource(resource);
252       continue;
253     }
254 
255     const AtomicString& mime_type = resource->MimeType();
256     bool is_mime_type_suitable_for_main_resource =
257         MIMETypeRegistry::IsSupportedNonImageMIMEType(mime_type);
258     // Want to allow image-only MHTML archives, but retain behavior for other
259     // documents that have already been created expecting the first HTML page to
260     // be considered the main resource.
261     if (resources_count == 1 &&
262         MIMETypeRegistry::IsSupportedImageResourceMIMEType(mime_type)) {
263       is_mime_type_suitable_for_main_resource = true;
264     }
265     // explicitly disallow JS and CSS as the main resource.
266     if (MIMETypeRegistry::IsSupportedJavaScriptMIMEType(mime_type) ||
267         MIMETypeRegistry::IsSupportedStyleSheetMIMEType(mime_type))
268       is_mime_type_suitable_for_main_resource = false;
269 
270     if (is_mime_type_suitable_for_main_resource)
271       archive->SetMainResource(resource);
272     else
273       archive->AddSubresource(resource);
274   }
275   if (archive->MainResource())
276     archive->load_result_ = MHTMLLoadResult::kSuccess;
277   else
278     archive->load_result_ = MHTMLLoadResult::kMissingMainResource;
279 
280   return archive;
281 }
282 
CanLoadArchive(const KURL & url)283 bool MHTMLArchive::CanLoadArchive(const KURL& url) {
284   // MHTML pages can only be loaded from local URLs, http/https URLs, and
285   // content URLs(Android specific).  The latter is now allowed due to full
286   // sandboxing enforcement on MHTML pages.
287   if (SchemeRegistry::ShouldTreatURLSchemeAsLocal(url.Protocol()))
288     return true;
289   if (url.ProtocolIsInHTTPFamily())
290     return true;
291 #if defined(OS_ANDROID)
292   if (url.ProtocolIs("content"))
293     return true;
294 #endif
295   return false;
296 }
297 
GenerateMHTMLHeader(const String & boundary,const KURL & url,const String & title,const String & mime_type,base::Time date,Vector<char> & output_buffer)298 void MHTMLArchive::GenerateMHTMLHeader(const String& boundary,
299                                        const KURL& url,
300                                        const String& title,
301                                        const String& mime_type,
302                                        base::Time date,
303                                        Vector<char>& output_buffer) {
304   DCHECK(!boundary.IsEmpty());
305   DCHECK(!mime_type.IsEmpty());
306 
307   String date_string = MakeRFC2822DateString(date, 0);
308 
309   StringBuilder string_builder;
310   string_builder.Append("From: <Saved by Blink>\r\n");
311 
312   // Add the document URL in the MHTML headers in order to avoid complicated
313   // parsing to locate it in the multipart body headers.
314   string_builder.Append("Snapshot-Content-Location: ");
315   string_builder.Append(url.GetString());
316 
317   string_builder.Append("\r\nSubject: ");
318   string_builder.Append(ConvertToPrintableCharacters(title));
319   string_builder.Append("\r\nDate: ");
320   string_builder.Append(date_string);
321   string_builder.Append("\r\nMIME-Version: 1.0\r\n");
322   string_builder.Append("Content-Type: multipart/related;\r\n");
323   string_builder.Append("\ttype=\"");
324   string_builder.Append(mime_type);
325   string_builder.Append("\";\r\n");
326   string_builder.Append("\tboundary=\"");
327   string_builder.Append(boundary);
328   string_builder.Append("\"\r\n\r\n");
329 
330   // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ??
331   // (we still only have put ASCII characters in it).
332   DCHECK(string_builder.ToString().ContainsOnlyASCIIOrEmpty());
333   std::string utf8_string = string_builder.ToString().Utf8();
334 
335   output_buffer.Append(utf8_string.c_str(), utf8_string.length());
336 }
337 
GenerateMHTMLPart(const String & boundary,const String & content_id,EncodingPolicy encoding_policy,const SerializedResource & resource,Vector<char> & output_buffer)338 void MHTMLArchive::GenerateMHTMLPart(const String& boundary,
339                                      const String& content_id,
340                                      EncodingPolicy encoding_policy,
341                                      const SerializedResource& resource,
342                                      Vector<char>& output_buffer) {
343   DCHECK(!boundary.IsEmpty());
344   DCHECK(content_id.IsEmpty() || content_id[0] == '<');
345 
346   StringBuilder string_builder;
347   // Per the spec, the boundary must occur at the beginning of a line.
348   string_builder.Append("\r\n--");
349   string_builder.Append(boundary);
350   string_builder.Append("\r\n");
351 
352   string_builder.Append("Content-Type: ");
353   string_builder.Append(resource.mime_type);
354   string_builder.Append("\r\n");
355 
356   if (!content_id.IsEmpty()) {
357     string_builder.Append("Content-ID: ");
358     string_builder.Append(content_id);
359     string_builder.Append("\r\n");
360   }
361 
362   const char* content_encoding = nullptr;
363   if (encoding_policy == kUseBinaryEncoding)
364     content_encoding = kBinary;
365   else if (MIMETypeRegistry::IsSupportedJavaScriptMIMEType(
366                resource.mime_type) ||
367            MIMETypeRegistry::IsSupportedNonImageMIMEType(resource.mime_type))
368     content_encoding = kQuotedPrintable;
369   else
370     content_encoding = kBase64;
371 
372   string_builder.Append("Content-Transfer-Encoding: ");
373   string_builder.Append(content_encoding);
374   string_builder.Append("\r\n");
375 
376   if (!resource.url.ProtocolIsAbout()) {
377     string_builder.Append("Content-Location: ");
378     string_builder.Append(resource.url.GetString());
379     string_builder.Append("\r\n");
380   }
381 
382   string_builder.Append("\r\n");
383 
384   std::string utf8_string = string_builder.ToString().Utf8();
385   output_buffer.Append(utf8_string.data(), utf8_string.length());
386 
387   if (!strcmp(content_encoding, kBinary)) {
388     for (const auto& span : *resource.data)
389       output_buffer.Append(span.data(), SafeCast<wtf_size_t>(span.size()));
390   } else {
391     // FIXME: ideally we would encode the content as a stream without having to
392     // fetch it all.
393     const SharedBuffer::DeprecatedFlatData flat_data(resource.data);
394     const char* data = flat_data.Data();
395     wtf_size_t data_length = SafeCast<wtf_size_t>(flat_data.size());
396     Vector<char> encoded_data;
397     if (!strcmp(content_encoding, kQuotedPrintable)) {
398       QuotedPrintableEncode(data, data_length, false /* is_header */,
399                             encoded_data);
400       output_buffer.Append(encoded_data.data(), encoded_data.size());
401     } else {
402       DCHECK(!strcmp(content_encoding, kBase64));
403       // We are not specifying insertLFs = true below as it would cut the lines
404       // with LFs and MHTML requires CRLFs.
405       Base64Encode(base::as_bytes(base::make_span(data, data_length)),
406                    encoded_data);
407       wtf_size_t index = 0;
408       wtf_size_t encoded_data_length = encoded_data.size();
409       do {
410         wtf_size_t line_length =
411             std::min(encoded_data_length - index, kMaximumLineLength);
412         output_buffer.Append(encoded_data.data() + index, line_length);
413         output_buffer.Append("\r\n", 2u);
414         index += kMaximumLineLength;
415       } while (index < encoded_data_length);
416     }
417   }
418 }
419 
GenerateMHTMLFooterForTesting(const String & boundary,Vector<char> & output_buffer)420 void MHTMLArchive::GenerateMHTMLFooterForTesting(const String& boundary,
421                                                  Vector<char>& output_buffer) {
422   DCHECK(!boundary.IsEmpty());
423   std::string utf8_string = String("\r\n--" + boundary + "--\r\n").Utf8();
424   output_buffer.Append(utf8_string.c_str(), utf8_string.length());
425 }
426 
SetMainResource(ArchiveResource * main_resource)427 void MHTMLArchive::SetMainResource(ArchiveResource* main_resource) {
428   main_resource_ = main_resource;
429 }
430 
AddSubresource(ArchiveResource * resource)431 void MHTMLArchive::AddSubresource(ArchiveResource* resource) {
432   const KURL& url = resource->Url();
433   subresources_.Set(url, resource);
434   KURL cid_uri = MHTMLParser::ConvertContentIDToURI(resource->ContentID());
435   if (cid_uri.IsValid())
436     subresources_.Set(cid_uri, resource);
437 }
438 
SubresourceForURL(const KURL & url) const439 ArchiveResource* MHTMLArchive::SubresourceForURL(const KURL& url) const {
440   return subresources_.at(url.GetString());
441 }
442 
Trace(Visitor * visitor) const443 void MHTMLArchive::Trace(Visitor* visitor) const {
444   visitor->Trace(main_resource_);
445   visitor->Trace(subresources_);
446 }
447 
448 }  // namespace blink
449