1 /*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "third_party/blink/renderer/platform/mhtml/mhtml_archive.h"
32
33 #include <stddef.h>
34 #include "base/metrics/histogram_macros.h"
35 #include "build/build_config.h"
36 #include "third_party/blink/public/mojom/loader/mhtml_load_result.mojom-blink.h"
37 #include "third_party/blink/renderer/platform/mhtml/archive_resource.h"
38 #include "third_party/blink/renderer/platform/mhtml/mhtml_parser.h"
39 #include "third_party/blink/renderer/platform/mhtml/serialized_resource.h"
40 #include "third_party/blink/renderer/platform/network/mime/mime_type_registry.h"
41 #include "third_party/blink/renderer/platform/text/date_components.h"
42 #include "third_party/blink/renderer/platform/weborigin/scheme_registry.h"
43 #include "third_party/blink/renderer/platform/wtf/assertions.h"
44 #include "third_party/blink/renderer/platform/wtf/date_math.h"
45 #include "third_party/blink/renderer/platform/wtf/shared_buffer.h"
46 #include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
47 #include "third_party/blink/renderer/platform/wtf/text/base64.h"
48 #include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
49 #include "third_party/blink/renderer/platform/wtf/vector.h"
50
51 namespace blink {
52
53 namespace {
54
55 using blink::mojom::MHTMLLoadResult;
56
57 const wtf_size_t kMaximumLineLength = 76;
58
59 const char kRFC2047EncodingPrefix[] = "=?utf-8?Q?";
60 const size_t kRFC2047EncodingPrefixLength = 10;
61 const char kRFC2047EncodingSuffix[] = "?=";
62 const size_t kRFC2047EncodingSuffixLength = 2;
63
64 const char kQuotedPrintable[] = "quoted-printable";
65 const char kBase64[] = "base64";
66 const char kBinary[] = "binary";
67
68 // Returns the length of a line-ending if one is present starting at
69 // |input[index]| or zero if no line-ending is present at the given |index|.
LengthOfLineEndingAtIndex(const char * input,size_t input_length,size_t index)70 size_t LengthOfLineEndingAtIndex(const char* input,
71 size_t input_length,
72 size_t index) {
73 SECURITY_DCHECK(index < input_length);
74 if (input[index] == '\n')
75 return 1; // Single LF.
76
77 if (input[index] == '\r') {
78 if ((index + 1) == input_length || input[index + 1] != '\n')
79 return 1; // Single CR (Classic Mac OS).
80 return 2; // CR-LF.
81 }
82
83 return 0;
84 }
85
86 // Performs quoted-printable encoding characters, per RFC 2047.
QuotedPrintableEncode(const char * input,wtf_size_t input_length,bool is_header,Vector<char> & out)87 void QuotedPrintableEncode(const char* input,
88 wtf_size_t input_length,
89 bool is_header,
90 Vector<char>& out) {
91 out.clear();
92 out.ReserveCapacity(input_length);
93 if (is_header)
94 out.Append(kRFC2047EncodingPrefix, kRFC2047EncodingPrefixLength);
95 size_t current_line_length = 0;
96 for (size_t i = 0; i < input_length; ++i) {
97 bool is_last_character = (i == input_length - 1);
98 char current_character = input[i];
99 bool requires_encoding = false;
100 // All non-printable ASCII characters and = require encoding.
101 if ((current_character < ' ' || current_character > '~' ||
102 current_character == '=') &&
103 current_character != '\t')
104 requires_encoding = true;
105
106 // Decide if space and tab characters need to be encoded.
107 if (!requires_encoding &&
108 (current_character == '\t' || current_character == ' ')) {
109 if (is_header) {
110 // White space characters should always be encoded if they appear
111 // anywhere in the header.
112 requires_encoding = true;
113 } else {
114 bool end_of_line = is_last_character || LengthOfLineEndingAtIndex(
115 input, input_length, i + 1);
116 requires_encoding = end_of_line;
117 }
118 }
119
120 // End of line should be converted to CR-LF sequences.
121 if (!is_last_character) {
122 size_t length_of_line_ending =
123 LengthOfLineEndingAtIndex(input, input_length, i);
124 if (length_of_line_ending) {
125 out.Append("\r\n", 2);
126 current_line_length = 0;
127 i += (length_of_line_ending -
128 1); // -1 because we'll ++ in the for() above.
129 continue;
130 }
131 }
132
133 size_t length_of_encoded_character = 1;
134 if (requires_encoding)
135 length_of_encoded_character += 2;
136 if (!is_last_character)
137 length_of_encoded_character += 1; // + 1 for the = (soft line break).
138
139 // Insert a soft line break if necessary.
140 size_t max_line_length_for_encoded_content = kMaximumLineLength;
141 if (is_header) {
142 max_line_length_for_encoded_content -= kRFC2047EncodingPrefixLength;
143 max_line_length_for_encoded_content -= kRFC2047EncodingSuffixLength;
144 }
145
146 if (current_line_length + length_of_encoded_character >
147 max_line_length_for_encoded_content) {
148 if (is_header) {
149 out.Append(kRFC2047EncodingSuffix, kRFC2047EncodingSuffixLength);
150 out.Append("\r\n", 2);
151 out.push_back(' ');
152 } else {
153 out.push_back('=');
154 out.Append("\r\n", 2);
155 }
156 current_line_length = 0;
157 if (is_header)
158 out.Append(kRFC2047EncodingPrefix, kRFC2047EncodingPrefixLength);
159 }
160
161 // Finally, insert the actual character(s).
162 if (requires_encoding) {
163 out.push_back('=');
164 out.push_back(UpperNibbleToASCIIHexDigit(current_character));
165 out.push_back(LowerNibbleToASCIIHexDigit(current_character));
166 current_line_length += 3;
167 } else {
168 out.push_back(current_character);
169 current_line_length++;
170 }
171 }
172 if (is_header)
173 out.Append(kRFC2047EncodingSuffix, kRFC2047EncodingSuffixLength);
174 }
175
ConvertToPrintableCharacters(const String & text)176 String ConvertToPrintableCharacters(const String& text) {
177 // If the text contains all printable ASCII characters, no need for encoding.
178 bool found_non_printable_char = false;
179 for (wtf_size_t i = 0; i < text.length(); ++i) {
180 if (!IsASCIIPrintable(text[i])) {
181 found_non_printable_char = true;
182 break;
183 }
184 }
185 if (!found_non_printable_char)
186 return text;
187
188 // Encode the text as sequences of printable ASCII characters per RFC 2047
189 // (https://tools.ietf.org/html/rfc2047). Specially, the encoded text will be
190 // as: =?utf-8?Q?encoded_text?=
191 // where, "utf-8" is the chosen charset to represent the text and "Q" is the
192 // Quoted-Printable format to convert to 7-bit printable ASCII characters.
193 std::string utf8_text = text.Utf8();
194 Vector<char> encoded_text;
195 QuotedPrintableEncode(utf8_text.c_str(), utf8_text.length(),
196 true /* is_header */, encoded_text);
197 return String(encoded_text.data(), encoded_text.size());
198 }
199
200 } // namespace
201
MHTMLArchive()202 MHTMLArchive::MHTMLArchive() : load_result_(MHTMLLoadResult::kInvalidArchive) {}
203
204 // static
ReportLoadResult(MHTMLLoadResult result)205 void MHTMLArchive::ReportLoadResult(MHTMLLoadResult result) {
206 UMA_HISTOGRAM_ENUMERATION("PageSerialization.MhtmlLoading.LoadResult",
207 result);
208 }
209
210 // static
Create(const KURL & url,scoped_refptr<const SharedBuffer> data)211 MHTMLArchive* MHTMLArchive::Create(const KURL& url,
212 scoped_refptr<const SharedBuffer> data) {
213 MHTMLArchive* archive = CreateArchive(url, data);
214 ReportLoadResult(archive->LoadResult());
215 return archive;
216 }
217
218 // static
CreateArchive(const KURL & url,scoped_refptr<const SharedBuffer> data)219 MHTMLArchive* MHTMLArchive::CreateArchive(
220 const KURL& url,
221 scoped_refptr<const SharedBuffer> data) {
222 MHTMLArchive* archive = MakeGarbageCollected<MHTMLArchive>();
223
224 // |data| may be null if archive file is empty.
225 if (!data || data->IsEmpty()) {
226 archive->load_result_ = MHTMLLoadResult::kEmptyFile;
227 return archive;
228 }
229
230 // MHTML pages can only be loaded from local URLs, http/https URLs, and
231 // content URLs(Android specific). The latter is now allowed due to full
232 // sandboxing enforcement on MHTML pages.
233 if (!CanLoadArchive(url)) {
234 archive->load_result_ = MHTMLLoadResult::kUrlSchemeNotAllowed;
235 return archive;
236 }
237
238 MHTMLParser parser(std::move(data));
239 HeapVector<Member<ArchiveResource>> resources = parser.ParseArchive();
240 if (resources.IsEmpty()) {
241 archive->load_result_ = MHTMLLoadResult::kInvalidArchive;
242 return archive;
243 }
244
245 archive->date_ = parser.CreationDate();
246
247 size_t resources_count = resources.size();
248 // The first document suitable resource is the main resource of the top frame.
249 for (ArchiveResource* resource : resources) {
250 if (archive->MainResource()) {
251 archive->AddSubresource(resource);
252 continue;
253 }
254
255 const AtomicString& mime_type = resource->MimeType();
256 bool is_mime_type_suitable_for_main_resource =
257 MIMETypeRegistry::IsSupportedNonImageMIMEType(mime_type);
258 // Want to allow image-only MHTML archives, but retain behavior for other
259 // documents that have already been created expecting the first HTML page to
260 // be considered the main resource.
261 if (resources_count == 1 &&
262 MIMETypeRegistry::IsSupportedImageResourceMIMEType(mime_type)) {
263 is_mime_type_suitable_for_main_resource = true;
264 }
265 // explicitly disallow JS and CSS as the main resource.
266 if (MIMETypeRegistry::IsSupportedJavaScriptMIMEType(mime_type) ||
267 MIMETypeRegistry::IsSupportedStyleSheetMIMEType(mime_type))
268 is_mime_type_suitable_for_main_resource = false;
269
270 if (is_mime_type_suitable_for_main_resource)
271 archive->SetMainResource(resource);
272 else
273 archive->AddSubresource(resource);
274 }
275 if (archive->MainResource())
276 archive->load_result_ = MHTMLLoadResult::kSuccess;
277 else
278 archive->load_result_ = MHTMLLoadResult::kMissingMainResource;
279
280 return archive;
281 }
282
CanLoadArchive(const KURL & url)283 bool MHTMLArchive::CanLoadArchive(const KURL& url) {
284 // MHTML pages can only be loaded from local URLs, http/https URLs, and
285 // content URLs(Android specific). The latter is now allowed due to full
286 // sandboxing enforcement on MHTML pages.
287 if (SchemeRegistry::ShouldTreatURLSchemeAsLocal(url.Protocol()))
288 return true;
289 if (url.ProtocolIsInHTTPFamily())
290 return true;
291 #if defined(OS_ANDROID)
292 if (url.ProtocolIs("content"))
293 return true;
294 #endif
295 return false;
296 }
297
GenerateMHTMLHeader(const String & boundary,const KURL & url,const String & title,const String & mime_type,base::Time date,Vector<char> & output_buffer)298 void MHTMLArchive::GenerateMHTMLHeader(const String& boundary,
299 const KURL& url,
300 const String& title,
301 const String& mime_type,
302 base::Time date,
303 Vector<char>& output_buffer) {
304 DCHECK(!boundary.IsEmpty());
305 DCHECK(!mime_type.IsEmpty());
306
307 String date_string = MakeRFC2822DateString(date, 0);
308
309 StringBuilder string_builder;
310 string_builder.Append("From: <Saved by Blink>\r\n");
311
312 // Add the document URL in the MHTML headers in order to avoid complicated
313 // parsing to locate it in the multipart body headers.
314 string_builder.Append("Snapshot-Content-Location: ");
315 string_builder.Append(url.GetString());
316
317 string_builder.Append("\r\nSubject: ");
318 string_builder.Append(ConvertToPrintableCharacters(title));
319 string_builder.Append("\r\nDate: ");
320 string_builder.Append(date_string);
321 string_builder.Append("\r\nMIME-Version: 1.0\r\n");
322 string_builder.Append("Content-Type: multipart/related;\r\n");
323 string_builder.Append("\ttype=\"");
324 string_builder.Append(mime_type);
325 string_builder.Append("\";\r\n");
326 string_builder.Append("\tboundary=\"");
327 string_builder.Append(boundary);
328 string_builder.Append("\"\r\n\r\n");
329
330 // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ??
331 // (we still only have put ASCII characters in it).
332 DCHECK(string_builder.ToString().ContainsOnlyASCIIOrEmpty());
333 std::string utf8_string = string_builder.ToString().Utf8();
334
335 output_buffer.Append(utf8_string.c_str(), utf8_string.length());
336 }
337
GenerateMHTMLPart(const String & boundary,const String & content_id,EncodingPolicy encoding_policy,const SerializedResource & resource,Vector<char> & output_buffer)338 void MHTMLArchive::GenerateMHTMLPart(const String& boundary,
339 const String& content_id,
340 EncodingPolicy encoding_policy,
341 const SerializedResource& resource,
342 Vector<char>& output_buffer) {
343 DCHECK(!boundary.IsEmpty());
344 DCHECK(content_id.IsEmpty() || content_id[0] == '<');
345
346 StringBuilder string_builder;
347 // Per the spec, the boundary must occur at the beginning of a line.
348 string_builder.Append("\r\n--");
349 string_builder.Append(boundary);
350 string_builder.Append("\r\n");
351
352 string_builder.Append("Content-Type: ");
353 string_builder.Append(resource.mime_type);
354 string_builder.Append("\r\n");
355
356 if (!content_id.IsEmpty()) {
357 string_builder.Append("Content-ID: ");
358 string_builder.Append(content_id);
359 string_builder.Append("\r\n");
360 }
361
362 const char* content_encoding = nullptr;
363 if (encoding_policy == kUseBinaryEncoding)
364 content_encoding = kBinary;
365 else if (MIMETypeRegistry::IsSupportedJavaScriptMIMEType(
366 resource.mime_type) ||
367 MIMETypeRegistry::IsSupportedNonImageMIMEType(resource.mime_type))
368 content_encoding = kQuotedPrintable;
369 else
370 content_encoding = kBase64;
371
372 string_builder.Append("Content-Transfer-Encoding: ");
373 string_builder.Append(content_encoding);
374 string_builder.Append("\r\n");
375
376 if (!resource.url.ProtocolIsAbout()) {
377 string_builder.Append("Content-Location: ");
378 string_builder.Append(resource.url.GetString());
379 string_builder.Append("\r\n");
380 }
381
382 string_builder.Append("\r\n");
383
384 std::string utf8_string = string_builder.ToString().Utf8();
385 output_buffer.Append(utf8_string.data(), utf8_string.length());
386
387 if (!strcmp(content_encoding, kBinary)) {
388 for (const auto& span : *resource.data)
389 output_buffer.Append(span.data(), SafeCast<wtf_size_t>(span.size()));
390 } else {
391 // FIXME: ideally we would encode the content as a stream without having to
392 // fetch it all.
393 const SharedBuffer::DeprecatedFlatData flat_data(resource.data);
394 const char* data = flat_data.Data();
395 wtf_size_t data_length = SafeCast<wtf_size_t>(flat_data.size());
396 Vector<char> encoded_data;
397 if (!strcmp(content_encoding, kQuotedPrintable)) {
398 QuotedPrintableEncode(data, data_length, false /* is_header */,
399 encoded_data);
400 output_buffer.Append(encoded_data.data(), encoded_data.size());
401 } else {
402 DCHECK(!strcmp(content_encoding, kBase64));
403 // We are not specifying insertLFs = true below as it would cut the lines
404 // with LFs and MHTML requires CRLFs.
405 Base64Encode(base::as_bytes(base::make_span(data, data_length)),
406 encoded_data);
407 wtf_size_t index = 0;
408 wtf_size_t encoded_data_length = encoded_data.size();
409 do {
410 wtf_size_t line_length =
411 std::min(encoded_data_length - index, kMaximumLineLength);
412 output_buffer.Append(encoded_data.data() + index, line_length);
413 output_buffer.Append("\r\n", 2u);
414 index += kMaximumLineLength;
415 } while (index < encoded_data_length);
416 }
417 }
418 }
419
GenerateMHTMLFooterForTesting(const String & boundary,Vector<char> & output_buffer)420 void MHTMLArchive::GenerateMHTMLFooterForTesting(const String& boundary,
421 Vector<char>& output_buffer) {
422 DCHECK(!boundary.IsEmpty());
423 std::string utf8_string = String("\r\n--" + boundary + "--\r\n").Utf8();
424 output_buffer.Append(utf8_string.c_str(), utf8_string.length());
425 }
426
SetMainResource(ArchiveResource * main_resource)427 void MHTMLArchive::SetMainResource(ArchiveResource* main_resource) {
428 main_resource_ = main_resource;
429 }
430
AddSubresource(ArchiveResource * resource)431 void MHTMLArchive::AddSubresource(ArchiveResource* resource) {
432 const KURL& url = resource->Url();
433 subresources_.Set(url, resource);
434 KURL cid_uri = MHTMLParser::ConvertContentIDToURI(resource->ContentID());
435 if (cid_uri.IsValid())
436 subresources_.Set(cid_uri, resource);
437 }
438
SubresourceForURL(const KURL & url) const439 ArchiveResource* MHTMLArchive::SubresourceForURL(const KURL& url) const {
440 return subresources_.at(url.GetString());
441 }
442
Trace(Visitor * visitor) const443 void MHTMLArchive::Trace(Visitor* visitor) const {
444 visitor->Trace(main_resource_);
445 visitor->Trace(subresources_);
446 }
447
448 } // namespace blink
449