1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Detecting mime types is a tricky business because we need to balance
6 // compatibility concerns with security issues. Here is a survey of how other
7 // browsers behave and then a description of how we intend to behave.
8 //
9 // HTML payload, no Content-Type header:
10 // * IE 7: Render as HTML
11 // * Firefox 2: Render as HTML
12 // * Safari 3: Render as HTML
13 // * Opera 9: Render as HTML
14 //
15 // Here the choice seems clear:
16 // => Chrome: Render as HTML
17 //
18 // HTML payload, Content-Type: "text/plain":
19 // * IE 7: Render as HTML
20 // * Firefox 2: Render as text
21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
22 // has an HTML extension)
23 // * Opera 9: Render as text
24 //
25 // Here we choose to follow the majority (and break some compatibility with IE).
26 // Many folks dislike IE's behavior here.
27 // => Chrome: Render as text
28 // We generalize this as follows. If the Content-Type header is text/plain
29 // we won't detect dangerous mime types (those that can execute script).
30 //
31 // HTML payload, Content-Type: "application/octet-stream":
32 // * IE 7: Render as HTML
33 // * Firefox 2: Download as application/octet-stream
34 // * Safari 3: Render as HTML
35 // * Opera 9: Render as HTML
36 //
37 // We follow Firefox.
38 // => Chrome: Download as application/octet-stream
39 // One factor in this decision is that IIS 4 and 5 will send
40 // application/octet-stream for .xhtml files (because they don't recognize
41 // the extension). We did some experiments and it looks like this doesn't occur
42 // very often on the web. We choose the more secure option.
43 //
44 // GIF payload, no Content-Type header:
45 // * IE 7: Render as GIF
46 // * Firefox 2: Render as GIF
47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
48 // URL has an GIF extension)
49 // * Opera 9: Render as GIF
50 //
51 // The choice is clear.
52 // => Chrome: Render as GIF
53 // Once we decide to render HTML without a Content-Type header, there isn't much
54 // reason not to render GIFs.
55 //
56 // GIF payload, Content-Type: "text/plain":
57 // * IE 7: Render as GIF
58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
59 // Download as GIF if the URL has an GIF extension)
60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
61 // URL has an GIF extension)
62 // * Opera 9: Render as GIF
63 //
64 // Displaying as text/plain makes little sense as the content will look like
65 // gibberish. Here, we could change our minds and download.
66 // => Chrome: Render as GIF
67 //
68 // GIF payload, Content-Type: "application/octet-stream":
69 // * IE 7: Render as GIF
70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
71 // Download as GIF if the URL has an GIF extension)
72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
73 // URL has an GIF extension)
74 // * Opera 9: Render as GIF
75 //
76 // We used to render as GIF here, but the problem is that some sites want to
77 // trigger downloads by sending application/octet-stream (even though they
78 // should be sending Content-Disposition: attachment). Although it is safe
79 // to render as GIF from a security perspective, we actually get better
80 // compatibility if we don't sniff from application/octet stream at all.
81 // => Chrome: Download as application/octet-stream
82 //
83 // Note that our definition of HTML payload is much stricter than IE's
84 // definition and roughly the same as Firefox's definition.
85
86 #include <stdint.h>
87 #include <string>
88
89 #include "net/base/mime_sniffer.h"
90
91 #include "base/containers/span.h"
92 #include "base/logging.h"
93 #include "base/stl_util.h"
94 #include "base/strings/string_util.h"
95 #include "url/gurl.h"
96
97 namespace net {
98
99 // The number of content bytes we need to use all our magic numbers. Feel free
100 // to increase this number if you add a longer magic number.
101 static const size_t kBytesRequiredForMagic = 42;
102
103 struct MagicNumber {
104 const char* const mime_type;
105 const char* const magic;
106 size_t magic_len;
107 bool is_string;
108 const char* const mask; // if set, must have same length as |magic|
109 };
110
111 #define MAGIC_NUMBER(mime_type, magic) \
112 { (mime_type), (magic), sizeof(magic)-1, false, NULL }
113
114 template <int MagicSize, int MaskSize>
115 class VerifySizes {
116 static_assert(MagicSize == MaskSize, "sizes must be equal");
117
118 public:
119 enum { SIZES = MagicSize };
120 };
121
122 #define verified_sizeof(magic, mask) \
123 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES
124
125 #define MAGIC_MASK(mime_type, magic, mask) \
126 { (mime_type), (magic), verified_sizeof(magic, mask)-1, false, (mask) }
127
128 // Magic strings are case insensitive and must not include '\0' characters
129 #define MAGIC_STRING(mime_type, magic) \
130 { (mime_type), (magic), sizeof(magic)-1, true, NULL }
131
132 static const MagicNumber kMagicNumbers[] = {
133 // Source: HTML 5 specification
134 MAGIC_NUMBER("application/pdf", "%PDF-"),
135 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-"),
136 MAGIC_NUMBER("image/gif", "GIF87a"),
137 MAGIC_NUMBER("image/gif", "GIF89a"),
138 MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A"),
139 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF"),
140 MAGIC_NUMBER("image/bmp", "BM"),
141 // Source: Mozilla
142 MAGIC_NUMBER("text/plain", "#!"), // Script
143 MAGIC_NUMBER("text/plain", "%!"), // Script, similar to PS
144 MAGIC_NUMBER("text/plain", "From"),
145 MAGIC_NUMBER("text/plain", ">From"),
146 // Chrome specific
147 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08"),
148 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46"),
149 MAGIC_NUMBER("video/x-ms-asf",
150 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C"),
151 MAGIC_NUMBER("image/tiff", "I I"),
152 MAGIC_NUMBER("image/tiff", "II*"),
153 MAGIC_NUMBER("image/tiff", "MM\x00*"),
154 MAGIC_NUMBER("audio/mpeg", "ID3"),
155 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP"),
156 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3"),
157 MAGIC_NUMBER("application/zip", "PK\x03\x04"),
158 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00"),
159 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A"),
160 MAGIC_NUMBER("application/octet-stream", "MZ"), // EXE
161 // Sniffing for Flash:
162 //
163 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS"),
164 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV"),
165 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS"),
166 //
167 // Including these magic number for Flash is a trade off.
168 //
169 // Pros:
170 // * Flash is an important and popular file format
171 //
172 // Cons:
173 // * These patterns are fairly weak
174 // * If we mistakenly decide something is Flash, we will execute it
175 // in the origin of an unsuspecting site. This could be a security
176 // vulnerability if the site allows users to upload content.
177 //
178 // On balance, we do not include these patterns.
179 };
180
181 // The number of content bytes we need to use all our Microsoft Office magic
182 // numbers.
183 static const size_t kBytesRequiredForOfficeMagic = 8;
184
185 static const MagicNumber kOfficeMagicNumbers[] = {
186 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"),
187 MAGIC_NUMBER("OOXML", "PK\x03\x04"),
188 };
189
190 enum OfficeDocType {
191 DOC_TYPE_WORD,
192 DOC_TYPE_EXCEL,
193 DOC_TYPE_POWERPOINT,
194 DOC_TYPE_NONE
195 };
196
197 struct OfficeExtensionType {
198 OfficeDocType doc_type;
199 const char* const extension;
200 size_t extension_len;
201 };
202
203 #define OFFICE_EXTENSION(type, extension) \
204 { (type), (extension), sizeof(extension) - 1 }
205
206 static const OfficeExtensionType kOfficeExtensionTypes[] = {
207 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc"),
208 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls"),
209 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt"),
210 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx"),
211 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx"),
212 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx"),
213 };
214
215 static const MagicNumber kExtraMagicNumbers[] = {
216 MAGIC_NUMBER("image/x-xbitmap", "#define"),
217 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00"),
218 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt "),
219 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST"),
220 MAGIC_NUMBER("audio/ogg", "OggS\0"),
221 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0"),
222 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0"),
223 MAGIC_NUMBER("video/3gpp", "....ftyp3g"),
224 MAGIC_NUMBER("video/3gpp", "....ftypavcl"),
225 MAGIC_NUMBER("video/mp4", "....ftyp"),
226 MAGIC_NUMBER("video/quicktime", "....moov"),
227 MAGIC_NUMBER("application/x-shockwave-flash", "CWS"),
228 MAGIC_NUMBER("application/x-shockwave-flash", "FWS"),
229 MAGIC_NUMBER("video/x-flv", "FLV"),
230 MAGIC_NUMBER("audio/x-flac", "fLaC"),
231 // Per https://tools.ietf.org/html/rfc3267#section-8.1
232 MAGIC_NUMBER("audio/amr", "#!AMR\n"),
233
234 // RAW image types.
235 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR"),
236 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR"),
237 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM"),
238 MAGIC_NUMBER("image/x-olympus-orf", "MMOR"), // big-endian
239 MAGIC_NUMBER("image/x-olympus-orf", "IIRO"), // little-endian
240 MAGIC_NUMBER("image/x-olympus-orf", "IIRS"), // little-endian
241 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW "),
242 MAGIC_NUMBER("image/x-panasonic-raw",
243 "IIU\x00\x08\x00\x00\x00"), // Panasonic .raw
244 MAGIC_NUMBER("image/x-panasonic-raw",
245 "IIU\x00\x18\x00\x00\x00"), // Panasonic .rw2
246 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw"),
247 MAGIC_NUMBER("image/x-x3f", "FOVb"),
248 };
249
250 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will
251 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
252 // HTML, but we will not.
253
254 #define MAGIC_HTML_TAG(tag) \
255 MAGIC_STRING("text/html", "<" tag)
256
257 static const MagicNumber kSniffableTags[] = {
258 // XML processing directive. Although this is not an HTML mime type, we sniff
259 // for this in the HTML phase because text/xml is just as powerful as HTML and
260 // we want to leverage our white space skipping technology.
261 MAGIC_NUMBER("text/xml", "<?xml"), // Mozilla
262 // DOCTYPEs
263 MAGIC_HTML_TAG("!DOCTYPE html"), // HTML5 spec
264 // Sniffable tags, ordered by how often they occur in sniffable documents.
265 MAGIC_HTML_TAG("script"), // HTML5 spec, Mozilla
266 MAGIC_HTML_TAG("html"), // HTML5 spec, Mozilla
267 MAGIC_HTML_TAG("!--"),
268 MAGIC_HTML_TAG("head"), // HTML5 spec, Mozilla
269 MAGIC_HTML_TAG("iframe"), // Mozilla
270 MAGIC_HTML_TAG("h1"), // Mozilla
271 MAGIC_HTML_TAG("div"), // Mozilla
272 MAGIC_HTML_TAG("font"), // Mozilla
273 MAGIC_HTML_TAG("table"), // Mozilla
274 MAGIC_HTML_TAG("a"), // Mozilla
275 MAGIC_HTML_TAG("style"), // Mozilla
276 MAGIC_HTML_TAG("title"), // Mozilla
277 MAGIC_HTML_TAG("b"), // Mozilla
278 MAGIC_HTML_TAG("body"), // Mozilla
279 MAGIC_HTML_TAG("br"),
280 MAGIC_HTML_TAG("p"), // Mozilla
281 };
282
283 // Compare content header to a magic number where magic_entry can contain '.'
284 // for single character of anything, allowing some bytes to be skipped.
MagicCmp(const char * magic_entry,const char * content,size_t len)285 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) {
286 while (len) {
287 if ((*magic_entry != '.') && (*magic_entry != *content))
288 return false;
289 ++magic_entry;
290 ++content;
291 --len;
292 }
293 return true;
294 }
295
296 // Like MagicCmp() except that it ANDs each byte with a mask before
297 // the comparison, because there are some bits we don't care about.
MagicMaskCmp(const char * magic_entry,const char * content,size_t len,const char * mask)298 static bool MagicMaskCmp(const char* magic_entry,
299 const char* content,
300 size_t len,
301 const char* mask) {
302 while (len) {
303 if ((*magic_entry != '.') && (*magic_entry != (*mask & *content)))
304 return false;
305 ++magic_entry;
306 ++content;
307 ++mask;
308 --len;
309 }
310 return true;
311 }
312
MatchMagicNumber(const char * content,size_t size,const MagicNumber & magic_entry,std::string * result)313 static bool MatchMagicNumber(const char* content,
314 size_t size,
315 const MagicNumber& magic_entry,
316 std::string* result) {
317 const size_t len = magic_entry.magic_len;
318
319 // Keep kBytesRequiredForMagic honest.
320 DCHECK_LE(len, kBytesRequiredForMagic);
321
322 // To compare with magic strings, we need to compute strlen(content), but
323 // content might not actually have a null terminator. In that case, we
324 // pretend the length is content_size.
325 const char* end = static_cast<const char*>(memchr(content, '\0', size));
326 const size_t content_strlen =
327 (end != nullptr) ? static_cast<size_t>(end - content) : size;
328
329 bool match = false;
330 if (magic_entry.is_string) {
331 if (content_strlen >= len) {
332 // Do a case-insensitive prefix comparison.
333 DCHECK_EQ(strlen(magic_entry.magic), len);
334 match = base::EqualsCaseInsensitiveASCII(magic_entry.magic,
335 base::StringPiece(content, len));
336 }
337 } else {
338 if (size >= len) {
339 if (!magic_entry.mask) {
340 match = MagicCmp(magic_entry.magic, content, len);
341 } else {
342 match = MagicMaskCmp(magic_entry.magic, content, len, magic_entry.mask);
343 }
344 }
345 }
346
347 if (match) {
348 result->assign(magic_entry.mime_type);
349 return true;
350 }
351 return false;
352 }
353
CheckForMagicNumbers(const char * content,size_t size,base::span<const MagicNumber> magic_numbers,std::string * result)354 static bool CheckForMagicNumbers(const char* content,
355 size_t size,
356 base::span<const MagicNumber> magic_numbers,
357 std::string* result) {
358 for (const MagicNumber& magic : magic_numbers) {
359 if (MatchMagicNumber(content, size, magic, result))
360 return true;
361 }
362 return false;
363 }
364
365 // Truncates |size| to |max_size| and returns true if |size| is at least
366 // |max_size|.
TruncateSize(const size_t max_size,size_t * size)367 static bool TruncateSize(const size_t max_size, size_t* size) {
368 // Keep kMaxBytesToSniff honest.
369 DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
370
371 if (*size >= max_size) {
372 *size = max_size;
373 return true;
374 }
375 return false;
376 }
377
378 // Returns true and sets result if the content appears to be HTML.
379 // Clears have_enough_content if more data could possibly change the result.
SniffForHTML(const char * content,size_t size,bool * have_enough_content,std::string * result)380 static bool SniffForHTML(const char* content,
381 size_t size,
382 bool* have_enough_content,
383 std::string* result) {
384 // For HTML, we are willing to consider up to 512 bytes. This may be overly
385 // conservative as IE only considers 256.
386 *have_enough_content &= TruncateSize(512, &size);
387
388 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
389 // but with some modifications to better match the HTML5 spec.
390 const char* const end = content + size;
391 const char* pos;
392 for (pos = content; pos < end; ++pos) {
393 if (!base::IsAsciiWhitespace(*pos))
394 break;
395 }
396 // |pos| now points to first non-whitespace character (or at end).
397 return CheckForMagicNumbers(pos, end - pos, kSniffableTags, result);
398 }
399
400 // Returns true and sets result if the content matches any of kMagicNumbers.
401 // Clears have_enough_content if more data could possibly change the result.
SniffForMagicNumbers(const char * content,size_t size,bool * have_enough_content,std::string * result)402 static bool SniffForMagicNumbers(const char* content,
403 size_t size,
404 bool* have_enough_content,
405 std::string* result) {
406 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
407
408 // Check our big table of Magic Numbers
409 return CheckForMagicNumbers(content, size, kMagicNumbers, result);
410 }
411
412 // Returns true and sets result if the content matches any of
413 // kOfficeMagicNumbers, and the URL has the proper extension.
414 // Clears |have_enough_content| if more data could possibly change the result.
SniffForOfficeDocs(const char * content,size_t size,const GURL & url,bool * have_enough_content,std::string * result)415 static bool SniffForOfficeDocs(const char* content,
416 size_t size,
417 const GURL& url,
418 bool* have_enough_content,
419 std::string* result) {
420 *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size);
421
422 // Check our table of magic numbers for Office file types.
423 std::string office_version;
424 if (!CheckForMagicNumbers(content, size, kOfficeMagicNumbers,
425 &office_version))
426 return false;
427
428 OfficeDocType type = DOC_TYPE_NONE;
429 base::StringPiece url_path = url.path_piece();
430 for (const auto& office_extension : kOfficeExtensionTypes) {
431 if (url_path.length() < office_extension.extension_len)
432 continue;
433
434 base::StringPiece extension =
435 url_path.substr(url_path.length() - office_extension.extension_len);
436 if (base::EqualsCaseInsensitiveASCII(
437 extension, base::StringPiece(office_extension.extension,
438 office_extension.extension_len))) {
439 type = office_extension.doc_type;
440 break;
441 }
442 }
443
444 if (type == DOC_TYPE_NONE)
445 return false;
446
447 if (office_version == "CFB") {
448 switch (type) {
449 case DOC_TYPE_WORD:
450 *result = "application/msword";
451 return true;
452 case DOC_TYPE_EXCEL:
453 *result = "application/vnd.ms-excel";
454 return true;
455 case DOC_TYPE_POWERPOINT:
456 *result = "application/vnd.ms-powerpoint";
457 return true;
458 case DOC_TYPE_NONE:
459 NOTREACHED();
460 return false;
461 }
462 } else if (office_version == "OOXML") {
463 switch (type) {
464 case DOC_TYPE_WORD:
465 *result = "application/vnd.openxmlformats-officedocument."
466 "wordprocessingml.document";
467 return true;
468 case DOC_TYPE_EXCEL:
469 *result = "application/vnd.openxmlformats-officedocument."
470 "spreadsheetml.sheet";
471 return true;
472 case DOC_TYPE_POWERPOINT:
473 *result = "application/vnd.openxmlformats-officedocument."
474 "presentationml.presentation";
475 return true;
476 case DOC_TYPE_NONE:
477 NOTREACHED();
478 return false;
479 }
480 }
481
482 NOTREACHED();
483 return false;
484 }
485
IsOfficeType(const std::string & type_hint)486 static bool IsOfficeType(const std::string& type_hint) {
487 return (type_hint == "application/msword" ||
488 type_hint == "application/vnd.ms-excel" ||
489 type_hint == "application/vnd.ms-powerpoint" ||
490 type_hint == "application/vnd.openxmlformats-officedocument."
491 "wordprocessingml.document" ||
492 type_hint == "application/vnd.openxmlformats-officedocument."
493 "spreadsheetml.sheet" ||
494 type_hint == "application/vnd.openxmlformats-officedocument."
495 "presentationml.presentation" ||
496 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" ||
497 type_hint == "application/vnd.ms-word.document.macroenabled.12" ||
498 type_hint == "application/vnd.ms-powerpoint.presentation."
499 "macroenabled.12" ||
500 type_hint == "application/mspowerpoint" ||
501 type_hint == "application/msexcel" ||
502 type_hint == "application/vnd.ms-word" ||
503 type_hint == "application/vnd.ms-word.document.12" ||
504 type_hint == "application/vnd.msword");
505 }
506
507 // This function checks for files that have a Microsoft Office MIME type
508 // set, but are not actually Office files.
509 //
510 // If this is not actually an Office file, |*result| is set to
511 // "application/octet-stream", otherwise it is not modified.
512 //
513 // Returns false if additional data is required to determine the file type, or
514 // true if there is enough data to make a decision.
SniffForInvalidOfficeDocs(const char * content,size_t size,const GURL & url,std::string * result)515 static bool SniffForInvalidOfficeDocs(const char* content,
516 size_t size,
517 const GURL& url,
518 std::string* result) {
519 if (!TruncateSize(kBytesRequiredForOfficeMagic, &size))
520 return false;
521
522 // Check our table of magic numbers for Office file types. If it does not
523 // match one, the MIME type was invalid. Set it instead to a safe value.
524 std::string office_version;
525 if (!CheckForMagicNumbers(content, size, kOfficeMagicNumbers,
526 &office_version)) {
527 *result = "application/octet-stream";
528 }
529
530 // We have enough information to determine if this was a Microsoft Office
531 // document or not, so sniffing is completed.
532 return true;
533 }
534
535 // Byte order marks
536 static const MagicNumber kMagicXML[] = {
537 MAGIC_STRING("application/atom+xml", "<feed"),
538 MAGIC_STRING("application/rss+xml", "<rss"), // UTF-8
539 };
540
541 // Returns true and sets result if the content appears to contain XHTML or a
542 // feed.
543 // Clears have_enough_content if more data could possibly change the result.
544 //
545 // TODO(evanm): this is similar but more conservative than what Safari does,
546 // while HTML5 has a different recommendation -- what should we do?
547 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
548 // of ASCII -- do we care?
SniffXML(const char * content,size_t size,bool * have_enough_content,std::string * result)549 static bool SniffXML(const char* content,
550 size_t size,
551 bool* have_enough_content,
552 std::string* result) {
553 // We allow at most 300 bytes of content before we expect the opening tag.
554 *have_enough_content &= TruncateSize(300, &size);
555 const char* pos = content;
556 const char* const end = content + size;
557
558 // This loop iterates through tag-looking offsets in the file.
559 // We want to skip XML processing instructions (of the form "<?xml ...")
560 // and stop at the first "plain" tag, then make a decision on the mime-type
561 // based on the name (or possibly attributes) of that tag.
562 const int kMaxTagIterations = 5;
563 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) {
564 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos));
565 if (!pos)
566 return false;
567
568 static constexpr base::StringPiece kXmlPrefix("<?xml");
569 static constexpr base::StringPiece kDocTypePrefix("<!DOCTYPE");
570
571 base::StringPiece current(pos, end - pos);
572 if (base::EqualsCaseInsensitiveASCII(current.substr(0, kXmlPrefix.size()),
573 kXmlPrefix)) {
574 // Skip XML declarations.
575 ++pos;
576 continue;
577 }
578
579 if (base::EqualsCaseInsensitiveASCII(
580 current.substr(0, kDocTypePrefix.size()), kDocTypePrefix)) {
581 // Skip DOCTYPE declarations.
582 ++pos;
583 continue;
584 }
585
586 if (CheckForMagicNumbers(pos, end - pos, kMagicXML, result))
587 return true;
588
589 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
590 // to identify.
591
592 // If we get here, we've hit an initial tag that hasn't matched one of the
593 // above tests. Abort.
594 return true;
595 }
596
597 // We iterated too far without finding a start tag.
598 // If we have more content to look at, we aren't going to change our mind by
599 // seeing more bytes from the network.
600 return pos < end;
601 }
602
603 // Byte order marks
604 static const MagicNumber kByteOrderMark[] = {
605 MAGIC_NUMBER("text/plain", "\xFE\xFF"), // UTF-16BE
606 MAGIC_NUMBER("text/plain", "\xFF\xFE"), // UTF-16LE
607 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF"), // UTF-8
608 };
609
610 // Returns true and sets result to "application/octet-stream" if the content
611 // appears to be binary data. Otherwise, returns false and sets "text/plain".
612 // Clears have_enough_content if more data could possibly change the result.
SniffBinary(const char * content,size_t size,bool * have_enough_content,std::string * result)613 static bool SniffBinary(const char* content,
614 size_t size,
615 bool* have_enough_content,
616 std::string* result) {
617 // There is no concensus about exactly how to sniff for binary content.
618 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
619 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
620 // Here, we side with FF, but with a smaller buffer. This size was chosen
621 // because it is small enough to comfortably fit into a single packet (after
622 // allowing for headers) and yet large enough to account for binary formats
623 // that have a significant amount of ASCII at the beginning (crbug.com/15314).
624 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
625
626 // First, we look for a BOM.
627 std::string unused;
628 if (CheckForMagicNumbers(content, size, kByteOrderMark, &unused)) {
629 // If there is BOM, we think the buffer is not binary.
630 result->assign("text/plain");
631 return false;
632 }
633
634 // Next we look to see if any of the bytes "look binary."
635 if (LooksLikeBinary(content, size)) {
636 result->assign("application/octet-stream");
637 return true;
638 }
639
640 // No evidence either way. Default to non-binary and, if truncated, clear
641 // have_enough_content because there could be a binary looking byte in the
642 // truncated data.
643 *have_enough_content &= is_truncated;
644 result->assign("text/plain");
645 return false;
646 }
647
IsUnknownMimeType(const std::string & mime_type)648 static bool IsUnknownMimeType(const std::string& mime_type) {
649 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
650 // If we do, please be careful not to alter the semantics at all.
651 static const char* const kUnknownMimeTypes[] = {
652 // Empty mime types are as unknown as they get.
653 "",
654 // The unknown/unknown type is popular and uninformative
655 "unknown/unknown",
656 // The second most popular unknown mime type is application/unknown
657 "application/unknown",
658 // Firefox rejects a mime type if it is exactly */*
659 "*/*",
660 };
661 for (const char* const unknown_mime_type : kUnknownMimeTypes) {
662 if (mime_type == unknown_mime_type)
663 return true;
664 }
665 if (mime_type.find('/') == std::string::npos) {
666 // Firefox rejects a mime type if it does not contain a slash
667 return true;
668 }
669 return false;
670 }
671
672 // Returns true and sets result if the content appears to be a crx (Chrome
673 // extension) file.
674 // Clears have_enough_content if more data could possibly change the result.
SniffCRX(const char * content,size_t size,const GURL & url,const std::string & type_hint,bool * have_enough_content,std::string * result)675 static bool SniffCRX(const char* content,
676 size_t size,
677 const GURL& url,
678 const std::string& type_hint,
679 bool* have_enough_content,
680 std::string* result) {
681 // Technically, the crx magic number is just Cr24, but the bytes after that
682 // are a version number which changes infrequently. Including it in the
683 // sniffing gives us less room for error. If the version number ever changes,
684 // we can just add an entry to this list.
685 static const struct MagicNumber kCRXMagicNumbers[] = {
686 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00"),
687 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x03\x00\x00\x00")};
688
689 // Only consider files that have the extension ".crx".
690 if (!base::EndsWith(url.path_piece(), ".crx", base::CompareCase::SENSITIVE))
691 return false;
692
693 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
694 return CheckForMagicNumbers(content, size, kCRXMagicNumbers, result);
695 }
696
ShouldSniffMimeType(const GURL & url,const std::string & mime_type)697 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
698 bool sniffable_scheme = url.is_empty() ||
699 url.SchemeIsHTTPOrHTTPS() ||
700 #if defined(OS_ANDROID)
701 url.SchemeIs("content") ||
702 #endif
703 url.SchemeIsFile() ||
704 url.SchemeIsFileSystem();
705 if (!sniffable_scheme)
706 return false;
707
708 static const char* const kSniffableTypes[] = {
709 // Many web servers are misconfigured to send text/plain for many
710 // different types of content.
711 "text/plain",
712 // We want to sniff application/octet-stream for
713 // application/x-chrome-extension, but nothing else.
714 "application/octet-stream",
715 // XHTML and Atom/RSS feeds are often served as plain xml instead of
716 // their more specific mime types.
717 "text/xml",
718 "application/xml",
719 // Check for false Microsoft Office MIME types.
720 "application/msword",
721 "application/vnd.ms-excel",
722 "application/vnd.ms-powerpoint",
723 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
724 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
725 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
726 "application/vnd.ms-excel.sheet.macroenabled.12",
727 "application/vnd.ms-word.document.macroenabled.12",
728 "application/vnd.ms-powerpoint.presentation.macroenabled.12",
729 "application/mspowerpoint",
730 "application/msexcel",
731 "application/vnd.ms-word",
732 "application/vnd.ms-word.document.12",
733 "application/vnd.msword",
734 };
735 for (const char* const sniffable_type : kSniffableTypes) {
736 if (mime_type == sniffable_type)
737 return true;
738 }
739 if (IsUnknownMimeType(mime_type)) {
740 // The web server didn't specify a content type or specified a mime
741 // type that we ignore.
742 return true;
743 }
744 return false;
745 }
746
SniffMimeType(const char * content,size_t content_size,const GURL & url,const std::string & type_hint,ForceSniffFileUrlsForHtml force_sniff_file_url_for_html,std::string * result)747 bool SniffMimeType(const char* content,
748 size_t content_size,
749 const GURL& url,
750 const std::string& type_hint,
751 ForceSniffFileUrlsForHtml force_sniff_file_url_for_html,
752 std::string* result) {
753 DCHECK_LT(content_size, 1000000U); // sanity check
754 DCHECK(content);
755 DCHECK(result);
756
757 // By default, we assume we have enough content.
758 // Each sniff routine may unset this if it wasn't provided enough content.
759 bool have_enough_content = true;
760
761 // By default, we'll return the type hint.
762 // Each sniff routine may modify this if it has a better guess..
763 result->assign(type_hint);
764
765 // If the file has a Microsoft Office MIME type, we should only check that it
766 // is a valid Office file. Because this is the only reason we sniff files
767 // with a Microsoft Office MIME type, we can return early.
768 if (IsOfficeType(type_hint))
769 return SniffForInvalidOfficeDocs(content, content_size, url, result);
770
771 // Cache information about the type_hint
772 bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
773
774 // First check for HTML, unless it's a file URL and
775 // |allow_sniffing_files_urls_as_html| is false.
776 if (hint_is_unknown_mime_type &&
777 (!url.SchemeIsFile() ||
778 force_sniff_file_url_for_html == ForceSniffFileUrlsForHtml::kEnabled)) {
779 // We're only willing to sniff HTML if the server has not supplied a mime
780 // type, or if the type it did supply indicates that it doesn't know what
781 // the type should be.
782 if (SniffForHTML(content, content_size, &have_enough_content, result))
783 return true; // We succeeded in sniffing HTML. No more content needed.
784 }
785
786 // We're only willing to sniff for binary in 3 cases:
787 // 1. The server has not supplied a mime type.
788 // 2. The type it did supply indicates that it doesn't know what the type
789 // should be.
790 // 3. The type is "text/plain" which is the default on some web servers and
791 // could be indicative of a mis-configuration that we shield the user from.
792 const bool hint_is_text_plain = (type_hint == "text/plain");
793 if (hint_is_unknown_mime_type || hint_is_text_plain) {
794 if (!SniffBinary(content, content_size, &have_enough_content, result)) {
795 // If the server said the content was text/plain and it doesn't appear
796 // to be binary, then we trust it.
797 if (hint_is_text_plain) {
798 return have_enough_content;
799 }
800 }
801 }
802
803 // If we have plain XML, sniff XML subtypes.
804 if (type_hint == "text/xml" || type_hint == "application/xml") {
805 // We're not interested in sniffing these types for images and the like.
806 // Instead, we're looking explicitly for a feed. If we don't find one
807 // we're done and return early.
808 if (SniffXML(content, content_size, &have_enough_content, result))
809 return true;
810 return have_enough_content;
811 }
812
813 // CRX files (Chrome extensions) have a special sniffing algorithm. It is
814 // tighter than the others because we don't have to match legacy behavior.
815 if (SniffCRX(content, content_size, url, type_hint,
816 &have_enough_content, result))
817 return true;
818
819 // Check the file extension and magic numbers to see if this is an Office
820 // document. This needs to be checked before the general magic numbers
821 // because zip files and Office documents (OOXML) have the same magic number.
822 if (SniffForOfficeDocs(content, content_size, url,
823 &have_enough_content, result))
824 return true; // We've matched a magic number. No more content needed.
825
826 // We're not interested in sniffing for magic numbers when the type_hint
827 // is application/octet-stream. Time to bail out.
828 if (type_hint == "application/octet-stream")
829 return have_enough_content;
830
831 // Now we look in our large table of magic numbers to see if we can find
832 // anything that matches the content.
833 if (SniffForMagicNumbers(content, content_size,
834 &have_enough_content, result))
835 return true; // We've matched a magic number. No more content needed.
836
837 return have_enough_content;
838 }
839
SniffMimeTypeFromLocalData(const char * content,size_t size,std::string * result)840 bool SniffMimeTypeFromLocalData(const char* content,
841 size_t size,
842 std::string* result) {
843 // First check the extra table.
844 if (CheckForMagicNumbers(content, size, kExtraMagicNumbers, result))
845 return true;
846 // Finally check the original table.
847 return CheckForMagicNumbers(content, size, kMagicNumbers, result);
848 }
849
LooksLikeBinary(const char * content,size_t size)850 bool LooksLikeBinary(const char* content, size_t size) {
851 // The definition of "binary bytes" is from the spec at
852 // https://mimesniff.spec.whatwg.org/#binary-data-byte
853 //
854 // The bytes which are considered to be "binary" are all < 0x20. Encode them
855 // one bit per byte, with 1 for a "binary" bit, and 0 for a "text" bit. The
856 // least-significant bit represents byte 0x00, the most-significant bit
857 // represents byte 0x1F.
858 const uint32_t kBinaryBits =
859 ~(1u << '\t' | 1u << '\n' | 1u << '\r' | 1u << '\f' | 1u << '\x1b');
860 for (size_t i = 0; i < size; ++i) {
861 uint8_t byte = static_cast<uint8_t>(content[i]);
862 if (byte < 0x20 && (kBinaryBits & (1u << byte)))
863 return true;
864 }
865 return false;
866 }
867
868 } // namespace net
869