1# -*- coding: utf-8 -*- 2""" 3Mimetypes-related utilities 4 5# TODO: reexport stdlib mimetypes? 6""" 7import collections 8import io 9import logging 10import re 11import zipfile 12 13__all__ = ['guess_mimetype'] 14 15_logger = logging.getLogger(__name__) 16 17# We define our own guess_mimetype implementation and if magic is available we 18# use it instead. 19 20# discriminants for zip-based file formats 21_ooxml_dirs = { 22 'word/': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 23 'pt/': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 24 'xl/': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 25} 26def _check_ooxml(data): 27 with io.BytesIO(data) as f, zipfile.ZipFile(f) as z: 28 filenames = z.namelist() 29 # OOXML documents should have a [Content_Types].xml file for early 30 # check that we're interested in this thing at all 31 if '[Content_Types].xml' not in filenames: 32 return False 33 34 # then there is a directory whose name denotes the type of the file: 35 # word, pt (powerpoint) or xl (excel) 36 for dirname, mime in _ooxml_dirs.items(): 37 if any(entry.startswith(dirname) for entry in filenames): 38 return mime 39 40 return False 41 42 43# checks that a string looks kinda sorta like a mimetype 44_mime_validator = re.compile(r""" 45 [\w-]+ # type-name 46 / # subtype separator 47 [\w-]+ # registration facet or subtype 48 (?:\.[\w-]+)* # optional faceted name 49 (?:\+[\w-]+)? # optional structured syntax specifier 50""", re.VERBOSE) 51def _check_open_container_format(data): 52 # Open Document Format for Office Applications (OpenDocument) Version 1.2 53 # 54 # Part 3: Packages 55 # 3 Packages 56 # 3.3 MIME Media Type 57 with io.BytesIO(data) as f, zipfile.ZipFile(f) as z: 58 # If a MIME media type for a document exists, then an OpenDocument 59 # package should contain a file with name "mimetype". 60 if 'mimetype' not in z.namelist(): 61 return False 62 63 # The content of this file shall be the ASCII encoded MIME media type 64 # associated with the document. 65 marcel = z.read('mimetype').decode('ascii') 66 # check that it's not too long (RFC6838 § 4.2 restricts type and 67 # subtype to 127 characters each + separator, strongly recommends 68 # limiting them to 64 but does not require it) and that it looks a lot 69 # like a valid mime type 70 if len(marcel) < 256 and _mime_validator.match(marcel): 71 return marcel 72 73 return False 74 75_xls_pattern = re.compile(b""" 76 \x09\x08\x10\x00\x00\x06\x05\x00 77 | \xFD\xFF\xFF\xFF(\x10|\x1F|\x20|"|\\#|\\(|\\)) 78""", re.VERBOSE) 79_ppt_pattern = re.compile(b""" 80 \x00\x6E\x1E\xF0 81 | \x0F\x00\xE8\x03 82 | \xA0\x46\x1D\xF0 83 | \xFD\xFF\xFF\xFF(\x0E|\x1C|\x43)\x00\x00\x00 84""", re.VERBOSE) 85def _check_olecf(data): 86 """ Pre-OOXML Office formats are OLE Compound Files which all use the same 87 file signature ("magic bytes") and should have a subheader at offset 512 88 (0x200). 89 90 Subheaders taken from http://www.garykessler.net/library/file_sigs.html 91 according to which Mac office files *may* have different subheaders. We'll 92 ignore that. 93 """ 94 offset = 0x200 95 if data.startswith(b'\xEC\xA5\xC1\x00', offset): 96 return 'application/msword' 97 # the _xls_pattern stuff doesn't seem to work correctly (the test file 98 # only has a bunch of \xf* at offset 0x200), that apparently works 99 elif b'Microsoft Excel' in data: 100 return 'application/vnd.ms-excel' 101 elif _ppt_pattern.match(data, offset): 102 return 'application/vnd.ms-powerpoint' 103 return False 104 105 106def _check_svg(data): 107 """This simply checks the existence of the opening and ending SVG tags""" 108 if b'<svg' in data and b'/svg>' in data: 109 return 'image/svg+xml' 110 111 112# for "master" formats with many subformats, discriminants is a list of 113# functions, tried in order and the first non-falsy value returned is the 114# selected mime type. If all functions return falsy values, the master 115# mimetype is returned. 116_Entry = collections.namedtuple('_Entry', ['mimetype', 'signatures', 'discriminants']) 117_mime_mappings = ( 118 # pdf 119 _Entry('application/pdf', [b'%PDF'], []), 120 # jpg, jpeg, png, gif, bmp 121 _Entry('image/jpeg', [b'\xFF\xD8\xFF\xE0', b'\xFF\xD8\xFF\xE2', b'\xFF\xD8\xFF\xE3', b'\xFF\xD8\xFF\xE1'], []), 122 _Entry('image/png', [b'\x89PNG\r\n\x1A\n'], []), 123 _Entry('image/gif', [b'GIF87a', b'GIF89a'], []), 124 _Entry('image/bmp', [b'BM'], []), 125 _Entry('image/svg+xml', [b'<'], [ 126 _check_svg, 127 ]), 128 _Entry('image/x-icon', [b'\x00\x00\x01\x00'], []), 129 # OLECF files in general (Word, Excel, PPT, default to word because why not?) 130 _Entry('application/msword', [b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1', b'\x0D\x44\x4F\x43'], [ 131 _check_olecf 132 ]), 133 # zip, but will include jar, odt, ods, odp, docx, xlsx, pptx, apk 134 _Entry('application/zip', [b'PK\x03\x04'], [_check_ooxml, _check_open_container_format]), 135) 136def guess_mimetype(bin_data, default='application/octet-stream'): 137 """ Attempts to guess the mime type of the provided binary data, similar 138 to but significantly more limited than libmagic 139 140 :param str bin_data: binary data to try and guess a mime type for 141 :returns: matched mimetype or ``application/octet-stream`` if none matched 142 """ 143 # by default, guess the type using the magic number of file hex signature (like magic, but more limited) 144 # see http://www.filesignatures.net/ for file signatures 145 for entry in _mime_mappings: 146 for signature in entry.signatures: 147 if bin_data.startswith(signature): 148 for discriminant in entry.discriminants: 149 try: 150 guess = discriminant(bin_data) 151 if guess: return guess 152 except Exception: 153 # log-and-next 154 _logger.getChild('guess_mimetype').warn( 155 "Sub-checker '%s' of type '%s' failed", 156 discriminant.__name__, entry.mimetype, 157 exc_info=True 158 ) 159 # if no discriminant or no discriminant matches, return 160 # primary mime type 161 return entry.mimetype 162 return default 163 164 165try: 166 import magic 167except ImportError: 168 magic = None 169else: 170 # There are 2 python libs named 'magic' with incompatible api. 171 172 # magic from pypi https://pypi.python.org/pypi/python-magic/ 173 if hasattr(magic,'from_buffer'): 174 guess_mimetype = lambda bin_data, default=None: magic.from_buffer(bin_data, mime=True) 175 # magic from file(1) https://packages.debian.org/squeeze/python-magic 176 elif hasattr(magic,'open'): 177 ms = magic.open(magic.MAGIC_MIME_TYPE) 178 ms.load() 179 guess_mimetype = lambda bin_data, default=None: ms.buffer(bin_data) 180 181def neuter_mimetype(mimetype, user): 182 wrong_type = 'ht' in mimetype or 'xml' in mimetype or 'svg' in mimetype 183 if wrong_type and not user._is_system(): 184 return 'text/plain' 185 return mimetype 186