1# -*- coding: utf-8 -*-
2"""
3Mimetypes-related utilities
4
5# TODO: reexport stdlib mimetypes?
6"""
7import collections
8import io
9import logging
10import re
11import zipfile
12
13__all__ = ['guess_mimetype']
14
15_logger = logging.getLogger(__name__)
16
17# We define our own guess_mimetype implementation and if magic is available we
18# use it instead.
19
20# discriminants for zip-based file formats
21_ooxml_dirs = {
22    'word/': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
23    'pt/': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
24    'xl/': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
25}
26def _check_ooxml(data):
27    with io.BytesIO(data) as f, zipfile.ZipFile(f) as z:
28        filenames = z.namelist()
29        # OOXML documents should have a [Content_Types].xml file for early
30        # check that we're interested in this thing at all
31        if '[Content_Types].xml' not in filenames:
32            return False
33
34        # then there is a directory whose name denotes the type of the file:
35        # word, pt (powerpoint) or xl (excel)
36        for dirname, mime in _ooxml_dirs.items():
37            if any(entry.startswith(dirname) for entry in filenames):
38                return mime
39
40        return False
41
42
43# checks that a string looks kinda sorta like a mimetype
44_mime_validator = re.compile(r"""
45    [\w-]+ # type-name
46    / # subtype separator
47    [\w-]+ # registration facet or subtype
48    (?:\.[\w-]+)* # optional faceted name
49    (?:\+[\w-]+)? # optional structured syntax specifier
50""", re.VERBOSE)
51def _check_open_container_format(data):
52    # Open Document Format for Office Applications (OpenDocument) Version 1.2
53    #
54    # Part 3: Packages
55    # 3 Packages
56    # 3.3 MIME Media Type
57    with io.BytesIO(data) as f, zipfile.ZipFile(f) as z:
58        # If a MIME media type for a document exists, then an OpenDocument
59        # package should contain a file with name "mimetype".
60        if 'mimetype' not in z.namelist():
61            return False
62
63        # The content of this file shall be the ASCII encoded MIME media type
64        # associated with the document.
65        marcel = z.read('mimetype').decode('ascii')
66        # check that it's not too long (RFC6838 § 4.2 restricts type and
67        # subtype to 127 characters each + separator, strongly recommends
68        # limiting them to 64 but does not require it) and that it looks a lot
69        # like a valid mime type
70        if len(marcel) < 256 and _mime_validator.match(marcel):
71            return marcel
72
73        return False
74
75_xls_pattern = re.compile(b"""
76    \x09\x08\x10\x00\x00\x06\x05\x00
77  | \xFD\xFF\xFF\xFF(\x10|\x1F|\x20|"|\\#|\\(|\\))
78""", re.VERBOSE)
79_ppt_pattern = re.compile(b"""
80    \x00\x6E\x1E\xF0
81  | \x0F\x00\xE8\x03
82  | \xA0\x46\x1D\xF0
83  | \xFD\xFF\xFF\xFF(\x0E|\x1C|\x43)\x00\x00\x00
84""", re.VERBOSE)
85def _check_olecf(data):
86    """ Pre-OOXML Office formats are OLE Compound Files which all use the same
87    file signature ("magic bytes") and should have a subheader at offset 512
88    (0x200).
89
90    Subheaders taken from http://www.garykessler.net/library/file_sigs.html
91    according to which Mac office files *may* have different subheaders. We'll
92    ignore that.
93    """
94    offset = 0x200
95    if data.startswith(b'\xEC\xA5\xC1\x00', offset):
96        return 'application/msword'
97    # the _xls_pattern stuff doesn't seem to work correctly (the test file
98    # only has a bunch of \xf* at offset 0x200), that apparently works
99    elif b'Microsoft Excel' in data:
100        return 'application/vnd.ms-excel'
101    elif _ppt_pattern.match(data, offset):
102        return 'application/vnd.ms-powerpoint'
103    return False
104
105
106def _check_svg(data):
107    """This simply checks the existence of the opening and ending SVG tags"""
108    if b'<svg' in data and b'/svg>' in data:
109        return 'image/svg+xml'
110
111
112# for "master" formats with many subformats, discriminants is a list of
113# functions, tried in order and the first non-falsy value returned is the
114# selected mime type. If all functions return falsy values, the master
115# mimetype is returned.
116_Entry = collections.namedtuple('_Entry', ['mimetype', 'signatures', 'discriminants'])
117_mime_mappings = (
118    # pdf
119    _Entry('application/pdf', [b'%PDF'], []),
120    # jpg, jpeg, png, gif, bmp
121    _Entry('image/jpeg', [b'\xFF\xD8\xFF\xE0', b'\xFF\xD8\xFF\xE2', b'\xFF\xD8\xFF\xE3', b'\xFF\xD8\xFF\xE1'], []),
122    _Entry('image/png', [b'\x89PNG\r\n\x1A\n'], []),
123    _Entry('image/gif', [b'GIF87a', b'GIF89a'], []),
124    _Entry('image/bmp', [b'BM'], []),
125    _Entry('image/svg+xml', [b'<'], [
126        _check_svg,
127    ]),
128    _Entry('image/x-icon', [b'\x00\x00\x01\x00'], []),
129    # OLECF files in general (Word, Excel, PPT, default to word because why not?)
130    _Entry('application/msword', [b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1', b'\x0D\x44\x4F\x43'], [
131        _check_olecf
132    ]),
133    # zip, but will include jar, odt, ods, odp, docx, xlsx, pptx, apk
134    _Entry('application/zip', [b'PK\x03\x04'], [_check_ooxml, _check_open_container_format]),
135)
136def guess_mimetype(bin_data, default='application/octet-stream'):
137    """ Attempts to guess the mime type of the provided binary data, similar
138    to but significantly more limited than libmagic
139
140    :param str bin_data: binary data to try and guess a mime type for
141    :returns: matched mimetype or ``application/octet-stream`` if none matched
142    """
143    # by default, guess the type using the magic number of file hex signature (like magic, but more limited)
144    # see http://www.filesignatures.net/ for file signatures
145    for entry in _mime_mappings:
146        for signature in entry.signatures:
147            if bin_data.startswith(signature):
148                for discriminant in entry.discriminants:
149                    try:
150                        guess = discriminant(bin_data)
151                        if guess: return guess
152                    except Exception:
153                        # log-and-next
154                        _logger.getChild('guess_mimetype').warn(
155                            "Sub-checker '%s' of type '%s' failed",
156                            discriminant.__name__, entry.mimetype,
157                            exc_info=True
158                        )
159                # if no discriminant or no discriminant matches, return
160                # primary mime type
161                return entry.mimetype
162    return default
163
164
165try:
166    import magic
167except ImportError:
168    magic = None
169else:
170    # There are 2 python libs named 'magic' with incompatible api.
171
172    # magic from pypi https://pypi.python.org/pypi/python-magic/
173    if hasattr(magic,'from_buffer'):
174        guess_mimetype = lambda bin_data, default=None: magic.from_buffer(bin_data, mime=True)
175    # magic from file(1) https://packages.debian.org/squeeze/python-magic
176    elif hasattr(magic,'open'):
177        ms = magic.open(magic.MAGIC_MIME_TYPE)
178        ms.load()
179        guess_mimetype = lambda bin_data, default=None: ms.buffer(bin_data)
180
181def neuter_mimetype(mimetype, user):
182    wrong_type = 'ht' in mimetype or 'xml' in mimetype or 'svg' in mimetype
183    if wrong_type and not user._is_system():
184        return 'text/plain'
185    return mimetype
186