1"""Guess the MIME type of a file.
2
3This module defines two useful functions:
4
5guess_type(url, strict=True) -- guess the MIME type and encoding of a URL.
6
7guess_extension(type, strict=True) -- guess the extension for a given MIME type.
8
9It also contains the following, for tuning the behavior:
10
11Data:
12
13knownfiles -- list of files to parse
14inited -- flag set when init() has been called
15suffix_map -- dictionary mapping suffixes to suffixes
16encodings_map -- dictionary mapping suffixes to encodings
17types_map -- dictionary mapping suffixes to types
18
19Functions:
20
21init([files]) -- parse a list of files, default knownfiles (on Windows, the
22  default values are taken from the registry)
23read_mime_types(file) -- parse one file, return a dictionary or None
24"""
25
26import os
27import sys
28import posixpath
29import urllib.parse
30
31try:
32    from _winapi import _mimetypes_read_windows_registry
33except ImportError:
34    _mimetypes_read_windows_registry = None
35
36try:
37    import winreg as _winreg
38except ImportError:
39    _winreg = None
40
41__all__ = [
42    "knownfiles", "inited", "MimeTypes",
43    "guess_type", "guess_all_extensions", "guess_extension",
44    "add_type", "init", "read_mime_types",
45    "suffix_map", "encodings_map", "types_map", "common_types"
46]
47
48knownfiles = [
49    "/etc/mime.types",
50    "/etc/httpd/mime.types",                    # Mac OS X
51    "/etc/httpd/conf/mime.types",               # Apache
52    "/etc/apache/mime.types",                   # Apache 1
53    "/etc/apache2/mime.types",                  # Apache 2
54    "/usr/local/etc/httpd/conf/mime.types",
55    "/usr/local/lib/netscape/mime.types",
56    "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
57    "/usr/local/etc/mime.types",                # Apache 1.3
58    ]
59
60inited = False
61_db = None
62
63
64class MimeTypes:
65    """MIME-types datastore.
66
67    This datastore can handle information from mime.types-style files
68    and supports basic determination of MIME type from a filename or
69    URL, and can guess a reasonable extension given a MIME type.
70    """
71
72    def __init__(self, filenames=(), strict=True):
73        if not inited:
74            init()
75        self.encodings_map = _encodings_map_default.copy()
76        self.suffix_map = _suffix_map_default.copy()
77        self.types_map = ({}, {}) # dict for (non-strict, strict)
78        self.types_map_inv = ({}, {})
79        for (ext, type) in _types_map_default.items():
80            self.add_type(type, ext, True)
81        for (ext, type) in _common_types_default.items():
82            self.add_type(type, ext, False)
83        for name in filenames:
84            self.read(name, strict)
85
86    def add_type(self, type, ext, strict=True):
87        """Add a mapping between a type and an extension.
88
89        When the extension is already known, the new
90        type will replace the old one. When the type
91        is already known the extension will be added
92        to the list of known extensions.
93
94        If strict is true, information will be added to
95        list of standard types, else to the list of non-standard
96        types.
97        """
98        self.types_map[strict][ext] = type
99        exts = self.types_map_inv[strict].setdefault(type, [])
100        if ext not in exts:
101            exts.append(ext)
102
103    def guess_type(self, url, strict=True):
104        """Guess the type of a file which is either a URL or a path-like object.
105
106        Return value is a tuple (type, encoding) where type is None if
107        the type can't be guessed (no or unknown suffix) or a string
108        of the form type/subtype, usable for a MIME Content-type
109        header; and encoding is None for no encoding or the name of
110        the program used to encode (e.g. compress or gzip).  The
111        mappings are table driven.  Encoding suffixes are case
112        sensitive; type suffixes are first tried case sensitive, then
113        case insensitive.
114
115        The suffixes .tgz, .taz and .tz (case sensitive!) are all
116        mapped to '.tar.gz'.  (This is table-driven too, using the
117        dictionary suffix_map.)
118
119        Optional `strict' argument when False adds a bunch of commonly found,
120        but non-standard types.
121        """
122        url = os.fspath(url)
123        scheme, url = urllib.parse._splittype(url)
124        if scheme == 'data':
125            # syntax of data URLs:
126            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
127            # mediatype := [ type "/" subtype ] *( ";" parameter )
128            # data      := *urlchar
129            # parameter := attribute "=" value
130            # type/subtype defaults to "text/plain"
131            comma = url.find(',')
132            if comma < 0:
133                # bad data URL
134                return None, None
135            semi = url.find(';', 0, comma)
136            if semi >= 0:
137                type = url[:semi]
138            else:
139                type = url[:comma]
140            if '=' in type or '/' not in type:
141                type = 'text/plain'
142            return type, None           # never compressed, so encoding is None
143        base, ext = posixpath.splitext(url)
144        while ext in self.suffix_map:
145            base, ext = posixpath.splitext(base + self.suffix_map[ext])
146        if ext in self.encodings_map:
147            encoding = self.encodings_map[ext]
148            base, ext = posixpath.splitext(base)
149        else:
150            encoding = None
151        types_map = self.types_map[True]
152        if ext in types_map:
153            return types_map[ext], encoding
154        elif ext.lower() in types_map:
155            return types_map[ext.lower()], encoding
156        elif strict:
157            return None, encoding
158        types_map = self.types_map[False]
159        if ext in types_map:
160            return types_map[ext], encoding
161        elif ext.lower() in types_map:
162            return types_map[ext.lower()], encoding
163        else:
164            return None, encoding
165
166    def guess_all_extensions(self, type, strict=True):
167        """Guess the extensions for a file based on its MIME type.
168
169        Return value is a list of strings giving the possible filename
170        extensions, including the leading dot ('.').  The extension is not
171        guaranteed to have been associated with any particular data stream,
172        but would be mapped to the MIME type `type' by guess_type().
173
174        Optional `strict' argument when false adds a bunch of commonly found,
175        but non-standard types.
176        """
177        type = type.lower()
178        extensions = list(self.types_map_inv[True].get(type, []))
179        if not strict:
180            for ext in self.types_map_inv[False].get(type, []):
181                if ext not in extensions:
182                    extensions.append(ext)
183        return extensions
184
185    def guess_extension(self, type, strict=True):
186        """Guess the extension for a file based on its MIME type.
187
188        Return value is a string giving a filename extension,
189        including the leading dot ('.').  The extension is not
190        guaranteed to have been associated with any particular data
191        stream, but would be mapped to the MIME type `type' by
192        guess_type().  If no extension can be guessed for `type', None
193        is returned.
194
195        Optional `strict' argument when false adds a bunch of commonly found,
196        but non-standard types.
197        """
198        extensions = self.guess_all_extensions(type, strict)
199        if not extensions:
200            return None
201        return extensions[0]
202
203    def read(self, filename, strict=True):
204        """
205        Read a single mime.types-format file, specified by pathname.
206
207        If strict is true, information will be added to
208        list of standard types, else to the list of non-standard
209        types.
210        """
211        with open(filename, encoding='utf-8') as fp:
212            self.readfp(fp, strict)
213
214    def readfp(self, fp, strict=True):
215        """
216        Read a single mime.types-format file.
217
218        If strict is true, information will be added to
219        list of standard types, else to the list of non-standard
220        types.
221        """
222        while 1:
223            line = fp.readline()
224            if not line:
225                break
226            words = line.split()
227            for i in range(len(words)):
228                if words[i][0] == '#':
229                    del words[i:]
230                    break
231            if not words:
232                continue
233            type, suffixes = words[0], words[1:]
234            for suff in suffixes:
235                self.add_type(type, '.' + suff, strict)
236
237    def read_windows_registry(self, strict=True):
238        """
239        Load the MIME types database from Windows registry.
240
241        If strict is true, information will be added to
242        list of standard types, else to the list of non-standard
243        types.
244        """
245
246        if not _mimetypes_read_windows_registry and not _winreg:
247            return
248
249        add_type = self.add_type
250        if strict:
251            add_type = lambda type, ext: self.add_type(type, ext, True)
252
253        # Accelerated function if it is available
254        if _mimetypes_read_windows_registry:
255            _mimetypes_read_windows_registry(add_type)
256        elif _winreg:
257            self._read_windows_registry(add_type)
258
259    @classmethod
260    def _read_windows_registry(cls, add_type):
261        def enum_types(mimedb):
262            i = 0
263            while True:
264                try:
265                    ctype = _winreg.EnumKey(mimedb, i)
266                except OSError:
267                    break
268                else:
269                    if '\0' not in ctype:
270                        yield ctype
271                i += 1
272
273        with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
274            for subkeyname in enum_types(hkcr):
275                try:
276                    with _winreg.OpenKey(hkcr, subkeyname) as subkey:
277                        # Only check file extensions
278                        if not subkeyname.startswith("."):
279                            continue
280                        # raises OSError if no 'Content Type' value
281                        mimetype, datatype = _winreg.QueryValueEx(
282                            subkey, 'Content Type')
283                        if datatype != _winreg.REG_SZ:
284                            continue
285                        add_type(mimetype, subkeyname)
286                except OSError:
287                    continue
288
289def guess_type(url, strict=True):
290    """Guess the type of a file based on its URL.
291
292    Return value is a tuple (type, encoding) where type is None if the
293    type can't be guessed (no or unknown suffix) or a string of the
294    form type/subtype, usable for a MIME Content-type header; and
295    encoding is None for no encoding or the name of the program used
296    to encode (e.g. compress or gzip).  The mappings are table
297    driven.  Encoding suffixes are case sensitive; type suffixes are
298    first tried case sensitive, then case insensitive.
299
300    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
301    to ".tar.gz".  (This is table-driven too, using the dictionary
302    suffix_map).
303
304    Optional `strict' argument when false adds a bunch of commonly found, but
305    non-standard types.
306    """
307    if _db is None:
308        init()
309    return _db.guess_type(url, strict)
310
311
312def guess_all_extensions(type, strict=True):
313    """Guess the extensions for a file based on its MIME type.
314
315    Return value is a list of strings giving the possible filename
316    extensions, including the leading dot ('.').  The extension is not
317    guaranteed to have been associated with any particular data
318    stream, but would be mapped to the MIME type `type' by
319    guess_type().  If no extension can be guessed for `type', None
320    is returned.
321
322    Optional `strict' argument when false adds a bunch of commonly found,
323    but non-standard types.
324    """
325    if _db is None:
326        init()
327    return _db.guess_all_extensions(type, strict)
328
329def guess_extension(type, strict=True):
330    """Guess the extension for a file based on its MIME type.
331
332    Return value is a string giving a filename extension, including the
333    leading dot ('.').  The extension is not guaranteed to have been
334    associated with any particular data stream, but would be mapped to the
335    MIME type `type' by guess_type().  If no extension can be guessed for
336    `type', None is returned.
337
338    Optional `strict' argument when false adds a bunch of commonly found,
339    but non-standard types.
340    """
341    if _db is None:
342        init()
343    return _db.guess_extension(type, strict)
344
345def add_type(type, ext, strict=True):
346    """Add a mapping between a type and an extension.
347
348    When the extension is already known, the new
349    type will replace the old one. When the type
350    is already known the extension will be added
351    to the list of known extensions.
352
353    If strict is true, information will be added to
354    list of standard types, else to the list of non-standard
355    types.
356    """
357    if _db is None:
358        init()
359    return _db.add_type(type, ext, strict)
360
361
362def init(files=None):
363    global suffix_map, types_map, encodings_map, common_types
364    global inited, _db
365    inited = True    # so that MimeTypes.__init__() doesn't call us again
366
367    if files is None or _db is None:
368        db = MimeTypes()
369        # Quick return if not supported
370        db.read_windows_registry()
371
372        if files is None:
373            files = knownfiles
374        else:
375            files = knownfiles + list(files)
376    else:
377        db = _db
378
379    for file in files:
380        if os.path.isfile(file):
381            db.read(file)
382    encodings_map = db.encodings_map
383    suffix_map = db.suffix_map
384    types_map = db.types_map[True]
385    common_types = db.types_map[False]
386    # Make the DB a global variable now that it is fully initialized
387    _db = db
388
389
390def read_mime_types(file):
391    try:
392        f = open(file, encoding='utf-8')
393    except OSError:
394        return None
395    with f:
396        db = MimeTypes()
397        db.readfp(f, True)
398        return db.types_map[True]
399
400
401def _default_mime_types():
402    global suffix_map, _suffix_map_default
403    global encodings_map, _encodings_map_default
404    global types_map, _types_map_default
405    global common_types, _common_types_default
406
407    suffix_map = _suffix_map_default = {
408        '.svgz': '.svg.gz',
409        '.tgz': '.tar.gz',
410        '.taz': '.tar.gz',
411        '.tz': '.tar.gz',
412        '.tbz2': '.tar.bz2',
413        '.txz': '.tar.xz',
414        }
415
416    encodings_map = _encodings_map_default = {
417        '.gz': 'gzip',
418        '.Z': 'compress',
419        '.bz2': 'bzip2',
420        '.xz': 'xz',
421        '.br': 'br',
422        }
423
424    # Before adding new types, make sure they are either registered with IANA,
425    # at http://www.iana.org/assignments/media-types
426    # or extensions, i.e. using the x- prefix
427
428    # If you add to these, please keep them sorted by mime type.
429    # Make sure the entry with the preferred file extension for a particular mime type
430    # appears before any others of the same mimetype.
431    types_map = _types_map_default = {
432        '.js'     : 'application/javascript',
433        '.mjs'    : 'application/javascript',
434        '.json'   : 'application/json',
435        '.webmanifest': 'application/manifest+json',
436        '.doc'    : 'application/msword',
437        '.dot'    : 'application/msword',
438        '.wiz'    : 'application/msword',
439        '.bin'    : 'application/octet-stream',
440        '.a'      : 'application/octet-stream',
441        '.dll'    : 'application/octet-stream',
442        '.exe'    : 'application/octet-stream',
443        '.o'      : 'application/octet-stream',
444        '.obj'    : 'application/octet-stream',
445        '.so'     : 'application/octet-stream',
446        '.oda'    : 'application/oda',
447        '.pdf'    : 'application/pdf',
448        '.p7c'    : 'application/pkcs7-mime',
449        '.ps'     : 'application/postscript',
450        '.ai'     : 'application/postscript',
451        '.eps'    : 'application/postscript',
452        '.m3u'    : 'application/vnd.apple.mpegurl',
453        '.m3u8'   : 'application/vnd.apple.mpegurl',
454        '.xls'    : 'application/vnd.ms-excel',
455        '.xlb'    : 'application/vnd.ms-excel',
456        '.ppt'    : 'application/vnd.ms-powerpoint',
457        '.pot'    : 'application/vnd.ms-powerpoint',
458        '.ppa'    : 'application/vnd.ms-powerpoint',
459        '.pps'    : 'application/vnd.ms-powerpoint',
460        '.pwz'    : 'application/vnd.ms-powerpoint',
461        '.wasm'   : 'application/wasm',
462        '.bcpio'  : 'application/x-bcpio',
463        '.cpio'   : 'application/x-cpio',
464        '.csh'    : 'application/x-csh',
465        '.dvi'    : 'application/x-dvi',
466        '.gtar'   : 'application/x-gtar',
467        '.hdf'    : 'application/x-hdf',
468        '.h5'     : 'application/x-hdf5',
469        '.latex'  : 'application/x-latex',
470        '.mif'    : 'application/x-mif',
471        '.cdf'    : 'application/x-netcdf',
472        '.nc'     : 'application/x-netcdf',
473        '.p12'    : 'application/x-pkcs12',
474        '.pfx'    : 'application/x-pkcs12',
475        '.ram'    : 'application/x-pn-realaudio',
476        '.pyc'    : 'application/x-python-code',
477        '.pyo'    : 'application/x-python-code',
478        '.sh'     : 'application/x-sh',
479        '.shar'   : 'application/x-shar',
480        '.swf'    : 'application/x-shockwave-flash',
481        '.sv4cpio': 'application/x-sv4cpio',
482        '.sv4crc' : 'application/x-sv4crc',
483        '.tar'    : 'application/x-tar',
484        '.tcl'    : 'application/x-tcl',
485        '.tex'    : 'application/x-tex',
486        '.texi'   : 'application/x-texinfo',
487        '.texinfo': 'application/x-texinfo',
488        '.roff'   : 'application/x-troff',
489        '.t'      : 'application/x-troff',
490        '.tr'     : 'application/x-troff',
491        '.man'    : 'application/x-troff-man',
492        '.me'     : 'application/x-troff-me',
493        '.ms'     : 'application/x-troff-ms',
494        '.ustar'  : 'application/x-ustar',
495        '.src'    : 'application/x-wais-source',
496        '.xsl'    : 'application/xml',
497        '.rdf'    : 'application/xml',
498        '.wsdl'   : 'application/xml',
499        '.xpdl'   : 'application/xml',
500        '.zip'    : 'application/zip',
501        '.3gp'    : 'audio/3gpp',
502        '.3gpp'   : 'audio/3gpp',
503        '.3g2'    : 'audio/3gpp2',
504        '.3gpp2'  : 'audio/3gpp2',
505        '.aac'    : 'audio/aac',
506        '.adts'   : 'audio/aac',
507        '.loas'   : 'audio/aac',
508        '.ass'    : 'audio/aac',
509        '.au'     : 'audio/basic',
510        '.snd'    : 'audio/basic',
511        '.mp3'    : 'audio/mpeg',
512        '.mp2'    : 'audio/mpeg',
513        '.opus'   : 'audio/opus',
514        '.aif'    : 'audio/x-aiff',
515        '.aifc'   : 'audio/x-aiff',
516        '.aiff'   : 'audio/x-aiff',
517        '.ra'     : 'audio/x-pn-realaudio',
518        '.wav'    : 'audio/x-wav',
519        '.bmp'    : 'image/bmp',
520        '.gif'    : 'image/gif',
521        '.ief'    : 'image/ief',
522        '.jpg'    : 'image/jpeg',
523        '.jpe'    : 'image/jpeg',
524        '.jpeg'   : 'image/jpeg',
525        '.heic'   : 'image/heic',
526        '.heif'   : 'image/heif',
527        '.png'    : 'image/png',
528        '.svg'    : 'image/svg+xml',
529        '.tiff'   : 'image/tiff',
530        '.tif'    : 'image/tiff',
531        '.ico'    : 'image/vnd.microsoft.icon',
532        '.ras'    : 'image/x-cmu-raster',
533        '.bmp'    : 'image/x-ms-bmp',
534        '.pnm'    : 'image/x-portable-anymap',
535        '.pbm'    : 'image/x-portable-bitmap',
536        '.pgm'    : 'image/x-portable-graymap',
537        '.ppm'    : 'image/x-portable-pixmap',
538        '.rgb'    : 'image/x-rgb',
539        '.xbm'    : 'image/x-xbitmap',
540        '.xpm'    : 'image/x-xpixmap',
541        '.xwd'    : 'image/x-xwindowdump',
542        '.eml'    : 'message/rfc822',
543        '.mht'    : 'message/rfc822',
544        '.mhtml'  : 'message/rfc822',
545        '.nws'    : 'message/rfc822',
546        '.css'    : 'text/css',
547        '.csv'    : 'text/csv',
548        '.html'   : 'text/html',
549        '.htm'    : 'text/html',
550        '.txt'    : 'text/plain',
551        '.bat'    : 'text/plain',
552        '.c'      : 'text/plain',
553        '.h'      : 'text/plain',
554        '.ksh'    : 'text/plain',
555        '.pl'     : 'text/plain',
556        '.rtx'    : 'text/richtext',
557        '.tsv'    : 'text/tab-separated-values',
558        '.py'     : 'text/x-python',
559        '.etx'    : 'text/x-setext',
560        '.sgm'    : 'text/x-sgml',
561        '.sgml'   : 'text/x-sgml',
562        '.vcf'    : 'text/x-vcard',
563        '.xml'    : 'text/xml',
564        '.mp4'    : 'video/mp4',
565        '.mpeg'   : 'video/mpeg',
566        '.m1v'    : 'video/mpeg',
567        '.mpa'    : 'video/mpeg',
568        '.mpe'    : 'video/mpeg',
569        '.mpg'    : 'video/mpeg',
570        '.mov'    : 'video/quicktime',
571        '.qt'     : 'video/quicktime',
572        '.webm'   : 'video/webm',
573        '.avi'    : 'video/x-msvideo',
574        '.movie'  : 'video/x-sgi-movie',
575        }
576
577    # These are non-standard types, commonly found in the wild.  They will
578    # only match if strict=0 flag is given to the API methods.
579
580    # Please sort these too
581    common_types = _common_types_default = {
582        '.rtf' : 'application/rtf',
583        '.midi': 'audio/midi',
584        '.mid' : 'audio/midi',
585        '.jpg' : 'image/jpg',
586        '.pict': 'image/pict',
587        '.pct' : 'image/pict',
588        '.pic' : 'image/pict',
589        '.xul' : 'text/xul',
590        }
591
592
593_default_mime_types()
594
595
596def _main():
597    import getopt
598
599    USAGE = """\
600Usage: mimetypes.py [options] type
601
602Options:
603    --help / -h       -- print this message and exit
604    --lenient / -l    -- additionally search of some common, but non-standard
605                         types.
606    --extension / -e  -- guess extension instead of type
607
608More than one type argument may be given.
609"""
610
611    def usage(code, msg=''):
612        print(USAGE)
613        if msg: print(msg)
614        sys.exit(code)
615
616    try:
617        opts, args = getopt.getopt(sys.argv[1:], 'hle',
618                                   ['help', 'lenient', 'extension'])
619    except getopt.error as msg:
620        usage(1, msg)
621
622    strict = 1
623    extension = 0
624    for opt, arg in opts:
625        if opt in ('-h', '--help'):
626            usage(0)
627        elif opt in ('-l', '--lenient'):
628            strict = 0
629        elif opt in ('-e', '--extension'):
630            extension = 1
631    for gtype in args:
632        if extension:
633            guess = guess_extension(gtype, strict)
634            if not guess: print("I don't know anything about type", gtype)
635            else: print(guess)
636        else:
637            guess, encoding = guess_type(gtype, strict)
638            if not guess: print("I don't know anything about type", gtype)
639            else: print('type:', guess, 'encoding:', encoding)
640
641
642if __name__ == '__main__':
643    _main()
644