1"""Guess the MIME type of a file.
2
3This module defines two useful functions:
4
5guess_type(url, strict=True) -- guess the MIME type and encoding of a URL.
6
7guess_extension(type, strict=True) -- guess the extension for a given MIME type.
8
9It also contains the following, for tuning the behavior:
10
11Data:
12
13knownfiles -- list of files to parse
14inited -- flag set when init() has been called
15suffix_map -- dictionary mapping suffixes to suffixes
16encodings_map -- dictionary mapping suffixes to encodings
17types_map -- dictionary mapping suffixes to types
18
19Functions:
20
21init([files]) -- parse a list of files, default knownfiles (on Windows, the
22  default values are taken from the registry)
23read_mime_types(file) -- parse one file, return a dictionary or None
24"""
25
26import os
27import sys
28import posixpath
29import urllib.parse
30try:
31    import winreg as _winreg
32except ImportError:
33    _winreg = None
34
35__all__ = [
36    "knownfiles", "inited", "MimeTypes",
37    "guess_type", "guess_all_extensions", "guess_extension",
38    "add_type", "init", "read_mime_types",
39    "suffix_map", "encodings_map", "types_map", "common_types"
40]
41
42knownfiles = [
43    "/etc/mime.types",
44    "/etc/httpd/mime.types",                    # Mac OS X
45    "/etc/httpd/conf/mime.types",               # Apache
46    "/etc/apache/mime.types",                   # Apache 1
47    "/etc/apache2/mime.types",                  # Apache 2
48    "/usr/local/etc/httpd/conf/mime.types",
49    "/usr/local/lib/netscape/mime.types",
50    "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
51    "/usr/local/etc/mime.types",                # Apache 1.3
52    ]
53
54inited = False
55_db = None
56
57
58class MimeTypes:
59    """MIME-types datastore.
60
61    This datastore can handle information from mime.types-style files
62    and supports basic determination of MIME type from a filename or
63    URL, and can guess a reasonable extension given a MIME type.
64    """
65
66    def __init__(self, filenames=(), strict=True):
67        if not inited:
68            init()
69        self.encodings_map = _encodings_map_default.copy()
70        self.suffix_map = _suffix_map_default.copy()
71        self.types_map = ({}, {}) # dict for (non-strict, strict)
72        self.types_map_inv = ({}, {})
73        for (ext, type) in _types_map_default.items():
74            self.add_type(type, ext, True)
75        for (ext, type) in _common_types_default.items():
76            self.add_type(type, ext, False)
77        for name in filenames:
78            self.read(name, strict)
79
80    def add_type(self, type, ext, strict=True):
81        """Add a mapping between a type and an extension.
82
83        When the extension is already known, the new
84        type will replace the old one. When the type
85        is already known the extension will be added
86        to the list of known extensions.
87
88        If strict is true, information will be added to
89        list of standard types, else to the list of non-standard
90        types.
91        """
92        self.types_map[strict][ext] = type
93        exts = self.types_map_inv[strict].setdefault(type, [])
94        if ext not in exts:
95            exts.append(ext)
96
97    def guess_type(self, url, strict=True):
98        """Guess the type of a file which is either a URL or a path-like object.
99
100        Return value is a tuple (type, encoding) where type is None if
101        the type can't be guessed (no or unknown suffix) or a string
102        of the form type/subtype, usable for a MIME Content-type
103        header; and encoding is None for no encoding or the name of
104        the program used to encode (e.g. compress or gzip).  The
105        mappings are table driven.  Encoding suffixes are case
106        sensitive; type suffixes are first tried case sensitive, then
107        case insensitive.
108
109        The suffixes .tgz, .taz and .tz (case sensitive!) are all
110        mapped to '.tar.gz'.  (This is table-driven too, using the
111        dictionary suffix_map.)
112
113        Optional `strict' argument when False adds a bunch of commonly found,
114        but non-standard types.
115        """
116        url = os.fspath(url)
117        scheme, url = urllib.parse._splittype(url)
118        if scheme == 'data':
119            # syntax of data URLs:
120            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
121            # mediatype := [ type "/" subtype ] *( ";" parameter )
122            # data      := *urlchar
123            # parameter := attribute "=" value
124            # type/subtype defaults to "text/plain"
125            comma = url.find(',')
126            if comma < 0:
127                # bad data URL
128                return None, None
129            semi = url.find(';', 0, comma)
130            if semi >= 0:
131                type = url[:semi]
132            else:
133                type = url[:comma]
134            if '=' in type or '/' not in type:
135                type = 'text/plain'
136            return type, None           # never compressed, so encoding is None
137        base, ext = posixpath.splitext(url)
138        while ext in self.suffix_map:
139            base, ext = posixpath.splitext(base + self.suffix_map[ext])
140        if ext in self.encodings_map:
141            encoding = self.encodings_map[ext]
142            base, ext = posixpath.splitext(base)
143        else:
144            encoding = None
145        types_map = self.types_map[True]
146        if ext in types_map:
147            return types_map[ext], encoding
148        elif ext.lower() in types_map:
149            return types_map[ext.lower()], encoding
150        elif strict:
151            return None, encoding
152        types_map = self.types_map[False]
153        if ext in types_map:
154            return types_map[ext], encoding
155        elif ext.lower() in types_map:
156            return types_map[ext.lower()], encoding
157        else:
158            return None, encoding
159
160    def guess_all_extensions(self, type, strict=True):
161        """Guess the extensions for a file based on its MIME type.
162
163        Return value is a list of strings giving the possible filename
164        extensions, including the leading dot ('.').  The extension is not
165        guaranteed to have been associated with any particular data stream,
166        but would be mapped to the MIME type `type' by guess_type().
167
168        Optional `strict' argument when false adds a bunch of commonly found,
169        but non-standard types.
170        """
171        type = type.lower()
172        extensions = list(self.types_map_inv[True].get(type, []))
173        if not strict:
174            for ext in self.types_map_inv[False].get(type, []):
175                if ext not in extensions:
176                    extensions.append(ext)
177        return extensions
178
179    def guess_extension(self, type, strict=True):
180        """Guess the extension for a file based on its MIME type.
181
182        Return value is a string giving a filename extension,
183        including the leading dot ('.').  The extension is not
184        guaranteed to have been associated with any particular data
185        stream, but would be mapped to the MIME type `type' by
186        guess_type().  If no extension can be guessed for `type', None
187        is returned.
188
189        Optional `strict' argument when false adds a bunch of commonly found,
190        but non-standard types.
191        """
192        extensions = self.guess_all_extensions(type, strict)
193        if not extensions:
194            return None
195        return extensions[0]
196
197    def read(self, filename, strict=True):
198        """
199        Read a single mime.types-format file, specified by pathname.
200
201        If strict is true, information will be added to
202        list of standard types, else to the list of non-standard
203        types.
204        """
205        with open(filename, encoding='utf-8') as fp:
206            self.readfp(fp, strict)
207
208    def readfp(self, fp, strict=True):
209        """
210        Read a single mime.types-format file.
211
212        If strict is true, information will be added to
213        list of standard types, else to the list of non-standard
214        types.
215        """
216        while 1:
217            line = fp.readline()
218            if not line:
219                break
220            words = line.split()
221            for i in range(len(words)):
222                if words[i][0] == '#':
223                    del words[i:]
224                    break
225            if not words:
226                continue
227            type, suffixes = words[0], words[1:]
228            for suff in suffixes:
229                self.add_type(type, '.' + suff, strict)
230
231    def read_windows_registry(self, strict=True):
232        """
233        Load the MIME types database from Windows registry.
234
235        If strict is true, information will be added to
236        list of standard types, else to the list of non-standard
237        types.
238        """
239
240        # Windows only
241        if not _winreg:
242            return
243
244        def enum_types(mimedb):
245            i = 0
246            while True:
247                try:
248                    ctype = _winreg.EnumKey(mimedb, i)
249                except OSError:
250                    break
251                else:
252                    if '\0' not in ctype:
253                        yield ctype
254                i += 1
255
256        with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
257            for subkeyname in enum_types(hkcr):
258                try:
259                    with _winreg.OpenKey(hkcr, subkeyname) as subkey:
260                        # Only check file extensions
261                        if not subkeyname.startswith("."):
262                            continue
263                        # raises OSError if no 'Content Type' value
264                        mimetype, datatype = _winreg.QueryValueEx(
265                            subkey, 'Content Type')
266                        if datatype != _winreg.REG_SZ:
267                            continue
268                        self.add_type(mimetype, subkeyname, strict)
269                except OSError:
270                    continue
271
272def guess_type(url, strict=True):
273    """Guess the type of a file based on its URL.
274
275    Return value is a tuple (type, encoding) where type is None if the
276    type can't be guessed (no or unknown suffix) or a string of the
277    form type/subtype, usable for a MIME Content-type header; and
278    encoding is None for no encoding or the name of the program used
279    to encode (e.g. compress or gzip).  The mappings are table
280    driven.  Encoding suffixes are case sensitive; type suffixes are
281    first tried case sensitive, then case insensitive.
282
283    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
284    to ".tar.gz".  (This is table-driven too, using the dictionary
285    suffix_map).
286
287    Optional `strict' argument when false adds a bunch of commonly found, but
288    non-standard types.
289    """
290    if _db is None:
291        init()
292    return _db.guess_type(url, strict)
293
294
295def guess_all_extensions(type, strict=True):
296    """Guess the extensions for a file based on its MIME type.
297
298    Return value is a list of strings giving the possible filename
299    extensions, including the leading dot ('.').  The extension is not
300    guaranteed to have been associated with any particular data
301    stream, but would be mapped to the MIME type `type' by
302    guess_type().  If no extension can be guessed for `type', None
303    is returned.
304
305    Optional `strict' argument when false adds a bunch of commonly found,
306    but non-standard types.
307    """
308    if _db is None:
309        init()
310    return _db.guess_all_extensions(type, strict)
311
312def guess_extension(type, strict=True):
313    """Guess the extension for a file based on its MIME type.
314
315    Return value is a string giving a filename extension, including the
316    leading dot ('.').  The extension is not guaranteed to have been
317    associated with any particular data stream, but would be mapped to the
318    MIME type `type' by guess_type().  If no extension can be guessed for
319    `type', None is returned.
320
321    Optional `strict' argument when false adds a bunch of commonly found,
322    but non-standard types.
323    """
324    if _db is None:
325        init()
326    return _db.guess_extension(type, strict)
327
328def add_type(type, ext, strict=True):
329    """Add a mapping between a type and an extension.
330
331    When the extension is already known, the new
332    type will replace the old one. When the type
333    is already known the extension will be added
334    to the list of known extensions.
335
336    If strict is true, information will be added to
337    list of standard types, else to the list of non-standard
338    types.
339    """
340    if _db is None:
341        init()
342    return _db.add_type(type, ext, strict)
343
344
345def init(files=None):
346    global suffix_map, types_map, encodings_map, common_types
347    global inited, _db
348    inited = True    # so that MimeTypes.__init__() doesn't call us again
349
350    if files is None or _db is None:
351        db = MimeTypes()
352        if _winreg:
353            db.read_windows_registry()
354
355        if files is None:
356            files = knownfiles
357        else:
358            files = knownfiles + list(files)
359    else:
360        db = _db
361
362    for file in files:
363        if os.path.isfile(file):
364            db.read(file)
365    encodings_map = db.encodings_map
366    suffix_map = db.suffix_map
367    types_map = db.types_map[True]
368    common_types = db.types_map[False]
369    # Make the DB a global variable now that it is fully initialized
370    _db = db
371
372
373def read_mime_types(file):
374    try:
375        f = open(file, encoding='utf-8')
376    except OSError:
377        return None
378    with f:
379        db = MimeTypes()
380        db.readfp(f, True)
381        return db.types_map[True]
382
383
384def _default_mime_types():
385    global suffix_map, _suffix_map_default
386    global encodings_map, _encodings_map_default
387    global types_map, _types_map_default
388    global common_types, _common_types_default
389
390    suffix_map = _suffix_map_default = {
391        '.svgz': '.svg.gz',
392        '.tgz': '.tar.gz',
393        '.taz': '.tar.gz',
394        '.tz': '.tar.gz',
395        '.tbz2': '.tar.bz2',
396        '.txz': '.tar.xz',
397        }
398
399    encodings_map = _encodings_map_default = {
400        '.gz': 'gzip',
401        '.Z': 'compress',
402        '.bz2': 'bzip2',
403        '.xz': 'xz',
404        '.br': 'br',
405        }
406
407    # Before adding new types, make sure they are either registered with IANA,
408    # at http://www.iana.org/assignments/media-types
409    # or extensions, i.e. using the x- prefix
410
411    # If you add to these, please keep them sorted by mime type.
412    # Make sure the entry with the preferred file extension for a particular mime type
413    # appears before any others of the same mimetype.
414    types_map = _types_map_default = {
415        '.js'     : 'application/javascript',
416        '.mjs'    : 'application/javascript',
417        '.json'   : 'application/json',
418        '.webmanifest': 'application/manifest+json',
419        '.doc'    : 'application/msword',
420        '.dot'    : 'application/msword',
421        '.wiz'    : 'application/msword',
422        '.bin'    : 'application/octet-stream',
423        '.a'      : 'application/octet-stream',
424        '.dll'    : 'application/octet-stream',
425        '.exe'    : 'application/octet-stream',
426        '.o'      : 'application/octet-stream',
427        '.obj'    : 'application/octet-stream',
428        '.so'     : 'application/octet-stream',
429        '.oda'    : 'application/oda',
430        '.pdf'    : 'application/pdf',
431        '.p7c'    : 'application/pkcs7-mime',
432        '.ps'     : 'application/postscript',
433        '.ai'     : 'application/postscript',
434        '.eps'    : 'application/postscript',
435        '.m3u'    : 'application/vnd.apple.mpegurl',
436        '.m3u8'   : 'application/vnd.apple.mpegurl',
437        '.xls'    : 'application/vnd.ms-excel',
438        '.xlb'    : 'application/vnd.ms-excel',
439        '.ppt'    : 'application/vnd.ms-powerpoint',
440        '.pot'    : 'application/vnd.ms-powerpoint',
441        '.ppa'    : 'application/vnd.ms-powerpoint',
442        '.pps'    : 'application/vnd.ms-powerpoint',
443        '.pwz'    : 'application/vnd.ms-powerpoint',
444        '.wasm'   : 'application/wasm',
445        '.bcpio'  : 'application/x-bcpio',
446        '.cpio'   : 'application/x-cpio',
447        '.csh'    : 'application/x-csh',
448        '.dvi'    : 'application/x-dvi',
449        '.gtar'   : 'application/x-gtar',
450        '.hdf'    : 'application/x-hdf',
451        '.latex'  : 'application/x-latex',
452        '.mif'    : 'application/x-mif',
453        '.cdf'    : 'application/x-netcdf',
454        '.nc'     : 'application/x-netcdf',
455        '.p12'    : 'application/x-pkcs12',
456        '.pfx'    : 'application/x-pkcs12',
457        '.ram'    : 'application/x-pn-realaudio',
458        '.pyc'    : 'application/x-python-code',
459        '.pyo'    : 'application/x-python-code',
460        '.sh'     : 'application/x-sh',
461        '.shar'   : 'application/x-shar',
462        '.swf'    : 'application/x-shockwave-flash',
463        '.sv4cpio': 'application/x-sv4cpio',
464        '.sv4crc' : 'application/x-sv4crc',
465        '.tar'    : 'application/x-tar',
466        '.tcl'    : 'application/x-tcl',
467        '.tex'    : 'application/x-tex',
468        '.texi'   : 'application/x-texinfo',
469        '.texinfo': 'application/x-texinfo',
470        '.roff'   : 'application/x-troff',
471        '.t'      : 'application/x-troff',
472        '.tr'     : 'application/x-troff',
473        '.man'    : 'application/x-troff-man',
474        '.me'     : 'application/x-troff-me',
475        '.ms'     : 'application/x-troff-ms',
476        '.ustar'  : 'application/x-ustar',
477        '.src'    : 'application/x-wais-source',
478        '.xsl'    : 'application/xml',
479        '.rdf'    : 'application/xml',
480        '.wsdl'   : 'application/xml',
481        '.xpdl'   : 'application/xml',
482        '.zip'    : 'application/zip',
483        '.au'     : 'audio/basic',
484        '.snd'    : 'audio/basic',
485        '.mp3'    : 'audio/mpeg',
486        '.mp2'    : 'audio/mpeg',
487        '.aif'    : 'audio/x-aiff',
488        '.aifc'   : 'audio/x-aiff',
489        '.aiff'   : 'audio/x-aiff',
490        '.ra'     : 'audio/x-pn-realaudio',
491        '.wav'    : 'audio/x-wav',
492        '.bmp'    : 'image/bmp',
493        '.gif'    : 'image/gif',
494        '.ief'    : 'image/ief',
495        '.jpg'    : 'image/jpeg',
496        '.jpe'    : 'image/jpeg',
497        '.jpeg'   : 'image/jpeg',
498        '.png'    : 'image/png',
499        '.svg'    : 'image/svg+xml',
500        '.tiff'   : 'image/tiff',
501        '.tif'    : 'image/tiff',
502        '.ico'    : 'image/vnd.microsoft.icon',
503        '.ras'    : 'image/x-cmu-raster',
504        '.bmp'    : 'image/x-ms-bmp',
505        '.pnm'    : 'image/x-portable-anymap',
506        '.pbm'    : 'image/x-portable-bitmap',
507        '.pgm'    : 'image/x-portable-graymap',
508        '.ppm'    : 'image/x-portable-pixmap',
509        '.rgb'    : 'image/x-rgb',
510        '.xbm'    : 'image/x-xbitmap',
511        '.xpm'    : 'image/x-xpixmap',
512        '.xwd'    : 'image/x-xwindowdump',
513        '.eml'    : 'message/rfc822',
514        '.mht'    : 'message/rfc822',
515        '.mhtml'  : 'message/rfc822',
516        '.nws'    : 'message/rfc822',
517        '.css'    : 'text/css',
518        '.csv'    : 'text/csv',
519        '.html'   : 'text/html',
520        '.htm'    : 'text/html',
521        '.txt'    : 'text/plain',
522        '.bat'    : 'text/plain',
523        '.c'      : 'text/plain',
524        '.h'      : 'text/plain',
525        '.ksh'    : 'text/plain',
526        '.pl'     : 'text/plain',
527        '.rtx'    : 'text/richtext',
528        '.tsv'    : 'text/tab-separated-values',
529        '.py'     : 'text/x-python',
530        '.etx'    : 'text/x-setext',
531        '.sgm'    : 'text/x-sgml',
532        '.sgml'   : 'text/x-sgml',
533        '.vcf'    : 'text/x-vcard',
534        '.xml'    : 'text/xml',
535        '.mp4'    : 'video/mp4',
536        '.mpeg'   : 'video/mpeg',
537        '.m1v'    : 'video/mpeg',
538        '.mpa'    : 'video/mpeg',
539        '.mpe'    : 'video/mpeg',
540        '.mpg'    : 'video/mpeg',
541        '.mov'    : 'video/quicktime',
542        '.qt'     : 'video/quicktime',
543        '.webm'   : 'video/webm',
544        '.avi'    : 'video/x-msvideo',
545        '.movie'  : 'video/x-sgi-movie',
546        }
547
548    # These are non-standard types, commonly found in the wild.  They will
549    # only match if strict=0 flag is given to the API methods.
550
551    # Please sort these too
552    common_types = _common_types_default = {
553        '.rtf' : 'application/rtf',
554        '.midi': 'audio/midi',
555        '.mid' : 'audio/midi',
556        '.jpg' : 'image/jpg',
557        '.pict': 'image/pict',
558        '.pct' : 'image/pict',
559        '.pic' : 'image/pict',
560        '.xul' : 'text/xul',
561        }
562
563
564_default_mime_types()
565
566
567def _main():
568    import getopt
569
570    USAGE = """\
571Usage: mimetypes.py [options] type
572
573Options:
574    --help / -h       -- print this message and exit
575    --lenient / -l    -- additionally search of some common, but non-standard
576                         types.
577    --extension / -e  -- guess extension instead of type
578
579More than one type argument may be given.
580"""
581
582    def usage(code, msg=''):
583        print(USAGE)
584        if msg: print(msg)
585        sys.exit(code)
586
587    try:
588        opts, args = getopt.getopt(sys.argv[1:], 'hle',
589                                   ['help', 'lenient', 'extension'])
590    except getopt.error as msg:
591        usage(1, msg)
592
593    strict = 1
594    extension = 0
595    for opt, arg in opts:
596        if opt in ('-h', '--help'):
597            usage(0)
598        elif opt in ('-l', '--lenient'):
599            strict = 0
600        elif opt in ('-e', '--extension'):
601            extension = 1
602    for gtype in args:
603        if extension:
604            guess = guess_extension(gtype, strict)
605            if not guess: print("I don't know anything about type", gtype)
606            else: print(guess)
607        else:
608            guess, encoding = guess_type(gtype, strict)
609            if not guess: print("I don't know anything about type", gtype)
610            else: print('type:', guess, 'encoding:', encoding)
611
612
613if __name__ == '__main__':
614    _main()
615