1"""Guess the MIME type of a file.
2
3This module defines two useful functions:
4
5guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
6
7guess_extension(type, strict=1) -- guess the extension for a given MIME type.
8
9It also contains the following, for tuning the behavior:
10
11Data:
12
13knownfiles -- list of files to parse
14inited -- flag set when init() has been called
15suffix_map -- dictionary mapping suffixes to suffixes
16encodings_map -- dictionary mapping suffixes to encodings
17types_map -- dictionary mapping suffixes to types
18
19Functions:
20
21init([files]) -- parse a list of files, default knownfiles (on Windows, the
22  default values are taken from the registry)
23read_mime_types(file) -- parse one file, return a dictionary or None
24"""
25
26import os
27import sys
28import posixpath
29import urllib
30try:
31    import _winreg
32except ImportError:
33    _winreg = None
34
35__all__ = [
36    "guess_type","guess_extension","guess_all_extensions",
37    "add_type","read_mime_types","init"
38]
39
40knownfiles = [
41    "/etc/mime.types",
42    "/etc/httpd/mime.types",                    # Mac OS X
43    "/etc/httpd/conf/mime.types",               # Apache
44    "/etc/apache/mime.types",                   # Apache 1
45    "/etc/apache2/mime.types",                  # Apache 2
46    "/usr/local/etc/httpd/conf/mime.types",
47    "/usr/local/lib/netscape/mime.types",
48    "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
49    "/usr/local/etc/mime.types",                # Apache 1.3
50    ]
51
52inited = False
53_db = None
54
55
56class MimeTypes:
57    """MIME-types datastore.
58
59    This datastore can handle information from mime.types-style files
60    and supports basic determination of MIME type from a filename or
61    URL, and can guess a reasonable extension given a MIME type.
62    """
63
64    def __init__(self, filenames=(), strict=True):
65        if not inited:
66            init()
67        self.encodings_map = encodings_map.copy()
68        self.suffix_map = suffix_map.copy()
69        self.types_map = ({}, {}) # dict for (non-strict, strict)
70        self.types_map_inv = ({}, {})
71        for (ext, type) in types_map.items():
72            self.add_type(type, ext, True)
73        for (ext, type) in common_types.items():
74            self.add_type(type, ext, False)
75        for name in filenames:
76            self.read(name, strict)
77
78    def add_type(self, type, ext, strict=True):
79        """Add a mapping between a type and an extension.
80
81        When the extension is already known, the new
82        type will replace the old one. When the type
83        is already known the extension will be added
84        to the list of known extensions.
85
86        If strict is true, information will be added to
87        list of standard types, else to the list of non-standard
88        types.
89        """
90        self.types_map[strict][ext] = type
91        exts = self.types_map_inv[strict].setdefault(type, [])
92        if ext not in exts:
93            exts.append(ext)
94
95    def guess_type(self, url, strict=True):
96        """Guess the type of a file based on its URL.
97
98        Return value is a tuple (type, encoding) where type is None if
99        the type can't be guessed (no or unknown suffix) or a string
100        of the form type/subtype, usable for a MIME Content-type
101        header; and encoding is None for no encoding or the name of
102        the program used to encode (e.g. compress or gzip).  The
103        mappings are table driven.  Encoding suffixes are case
104        sensitive; type suffixes are first tried case sensitive, then
105        case insensitive.
106
107        The suffixes .tgz, .taz and .tz (case sensitive!) are all
108        mapped to '.tar.gz'.  (This is table-driven too, using the
109        dictionary suffix_map.)
110
111        Optional `strict' argument when False adds a bunch of commonly found,
112        but non-standard types.
113        """
114        scheme, url = urllib.splittype(url)
115        if scheme == 'data':
116            # syntax of data URLs:
117            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
118            # mediatype := [ type "/" subtype ] *( ";" parameter )
119            # data      := *urlchar
120            # parameter := attribute "=" value
121            # type/subtype defaults to "text/plain"
122            comma = url.find(',')
123            if comma < 0:
124                # bad data URL
125                return None, None
126            semi = url.find(';', 0, comma)
127            if semi >= 0:
128                type = url[:semi]
129            else:
130                type = url[:comma]
131            if '=' in type or '/' not in type:
132                type = 'text/plain'
133            return type, None           # never compressed, so encoding is None
134        base, ext = posixpath.splitext(url)
135        while ext in self.suffix_map:
136            base, ext = posixpath.splitext(base + self.suffix_map[ext])
137        if ext in self.encodings_map:
138            encoding = self.encodings_map[ext]
139            base, ext = posixpath.splitext(base)
140        else:
141            encoding = None
142        types_map = self.types_map[True]
143        if ext in types_map:
144            return types_map[ext], encoding
145        elif ext.lower() in types_map:
146            return types_map[ext.lower()], encoding
147        elif strict:
148            return None, encoding
149        types_map = self.types_map[False]
150        if ext in types_map:
151            return types_map[ext], encoding
152        elif ext.lower() in types_map:
153            return types_map[ext.lower()], encoding
154        else:
155            return None, encoding
156
157    def guess_all_extensions(self, type, strict=True):
158        """Guess the extensions for a file based on its MIME type.
159
160        Return value is a list of strings giving the possible filename
161        extensions, including the leading dot ('.').  The extension is not
162        guaranteed to have been associated with any particular data stream,
163        but would be mapped to the MIME type `type' by guess_type().
164
165        Optional `strict' argument when false adds a bunch of commonly found,
166        but non-standard types.
167        """
168        type = type.lower()
169        extensions = self.types_map_inv[True].get(type, [])
170        if not strict:
171            for ext in self.types_map_inv[False].get(type, []):
172                if ext not in extensions:
173                    extensions.append(ext)
174        return extensions
175
176    def guess_extension(self, type, strict=True):
177        """Guess the extension for a file based on its MIME type.
178
179        Return value is a string giving a filename extension,
180        including the leading dot ('.').  The extension is not
181        guaranteed to have been associated with any particular data
182        stream, but would be mapped to the MIME type `type' by
183        guess_type().  If no extension can be guessed for `type', None
184        is returned.
185
186        Optional `strict' argument when false adds a bunch of commonly found,
187        but non-standard types.
188        """
189        extensions = self.guess_all_extensions(type, strict)
190        if not extensions:
191            return None
192        return extensions[0]
193
194    def read(self, filename, strict=True):
195        """
196        Read a single mime.types-format file, specified by pathname.
197
198        If strict is true, information will be added to
199        list of standard types, else to the list of non-standard
200        types.
201        """
202        with open(filename) as fp:
203            self.readfp(fp, strict)
204
205    def readfp(self, fp, strict=True):
206        """
207        Read a single mime.types-format file.
208
209        If strict is true, information will be added to
210        list of standard types, else to the list of non-standard
211        types.
212        """
213        while 1:
214            line = fp.readline()
215            if not line:
216                break
217            words = line.split()
218            for i in range(len(words)):
219                if words[i][0] == '#':
220                    del words[i:]
221                    break
222            if not words:
223                continue
224            type, suffixes = words[0], words[1:]
225            for suff in suffixes:
226                self.add_type(type, '.' + suff, strict)
227
228    def read_windows_registry(self, strict=True):
229        """
230        Load the MIME types database from Windows registry.
231
232        If strict is true, information will be added to
233        list of standard types, else to the list of non-standard
234        types.
235        """
236
237        # Windows only
238        if not _winreg:
239            return
240
241        def enum_types(mimedb):
242            i = 0
243            while True:
244                try:
245                    ctype = _winreg.EnumKey(mimedb, i)
246                except EnvironmentError:
247                    break
248                else:
249                    if '\0' not in ctype:
250                        yield ctype
251                i += 1
252
253        default_encoding = sys.getdefaultencoding()
254        with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
255            for subkeyname in enum_types(hkcr):
256                try:
257                    with _winreg.OpenKey(hkcr, subkeyname) as subkey:
258                        # Only check file extensions
259                        if not subkeyname.startswith("."):
260                            continue
261                        # raises EnvironmentError if no 'Content Type' value
262                        mimetype, datatype = _winreg.QueryValueEx(
263                            subkey, 'Content Type')
264                        if datatype != _winreg.REG_SZ:
265                            continue
266                        try:
267                            mimetype = mimetype.encode(default_encoding)
268                        except UnicodeEncodeError:
269                            continue
270                        self.add_type(mimetype, subkeyname, strict)
271                except EnvironmentError:
272                    continue
273
274def guess_type(url, strict=True):
275    """Guess the type of a file based on its URL.
276
277    Return value is a tuple (type, encoding) where type is None if the
278    type can't be guessed (no or unknown suffix) or a string of the
279    form type/subtype, usable for a MIME Content-type header; and
280    encoding is None for no encoding or the name of the program used
281    to encode (e.g. compress or gzip).  The mappings are table
282    driven.  Encoding suffixes are case sensitive; type suffixes are
283    first tried case sensitive, then case insensitive.
284
285    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
286    to ".tar.gz".  (This is table-driven too, using the dictionary
287    suffix_map).
288
289    Optional `strict' argument when false adds a bunch of commonly found, but
290    non-standard types.
291    """
292    if _db is None:
293        init()
294    return _db.guess_type(url, strict)
295
296
297def guess_all_extensions(type, strict=True):
298    """Guess the extensions for a file based on its MIME type.
299
300    Return value is a list of strings giving the possible filename
301    extensions, including the leading dot ('.').  The extension is not
302    guaranteed to have been associated with any particular data
303    stream, but would be mapped to the MIME type `type' by
304    guess_type().  If no extension can be guessed for `type', None
305    is returned.
306
307    Optional `strict' argument when false adds a bunch of commonly found,
308    but non-standard types.
309    """
310    if _db is None:
311        init()
312    return _db.guess_all_extensions(type, strict)
313
314def guess_extension(type, strict=True):
315    """Guess the extension for a file based on its MIME type.
316
317    Return value is a string giving a filename extension, including the
318    leading dot ('.').  The extension is not guaranteed to have been
319    associated with any particular data stream, but would be mapped to the
320    MIME type `type' by guess_type().  If no extension can be guessed for
321    `type', None is returned.
322
323    Optional `strict' argument when false adds a bunch of commonly found,
324    but non-standard types.
325    """
326    if _db is None:
327        init()
328    return _db.guess_extension(type, strict)
329
330def add_type(type, ext, strict=True):
331    """Add a mapping between a type and an extension.
332
333    When the extension is already known, the new
334    type will replace the old one. When the type
335    is already known the extension will be added
336    to the list of known extensions.
337
338    If strict is true, information will be added to
339    list of standard types, else to the list of non-standard
340    types.
341    """
342    if _db is None:
343        init()
344    return _db.add_type(type, ext, strict)
345
346
347def init(files=None):
348    global suffix_map, types_map, encodings_map, common_types
349    global inited, _db
350    inited = True    # so that MimeTypes.__init__() doesn't call us again
351    db = MimeTypes()
352    if files is None:
353        if _winreg:
354            db.read_windows_registry()
355        files = knownfiles
356    for file in files:
357        if os.path.isfile(file):
358            db.read(file)
359    encodings_map = db.encodings_map
360    suffix_map = db.suffix_map
361    types_map = db.types_map[True]
362    common_types = db.types_map[False]
363    # Make the DB a global variable now that it is fully initialized
364    _db = db
365
366
367def read_mime_types(file):
368    try:
369        f = open(file)
370    except IOError:
371        return None
372    with f:
373        db = MimeTypes()
374        db.readfp(f, True)
375        return db.types_map[True]
376
377
378def _default_mime_types():
379    global suffix_map
380    global encodings_map
381    global types_map
382    global common_types
383
384    suffix_map = {
385        '.svgz': '.svg.gz',
386        '.tgz': '.tar.gz',
387        '.taz': '.tar.gz',
388        '.tz': '.tar.gz',
389        '.tbz2': '.tar.bz2',
390        '.txz': '.tar.xz',
391        }
392
393    encodings_map = {
394        '.gz': 'gzip',
395        '.Z': 'compress',
396        '.bz2': 'bzip2',
397        '.xz': 'xz',
398        }
399
400    # Before adding new types, make sure they are either registered with IANA,
401    # at http://www.isi.edu/in-notes/iana/assignments/media-types
402    # or extensions, i.e. using the x- prefix
403
404    # If you add to these, please keep them sorted!
405    types_map = {
406        '.a'      : 'application/octet-stream',
407        '.ai'     : 'application/postscript',
408        '.aif'    : 'audio/x-aiff',
409        '.aifc'   : 'audio/x-aiff',
410        '.aiff'   : 'audio/x-aiff',
411        '.au'     : 'audio/basic',
412        '.avi'    : 'video/x-msvideo',
413        '.bat'    : 'text/plain',
414        '.bcpio'  : 'application/x-bcpio',
415        '.bin'    : 'application/octet-stream',
416        '.bmp'    : 'image/x-ms-bmp',
417        '.c'      : 'text/plain',
418        # Duplicates :(
419        '.cdf'    : 'application/x-cdf',
420        '.cdf'    : 'application/x-netcdf',
421        '.cpio'   : 'application/x-cpio',
422        '.csh'    : 'application/x-csh',
423        '.css'    : 'text/css',
424        '.csv'    : 'text/csv',
425        '.dll'    : 'application/octet-stream',
426        '.doc'    : 'application/msword',
427        '.dot'    : 'application/msword',
428        '.dvi'    : 'application/x-dvi',
429        '.eml'    : 'message/rfc822',
430        '.eps'    : 'application/postscript',
431        '.etx'    : 'text/x-setext',
432        '.exe'    : 'application/octet-stream',
433        '.gif'    : 'image/gif',
434        '.gtar'   : 'application/x-gtar',
435        '.h'      : 'text/plain',
436        '.hdf'    : 'application/x-hdf',
437        '.htm'    : 'text/html',
438        '.html'   : 'text/html',
439        '.ico'    : 'image/vnd.microsoft.icon',
440        '.ief'    : 'image/ief',
441        '.jpe'    : 'image/jpeg',
442        '.jpeg'   : 'image/jpeg',
443        '.jpg'    : 'image/jpeg',
444        '.js'     : 'application/javascript',
445        '.json'   : 'application/json',
446        '.ksh'    : 'text/plain',
447        '.latex'  : 'application/x-latex',
448        '.m1v'    : 'video/mpeg',
449        '.man'    : 'application/x-troff-man',
450        '.me'     : 'application/x-troff-me',
451        '.mht'    : 'message/rfc822',
452        '.mhtml'  : 'message/rfc822',
453        '.mif'    : 'application/x-mif',
454        '.mjs'    : 'application/javascript',
455        '.mov'    : 'video/quicktime',
456        '.movie'  : 'video/x-sgi-movie',
457        '.mp2'    : 'audio/mpeg',
458        '.mp3'    : 'audio/mpeg',
459        '.mp4'    : 'video/mp4',
460        '.mpa'    : 'video/mpeg',
461        '.mpe'    : 'video/mpeg',
462        '.mpeg'   : 'video/mpeg',
463        '.mpg'    : 'video/mpeg',
464        '.ms'     : 'application/x-troff-ms',
465        '.nc'     : 'application/x-netcdf',
466        '.nws'    : 'message/rfc822',
467        '.o'      : 'application/octet-stream',
468        '.obj'    : 'application/octet-stream',
469        '.oda'    : 'application/oda',
470        '.p12'    : 'application/x-pkcs12',
471        '.p7c'    : 'application/pkcs7-mime',
472        '.pbm'    : 'image/x-portable-bitmap',
473        '.pdf'    : 'application/pdf',
474        '.pfx'    : 'application/x-pkcs12',
475        '.pgm'    : 'image/x-portable-graymap',
476        '.pl'     : 'text/plain',
477        '.png'    : 'image/png',
478        '.pnm'    : 'image/x-portable-anymap',
479        '.pot'    : 'application/vnd.ms-powerpoint',
480        '.ppa'    : 'application/vnd.ms-powerpoint',
481        '.ppm'    : 'image/x-portable-pixmap',
482        '.pps'    : 'application/vnd.ms-powerpoint',
483        '.ppt'    : 'application/vnd.ms-powerpoint',
484        '.ps'     : 'application/postscript',
485        '.pwz'    : 'application/vnd.ms-powerpoint',
486        '.py'     : 'text/x-python',
487        '.pyc'    : 'application/x-python-code',
488        '.pyo'    : 'application/x-python-code',
489        '.qt'     : 'video/quicktime',
490        '.ra'     : 'audio/x-pn-realaudio',
491        '.ram'    : 'application/x-pn-realaudio',
492        '.ras'    : 'image/x-cmu-raster',
493        '.rdf'    : 'application/xml',
494        '.rgb'    : 'image/x-rgb',
495        '.roff'   : 'application/x-troff',
496        '.rtx'    : 'text/richtext',
497        '.sgm'    : 'text/x-sgml',
498        '.sgml'   : 'text/x-sgml',
499        '.sh'     : 'application/x-sh',
500        '.shar'   : 'application/x-shar',
501        '.snd'    : 'audio/basic',
502        '.so'     : 'application/octet-stream',
503        '.src'    : 'application/x-wais-source',
504        '.sv4cpio': 'application/x-sv4cpio',
505        '.sv4crc' : 'application/x-sv4crc',
506        '.svg'    : 'image/svg+xml',
507        '.swf'    : 'application/x-shockwave-flash',
508        '.t'      : 'application/x-troff',
509        '.tar'    : 'application/x-tar',
510        '.tcl'    : 'application/x-tcl',
511        '.tex'    : 'application/x-tex',
512        '.texi'   : 'application/x-texinfo',
513        '.texinfo': 'application/x-texinfo',
514        '.tif'    : 'image/tiff',
515        '.tiff'   : 'image/tiff',
516        '.tr'     : 'application/x-troff',
517        '.tsv'    : 'text/tab-separated-values',
518        '.txt'    : 'text/plain',
519        '.ustar'  : 'application/x-ustar',
520        '.vcf'    : 'text/x-vcard',
521        '.wav'    : 'audio/x-wav',
522        '.webm'   : 'video/webm',
523        '.wiz'    : 'application/msword',
524        '.wsdl'   : 'application/xml',
525        '.xbm'    : 'image/x-xbitmap',
526        '.xlb'    : 'application/vnd.ms-excel',
527        # Duplicates :(
528        '.xls'    : 'application/excel',
529        '.xls'    : 'application/vnd.ms-excel',
530        '.xml'    : 'text/xml',
531        '.xpdl'   : 'application/xml',
532        '.xpm'    : 'image/x-xpixmap',
533        '.xsl'    : 'application/xml',
534        '.xwd'    : 'image/x-xwindowdump',
535        '.zip'    : 'application/zip',
536        }
537
538    # These are non-standard types, commonly found in the wild.  They will
539    # only match if strict=0 flag is given to the API methods.
540
541    # Please sort these too
542    common_types = {
543        '.jpg' : 'image/jpg',
544        '.mid' : 'audio/midi',
545        '.midi': 'audio/midi',
546        '.pct' : 'image/pict',
547        '.pic' : 'image/pict',
548        '.pict': 'image/pict',
549        '.rtf' : 'application/rtf',
550        '.xul' : 'text/xul'
551        }
552
553
554_default_mime_types()
555
556
557if __name__ == '__main__':
558    import getopt
559
560    USAGE = """\
561Usage: mimetypes.py [options] type
562
563Options:
564    --help / -h       -- print this message and exit
565    --lenient / -l    -- additionally search of some common, but non-standard
566                         types.
567    --extension / -e  -- guess extension instead of type
568
569More than one type argument may be given.
570"""
571
572    def usage(code, msg=''):
573        print USAGE
574        if msg: print msg
575        sys.exit(code)
576
577    try:
578        opts, args = getopt.getopt(sys.argv[1:], 'hle',
579                                   ['help', 'lenient', 'extension'])
580    except getopt.error, msg:
581        usage(1, msg)
582
583    strict = 1
584    extension = 0
585    for opt, arg in opts:
586        if opt in ('-h', '--help'):
587            usage(0)
588        elif opt in ('-l', '--lenient'):
589            strict = 0
590        elif opt in ('-e', '--extension'):
591            extension = 1
592    for gtype in args:
593        if extension:
594            guess = guess_extension(gtype, strict)
595            if not guess: print "I don't know anything about type", gtype
596            else: print guess
597        else:
598            guess, encoding = guess_type(gtype, strict)
599            if not guess: print "I don't know anything about type", gtype
600            else: print 'type:', guess, 'encoding:', encoding
601