1"""Guess the MIME type of a file.
2
3This module defines two useful functions:
4
5guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
6
7guess_extension(type, strict=1) -- guess the extension for a given MIME type.
8
9It also contains the following, for tuning the behavior:
10
11Data:
12
13knownfiles -- list of files to parse
14inited -- flag set when init() has been called
15suffix_map -- dictionary mapping suffixes to suffixes
16encodings_map -- dictionary mapping suffixes to encodings
17types_map -- dictionary mapping suffixes to types
18
19Functions:
20
21init([files]) -- parse a list of files, default knownfiles (on Windows, the
22  default values are taken from the registry)
23read_mime_types(file) -- parse one file, return a dictionary or None
24"""
25
26import os
27import sys
28import posixpath
29import urllib
30try:
31    import _winreg
32except ImportError:
33    _winreg = None
34
35__all__ = [
36    "guess_type","guess_extension","guess_all_extensions",
37    "add_type","read_mime_types","init"
38]
39
40knownfiles = [
41    "/etc/mime.types",
42    "/etc/httpd/mime.types",                    # Mac OS X
43    "/etc/httpd/conf/mime.types",               # Apache
44    "/etc/apache/mime.types",                   # Apache 1
45    "/etc/apache2/mime.types",                  # Apache 2
46    "/usr/local/etc/httpd/conf/mime.types",
47    "/usr/local/lib/netscape/mime.types",
48    "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
49    "/usr/local/etc/mime.types",                # Apache 1.3
50    ]
51
52inited = False
53_db = None
54
55
56class MimeTypes:
57    """MIME-types datastore.
58
59    This datastore can handle information from mime.types-style files
60    and supports basic determination of MIME type from a filename or
61    URL, and can guess a reasonable extension given a MIME type.
62    """
63
64    def __init__(self, filenames=(), strict=True):
65        if not inited:
66            init()
67        self.encodings_map = encodings_map.copy()
68        self.suffix_map = suffix_map.copy()
69        self.types_map = ({}, {}) # dict for (non-strict, strict)
70        self.types_map_inv = ({}, {})
71        for (ext, type) in types_map.items():
72            self.add_type(type, ext, True)
73        for (ext, type) in common_types.items():
74            self.add_type(type, ext, False)
75        for name in filenames:
76            self.read(name, strict)
77
78    def add_type(self, type, ext, strict=True):
79        """Add a mapping between a type and an extension.
80
81        When the extension is already known, the new
82        type will replace the old one. When the type
83        is already known the extension will be added
84        to the list of known extensions.
85
86        If strict is true, information will be added to
87        list of standard types, else to the list of non-standard
88        types.
89        """
90        self.types_map[strict][ext] = type
91        exts = self.types_map_inv[strict].setdefault(type, [])
92        if ext not in exts:
93            exts.append(ext)
94
95    def guess_type(self, url, strict=True):
96        """Guess the type of a file based on its URL.
97
98        Return value is a tuple (type, encoding) where type is None if
99        the type can't be guessed (no or unknown suffix) or a string
100        of the form type/subtype, usable for a MIME Content-type
101        header; and encoding is None for no encoding or the name of
102        the program used to encode (e.g. compress or gzip).  The
103        mappings are table driven.  Encoding suffixes are case
104        sensitive; type suffixes are first tried case sensitive, then
105        case insensitive.
106
107        The suffixes .tgz, .taz and .tz (case sensitive!) are all
108        mapped to '.tar.gz'.  (This is table-driven too, using the
109        dictionary suffix_map.)
110
111        Optional `strict' argument when False adds a bunch of commonly found,
112        but non-standard types.
113        """
114        scheme, url = urllib.splittype(url)
115        if scheme == 'data':
116            # syntax of data URLs:
117            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
118            # mediatype := [ type "/" subtype ] *( ";" parameter )
119            # data      := *urlchar
120            # parameter := attribute "=" value
121            # type/subtype defaults to "text/plain"
122            comma = url.find(',')
123            if comma < 0:
124                # bad data URL
125                return None, None
126            semi = url.find(';', 0, comma)
127            if semi >= 0:
128                type = url[:semi]
129            else:
130                type = url[:comma]
131            if '=' in type or '/' not in type:
132                type = 'text/plain'
133            return type, None           # never compressed, so encoding is None
134        base, ext = posixpath.splitext(url)
135        while ext in self.suffix_map:
136            base, ext = posixpath.splitext(base + self.suffix_map[ext])
137        if ext in self.encodings_map:
138            encoding = self.encodings_map[ext]
139            base, ext = posixpath.splitext(base)
140        else:
141            encoding = None
142        types_map = self.types_map[True]
143        if ext in types_map:
144            return types_map[ext], encoding
145        elif ext.lower() in types_map:
146            return types_map[ext.lower()], encoding
147        elif strict:
148            return None, encoding
149        types_map = self.types_map[False]
150        if ext in types_map:
151            return types_map[ext], encoding
152        elif ext.lower() in types_map:
153            return types_map[ext.lower()], encoding
154        else:
155            return None, encoding
156
157    def guess_all_extensions(self, type, strict=True):
158        """Guess the extensions for a file based on its MIME type.
159
160        Return value is a list of strings giving the possible filename
161        extensions, including the leading dot ('.').  The extension is not
162        guaranteed to have been associated with any particular data stream,
163        but would be mapped to the MIME type `type' by guess_type().
164
165        Optional `strict' argument when false adds a bunch of commonly found,
166        but non-standard types.
167        """
168        type = type.lower()
169        extensions = self.types_map_inv[True].get(type, [])
170        if not strict:
171            for ext in self.types_map_inv[False].get(type, []):
172                if ext not in extensions:
173                    extensions.append(ext)
174        return extensions
175
176    def guess_extension(self, type, strict=True):
177        """Guess the extension for a file based on its MIME type.
178
179        Return value is a string giving a filename extension,
180        including the leading dot ('.').  The extension is not
181        guaranteed to have been associated with any particular data
182        stream, but would be mapped to the MIME type `type' by
183        guess_type().  If no extension can be guessed for `type', None
184        is returned.
185
186        Optional `strict' argument when false adds a bunch of commonly found,
187        but non-standard types.
188        """
189        extensions = self.guess_all_extensions(type, strict)
190        if not extensions:
191            return None
192        return extensions[0]
193
194    def read(self, filename, strict=True):
195        """
196        Read a single mime.types-format file, specified by pathname.
197
198        If strict is true, information will be added to
199        list of standard types, else to the list of non-standard
200        types.
201        """
202        with open(filename) as fp:
203            self.readfp(fp, strict)
204
205    def readfp(self, fp, strict=True):
206        """
207        Read a single mime.types-format file.
208
209        If strict is true, information will be added to
210        list of standard types, else to the list of non-standard
211        types.
212        """
213        while 1:
214            line = fp.readline()
215            if not line:
216                break
217            words = line.split()
218            for i in range(len(words)):
219                if words[i][0] == '#':
220                    del words[i:]
221                    break
222            if not words:
223                continue
224            type, suffixes = words[0], words[1:]
225            for suff in suffixes:
226                self.add_type(type, '.' + suff, strict)
227
228    def read_windows_registry(self, strict=True):
229        """
230        Load the MIME types database from Windows registry.
231
232        If strict is true, information will be added to
233        list of standard types, else to the list of non-standard
234        types.
235        """
236
237        # Windows only
238        if not _winreg:
239            return
240
241        def enum_types(mimedb):
242            i = 0
243            while True:
244                try:
245                    ctype = _winreg.EnumKey(mimedb, i)
246                except EnvironmentError:
247                    break
248                try:
249                    ctype = ctype.encode(default_encoding) # omit in 3.x!
250                except UnicodeEncodeError:
251                    pass
252                else:
253                    yield ctype
254                i += 1
255
256        default_encoding = sys.getdefaultencoding()
257        with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT,
258                             r'MIME\Database\Content Type') as mimedb:
259            for ctype in enum_types(mimedb):
260                try:
261                    with _winreg.OpenKey(mimedb, ctype) as key:
262                        suffix, datatype = _winreg.QueryValueEx(key,
263                                                                'Extension')
264                except EnvironmentError:
265                    continue
266                if datatype != _winreg.REG_SZ:
267                    continue
268                try:
269                    suffix = suffix.encode(default_encoding) # omit in 3.x!
270                except UnicodeEncodeError:
271                    continue
272                self.add_type(ctype, suffix, strict)
273
274
275def guess_type(url, strict=True):
276    """Guess the type of a file based on its URL.
277
278    Return value is a tuple (type, encoding) where type is None if the
279    type can't be guessed (no or unknown suffix) or a string of the
280    form type/subtype, usable for a MIME Content-type header; and
281    encoding is None for no encoding or the name of the program used
282    to encode (e.g. compress or gzip).  The mappings are table
283    driven.  Encoding suffixes are case sensitive; type suffixes are
284    first tried case sensitive, then case insensitive.
285
286    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
287    to ".tar.gz".  (This is table-driven too, using the dictionary
288    suffix_map).
289
290    Optional `strict' argument when false adds a bunch of commonly found, but
291    non-standard types.
292    """
293    if _db is None:
294        init()
295    return _db.guess_type(url, strict)
296
297
298def guess_all_extensions(type, strict=True):
299    """Guess the extensions for a file based on its MIME type.
300
301    Return value is a list of strings giving the possible filename
302    extensions, including the leading dot ('.').  The extension is not
303    guaranteed to have been associated with any particular data
304    stream, but would be mapped to the MIME type `type' by
305    guess_type().  If no extension can be guessed for `type', None
306    is returned.
307
308    Optional `strict' argument when false adds a bunch of commonly found,
309    but non-standard types.
310    """
311    if _db is None:
312        init()
313    return _db.guess_all_extensions(type, strict)
314
315def guess_extension(type, strict=True):
316    """Guess the extension for a file based on its MIME type.
317
318    Return value is a string giving a filename extension, including the
319    leading dot ('.').  The extension is not guaranteed to have been
320    associated with any particular data stream, but would be mapped to the
321    MIME type `type' by guess_type().  If no extension can be guessed for
322    `type', None is returned.
323
324    Optional `strict' argument when false adds a bunch of commonly found,
325    but non-standard types.
326    """
327    if _db is None:
328        init()
329    return _db.guess_extension(type, strict)
330
331def add_type(type, ext, strict=True):
332    """Add a mapping between a type and an extension.
333
334    When the extension is already known, the new
335    type will replace the old one. When the type
336    is already known the extension will be added
337    to the list of known extensions.
338
339    If strict is true, information will be added to
340    list of standard types, else to the list of non-standard
341    types.
342    """
343    if _db is None:
344        init()
345    return _db.add_type(type, ext, strict)
346
347
348def init(files=None):
349    global suffix_map, types_map, encodings_map, common_types
350    global inited, _db
351    inited = True    # so that MimeTypes.__init__() doesn't call us again
352    db = MimeTypes()
353    if files is None:
354        if _winreg:
355            db.read_windows_registry()
356        files = knownfiles
357    for file in files:
358        if os.path.isfile(file):
359            db.read(file)
360    encodings_map = db.encodings_map
361    suffix_map = db.suffix_map
362    types_map = db.types_map[True]
363    common_types = db.types_map[False]
364    # Make the DB a global variable now that it is fully initialized
365    _db = db
366
367
368def read_mime_types(file):
369    try:
370        f = open(file)
371    except IOError:
372        return None
373    db = MimeTypes()
374    db.readfp(f, True)
375    return db.types_map[True]
376
377
378def _default_mime_types():
379    global suffix_map
380    global encodings_map
381    global types_map
382    global common_types
383
384    suffix_map = {
385        '.tgz': '.tar.gz',
386        '.taz': '.tar.gz',
387        '.tz': '.tar.gz',
388        '.tbz2': '.tar.bz2',
389        }
390
391    encodings_map = {
392        '.gz': 'gzip',
393        '.Z': 'compress',
394        '.bz2': 'bzip2',
395        }
396
397    # Before adding new types, make sure they are either registered with IANA,
398    # at http://www.isi.edu/in-notes/iana/assignments/media-types
399    # or extensions, i.e. using the x- prefix
400
401    # If you add to these, please keep them sorted!
402    types_map = {
403        '.a'      : 'application/octet-stream',
404        '.ai'     : 'application/postscript',
405        '.aif'    : 'audio/x-aiff',
406        '.aifc'   : 'audio/x-aiff',
407        '.aiff'   : 'audio/x-aiff',
408        '.au'     : 'audio/basic',
409        '.avi'    : 'video/x-msvideo',
410        '.bat'    : 'text/plain',
411        '.bcpio'  : 'application/x-bcpio',
412        '.bin'    : 'application/octet-stream',
413        '.bmp'    : 'image/x-ms-bmp',
414        '.c'      : 'text/plain',
415        # Duplicates :(
416        '.cdf'    : 'application/x-cdf',
417        '.cdf'    : 'application/x-netcdf',
418        '.cpio'   : 'application/x-cpio',
419        '.csh'    : 'application/x-csh',
420        '.css'    : 'text/css',
421        '.dll'    : 'application/octet-stream',
422        '.doc'    : 'application/msword',
423        '.dot'    : 'application/msword',
424        '.dvi'    : 'application/x-dvi',
425        '.eml'    : 'message/rfc822',
426        '.eps'    : 'application/postscript',
427        '.etx'    : 'text/x-setext',
428        '.exe'    : 'application/octet-stream',
429        '.gif'    : 'image/gif',
430        '.gtar'   : 'application/x-gtar',
431        '.h'      : 'text/plain',
432        '.hdf'    : 'application/x-hdf',
433        '.htm'    : 'text/html',
434        '.html'   : 'text/html',
435        '.ico'    : 'image/vnd.microsoft.icon',
436        '.ief'    : 'image/ief',
437        '.jpe'    : 'image/jpeg',
438        '.jpeg'   : 'image/jpeg',
439        '.jpg'    : 'image/jpeg',
440        '.js'     : 'application/javascript',
441        '.ksh'    : 'text/plain',
442        '.latex'  : 'application/x-latex',
443        '.m1v'    : 'video/mpeg',
444        '.man'    : 'application/x-troff-man',
445        '.me'     : 'application/x-troff-me',
446        '.mht'    : 'message/rfc822',
447        '.mhtml'  : 'message/rfc822',
448        '.mif'    : 'application/x-mif',
449        '.mov'    : 'video/quicktime',
450        '.movie'  : 'video/x-sgi-movie',
451        '.mp2'    : 'audio/mpeg',
452        '.mp3'    : 'audio/mpeg',
453        '.mp4'    : 'video/mp4',
454        '.mpa'    : 'video/mpeg',
455        '.mpe'    : 'video/mpeg',
456        '.mpeg'   : 'video/mpeg',
457        '.mpg'    : 'video/mpeg',
458        '.ms'     : 'application/x-troff-ms',
459        '.nc'     : 'application/x-netcdf',
460        '.nws'    : 'message/rfc822',
461        '.o'      : 'application/octet-stream',
462        '.obj'    : 'application/octet-stream',
463        '.oda'    : 'application/oda',
464        '.p12'    : 'application/x-pkcs12',
465        '.p7c'    : 'application/pkcs7-mime',
466        '.pbm'    : 'image/x-portable-bitmap',
467        '.pdf'    : 'application/pdf',
468        '.pfx'    : 'application/x-pkcs12',
469        '.pgm'    : 'image/x-portable-graymap',
470        '.pl'     : 'text/plain',
471        '.png'    : 'image/png',
472        '.pnm'    : 'image/x-portable-anymap',
473        '.pot'    : 'application/vnd.ms-powerpoint',
474        '.ppa'    : 'application/vnd.ms-powerpoint',
475        '.ppm'    : 'image/x-portable-pixmap',
476        '.pps'    : 'application/vnd.ms-powerpoint',
477        '.ppt'    : 'application/vnd.ms-powerpoint',
478        '.ps'     : 'application/postscript',
479        '.pwz'    : 'application/vnd.ms-powerpoint',
480        '.py'     : 'text/x-python',
481        '.pyc'    : 'application/x-python-code',
482        '.pyo'    : 'application/x-python-code',
483        '.qt'     : 'video/quicktime',
484        '.ra'     : 'audio/x-pn-realaudio',
485        '.ram'    : 'application/x-pn-realaudio',
486        '.ras'    : 'image/x-cmu-raster',
487        '.rdf'    : 'application/xml',
488        '.rgb'    : 'image/x-rgb',
489        '.roff'   : 'application/x-troff',
490        '.rtx'    : 'text/richtext',
491        '.sgm'    : 'text/x-sgml',
492        '.sgml'   : 'text/x-sgml',
493        '.sh'     : 'application/x-sh',
494        '.shar'   : 'application/x-shar',
495        '.snd'    : 'audio/basic',
496        '.so'     : 'application/octet-stream',
497        '.src'    : 'application/x-wais-source',
498        '.sv4cpio': 'application/x-sv4cpio',
499        '.sv4crc' : 'application/x-sv4crc',
500        '.swf'    : 'application/x-shockwave-flash',
501        '.t'      : 'application/x-troff',
502        '.tar'    : 'application/x-tar',
503        '.tcl'    : 'application/x-tcl',
504        '.tex'    : 'application/x-tex',
505        '.texi'   : 'application/x-texinfo',
506        '.texinfo': 'application/x-texinfo',
507        '.tif'    : 'image/tiff',
508        '.tiff'   : 'image/tiff',
509        '.tr'     : 'application/x-troff',
510        '.tsv'    : 'text/tab-separated-values',
511        '.txt'    : 'text/plain',
512        '.ustar'  : 'application/x-ustar',
513        '.vcf'    : 'text/x-vcard',
514        '.wav'    : 'audio/x-wav',
515        '.wiz'    : 'application/msword',
516        '.wsdl'   : 'application/xml',
517        '.xbm'    : 'image/x-xbitmap',
518        '.xlb'    : 'application/vnd.ms-excel',
519        # Duplicates :(
520        '.xls'    : 'application/excel',
521        '.xls'    : 'application/vnd.ms-excel',
522        '.xml'    : 'text/xml',
523        '.xpdl'   : 'application/xml',
524        '.xpm'    : 'image/x-xpixmap',
525        '.xsl'    : 'application/xml',
526        '.xwd'    : 'image/x-xwindowdump',
527        '.zip'    : 'application/zip',
528        }
529
530    # These are non-standard types, commonly found in the wild.  They will
531    # only match if strict=0 flag is given to the API methods.
532
533    # Please sort these too
534    common_types = {
535        '.jpg' : 'image/jpg',
536        '.mid' : 'audio/midi',
537        '.midi': 'audio/midi',
538        '.pct' : 'image/pict',
539        '.pic' : 'image/pict',
540        '.pict': 'image/pict',
541        '.rtf' : 'application/rtf',
542        '.xul' : 'text/xul'
543        }
544
545
546_default_mime_types()
547
548
549if __name__ == '__main__':
550    import getopt
551
552    USAGE = """\
553Usage: mimetypes.py [options] type
554
555Options:
556    --help / -h       -- print this message and exit
557    --lenient / -l    -- additionally search of some common, but non-standard
558                         types.
559    --extension / -e  -- guess extension instead of type
560
561More than one type argument may be given.
562"""
563
564    def usage(code, msg=''):
565        print USAGE
566        if msg: print msg
567        sys.exit(code)
568
569    try:
570        opts, args = getopt.getopt(sys.argv[1:], 'hle',
571                                   ['help', 'lenient', 'extension'])
572    except getopt.error, msg:
573        usage(1, msg)
574
575    strict = 1
576    extension = 0
577    for opt, arg in opts:
578        if opt in ('-h', '--help'):
579            usage(0)
580        elif opt in ('-l', '--lenient'):
581            strict = 0
582        elif opt in ('-e', '--extension'):
583            extension = 1
584    for gtype in args:
585        if extension:
586            guess = guess_extension(gtype, strict)
587            if not guess: print "I don't know anything about type", gtype
588            else: print guess
589        else:
590            guess, encoding = guess_type(gtype, strict)
591            if not guess: print "I don't know anything about type", gtype
592            else: print 'type:', guess, 'encoding:', encoding
593