1import struct
2import time
3import io
4
5import cherrypy
6from cherrypy._cpcompat import text_or_bytes
7from cherrypy.lib import file_generator
8from cherrypy.lib import is_closable_iterator
9from cherrypy.lib import set_vary_header
10
11
12_COMPRESSION_LEVEL_FAST = 1
13_COMPRESSION_LEVEL_BEST = 9
14
15
16def decode(encoding=None, default_encoding='utf-8'):
17    """Replace or extend the list of charsets used to decode a request entity.
18
19    Either argument may be a single string or a list of strings.
20
21    encoding
22        If not None, restricts the set of charsets attempted while decoding
23        a request entity to the given set (even if a different charset is
24        given in the Content-Type request header).
25
26    default_encoding
27        Only in effect if the 'encoding' argument is not given.
28        If given, the set of charsets attempted while decoding a request
29        entity is *extended* with the given value(s).
30
31    """
32    body = cherrypy.request.body
33    if encoding is not None:
34        if not isinstance(encoding, list):
35            encoding = [encoding]
36        body.attempt_charsets = encoding
37    elif default_encoding:
38        if not isinstance(default_encoding, list):
39            default_encoding = [default_encoding]
40        body.attempt_charsets = body.attempt_charsets + default_encoding
41
42
43class UTF8StreamEncoder:
44    def __init__(self, iterator):
45        self._iterator = iterator
46
47    def __iter__(self):
48        return self
49
50    def next(self):
51        return self.__next__()
52
53    def __next__(self):
54        res = next(self._iterator)
55        if isinstance(res, str):
56            res = res.encode('utf-8')
57        return res
58
59    def close(self):
60        if is_closable_iterator(self._iterator):
61            self._iterator.close()
62
63    def __getattr__(self, attr):
64        if attr.startswith('__'):
65            raise AttributeError(self, attr)
66        return getattr(self._iterator, attr)
67
68
69class ResponseEncoder:
70
71    default_encoding = 'utf-8'
72    failmsg = 'Response body could not be encoded with %r.'
73    encoding = None
74    errors = 'strict'
75    text_only = True
76    add_charset = True
77    debug = False
78
79    def __init__(self, **kwargs):
80        for k, v in kwargs.items():
81            setattr(self, k, v)
82
83        self.attempted_charsets = set()
84        request = cherrypy.serving.request
85        if request.handler is not None:
86            # Replace request.handler with self
87            if self.debug:
88                cherrypy.log('Replacing request.handler', 'TOOLS.ENCODE')
89            self.oldhandler = request.handler
90            request.handler = self
91
92    def encode_stream(self, encoding):
93        """Encode a streaming response body.
94
95        Use a generator wrapper, and just pray it works as the stream is
96        being written out.
97        """
98        if encoding in self.attempted_charsets:
99            return False
100        self.attempted_charsets.add(encoding)
101
102        def encoder(body):
103            for chunk in body:
104                if isinstance(chunk, str):
105                    chunk = chunk.encode(encoding, self.errors)
106                yield chunk
107        self.body = encoder(self.body)
108        return True
109
110    def encode_string(self, encoding):
111        """Encode a buffered response body."""
112        if encoding in self.attempted_charsets:
113            return False
114        self.attempted_charsets.add(encoding)
115        body = []
116        for chunk in self.body:
117            if isinstance(chunk, str):
118                try:
119                    chunk = chunk.encode(encoding, self.errors)
120                except (LookupError, UnicodeError):
121                    return False
122            body.append(chunk)
123        self.body = body
124        return True
125
126    def find_acceptable_charset(self):
127        request = cherrypy.serving.request
128        response = cherrypy.serving.response
129
130        if self.debug:
131            cherrypy.log('response.stream %r' %
132                         response.stream, 'TOOLS.ENCODE')
133        if response.stream:
134            encoder = self.encode_stream
135        else:
136            encoder = self.encode_string
137            if 'Content-Length' in response.headers:
138                # Delete Content-Length header so finalize() recalcs it.
139                # Encoded strings may be of different lengths from their
140                # unicode equivalents, and even from each other. For example:
141                # >>> t = u"\u7007\u3040"
142                # >>> len(t)
143                # 2
144                # >>> len(t.encode("UTF-8"))
145                # 6
146                # >>> len(t.encode("utf7"))
147                # 8
148                del response.headers['Content-Length']
149
150        # Parse the Accept-Charset request header, and try to provide one
151        # of the requested charsets (in order of user preference).
152        encs = request.headers.elements('Accept-Charset')
153        charsets = [enc.value.lower() for enc in encs]
154        if self.debug:
155            cherrypy.log('charsets %s' % repr(charsets), 'TOOLS.ENCODE')
156
157        if self.encoding is not None:
158            # If specified, force this encoding to be used, or fail.
159            encoding = self.encoding.lower()
160            if self.debug:
161                cherrypy.log('Specified encoding %r' %
162                             encoding, 'TOOLS.ENCODE')
163            if (not charsets) or '*' in charsets or encoding in charsets:
164                if self.debug:
165                    cherrypy.log('Attempting encoding %r' %
166                                 encoding, 'TOOLS.ENCODE')
167                if encoder(encoding):
168                    return encoding
169        else:
170            if not encs:
171                if self.debug:
172                    cherrypy.log('Attempting default encoding %r' %
173                                 self.default_encoding, 'TOOLS.ENCODE')
174                # Any character-set is acceptable.
175                if encoder(self.default_encoding):
176                    return self.default_encoding
177                else:
178                    raise cherrypy.HTTPError(500, self.failmsg %
179                                             self.default_encoding)
180            else:
181                for element in encs:
182                    if element.qvalue > 0:
183                        if element.value == '*':
184                            # Matches any charset. Try our default.
185                            if self.debug:
186                                cherrypy.log('Attempting default encoding due '
187                                             'to %r' % element, 'TOOLS.ENCODE')
188                            if encoder(self.default_encoding):
189                                return self.default_encoding
190                        else:
191                            encoding = element.value
192                            if self.debug:
193                                cherrypy.log('Attempting encoding %s (qvalue >'
194                                             '0)' % element, 'TOOLS.ENCODE')
195                            if encoder(encoding):
196                                return encoding
197
198                if '*' not in charsets:
199                    # If no "*" is present in an Accept-Charset field, then all
200                    # character sets not explicitly mentioned get a quality
201                    # value of 0, except for ISO-8859-1, which gets a quality
202                    # value of 1 if not explicitly mentioned.
203                    iso = 'iso-8859-1'
204                    if iso not in charsets:
205                        if self.debug:
206                            cherrypy.log('Attempting ISO-8859-1 encoding',
207                                         'TOOLS.ENCODE')
208                        if encoder(iso):
209                            return iso
210
211        # No suitable encoding found.
212        ac = request.headers.get('Accept-Charset')
213        if ac is None:
214            msg = 'Your client did not send an Accept-Charset header.'
215        else:
216            msg = 'Your client sent this Accept-Charset header: %s.' % ac
217        _charsets = ', '.join(sorted(self.attempted_charsets))
218        msg += ' We tried these charsets: %s.' % (_charsets,)
219        raise cherrypy.HTTPError(406, msg)
220
221    def __call__(self, *args, **kwargs):
222        response = cherrypy.serving.response
223        self.body = self.oldhandler(*args, **kwargs)
224
225        self.body = prepare_iter(self.body)
226
227        ct = response.headers.elements('Content-Type')
228        if self.debug:
229            cherrypy.log('Content-Type: %r' % [str(h)
230                         for h in ct], 'TOOLS.ENCODE')
231        if ct and self.add_charset:
232            ct = ct[0]
233            if self.text_only:
234                if ct.value.lower().startswith('text/'):
235                    if self.debug:
236                        cherrypy.log(
237                            'Content-Type %s starts with "text/"' % ct,
238                            'TOOLS.ENCODE')
239                    do_find = True
240                else:
241                    if self.debug:
242                        cherrypy.log('Not finding because Content-Type %s '
243                                     'does not start with "text/"' % ct,
244                                     'TOOLS.ENCODE')
245                    do_find = False
246            else:
247                if self.debug:
248                    cherrypy.log('Finding because not text_only',
249                                 'TOOLS.ENCODE')
250                do_find = True
251
252            if do_find:
253                # Set "charset=..." param on response Content-Type header
254                ct.params['charset'] = self.find_acceptable_charset()
255                if self.debug:
256                    cherrypy.log('Setting Content-Type %s' % ct,
257                                 'TOOLS.ENCODE')
258                response.headers['Content-Type'] = str(ct)
259
260        return self.body
261
262
263def prepare_iter(value):
264    """
265    Ensure response body is iterable and resolves to False when empty.
266    """
267    if isinstance(value, text_or_bytes):
268        # strings get wrapped in a list because iterating over a single
269        # item list is much faster than iterating over every character
270        # in a long string.
271        if value:
272            value = [value]
273        else:
274            # [''] doesn't evaluate to False, so replace it with [].
275            value = []
276    # Don't use isinstance here; io.IOBase which has an ABC takes
277    # 1000 times as long as, say, isinstance(value, str)
278    elif hasattr(value, 'read'):
279        value = file_generator(value)
280    elif value is None:
281        value = []
282    return value
283
284
285# GZIP
286
287
288def compress(body, compress_level):
289    """Compress 'body' at the given compress_level."""
290    import zlib
291
292    # See https://tools.ietf.org/html/rfc1952
293    yield b'\x1f\x8b'       # ID1 and ID2: gzip marker
294    yield b'\x08'           # CM: compression method
295    yield b'\x00'           # FLG: none set
296    # MTIME: 4 bytes
297    yield struct.pack('<L', int(time.time()) & int('FFFFFFFF', 16))
298
299    # RFC 1952, section 2.3.1:
300    #
301    # XFL (eXtra FLags)
302    #    These flags are available for use by specific compression
303    #    methods.  The "deflate" method (CM = 8) sets these flags as
304    #    follows:
305    #
306    #       XFL = 2 - compressor used maximum compression,
307    #                 slowest algorithm
308    #       XFL = 4 - compressor used fastest algorithm
309    if compress_level == _COMPRESSION_LEVEL_BEST:
310        yield b'\x02'       # XFL: max compression, slowest algo
311    elif compress_level == _COMPRESSION_LEVEL_FAST:
312        yield b'\x04'       # XFL: min compression, fastest algo
313    else:
314        yield b'\x00'       # XFL: compression unset/tradeoff
315    yield b'\xff'           # OS: unknown
316
317    crc = zlib.crc32(b'')
318    size = 0
319    zobj = zlib.compressobj(compress_level,
320                            zlib.DEFLATED, -zlib.MAX_WBITS,
321                            zlib.DEF_MEM_LEVEL, 0)
322    for line in body:
323        size += len(line)
324        crc = zlib.crc32(line, crc)
325        yield zobj.compress(line)
326    yield zobj.flush()
327
328    # CRC32: 4 bytes
329    yield struct.pack('<L', crc & int('FFFFFFFF', 16))
330    # ISIZE: 4 bytes
331    yield struct.pack('<L', size & int('FFFFFFFF', 16))
332
333
334def decompress(body):
335    import gzip
336
337    zbuf = io.BytesIO()
338    zbuf.write(body)
339    zbuf.seek(0)
340    zfile = gzip.GzipFile(mode='rb', fileobj=zbuf)
341    data = zfile.read()
342    zfile.close()
343    return data
344
345
346def gzip(compress_level=5, mime_types=['text/html', 'text/plain'],
347         debug=False):
348    """Try to gzip the response body if Content-Type in mime_types.
349
350    cherrypy.response.headers['Content-Type'] must be set to one of the
351    values in the mime_types arg before calling this function.
352
353    The provided list of mime-types must be of one of the following form:
354        * `type/subtype`
355        * `type/*`
356        * `type/*+subtype`
357
358    No compression is performed if any of the following hold:
359        * The client sends no Accept-Encoding request header
360        * No 'gzip' or 'x-gzip' is present in the Accept-Encoding header
361        * No 'gzip' or 'x-gzip' with a qvalue > 0 is present
362        * The 'identity' value is given with a qvalue > 0.
363
364    """
365    request = cherrypy.serving.request
366    response = cherrypy.serving.response
367
368    set_vary_header(response, 'Accept-Encoding')
369
370    if not response.body:
371        # Response body is empty (might be a 304 for instance)
372        if debug:
373            cherrypy.log('No response body', context='TOOLS.GZIP')
374        return
375
376    # If returning cached content (which should already have been gzipped),
377    # don't re-zip.
378    if getattr(request, 'cached', False):
379        if debug:
380            cherrypy.log('Not gzipping cached response', context='TOOLS.GZIP')
381        return
382
383    acceptable = request.headers.elements('Accept-Encoding')
384    if not acceptable:
385        # If no Accept-Encoding field is present in a request,
386        # the server MAY assume that the client will accept any
387        # content coding. In this case, if "identity" is one of
388        # the available content-codings, then the server SHOULD use
389        # the "identity" content-coding, unless it has additional
390        # information that a different content-coding is meaningful
391        # to the client.
392        if debug:
393            cherrypy.log('No Accept-Encoding', context='TOOLS.GZIP')
394        return
395
396    ct = response.headers.get('Content-Type', '').split(';')[0]
397    for coding in acceptable:
398        if coding.value == 'identity' and coding.qvalue != 0:
399            if debug:
400                cherrypy.log('Non-zero identity qvalue: %s' % coding,
401                             context='TOOLS.GZIP')
402            return
403        if coding.value in ('gzip', 'x-gzip'):
404            if coding.qvalue == 0:
405                if debug:
406                    cherrypy.log('Zero gzip qvalue: %s' % coding,
407                                 context='TOOLS.GZIP')
408                return
409
410            if ct not in mime_types:
411                # If the list of provided mime-types contains tokens
412                # such as 'text/*' or 'application/*+xml',
413                # we go through them and find the most appropriate one
414                # based on the given content-type.
415                # The pattern matching is only caring about the most
416                # common cases, as stated above, and doesn't support
417                # for extra parameters.
418                found = False
419                if '/' in ct:
420                    ct_media_type, ct_sub_type = ct.split('/')
421                    for mime_type in mime_types:
422                        if '/' in mime_type:
423                            media_type, sub_type = mime_type.split('/')
424                            if ct_media_type == media_type:
425                                if sub_type == '*':
426                                    found = True
427                                    break
428                                elif '+' in sub_type and '+' in ct_sub_type:
429                                    ct_left, ct_right = ct_sub_type.split('+')
430                                    left, right = sub_type.split('+')
431                                    if left == '*' and ct_right == right:
432                                        found = True
433                                        break
434
435                if not found:
436                    if debug:
437                        cherrypy.log('Content-Type %s not in mime_types %r' %
438                                     (ct, mime_types), context='TOOLS.GZIP')
439                    return
440
441            if debug:
442                cherrypy.log('Gzipping', context='TOOLS.GZIP')
443            # Return a generator that compresses the page
444            response.headers['Content-Encoding'] = 'gzip'
445            response.body = compress(response.body, compress_level)
446            if 'Content-Length' in response.headers:
447                # Delete Content-Length header so finalize() recalcs it.
448                del response.headers['Content-Length']
449
450            return
451
452    if debug:
453        cherrypy.log('No acceptable encoding found.', context='GZIP')
454    cherrypy.HTTPError(406, 'identity, gzip').set_response()
455