1 /*
2  * Copyright (c) 2012-2018, Steeve Morin, Jonathan Underwood
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  *    this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  *    this list of conditions and the following disclaimer in the documentation
13  *    and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of Steeve Morin nor the names of its contributors may be
16  *    used to endorse or promote products derived from this software without
17  *    specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #if defined(_WIN32) && defined(_MSC_VER)
33 #define inline __inline
34 #elif defined(__SUNPRO_C) || defined(__hpux) || defined(_AIX)
35 #define inline
36 #endif
37 
38 #include <py3c.h>
39 #include <py3c/capsulethunk.h>
40 
41 #include <stdlib.h>
42 #include <math.h>
43 #include <lz4.h>
44 #include <lz4hc.h>
45 
46 #ifndef Py_UNUSED /* This is already defined for Python 3.4 onwards */
47 #ifdef __GNUC__
48 #define Py_UNUSED(name) _unused_ ## name __attribute__((unused))
49 #else
50 #define Py_UNUSED(name) _unused_ ## name
51 #endif
52 #endif
53 
54 #if defined(_WIN32) && defined(_MSC_VER)
55 #if _MSC_VER >= 1600
56 #include <stdint.h>
57 #else /* _MSC_VER >= 1600 */
58 typedef signed char int8_t;
59 typedef signed short int16_t;
60 typedef signed int int32_t;
61 typedef unsigned char uint8_t;
62 typedef unsigned short uint16_t;
63 typedef unsigned int uint32_t;
64 #endif /* _MSC_VER >= 1600 */
65 #endif
66 
67 static inline void
store_le32(char * c,uint32_t x)68 store_le32 (char *c, uint32_t x)
69 {
70   c[0] = x & 0xff;
71   c[1] = (x >> 8) & 0xff;
72   c[2] = (x >> 16) & 0xff;
73   c[3] = (x >> 24) & 0xff;
74 }
75 
76 static inline uint32_t
load_le32(const char * c)77 load_le32 (const char *c)
78 {
79   const uint8_t *d = (const uint8_t *) c;
80   return d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24);
81 }
82 
83 static const size_t hdr_size = sizeof (uint32_t);
84 
85 typedef enum
86 {
87   DEFAULT,
88   FAST,
89   HIGH_COMPRESSION
90 } compression_type;
91 
92 static PyObject * LZ4BlockError;
93 
94 static inline int
lz4_compress_generic(int comp,char * source,char * dest,int source_size,int dest_size,char * dict,int dict_size,int acceleration,int compression)95 lz4_compress_generic (int comp, char* source, char* dest, int source_size, int dest_size,
96                       char* dict, int dict_size, int acceleration, int compression)
97 {
98   if (comp != HIGH_COMPRESSION)
99     {
100       LZ4_stream_t lz4_state;
101       LZ4_resetStream (&lz4_state);
102       if (dict)
103         {
104           LZ4_loadDict (&lz4_state, dict, dict_size);
105         }
106       if (comp != FAST)
107         {
108           acceleration = 1;
109         }
110       return LZ4_compress_fast_continue (&lz4_state, source, dest, source_size, dest_size, acceleration);
111     }
112   else
113     {
114       LZ4_streamHC_t lz4_state;
115       LZ4_resetStreamHC (&lz4_state, compression);
116       if (dict)
117         {
118           LZ4_loadDictHC (&lz4_state, dict, dict_size);
119         }
120       return LZ4_compress_HC_continue (&lz4_state, source, dest, source_size, dest_size);
121     }
122 }
123 
124 #ifdef inline
125 #undef inline
126 #endif
127 
128 static PyObject *
compress(PyObject * Py_UNUSED (self),PyObject * args,PyObject * kwargs)129 compress (PyObject * Py_UNUSED (self), PyObject * args, PyObject * kwargs)
130 {
131   const char *mode = "default";
132   size_t dest_size, total_size;
133   int acceleration = 1;
134   int compression = 9;
135   int store_size = 1;
136   PyObject *py_dest;
137   char *dest, *dest_start;
138   compression_type comp;
139   int output_size;
140   Py_buffer source;
141   int source_size;
142   int return_bytearray = 0;
143   Py_buffer dict = {0};
144   static char *argnames[] = {
145     "source",
146     "mode",
147     "store_size",
148     "acceleration",
149     "compression",
150     "return_bytearray",
151     "dict",
152     NULL
153   };
154 
155 
156 #if IS_PY3
157   if (!PyArg_ParseTupleAndKeywords (args, kwargs, "y*|spiipz*", argnames,
158                                     &source,
159                                     &mode, &store_size, &acceleration, &compression,
160                                     &return_bytearray, &dict))
161     {
162       return NULL;
163     }
164 #else
165   if (!PyArg_ParseTupleAndKeywords (args, kwargs, "s*|siiiiz*", argnames,
166                                     &source,
167                                     &mode, &store_size, &acceleration, &compression,
168                                     &return_bytearray, &dict))
169     {
170       return NULL;
171     }
172 #endif
173 
174   if (source.len > INT_MAX)
175     {
176       PyBuffer_Release(&source);
177       PyBuffer_Release(&dict);
178       PyErr_Format(PyExc_OverflowError,
179                    "Input too large for LZ4 API");
180       return NULL;
181     }
182 
183   if (dict.len > INT_MAX)
184     {
185       PyBuffer_Release(&source);
186       PyBuffer_Release(&dict);
187       PyErr_Format(PyExc_OverflowError,
188                    "Dictionary too large for LZ4 API");
189       return NULL;
190     }
191 
192   source_size = (int) source.len;
193 
194   if (!strncmp (mode, "default", sizeof ("default")))
195     {
196       comp = DEFAULT;
197     }
198   else if (!strncmp (mode, "fast", sizeof ("fast")))
199     {
200       comp = FAST;
201     }
202   else if (!strncmp (mode, "high_compression", sizeof ("high_compression")))
203     {
204       comp = HIGH_COMPRESSION;
205     }
206   else
207     {
208       PyBuffer_Release(&source);
209       PyBuffer_Release(&dict);
210       PyErr_Format (PyExc_ValueError,
211                     "Invalid mode argument: %s. Must be one of: standard, fast, high_compression",
212                     mode);
213       return NULL;
214     }
215 
216   dest_size = LZ4_compressBound (source_size);
217 
218   if (store_size)
219     {
220       total_size = dest_size + hdr_size;
221     }
222   else
223     {
224       total_size = dest_size;
225     }
226 
227   dest = PyMem_Malloc (total_size * sizeof * dest);
228   if (dest == NULL)
229     {
230       return PyErr_NoMemory();
231     }
232 
233   Py_BEGIN_ALLOW_THREADS
234 
235   if (store_size)
236     {
237       store_le32 (dest, source_size);
238       dest_start = dest + hdr_size;
239     }
240   else
241     {
242       dest_start = dest;
243     }
244 
245   output_size = lz4_compress_generic (comp, source.buf, dest_start, source_size,
246                                       (int) dest_size, dict.buf, (int) dict.len,
247                                       acceleration, compression);
248 
249   Py_END_ALLOW_THREADS
250 
251   PyBuffer_Release(&source);
252   PyBuffer_Release(&dict);
253 
254   if (output_size <= 0)
255     {
256       PyErr_SetString (LZ4BlockError, "Compression failed");
257       PyMem_Free (dest);
258       return NULL;
259     }
260 
261   if (store_size)
262     {
263       output_size += (int) hdr_size;
264     }
265 
266   if (return_bytearray)
267     {
268       py_dest = PyByteArray_FromStringAndSize (dest, (Py_ssize_t) output_size);
269     }
270   else
271     {
272       py_dest = PyBytes_FromStringAndSize (dest, (Py_ssize_t) output_size);
273     }
274 
275   PyMem_Free (dest);
276 
277   if (py_dest == NULL)
278     {
279       return PyErr_NoMemory ();
280     }
281 
282   return py_dest;
283 }
284 
285 static PyObject *
decompress(PyObject * Py_UNUSED (self),PyObject * args,PyObject * kwargs)286 decompress (PyObject * Py_UNUSED (self), PyObject * args, PyObject * kwargs)
287 {
288   Py_buffer source;
289   const char * source_start;
290   size_t source_size;
291   PyObject *py_dest;
292   char *dest;
293   int output_size;
294   size_t dest_size;
295   int uncompressed_size = -1;
296   int return_bytearray = 0;
297   Py_buffer dict = {0};
298   static char *argnames[] = {
299     "source",
300     "uncompressed_size",
301     "return_bytearray",
302     "dict",
303     NULL
304   };
305 
306 #if IS_PY3
307   if (!PyArg_ParseTupleAndKeywords (args, kwargs, "y*|ipz*", argnames,
308                                     &source, &uncompressed_size,
309                                     &return_bytearray, &dict))
310     {
311       return NULL;
312     }
313 #else
314   if (!PyArg_ParseTupleAndKeywords (args, kwargs, "s*|iiz*", argnames,
315                                     &source, &uncompressed_size,
316                                     &return_bytearray, &dict))
317     {
318       return NULL;
319     }
320 #endif
321 
322   if (source.len > INT_MAX)
323     {
324       PyBuffer_Release(&source);
325       PyBuffer_Release(&dict);
326       PyErr_Format(PyExc_OverflowError,
327                    "Input too large for LZ4 API");
328       return NULL;
329     }
330 
331   if (dict.len > INT_MAX)
332     {
333       PyBuffer_Release(&source);
334       PyBuffer_Release(&dict);
335       PyErr_Format(PyExc_OverflowError,
336                    "Dictionary too large for LZ4 API");
337       return NULL;
338     }
339 
340   source_start = (const char *) source.buf;
341   source_size = (int) source.len;
342 
343   if (uncompressed_size >= 0)
344     {
345       dest_size = uncompressed_size;
346     }
347   else
348     {
349       if (source_size < hdr_size)
350         {
351           PyBuffer_Release(&source);
352           PyBuffer_Release(&dict);
353           PyErr_SetString (PyExc_ValueError, "Input source data size too small");
354           return NULL;
355         }
356       dest_size = load_le32 (source_start);
357       source_start += hdr_size;
358       source_size -= hdr_size;
359     }
360 
361   if (dest_size > INT_MAX)
362     {
363       PyBuffer_Release(&source);
364       PyBuffer_Release(&dict);
365       PyErr_Format (PyExc_ValueError, "Invalid size: 0x%zu",
366                     dest_size);
367       return NULL;
368     }
369 
370   dest = PyMem_Malloc (dest_size * sizeof * dest);
371   if (dest == NULL)
372     {
373       return PyErr_NoMemory();
374     }
375 
376   Py_BEGIN_ALLOW_THREADS
377 
378   output_size =
379     LZ4_decompress_safe_usingDict (source_start, dest, source_size, (int) dest_size,
380                                    dict.buf, (int) dict.len);
381 
382   Py_END_ALLOW_THREADS
383 
384   PyBuffer_Release(&source);
385   PyBuffer_Release(&dict);
386 
387   if (output_size < 0)
388     {
389       PyErr_Format (LZ4BlockError,
390                     "Decompression failed: corrupt input or insufficient space in destination buffer. Error code: %u",
391                     -output_size);
392       PyMem_Free (dest);
393       return NULL;
394     }
395   else if (((size_t)output_size != dest_size) && (uncompressed_size < 0))
396     {
397       PyErr_Format (LZ4BlockError,
398                     "Decompressor wrote %u bytes, but %zu bytes expected from header",
399                     output_size, dest_size);
400       PyMem_Free (dest);
401       return NULL;
402     }
403 
404   if (return_bytearray)
405     {
406       py_dest = PyByteArray_FromStringAndSize (dest, (Py_ssize_t) output_size);
407     }
408   else
409     {
410       py_dest = PyBytes_FromStringAndSize (dest, (Py_ssize_t) output_size);
411     }
412 
413   PyMem_Free (dest);
414 
415   if (py_dest == NULL)
416     {
417       return PyErr_NoMemory ();
418     }
419 
420   return py_dest;
421 }
422 
423 PyDoc_STRVAR(compress__doc,
424              "compress(source, mode='default', acceleration=1, compression=0, return_bytearray=False)\n\n" \
425              "Compress source, returning the compressed data as a string.\n" \
426              "Raises an exception if any error occurs.\n"               \
427              "\n"                                                       \
428              "Args:\n"                                                  \
429              "    source (str, bytes or buffer-compatible object): Data to compress\n" \
430              "\n"                                                       \
431              "Keyword Args:\n"                                          \
432              "    mode (str): If ``'default'`` or unspecified use the default LZ4\n" \
433              "        compression mode. Set to ``'fast'`` to use the fast compression\n" \
434              "        LZ4 mode at the expense of compression. Set to\n" \
435              "        ``'high_compression'`` to use the LZ4 high-compression mode at\n" \
436              "        the exepense of speed.\n"                         \
437              "    acceleration (int): When mode is set to ``'fast'`` this argument\n" \
438              "        specifies the acceleration. The larger the acceleration, the\n" \
439              "        faster the but the lower the compression. The default\n" \
440              "        compression corresponds to a value of ``1``.\n"       \
441              "    compression (int): When mode is set to ``high_compression`` this\n" \
442              "        argument specifies the compression. Valid values are between\n" \
443              "        ``1`` and ``12``. Values between ``4-9`` are recommended, and\n" \
444              "        ``9`` is the default.\n"
445              "    store_size (bool): If ``True`` (the default) then the size of the\n" \
446              "        uncompressed data is stored at the start of the compressed\n" \
447              "        block.\n"                                         \
448              "    return_bytearray (bool): If ``False`` (the default) then the function\n" \
449              "        will return a bytes object. If ``True``, then the function will\n" \
450              "        return a bytearray object.\n\n"                   \
451              "    dict (str, bytes or buffer-compatible object): If specified, perform\n" \
452              "        compression using this initial dictionary.\n"     \
453              "Returns:\n"                                               \
454              "    bytes or bytearray: Compressed data.\n");
455 
456 PyDoc_STRVAR(decompress__doc,
457              "decompress(source, uncompressed_size=-1, return_bytearray=False)\n\n" \
458              "Decompress source, returning the uncompressed data as a string.\n" \
459              "Raises an exception if any error occurs.\n"               \
460              "\n"                                                       \
461              "Args:\n"                                                  \
462              "    source (str, bytes or buffer-compatible object): Data to decompress.\n" \
463              "\n"                                                       \
464              "Keyword Args:\n"                                          \
465              "    uncompressed_size (int): If not specified or negative, the uncompressed\n" \
466              "        data size is read from the start of the source block. If specified,\n" \
467              "        it is assumed that the full source data is compressed data. If this\n" \
468              "        argument is specified, it is considered to be a maximum possible size\n" \
469              "        for the buffer used to hold the uncompressed data, and so less data\n" \
470              "        may be returned. If `uncompressed_size` is too small, `LZ4BlockError`\n" \
471              "        will be raised. By catching `LZ4BlockError` it is possible to increase\n" \
472              "        `uncompressed_size` and try again.\n"             \
473              "    return_bytearray (bool): If ``False`` (the default) then the function\n" \
474              "        will return a bytes object. If ``True``, then the function will\n" \
475              "        return a bytearray object.\n\n" \
476              "    dict (str, bytes or buffer-compatible object): If specified, perform\n" \
477              "        decompression using this initial dictionary.\n"   \
478              "\n"                                                       \
479              "Returns:\n"                                               \
480              "    bytes or bytearray: Decompressed data.\n"             \
481              "\n"                                                       \
482              "Raises:\n"                                                \
483              "    LZ4BlockError: raised if the call to the LZ4 library fails. This can be\n" \
484              "        caused by `uncompressed_size` being too small, or invalid data.\n");
485 
486 PyDoc_STRVAR(lz4block__doc,
487              "A Python wrapper for the LZ4 block protocol"
488              );
489 
490 static PyMethodDef module_methods[] = {
491   {
492     "compress",
493     (PyCFunction) compress,
494     METH_VARARGS | METH_KEYWORDS,
495     compress__doc
496   },
497   {
498     "decompress",
499     (PyCFunction) decompress,
500     METH_VARARGS | METH_KEYWORDS,
501     decompress__doc
502   },
503   {
504     /* Sentinel */
505     NULL,
506     NULL,
507     0,
508     NULL
509   }
510 };
511 
512 static struct PyModuleDef moduledef =
513 {
514   PyModuleDef_HEAD_INIT,
515   "_block",
516   lz4block__doc,
517   -1,
518   module_methods
519 };
520 
MODULE_INIT_FUNC(_block)521 MODULE_INIT_FUNC (_block)
522 {
523   PyObject *module = PyModule_Create (&moduledef);
524 
525   if (module == NULL)
526     return NULL;
527 
528   PyModule_AddIntConstant (module, "HC_LEVEL_MIN", LZ4HC_CLEVEL_MIN);
529   PyModule_AddIntConstant (module, "HC_LEVEL_DEFAULT", LZ4HC_CLEVEL_DEFAULT);
530   PyModule_AddIntConstant (module, "HC_LEVEL_OPT_MIN", LZ4HC_CLEVEL_OPT_MIN);
531   PyModule_AddIntConstant (module, "HC_LEVEL_MAX", LZ4HC_CLEVEL_MAX);
532 
533   LZ4BlockError = PyErr_NewExceptionWithDoc("_block.LZ4BlockError", "Call to LZ4 library failed.", NULL, NULL);
534   if (LZ4BlockError == NULL)
535     {
536       return NULL;
537     }
538   Py_INCREF(LZ4BlockError);
539   PyModule_AddObject(module, "LZ4BlockError", LZ4BlockError);
540 
541   return module;
542 }
543