1 /*-
2  * Public Domain 2014-2018 MongoDB, Inc.
3  * Public Domain 2008-2014 WiredTiger, Inc.
4  *
5  * This is free and unencumbered software released into the public domain.
6  *
7  * Anyone is free to copy, modify, publish, use, compile, sell, or
8  * distribute this software, either in source code form or as a compiled
9  * binary, for any purpose, commercial or non-commercial, and by any
10  * means.
11  *
12  * In jurisdictions that recognize copyright laws, the author or authors
13  * of this software dedicate any and all copyright interest in the
14  * software to the public domain. We make this dedication for the benefit
15  * of the public at large and to the detriment of our heirs and
16  * successors. We intend this dedication to be an overt act of
17  * relinquishment in perpetuity of all present and future rights to this
18  * software under copyright law.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
24  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
25  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26  * OTHER DEALINGS IN THE SOFTWARE.
27  */
28 
29 #include <lz4.h>
30 #include <errno.h>
31 #include <stdlib.h>
32 #include <string.h>
33 
34 /*
35  * We need to include the configuration file to detect whether this extension
36  * is being built into the WiredTiger library; application-loaded compression
37  * functions won't need it.
38  */
39 #include <wiredtiger_config.h>
40 
41 #include <wiredtiger.h>
42 #include <wiredtiger_ext.h>
43 
44 #ifdef _MSC_VER
45 #define	inline	__inline
46 #endif
47 
48 /* Local compressor structure. */
49 typedef struct {
50 	WT_COMPRESSOR compressor;		/* Must come first */
51 
52 	WT_EXTENSION_API *wt_api;		/* Extension API */
53 } LZ4_COMPRESSOR;
54 
55 /*
56  * LZ4 decompression requires the exact compressed byte count returned by the
57  * LZ4_compress_default and LZ4_compress_destSize functions. WiredTiger doesn't
58  * track that value, store it in the destination buffer.
59  *
60  * Additionally, LZ4_compress_destSize may compress into the middle of a record,
61  * and after decompression we return the length to the last record successfully
62  * decompressed, not the number of bytes decompressed; store that value in the
63  * destination buffer as well.
64  *
65  * Use fixed-size, 4B values (WiredTiger never writes buffers larger than 4GB).
66  *
67  * The unused field is available for a mode flag if one is needed in the future,
68  * we guarantee it's 0.
69  */
70 typedef struct {
71 	uint32_t compressed_len;	/* True compressed length */
72 	uint32_t uncompressed_len;	/* True uncompressed source length */
73 	uint32_t useful_len;		/* Decompression return value */
74 	uint32_t unused;		/* Guaranteed to be 0 */
75 } LZ4_PREFIX;
76 
77 #ifdef WORDS_BIGENDIAN
78 /*
79  * lz4_bswap32 --
80  *	32-bit unsigned little-endian to/from big-endian value.
81  */
82 static inline uint32_t
lz4_bswap32(uint32_t v)83 lz4_bswap32(uint32_t v)
84 {
85 	return (
86 	    ((v << 24) & 0xff000000) |
87 	    ((v <<  8) & 0x00ff0000) |
88 	    ((v >>  8) & 0x0000ff00) |
89 	    ((v >> 24) & 0x000000ff)
90 	);
91 }
92 
93 /*
94  * lz4_prefix_swap --
95  *	The additional information is written in little-endian format, handle
96  * the conversion.
97  */
98 static inline void
lz4_prefix_swap(LZ4_PREFIX * prefix)99 lz4_prefix_swap(LZ4_PREFIX *prefix)
100 {
101 	prefix->compressed_len = lz4_bswap32(prefix->compressed_len);
102 	prefix->uncompressed_len = lz4_bswap32(prefix->uncompressed_len);
103 	prefix->useful_len = lz4_bswap32(prefix->useful_len);
104 	prefix->unused = lz4_bswap32(prefix->unused);
105 }
106 #endif
107 
108 /*
109  * lz4_error --
110  *	Output an error message, and return a standard error code.
111  */
112 static int
lz4_error(WT_COMPRESSOR * compressor,WT_SESSION * session,const char * call,int error)113 lz4_error(
114     WT_COMPRESSOR *compressor, WT_SESSION *session, const char *call, int error)
115 {
116 	WT_EXTENSION_API *wt_api;
117 
118 	wt_api = ((LZ4_COMPRESSOR *)compressor)->wt_api;
119 
120 	(void)wt_api->err_printf(wt_api,
121 	    session, "lz4 error: %s: %d", call, error);
122 	return (WT_ERROR);
123 }
124 
125 /*
126  *  lz4_compress --
127  *	WiredTiger LZ4 compression.
128  */
129 static int
lz4_compress(WT_COMPRESSOR * compressor,WT_SESSION * session,uint8_t * src,size_t src_len,uint8_t * dst,size_t dst_len,size_t * result_lenp,int * compression_failed)130 lz4_compress(WT_COMPRESSOR *compressor, WT_SESSION *session,
131     uint8_t *src, size_t src_len,
132     uint8_t *dst, size_t dst_len,
133     size_t *result_lenp, int *compression_failed)
134 {
135 	LZ4_PREFIX prefix;
136 	int lz4_len;
137 
138 	(void)compressor;				/* Unused parameters */
139 	(void)session;
140 
141 	/* Compress, starting after the prefix bytes. */
142 	lz4_len = LZ4_compress_default((const char *)src,
143 	    (char *)dst + sizeof(LZ4_PREFIX), (int)src_len, (int)dst_len);
144 
145 	/*
146 	 * If compression succeeded and the compressed length is smaller than
147 	 * the original size, return success.
148 	 */
149 	if (lz4_len != 0 && (size_t)lz4_len + sizeof(LZ4_PREFIX) < src_len) {
150 		prefix.compressed_len = (uint32_t)lz4_len;
151 		prefix.uncompressed_len = (uint32_t)src_len;
152 		prefix.useful_len = (uint32_t)src_len;
153 		prefix.unused = 0;
154 #ifdef WORDS_BIGENDIAN
155 		lz4_prefix_swap(&prefix);
156 #endif
157 		memcpy(dst, &prefix, sizeof(LZ4_PREFIX));
158 
159 		*result_lenp = (size_t)lz4_len + sizeof(LZ4_PREFIX);
160 		*compression_failed = 0;
161 		return (0);
162 	}
163 
164 	*compression_failed = 1;
165 	return (0);
166 }
167 
168 /*
169  * lz4_decompress --
170  *	WiredTiger LZ4 decompression.
171  */
172 static int
lz4_decompress(WT_COMPRESSOR * compressor,WT_SESSION * session,uint8_t * src,size_t src_len,uint8_t * dst,size_t dst_len,size_t * result_lenp)173 lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session,
174     uint8_t *src, size_t src_len,
175     uint8_t *dst, size_t dst_len,
176     size_t *result_lenp)
177 {
178 	WT_EXTENSION_API *wt_api;
179 	LZ4_PREFIX prefix;
180 	int decoded;
181 	uint8_t *dst_tmp;
182 
183 	wt_api = ((LZ4_COMPRESSOR *)compressor)->wt_api;
184 
185 	/*
186 	 * Retrieve the true length of the compressed block and source and the
187 	 * decompressed bytes to return from the start of the source buffer.
188 	 */
189 	memcpy(&prefix, src, sizeof(LZ4_PREFIX));
190 #ifdef WORDS_BIGENDIAN
191 	lz4_prefix_swap(&prefix);
192 #endif
193 	if (prefix.compressed_len + sizeof(LZ4_PREFIX) > src_len) {
194 		(void)wt_api->err_printf(wt_api,
195 		    session,
196 		    "WT_COMPRESSOR.decompress: stored size exceeds source "
197 		    "size");
198 		return (WT_ERROR);
199 	}
200 
201 	/*
202 	 * Decompress, starting after the prefix bytes. Use safe decompression:
203 	 * we rely on decompression to detect corruption.
204 	 *
205 	 * Two code paths, one with and one without a bounce buffer. When doing
206 	 * raw compression, we compress to a target size irrespective of row
207 	 * boundaries, and return to our caller a "useful" compression length
208 	 * based on the last complete row that was compressed. Our caller stores
209 	 * that length, not the length of bytes actually compressed by LZ4. In
210 	 * other words, our caller doesn't know how many bytes will result from
211 	 * decompression, likely hasn't provided us a large enough buffer, and
212 	 * we have to allocate a scratch buffer.
213 	 */
214 	if (dst_len < prefix.uncompressed_len) {
215 		if ((dst_tmp = wt_api->scr_alloc(
216 		    wt_api, session, (size_t)prefix.uncompressed_len)) == NULL)
217 			return (ENOMEM);
218 
219 		decoded = LZ4_decompress_safe(
220 		    (const char *)src + sizeof(LZ4_PREFIX), (char *)dst_tmp,
221 		    (int)prefix.compressed_len, (int)prefix.uncompressed_len);
222 
223 		if (decoded >= 0)
224 			memcpy(dst, dst_tmp, dst_len);
225 		wt_api->scr_free(wt_api, session, dst_tmp);
226 	} else
227 		decoded = LZ4_decompress_safe(
228 		    (const char *)src + sizeof(LZ4_PREFIX),
229 		    (char *)dst, (int)prefix.compressed_len, (int)dst_len);
230 
231 	if (decoded >= 0) {
232 		*result_lenp = prefix.useful_len;
233 		return (0);
234 	}
235 
236 	return (
237 	    lz4_error(compressor, session, "LZ4 decompress error", decoded));
238 }
239 
240 /*
241  * lz4_find_slot --
242  *	Find the slot containing the target offset (binary search).
243  */
244 static inline uint32_t
lz4_find_slot(int target_arg,uint32_t * offsets,uint32_t slots)245 lz4_find_slot(int target_arg, uint32_t *offsets, uint32_t slots)
246 {
247 	uint32_t base, indx, limit, target;
248 
249 	indx = 1;					/* -Wuninitialized */
250 
251 	target = (uint32_t)target_arg;			/* Type conversion */
252 
253 	/* Fast check if we consumed it all, it's a likely result. */
254 	if (target >= offsets[slots])
255 		return (slots);
256 
257 	/*
258 	 * Figure out which slot we got to: binary search. Note the test of
259 	 * offset (slot + 1), that's (end-byte + 1) for slot.
260 	 */
261 	for (base = 0, limit = slots; limit != 0; limit >>= 1) {
262 		indx = base + (limit >> 1);
263 		if (target > offsets[indx + 1]) {
264 			base = indx + 1;
265 			--limit;
266 		}
267 	}
268 
269 	return (indx);
270 }
271 
272 /*
273  * lz4_compress_raw --
274  *	Pack records into a specified on-disk page size.
275  */
276 static int
lz4_compress_raw(WT_COMPRESSOR * compressor,WT_SESSION * session,size_t page_max,int split_pct,size_t extra,uint8_t * src,uint32_t * offsets,uint32_t slots,uint8_t * dst,size_t dst_len,int final,size_t * result_lenp,uint32_t * result_slotsp)277 lz4_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session,
278     size_t page_max, int split_pct, size_t extra,
279     uint8_t *src, uint32_t *offsets, uint32_t slots,
280     uint8_t *dst, size_t dst_len, int final,
281     size_t *result_lenp, uint32_t *result_slotsp)
282 {
283 	LZ4_PREFIX prefix;
284 	uint32_t slot;
285 	int lz4_len, sourceSize, targetDestSize;
286 
287 	(void)compressor;				/* Unused parameters */
288 	(void)session;
289 	(void)split_pct;
290 	(void)final;
291 
292 	/*
293 	 * Set the source and target sizes. The target size is complicated: we
294 	 * don't want to exceed the smaller of the maximum page size or the
295 	 * destination buffer length, and in both cases we have to take into
296 	 * account the space for our overhead and the extra bytes required by
297 	 * our caller.
298 	 */
299 	sourceSize = (int)offsets[slots];
300 	targetDestSize = (int)(page_max < dst_len ? page_max : dst_len);
301 	targetDestSize -= (int)(sizeof(LZ4_PREFIX) + extra);
302 
303 	/* Compress, starting after the prefix bytes. */
304 	lz4_len = LZ4_compress_destSize((const char *)src,
305 	    (char *)dst + sizeof(LZ4_PREFIX), &sourceSize, targetDestSize);
306 
307 	/*
308 	 * If compression succeeded and the compressed length is smaller than
309 	 * the original size, return success.
310 	 */
311 	if (lz4_len != 0) {
312 		/* Find the first slot we didn't compress. */
313 		slot = lz4_find_slot(sourceSize, offsets, slots);
314 
315 		if ((size_t)lz4_len + sizeof(LZ4_PREFIX) < offsets[slot]) {
316 			prefix.compressed_len = (uint32_t)lz4_len;
317 			prefix.uncompressed_len = (uint32_t)sourceSize;
318 			prefix.useful_len = offsets[slot];
319 			prefix.unused = 0;
320 #ifdef WORDS_BIGENDIAN
321 			lz4_prefix_swap(&prefix);
322 #endif
323 			memcpy(dst, &prefix, sizeof(LZ4_PREFIX));
324 
325 			*result_slotsp = slot;
326 			*result_lenp = (size_t)lz4_len + sizeof(LZ4_PREFIX);
327 			return (0);
328 		}
329 	}
330 
331 	*result_slotsp = 0;
332 	*result_lenp = 1;
333 	return (0);
334 }
335 
336 /*
337  * lz4_pre_size --
338  *	WiredTiger LZ4 destination buffer sizing for compression.
339  */
340 static int
lz4_pre_size(WT_COMPRESSOR * compressor,WT_SESSION * session,uint8_t * src,size_t src_len,size_t * result_lenp)341 lz4_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session,
342     uint8_t *src, size_t src_len, size_t *result_lenp)
343 {
344 	(void)compressor;				/* Unused parameters */
345 	(void)session;
346 	(void)src;
347 
348 	/*
349 	 * In block mode, LZ4 can use more space than the input data size, use
350 	 * the library calculation of that overhead (plus our overhead) to be
351 	 * safe.
352 	 */
353 	*result_lenp = LZ4_COMPRESSBOUND(src_len) + sizeof(LZ4_PREFIX);
354 	return (0);
355 }
356 
357 /*
358  * lz4_terminate --
359  *	WiredTiger LZ4 compression termination.
360  */
361 static int
lz4_terminate(WT_COMPRESSOR * compressor,WT_SESSION * session)362 lz4_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session)
363 {
364 	(void)session;					/* Unused parameters */
365 
366 	free(compressor);
367 	return (0);
368 }
369 
370 /*
371  * lz4_add_compressor --
372  *	Add a LZ4 compressor.
373  */
374 static int
lz_add_compressor(WT_CONNECTION * connection,bool raw,const char * name)375 lz_add_compressor(WT_CONNECTION *connection, bool raw, const char *name)
376 {
377 	LZ4_COMPRESSOR *lz4_compressor;
378 	int ret;
379 
380 	/*
381 	 * There are two almost identical LZ4 compressors: one using raw
382 	 * compression to target a specific block size, and one without.
383 	 */
384 	if ((lz4_compressor = calloc(1, sizeof(LZ4_COMPRESSOR))) == NULL)
385 		return (errno);
386 
387 	lz4_compressor->compressor.compress = lz4_compress;
388 	lz4_compressor->compressor.compress_raw = raw ? lz4_compress_raw : NULL;
389 	lz4_compressor->compressor.decompress = lz4_decompress;
390 	lz4_compressor->compressor.pre_size = lz4_pre_size;
391 	lz4_compressor->compressor.terminate = lz4_terminate;
392 
393 	lz4_compressor->wt_api = connection->get_extension_api(connection);
394 
395 	/* Load the compressor */
396 	if ((ret = connection->add_compressor(
397 	    connection, name, (WT_COMPRESSOR *)lz4_compressor, NULL)) == 0)
398 		return (0);
399 
400 	free(lz4_compressor);
401 	return (ret);
402 }
403 
404 int lz4_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *);
405 
406 /*
407  * lz4_extension_init --
408  *	WiredTiger LZ4 compression extension - called directly when LZ4 support
409  * is built in, or via wiredtiger_extension_init when LZ4 support is included
410  * via extension loading.
411  */
412 int
lz4_extension_init(WT_CONNECTION * connection,WT_CONFIG_ARG * config)413 lz4_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
414 {
415 	int ret;
416 
417 	(void)config;    				/* Unused parameters */
418 
419 	if ((ret = lz_add_compressor(connection, true, "lz4")) != 0)
420 		return (ret);
421 	if ((ret = lz_add_compressor(connection, false, "lz4-noraw")) != 0)
422 		return (ret);
423 	return (0);
424 }
425 
426 /*
427  * We have to remove this symbol when building as a builtin extension otherwise
428  * it will conflict with other builtin libraries.
429  */
430 #ifndef	HAVE_BUILTIN_EXTENSION_LZ4
431 /*
432  * wiredtiger_extension_init --
433  *	WiredTiger LZ4 compression extension.
434  */
435 int
wiredtiger_extension_init(WT_CONNECTION * connection,WT_CONFIG_ARG * config)436 wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
437 {
438 	return (lz4_extension_init(connection, config));
439 }
440 #endif
441