1 /*********************************************************************
2 Blosc - Blocked Shuffling and Compression Library
3
4 Author: Francesc Alted <francesc@blosc.org>
5 Creation date: 2009-05-20
6
7 See LICENSE.txt for details about copyright and rights to use.
8 **********************************************************************/
9
10 /*********************************************************************
11 The code in this file is heavily based on FastLZ, a lightning-fast
12 lossless compression library. See LICENSES/FASTLZ.txt for details.
13 **********************************************************************/
14
15
16 #include <stdio.h>
17 #include <stdbool.h>
18 #include "blosclz.h"
19 #include "fastcopy.h"
20 #include "blosc-common.h"
21
22
23 /*
24 * Give hints to the compiler for branch prediction optimization.
25 */
26 #if defined(__GNUC__) && (__GNUC__ > 2)
27 #define BLOSCLZ_LIKELY(c) (__builtin_expect((c), 1))
28 #define BLOSCLZ_UNLIKELY(c) (__builtin_expect((c), 0))
29 #else
30 #define BLOSCLZ_LIKELY(c) (c)
31 #define BLOSCLZ_UNLIKELY(c) (c)
32 #endif
33
34 /*
35 * Use inlined functions for supported systems.
36 */
37 #if defined(_MSC_VER) && !defined(__cplusplus) /* Visual Studio */
38 #define inline __inline /* Visual C is not C99, but supports some kind of inline */
39 #endif
40
41 #define MAX_COPY 32U
42 #define MAX_DISTANCE 8191
43 #define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
44
45 #ifdef BLOSC_STRICT_ALIGN
46 #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
47 #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
48 #else
49 #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
50 #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
51 #endif
52
53 #define HASH_LOG (12U)
54
55 // This is used in LZ4 and seems to work pretty well here too
56 #define HASH_FUNCTION(v, s, h) { \
57 v = (s * 2654435761U) >> (32U - h); \
58 }
59
60
61 #if defined(__AVX2__)
get_run_32(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)62 static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
63 uint8_t x = ip[-1];
64 /* safe because the outer check against ip limit */
65 if (ip < (ip_bound - sizeof(int64_t))) {
66 int64_t value, value2;
67 /* Broadcast the value for every byte in a 64-bit register */
68 memset(&value, x, 8);
69 #if defined(BLOSC_STRICT_ALIGN)
70 memcpy(&value2, ref, 8);
71 #else
72 value2 = ((int64_t*)ref)[0];
73 #endif
74 if (value != value2) {
75 /* Return the byte that starts to differ */
76 while (*ref++ == x) ip++;
77 return ip;
78 }
79 else {
80 ip += 8;
81 ref += 8;
82 }
83 }
84 if (ip < (ip_bound - sizeof(__m128i))) {
85 __m128i value, value2, cmp;
86 /* Broadcast the value for every byte in a 128-bit register */
87 memset(&value, x, sizeof(__m128i));
88 value2 = _mm_loadu_si128((__m128i *) ref);
89 cmp = _mm_cmpeq_epi32(value, value2);
90 if (_mm_movemask_epi8(cmp) != 0xFFFF) {
91 /* Return the byte that starts to differ */
92 while (*ref++ == x) ip++;
93 return ip;
94 } else {
95 ip += sizeof(__m128i);
96 ref += sizeof(__m128i);
97 }
98 }
99 while (ip < (ip_bound - (sizeof(__m256i)))) {
100 __m256i value, value2, cmp;
101 /* Broadcast the value for every byte in a 256-bit register */
102 memset(&value, x, sizeof(__m256i));
103 value2 = _mm256_loadu_si256((__m256i *)ref);
104 cmp = _mm256_cmpeq_epi64(value, value2);
105 if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
106 /* Return the byte that starts to differ */
107 while (*ref++ == x) ip++;
108 return ip;
109 }
110 else {
111 ip += sizeof(__m256i);
112 ref += sizeof(__m256i);
113 }
114 }
115 /* Look into the remainder */
116 while ((ip < ip_bound) && (*ref++ == x)) ip++;
117 return ip;
118 }
119
120 #elif defined(__SSE2__)
121
get_run_16(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)122 static uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
123 uint8_t x = ip[-1];
124
125 if (ip < (ip_bound - sizeof(int64_t))) {
126 int64_t value, value2;
127 /* Broadcast the value for every byte in a 64-bit register */
128 memset(&value, x, 8);
129 #if defined(BLOSC_STRICT_ALIGN)
130 memcpy(&value2, ref, 8);
131 #else
132 value2 = ((int64_t*)ref)[0];
133 #endif
134 if (value != value2) {
135 /* Return the byte that starts to differ */
136 while (*ref++ == x) ip++;
137 return ip;
138 }
139 else {
140 ip += 8;
141 ref += 8;
142 }
143 }
144 /* safe because the outer check against ip limit */
145 while (ip < (ip_bound - sizeof(__m128i))) {
146 __m128i value, value2, cmp;
147 /* Broadcast the value for every byte in a 128-bit register */
148 memset(&value, x, sizeof(__m128i));
149 value2 = _mm_loadu_si128((__m128i *)ref);
150 cmp = _mm_cmpeq_epi32(value, value2);
151 if (_mm_movemask_epi8(cmp) != 0xFFFF) {
152 /* Return the byte that starts to differ */
153 while (*ref++ == x) ip++;
154 return ip;
155 }
156 else {
157 ip += sizeof(__m128i);
158 ref += sizeof(__m128i);
159 }
160 }
161 /* Look into the remainder */
162 while ((ip < ip_bound) && (*ref++ == x)) ip++;
163 return ip;
164 }
165
166 #else
167
get_run(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)168 static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
169 uint8_t x = ip[-1];
170 int64_t value, value2;
171 /* Broadcast the value for every byte in a 64-bit register */
172 memset(&value, x, 8);
173 /* safe because the outer check against ip limit */
174 while (ip < (ip_bound - sizeof(int64_t))) {
175 #if defined(BLOSC_STRICT_ALIGN)
176 memcpy(&value2, ref, 8);
177 #else
178 value2 = ((int64_t*)ref)[0];
179 #endif
180 if (value != value2) {
181 /* Return the byte that starts to differ */
182 while (*ref++ == x) ip++;
183 return ip;
184 }
185 else {
186 ip += 8;
187 ref += 8;
188 }
189 }
190 /* Look into the remainder */
191 while ((ip < ip_bound) && (*ref++ == x)) ip++;
192 return ip;
193 }
194
195 #endif
196
197
198 /* Return the byte that starts to differ */
get_match(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)199 static uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
200 #if !defined(BLOSC_STRICT_ALIGN)
201 while (ip < (ip_bound - sizeof(int64_t))) {
202 if (*(int64_t*)ref != *(int64_t*)ip) {
203 /* Return the byte that starts to differ */
204 while (*ref++ == *ip++) {}
205 return ip;
206 }
207 else {
208 ip += sizeof(int64_t);
209 ref += sizeof(int64_t);
210 }
211 }
212 #endif
213 /* Look into the remainder */
214 while ((ip < ip_bound) && (*ref++ == *ip++)) {}
215 return ip;
216 }
217
218
219 #if defined(__SSE2__)
get_match_16(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)220 static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
221 __m128i value, value2, cmp;
222
223 if (ip < (ip_bound - sizeof(int64_t))) {
224 if (*(int64_t *) ref != *(int64_t *) ip) {
225 /* Return the byte that starts to differ */
226 while (*ref++ == *ip++) {}
227 return ip;
228 } else {
229 ip += sizeof(int64_t);
230 ref += sizeof(int64_t);
231 }
232 }
233 while (ip < (ip_bound - sizeof(__m128i))) {
234 value = _mm_loadu_si128((__m128i *) ip);
235 value2 = _mm_loadu_si128((__m128i *) ref);
236 cmp = _mm_cmpeq_epi32(value, value2);
237 if (_mm_movemask_epi8(cmp) != 0xFFFF) {
238 /* Return the byte that starts to differ */
239 return get_match(ip, ip_bound, ref);
240 }
241 else {
242 ip += sizeof(__m128i);
243 ref += sizeof(__m128i);
244 }
245 }
246 /* Look into the remainder */
247 while ((ip < ip_bound) && (*ref++ == *ip++)) {}
248 return ip;
249 }
250 #endif
251
252
253 #if defined(__AVX2__)
get_match_32(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)254 static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
255
256 if (ip < (ip_bound - sizeof(int64_t))) {
257 if (*(int64_t *) ref != *(int64_t *) ip) {
258 /* Return the byte that starts to differ */
259 while (*ref++ == *ip++) {}
260 return ip;
261 } else {
262 ip += sizeof(int64_t);
263 ref += sizeof(int64_t);
264 }
265 }
266 if (ip < (ip_bound - sizeof(__m128i))) {
267 __m128i value, value2, cmp;
268 value = _mm_loadu_si128((__m128i *) ip);
269 value2 = _mm_loadu_si128((__m128i *) ref);
270 cmp = _mm_cmpeq_epi32(value, value2);
271 if (_mm_movemask_epi8(cmp) != 0xFFFF) {
272 /* Return the byte that starts to differ */
273 return get_match_16(ip, ip_bound, ref);
274 }
275 else {
276 ip += sizeof(__m128i);
277 ref += sizeof(__m128i);
278 }
279 }
280 while (ip < (ip_bound - sizeof(__m256i))) {
281 __m256i value, value2, cmp;
282 value = _mm256_loadu_si256((__m256i *) ip);
283 value2 = _mm256_loadu_si256((__m256i *)ref);
284 cmp = _mm256_cmpeq_epi64(value, value2);
285 if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
286 /* Return the byte that starts to differ */
287 while (*ref++ == *ip++) {}
288 return ip;
289 }
290 else {
291 ip += sizeof(__m256i);
292 ref += sizeof(__m256i);
293 }
294 }
295 /* Look into the remainder */
296 while ((ip < ip_bound) && (*ref++ == *ip++)) {}
297 return ip;
298 }
299 #endif
300
301
get_run_or_match(uint8_t * ip,uint8_t * ip_bound,const uint8_t * ref,bool run)302 static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
303 if (BLOSCLZ_UNLIKELY(run)) {
304 #if defined(__AVX2__)
305 ip = get_run_32(ip, ip_bound, ref);
306 #elif defined(__SSE2__)
307 ip = get_run_16(ip, ip_bound, ref);
308 #else
309 ip = get_run(ip, ip_bound, ref);
310 #endif
311 }
312 else {
313 #if defined(__AVX2__)
314 ip = get_match_32(ip, ip_bound, ref);
315 #elif defined(__SSE2__)
316 ip = get_match_16(ip, ip_bound, ref);
317 #else
318 ip = get_match(ip, ip_bound, ref);
319 #endif
320 }
321
322 return ip;
323 }
324
325
326 #define LITERAL(ip, op, op_limit, anchor, copy) { \
327 if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
328 goto out; \
329 *op++ = *anchor++; \
330 ip = anchor; \
331 copy++; \
332 if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) { \
333 copy = 0; \
334 *op++ = MAX_COPY-1; \
335 } \
336 }
337
338 #define LITERAL2(ip, oc, anchor, copy) { \
339 oc++; anchor++; \
340 ip = anchor; \
341 copy++; \
342 if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) { \
343 copy = 0; \
344 oc++; \
345 } \
346 }
347
348 #define DISTANCE_SHORT(op, op_limit, len, distance) { \
349 if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
350 goto out; \
351 *op++ = (uint8_t)((len << 5U) + (distance >> 8U)); \
352 *op++ = (uint8_t)((distance & 255U)); \
353 }
354
355 #define DISTANCE_LONG(op, op_limit, len, distance) { \
356 if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
357 goto out; \
358 *op++ = (uint8_t)((7U << 5U) + (distance >> 8U)); \
359 for (len -= 7; len >= 255; len -= 255) { \
360 if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
361 goto out; \
362 *op++ = 255; \
363 } \
364 if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
365 goto out; \
366 *op++ = (uint8_t)len; \
367 *op++ = (uint8_t)((distance & 255U)); \
368 }
369
370 #define DISTANCE_SHORT_FAR(op, op_limit, len, distance) { \
371 if (BLOSCLZ_UNLIKELY(op + 4 > op_limit)) \
372 goto out; \
373 *op++ = (uint8_t)((len << 5U) + 31); \
374 *op++ = 255; \
375 *op++ = (uint8_t)(distance >> 8U); \
376 *op++ = (uint8_t)(distance & 255U); \
377 }
378
379 #define DISTANCE_LONG_FAR(op, op_limit, len, distance) { \
380 if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
381 goto out; \
382 *op++ = (7U << 5U) + 31; \
383 for (len -= 7; len >= 255; len -= 255) { \
384 if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
385 goto out; \
386 *op++ = 255; \
387 } \
388 if (BLOSCLZ_UNLIKELY(op + 4 > op_limit)) \
389 goto out; \
390 *op++ = (uint8_t)len; \
391 *op++ = 255; \
392 *op++ = (uint8_t)(distance >> 8U); \
393 *op++ = (uint8_t)(distance & 255U); \
394 }
395
396
397 // Get the compressed size of a buffer. Useful for testing compression ratios for high clevels.
get_csize(uint8_t * ibase,int maxlen,bool force_3b_shift)398 static int get_csize(uint8_t* ibase, int maxlen, bool force_3b_shift) {
399 uint8_t* ip = ibase;
400 int32_t oc = 0;
401 uint8_t* ip_bound = ibase + maxlen - 1;
402 uint8_t* ip_limit = ibase + maxlen - 12;
403 uint32_t htab[1U << (uint8_t)HASH_LOG];
404 uint32_t hval;
405 uint32_t seq;
406 uint8_t copy;
407
408 // Initialize the hash table to distances of 0
409 for (unsigned i = 0; i < (1U << HASH_LOG); i++) {
410 htab[i] = 0;
411 }
412
413 /* we start with literal copy */
414 copy = 4;
415 oc += 5;
416
417 /* main loop */
418 while (BLOSCLZ_LIKELY(ip < ip_limit)) {
419 const uint8_t* ref;
420 unsigned distance;
421 uint8_t* anchor = ip; /* comparison starting-point */
422
423 /* find potential match */
424 seq = BLOSCLZ_READU32(ip);
425 HASH_FUNCTION(hval, seq, HASH_LOG)
426 ref = ibase + htab[hval];
427
428 /* calculate distance to the match */
429 distance = anchor - ref;
430
431 /* update hash table */
432 htab[hval] = (uint32_t) (anchor - ibase);
433
434 if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
435 LITERAL2(ip, oc, anchor, copy)
436 continue;
437 }
438
439 /* is this a match? check the first 4 bytes */
440 if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
441 ref += 4;
442 }
443 else {
444 /* no luck, copy as a literal */
445 LITERAL2(ip, oc, anchor, copy)
446 continue;
447 }
448
449 /* last matched byte */
450 ip = anchor + 4;
451
452 /* distance is biased */
453 distance--;
454
455 /* get runs or matches; zero distance means a run */
456 ip = get_run_or_match(ip, ip_bound, ref, !distance);
457
458 ip -= force_3b_shift ? 3 : 4;
459 unsigned len = (int)(ip - anchor);
460 // If match is close, let's reduce the minimum length to encode it
461 unsigned minlen = (distance < MAX_DISTANCE) ? 3 : 4;
462 // Encoding short lengths is expensive during decompression
463 if (len < minlen) {
464 LITERAL2(ip, oc, anchor, copy)
465 continue;
466 }
467
468 /* if we have'nt copied anything, adjust the output counter */
469 if (!copy)
470 oc--;
471 /* reset literal counter */
472 copy = 0;
473
474 /* encode the match */
475 if (distance < MAX_DISTANCE) {
476 if (len >= 7) {
477 oc += ((len - 7) / 255) + 1;
478 }
479 oc += 2;
480 }
481 else {
482 /* far away, but not yet in the another galaxy... */
483 if (len >= 7) {
484 oc += ((len - 7) / 255) + 1;
485 }
486 oc += 4;
487 }
488
489 /* update the hash at match boundary */
490 seq = BLOSCLZ_READU32(ip);
491 HASH_FUNCTION(hval, seq, HASH_LOG)
492 htab[hval] = (uint32_t) (ip++ - ibase);
493 seq >>= 8U;
494 HASH_FUNCTION(hval, seq, HASH_LOG)
495 htab[hval] = (uint32_t) (ip++ - ibase);
496 /* assuming literal copy */
497 oc++;
498
499 }
500
501 /* if we have copied something, adjust the copy length */
502 if (!copy)
503 oc--;
504
505 return (int)oc;
506 }
507
508
blosclz_compress(const int clevel,const void * input,int length,void * output,int maxout)509 int blosclz_compress(const int clevel, const void* input, int length,
510 void* output, int maxout) {
511 uint8_t* ibase = (uint8_t*)input;
512 uint8_t* ip = ibase;
513 uint8_t* ip_bound = ibase + length - 1;
514 uint8_t* ip_limit = ibase + length - 12;
515 uint8_t* op = (uint8_t*)output;
516 uint8_t* op_limit;
517 uint32_t htab[1U << (uint8_t)HASH_LOG];
518 uint32_t hval;
519 uint32_t seq;
520 uint8_t copy;
521
522 op_limit = op + maxout;
523
524 // Minimum lengths for encoding
525 unsigned minlen_[10] = {0, 12, 12, 11, 10, 9, 8, 7, 6, 5};
526
527 // Minimum compression ratios for initiate encoding
528 double cratio_[10] = {0, 2, 2, 2, 2, 1.8, 1.6, 1.4, 1.2, 1.1};
529
530 uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
531 HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
532 uint8_t hashlog = hashlog_[clevel];
533 // Initialize the hash table to distances of 0
534 for (unsigned i = 0; i < (1U << hashlog); i++) {
535 htab[i] = 0;
536 }
537
538 /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
539 if (length < 16 || maxout < 66) {
540 return 0;
541 }
542
543 /* When we go back in a match (shift), we obtain quite different compression properties.
544 * It looks like 4 is more useful in combination with bitshuffle and small typesizes
545 * (compress better and faster in e.g. `b2bench blosclz bitshuffle single 6 6291456 1 19`).
546 * Fallback to 4 because it provides more consistent results on small itemsizes.
547 *
548 * In this block we also check cratios for the beginning of the buffers and
549 * eventually discard those that are small (take too long to decompress).
550 * This process is called _entropy probing_.
551 */
552 int ipshift = 4;
553 int maxlen; // maximum length for entropy probing
554 int csize_3b;
555 int csize_4b;
556 double cratio = 0;
557 switch (clevel) {
558 case 1:
559 case 2:
560 case 3:
561 maxlen = length / 8;
562 csize_4b = get_csize(ibase, maxlen, false);
563 cratio = (double)maxlen / csize_4b;
564 break;
565 case 4:
566 case 5:
567 case 6:
568 case 7:
569 case 8:
570 maxlen = length / 8;
571 csize_4b = get_csize(ibase, maxlen, false);
572 cratio = (double)maxlen / csize_4b;
573 break;
574 case 9:
575 // case 9 is special. we need to asses the optimal shift
576 maxlen = length / 8;
577 csize_3b = get_csize(ibase, maxlen, true);
578 csize_4b = get_csize(ibase, maxlen, false);
579 ipshift = (csize_3b < csize_4b) ? 3 : 4;
580 cratio = (csize_3b < csize_4b) ? ((double)maxlen / csize_3b) : ((double)maxlen / csize_4b);
581 break;
582 default:
583 break;
584 }
585 // discard probes with small compression ratios (too expensive)
586 if (cratio < cratio_ [clevel]) {
587 goto out;
588 }
589
590 /* we start with literal copy */
591 copy = 4;
592 *op++ = MAX_COPY - 1;
593 *op++ = *ip++;
594 *op++ = *ip++;
595 *op++ = *ip++;
596 *op++ = *ip++;
597
598 /* main loop */
599 while (BLOSCLZ_LIKELY(ip < ip_limit)) {
600 const uint8_t* ref;
601 unsigned distance;
602 uint8_t* anchor = ip; /* comparison starting-point */
603
604 /* find potential match */
605 seq = BLOSCLZ_READU32(ip);
606 HASH_FUNCTION(hval, seq, hashlog)
607 ref = ibase + htab[hval];
608
609 /* calculate distance to the match */
610 distance = anchor - ref;
611
612 /* update hash table */
613 htab[hval] = (uint32_t) (anchor - ibase);
614
615 if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
616 LITERAL(ip, op, op_limit, anchor, copy)
617 continue;
618 }
619
620 /* is this a match? check the first 4 bytes */
621 if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
622 ref += 4;
623 } else {
624 /* no luck, copy as a literal */
625 LITERAL(ip, op, op_limit, anchor, copy)
626 continue;
627 }
628
629 /* last matched byte */
630 ip = anchor + 4;
631
632 /* distance is biased */
633 distance--;
634
635 /* get runs or matches; zero distance means a run */
636 ip = get_run_or_match(ip, ip_bound, ref, !distance);
637
638 /* length is biased, '1' means a match of 3 bytes */
639 ip -= ipshift;
640
641 unsigned len = (int)(ip - anchor);
642 // If match is close, let's reduce the minimum length to encode it
643 unsigned minlen = (clevel == 9) ? ipshift : minlen_[clevel];
644
645 // Encoding short lengths is expensive during decompression
646 // Encode only for reasonable lengths (extensive experiments done)
647 if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
648 LITERAL(ip, op, op_limit, anchor, copy)
649 continue;
650 }
651
652 /* if we have copied something, adjust the copy count */
653 if (copy)
654 /* copy is biased, '0' means 1 byte copy */
655 *(op - copy - 1) = (uint8_t)(copy - 1);
656 else
657 /* back, to overwrite the copy count */
658 op--;
659 /* reset literal counter */
660 copy = 0;
661
662 /* encode the match */
663 if (distance < MAX_DISTANCE) {
664 if (len < 7) {
665 DISTANCE_SHORT(op, op_limit, len, distance)
666 } else {
667 DISTANCE_LONG(op, op_limit, len, distance)
668 }
669 } else {
670 /* far away, but not yet in the another galaxy... */
671 distance -= MAX_DISTANCE;
672 if (len < 7) {
673 DISTANCE_SHORT_FAR(op, op_limit, len, distance)
674 } else {
675 DISTANCE_LONG_FAR(op, op_limit, len, distance)
676 }
677 }
678
679 /* update the hash at match boundary */
680 seq = BLOSCLZ_READU32(ip);
681 HASH_FUNCTION(hval, seq, hashlog)
682 htab[hval] = (uint32_t) (ip++ - ibase);
683 seq >>= 8U;
684 HASH_FUNCTION(hval, seq, hashlog)
685 htab[hval] = (uint32_t) (ip++ - ibase);
686 /* assuming literal copy */
687
688 if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
689 goto out;
690 *op++ = MAX_COPY - 1;
691 }
692
693 /* left-over as literal copy */
694 while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
695 if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
696 *op++ = *ip++;
697 copy++;
698 if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
699 copy = 0;
700 *op++ = MAX_COPY - 1;
701 }
702 }
703
704 /* if we have copied something, adjust the copy length */
705 if (copy)
706 *(op - copy - 1) = (uint8_t)(copy - 1);
707 else
708 op--;
709
710 /* marker for blosclz */
711 *(uint8_t*)output |= (1U << 5U);
712
713 return (int)(op - (uint8_t*)output);
714
715 out:
716 return 0;
717 }
718
719 // See https://habr.com/en/company/yandex/blog/457612/
720 #ifdef __AVX2__
721
722 #if defined(_MSC_VER)
723 #define ALIGNED_(x) __declspec(align(x))
724 #else
725 #if defined(__GNUC__)
726 #define ALIGNED_(x) __attribute__ ((aligned(x)))
727 #endif
728 #endif
729 #define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
730
copy_match_16(unsigned char * op,const unsigned char * match,int32_t len)731 static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
732 {
733 size_t offset = op - match;
734 while (len >= 16) {
735
736 static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
737 {
738 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, // offset = 0, not used as mask, but for shift
739 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // offset = 1
740 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
741 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0,
742 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
743 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
744 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
745 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1,
746 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
747 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6,
748 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
749 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4,
750 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
751 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2,
752 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1,
753 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
754 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // offset = 16
755 };
756
757 _mm_storeu_si128((__m128i *)(op),
758 _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
759 _mm_load_si128((const __m128i *)(masks) + offset)));
760
761 match += masks[offset];
762
763 op += 16;
764 len -= 16;
765 }
766 // Deal with remainders
767 for (; len > 0; len--) {
768 *op++ = *match++;
769 }
770 return op;
771 }
772 #endif
773
774 // LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
wild_copy(uint8_t * out,const uint8_t * from,uint8_t * end)775 static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
776 uint8_t* d = out;
777 const uint8_t* s = from;
778 uint8_t* const e = end;
779
780 do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
781 }
782
blosclz_decompress(const void * input,int length,void * output,int maxout)783 int blosclz_decompress(const void* input, int length, void* output, int maxout) {
784 const uint8_t* ip = (const uint8_t*)input;
785 const uint8_t* ip_limit = ip + length;
786 uint8_t* op = (uint8_t*)output;
787 uint32_t ctrl;
788 uint8_t* op_limit = op + maxout;
789 if (BLOSCLZ_UNLIKELY(length == 0)) {
790 return 0;
791 }
792 ctrl = (*ip++) & 31U;
793
794 while (1) {
795 if (ctrl >= 32) {
796 // match
797 int32_t len = (ctrl >> 5U) - 1 ;
798 int32_t ofs = (ctrl & 31U) << 8U;
799 uint8_t code;
800 const uint8_t* ref = op - ofs;
801
802 if (len == 7 - 1) {
803 do {
804 if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
805 return 0;
806 }
807 code = *ip++;
808 len += code;
809 } while (code == 255);
810 }
811 else {
812 if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
813 return 0;
814 }
815 }
816 code = *ip++;
817 len += 3;
818 ref -= code;
819
820 /* match from 16-bit distance */
821 if (BLOSCLZ_UNLIKELY(code == 255)) {
822 if (ofs == (31U << 8U)) {
823 if (ip + 1 >= ip_limit) {
824 return 0;
825 }
826 ofs = (*ip++) << 8U;
827 ofs += *ip++;
828 ref = op - ofs - MAX_DISTANCE;
829 }
830 }
831
832 if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
833 return 0;
834 }
835
836 if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
837 return 0;
838 }
839
840 if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
841 ctrl = *ip++;
842
843 ref--;
844 if (ref == op - 1) {
845 /* optimized copy for a run */
846 memset(op, *ref, len);
847 op += len;
848 }
849 else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
850 // copy with an overlap not larger than 8
851 wild_copy(op, ref, op + len);
852 op += len;
853 }
854 else {
855 // general copy with any overlap
856 #ifdef __AVX2__
857 if (op - ref <= 16) {
858 // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
859 // it is not slower either. Let's activate here for experimentation.
860 op = copy_match_16(op, ref, len);
861 }
862 else {
863 #endif
864 op = copy_match(op, ref, (unsigned) len);
865 #ifdef __AVX2__
866 }
867 #endif
868 }
869 }
870 else {
871 // literal
872 ctrl++;
873 if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
874 return 0;
875 }
876 if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
877 return 0;
878 }
879
880 memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
881 // On GCC-6, fastcopy this is still faster than plain memcpy
882 // However, using recent CLANG/LLVM 9.0, there is almost no difference
883 // in performance.
884 // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
885 // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
886
887 if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
888 ctrl = *ip++;
889 }
890 }
891
892 return (int)(op - (uint8_t*)output);
893 }
894