1 /*********************************************************************
2 Blosc - Blocked Shuffling and Compression Library
3
4 Copyright (C) 2021 The Blosc Developers <blosc@blosc.org>
5 https://blosc.org
6 License: BSD 3-Clause (see LICENSE.txt)
7
8 See LICENSE.txt for details about copyright and rights to use.
9 **********************************************************************/
10
11 /*********************************************************************
12 The code in this file is heavily based on FastLZ, a lightning-fast
13 lossless compression library. See LICENSES/FASTLZ.txt for details.
14 **********************************************************************/
15
16
17 #include <stdio.h>
18 #include <stdbool.h>
19 #include "blosclz.h"
20 #include "fastcopy.h"
21 #include "blosc2/blosc2-common.h"
22
23
24 /*
25 * Give hints to the compiler for branch prediction optimization.
26 */
27 #if defined(__GNUC__) && (__GNUC__ > 2)
28 #define BLOSCLZ_LIKELY(c) (__builtin_expect((c), 1))
29 #define BLOSCLZ_UNLIKELY(c) (__builtin_expect((c), 0))
30 #else
31 #define BLOSCLZ_LIKELY(c) (c)
32 #define BLOSCLZ_UNLIKELY(c) (c)
33 #endif
34
35 /*
36 * Use inlined functions for supported systems.
37 */
38 #if defined(_MSC_VER) && !defined(__cplusplus) /* Visual Studio */
39 #define inline __inline /* Visual C is not C99, but supports some kind of inline */
40 #endif
41
42 #define MAX_COPY 32U
43 #define MAX_DISTANCE 8191
44 #define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
45
46 #ifdef BLOSC_STRICT_ALIGN
47 #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
48 #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
49 #else
50 #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
51 #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
52 #endif
53
54 #define HASH_LOG (14U)
55 #define HASH_LOG2 (12U)
56
57 // This is used in LZ4 and seems to work pretty well here too
58 #define HASH_FUNCTION(v, s, h) { \
59 v = (s * 2654435761U) >> (32U - h); \
60 }
61
62
63 #if defined(__AVX2__)
get_run_32(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)64 static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
65 uint8_t x = ip[-1];
66
67 while (ip < (ip_bound - (sizeof(__m256i)))) {
68 __m256i value, value2, cmp;
69 /* Broadcast the value for every byte in a 256-bit register */
70 memset(&value, x, sizeof(__m256i));
71 value2 = _mm256_loadu_si256((__m256i *)ref);
72 cmp = _mm256_cmpeq_epi64(value, value2);
73 if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
74 /* Return the byte that starts to differ */
75 while (*ref++ == x) ip++;
76 return ip;
77 }
78 else {
79 ip += sizeof(__m256i);
80 ref += sizeof(__m256i);
81 }
82 }
83 /* Look into the remainder */
84 while ((ip < ip_bound) && (*ref++ == x)) ip++;
85 return ip;
86 }
87 #endif
88
89 #if defined(__SSE2__)
get_run_16(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)90 static uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
91 uint8_t x = ip[-1];
92
93 while (ip < (ip_bound - sizeof(__m128i))) {
94 __m128i value, value2, cmp;
95 /* Broadcast the value for every byte in a 128-bit register */
96 memset(&value, x, sizeof(__m128i));
97 value2 = _mm_loadu_si128((__m128i *)ref);
98 cmp = _mm_cmpeq_epi32(value, value2);
99 if (_mm_movemask_epi8(cmp) != 0xFFFF) {
100 /* Return the byte that starts to differ */
101 while (*ref++ == x) ip++;
102 return ip;
103 }
104 else {
105 ip += sizeof(__m128i);
106 ref += sizeof(__m128i);
107 }
108 }
109 /* Look into the remainder */
110 while ((ip < ip_bound) && (*ref++ == x)) ip++;
111 return ip;
112 }
113
114 #endif
115
116
get_run(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)117 static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
118 uint8_t x = ip[-1];
119 int64_t value, value2;
120 /* Broadcast the value for every byte in a 64-bit register */
121 memset(&value, x, 8);
122 /* safe because the outer check against ip limit */
123 while (ip < (ip_bound - sizeof(int64_t))) {
124 #if defined(BLOSC_STRICT_ALIGN)
125 memcpy(&value2, ref, 8);
126 #else
127 value2 = ((int64_t*)ref)[0];
128 #endif
129 if (value != value2) {
130 /* Return the byte that starts to differ */
131 while (*ref++ == x) ip++;
132 return ip;
133 }
134 else {
135 ip += 8;
136 ref += 8;
137 }
138 }
139 /* Look into the remainder */
140 while ((ip < ip_bound) && (*ref++ == x)) ip++;
141 return ip;
142 }
143
144
145 /* Return the byte that starts to differ */
get_match(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)146 static uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
147 #if !defined(BLOSC_STRICT_ALIGN)
148 while (ip < (ip_bound - sizeof(int64_t))) {
149 if (*(int64_t*)ref != *(int64_t*)ip) {
150 /* Return the byte that starts to differ */
151 while (*ref++ == *ip++) {}
152 return ip;
153 }
154 else {
155 ip += sizeof(int64_t);
156 ref += sizeof(int64_t);
157 }
158 }
159 #endif
160 /* Look into the remainder */
161 while ((ip < ip_bound) && (*ref++ == *ip++)) {}
162 return ip;
163 }
164
165
166 #if defined(__SSE2__)
get_match_16(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)167 static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
168 __m128i value, value2, cmp;
169
170 while (ip < (ip_bound - sizeof(__m128i))) {
171 value = _mm_loadu_si128((__m128i *) ip);
172 value2 = _mm_loadu_si128((__m128i *) ref);
173 cmp = _mm_cmpeq_epi32(value, value2);
174 if (_mm_movemask_epi8(cmp) != 0xFFFF) {
175 /* Return the byte that starts to differ */
176 while (*ref++ == *ip++) {}
177 return ip;
178 }
179 else {
180 ip += sizeof(__m128i);
181 ref += sizeof(__m128i);
182 }
183 }
184 /* Look into the remainder */
185 while ((ip < ip_bound) && (*ref++ == *ip++)) {}
186 return ip;
187 }
188 #endif
189
190
191 #if defined(__AVX2__)
get_match_32(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)192 static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
193
194 while (ip < (ip_bound - sizeof(__m256i))) {
195 __m256i value, value2, cmp;
196 value = _mm256_loadu_si256((__m256i *) ip);
197 value2 = _mm256_loadu_si256((__m256i *)ref);
198 cmp = _mm256_cmpeq_epi64(value, value2);
199 if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
200 /* Return the byte that starts to differ */
201 while (*ref++ == *ip++) {}
202 return ip;
203 }
204 else {
205 ip += sizeof(__m256i);
206 ref += sizeof(__m256i);
207 }
208 }
209 /* Look into the remainder */
210 while ((ip < ip_bound) && (*ref++ == *ip++)) {}
211 return ip;
212 }
213 #endif
214
215
get_run_or_match(uint8_t * ip,uint8_t * ip_bound,const uint8_t * ref,bool run)216 static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
217 if (BLOSCLZ_UNLIKELY(run)) {
218 #if defined(__AVX2__)
219 // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
220 // ip = get_run_32(ip, ip_bound, ref);
221 ip = get_run(ip, ip_bound, ref);
222 #elif defined(__SSE2__)
223 // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
224 // ip = get_run_16(ip, ip_bound, ref);
225 ip = get_run(ip, ip_bound, ref);
226 #else
227 ip = get_run(ip, ip_bound, ref);
228 #endif
229 }
230 else {
231 #if defined(__AVX2__)
232 // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
233 // ip = get_match_32(ip, ip_bound, ref);
234 ip = get_match_16(ip, ip_bound, ref);
235 #elif defined(__SSE2__)
236 ip = get_match_16(ip, ip_bound, ref);
237 #else
238 ip = get_match(ip, ip_bound, ref);
239 #endif
240 }
241
242 return ip;
243 }
244
245
246 #define LITERAL(ip, op, op_limit, anchor, copy) { \
247 if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
248 goto out; \
249 *op++ = *anchor++; \
250 ip = anchor; \
251 copy++; \
252 if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) { \
253 copy = 0; \
254 *op++ = MAX_COPY-1; \
255 } \
256 }
257
258 #define LITERAL2(ip, anchor, copy) { \
259 oc++; anchor++; \
260 ip = anchor; \
261 copy++; \
262 if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) { \
263 copy = 0; \
264 oc++; \
265 } \
266 }
267
268 #define MATCH_SHORT(op, op_limit, len, distance) { \
269 if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
270 goto out; \
271 *op++ = (uint8_t)((len << 5U) + (distance >> 8U)); \
272 *op++ = (uint8_t)((distance & 255U)); \
273 }
274
275 #define MATCH_LONG(op, op_limit, len, distance) { \
276 if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
277 goto out; \
278 *op++ = (uint8_t)((7U << 5U) + (distance >> 8U)); \
279 for (len -= 7; len >= 255; len -= 255) { \
280 if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
281 goto out; \
282 *op++ = 255; \
283 } \
284 if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
285 goto out; \
286 *op++ = (uint8_t)len; \
287 *op++ = (uint8_t)((distance & 255U)); \
288 }
289
290 #define MATCH_SHORT_FAR(op, op_limit, len, distance) { \
291 if (BLOSCLZ_UNLIKELY(op + 4 > op_limit)) \
292 goto out; \
293 *op++ = (uint8_t)((len << 5U) + 31); \
294 *op++ = 255; \
295 *op++ = (uint8_t)(distance >> 8U); \
296 *op++ = (uint8_t)(distance & 255U); \
297 }
298
299 #define MATCH_LONG_FAR(op, op_limit, len, distance) { \
300 if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
301 goto out; \
302 *op++ = (7U << 5U) + 31; \
303 for (len -= 7; len >= 255; len -= 255) { \
304 if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
305 goto out; \
306 *op++ = 255; \
307 } \
308 if (BLOSCLZ_UNLIKELY(op + 4 > op_limit)) \
309 goto out; \
310 *op++ = (uint8_t)len; \
311 *op++ = 255; \
312 *op++ = (uint8_t)(distance >> 8U); \
313 *op++ = (uint8_t)(distance & 255U); \
314 }
315
316
317 // Get a guess for the compressed size of a buffer
get_cratio(uint8_t * ibase,int maxlen,int minlen,int ipshift)318 static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift) {
319 uint8_t* ip = ibase;
320 int32_t oc = 0;
321 const uint16_t hashlen = (1U << (uint8_t)HASH_LOG2);
322 uint16_t htab[1U << (uint8_t)HASH_LOG2];
323 uint32_t hval;
324 uint32_t seq;
325 uint8_t copy;
326 // Make a tradeoff between testing too much and too little
327 uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
328 uint8_t* ip_bound = ibase + limit - 1;
329 uint8_t* ip_limit = ibase + limit - 12;
330
331 // Initialize the hash table to distances of 0
332 memset(htab, 0, hashlen * sizeof(uint16_t));
333
334 /* we start with literal copy */
335 copy = 4;
336 oc += 5;
337
338 /* main loop */
339 while (BLOSCLZ_LIKELY(ip < ip_limit)) {
340 const uint8_t* ref;
341 unsigned distance;
342 uint8_t* anchor = ip; /* comparison starting-point */
343
344 /* find potential match */
345 seq = BLOSCLZ_READU32(ip);
346 HASH_FUNCTION(hval, seq, HASH_LOG2)
347 ref = ibase + htab[hval];
348
349 /* calculate distance to the match */
350 distance = (unsigned int)(anchor - ref);
351
352 /* update hash table */
353 htab[hval] = (uint16_t) (anchor - ibase);
354
355 if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
356 LITERAL2(ip, anchor, copy)
357 continue;
358 }
359
360 /* is this a match? check the first 4 bytes */
361 if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
362 ref += 4;
363 }
364 else {
365 /* no luck, copy as a literal */
366 LITERAL2(ip, anchor, copy)
367 continue;
368 }
369
370 /* last matched byte */
371 ip = anchor + 4;
372
373 /* distance is biased */
374 distance--;
375
376 /* get runs or matches; zero distance means a run */
377 ip = get_run_or_match(ip, ip_bound, ref, !distance);
378
379 ip -= ipshift;
380 unsigned len = (int)(ip - anchor);
381 if (len < minlen) {
382 LITERAL2(ip, anchor, copy)
383 continue;
384 }
385
386 /* if we haven't copied anything, adjust the output counter */
387 if (!copy)
388 oc--;
389 /* reset literal counter */
390 copy = 0;
391
392 /* encode the match */
393 if (distance < MAX_DISTANCE) {
394 if (len >= 7) {
395 oc += ((len - 7) / 255) + 1;
396 }
397 oc += 2;
398 }
399 else {
400 /* far away, but not yet in the another galaxy... */
401 if (len >= 7) {
402 oc += ((len - 7) / 255) + 1;
403 }
404 oc += 4;
405 }
406
407 /* update the hash at match boundary */
408 seq = BLOSCLZ_READU32(ip);
409 HASH_FUNCTION(hval, seq, HASH_LOG2)
410 htab[hval] = (uint16_t)(ip++ - ibase);
411 ip++;
412 /* assuming literal copy */
413 oc++;
414 }
415
416 double ic;
417 out:
418 ic = (double)(ip - ibase);
419 return ic / (double)oc;
420 }
421
422
blosclz_compress(const int clevel,const void * input,int length,void * output,int maxout,blosc2_context * ctx)423 int blosclz_compress(const int clevel, const void* input, int length,
424 void* output, int maxout, blosc2_context* ctx) {
425 uint8_t* ibase = (uint8_t*)input;
426
427 // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
428 int maxlen = length / 4;
429 // Start probing somewhere inside the buffer
430 int shift = length - maxlen;
431 // Actual entropy probing!
432 double cratio = get_cratio(ibase + shift, maxlen, 3, 3);
433 // discard probes with small compression ratios (too expensive)
434 double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
435 if (cratio < cratio_[clevel]) {
436 goto out;
437 }
438
439 /* When we go back in a match (shift), we obtain quite different compression properties.
440 * It looks like 4 is more useful in combination with bitshuffle and small typesizes
441 * Fallback to 4 because it provides more consistent results for large cratios.
442 *
443 * In this block we also check cratios for the beginning of the buffers and
444 * eventually discard those that are small (take too long to decompress).
445 * This process is called _entropy probing_.
446 */
447 unsigned ipshift = 4;
448 // Compute optimal shift and minimum lengths for encoding
449 // Use 4 by default, except for low entropy data, where we should do a best effort
450 unsigned minlen = 4;
451 // BloscLZ works better with splits mostly, so when data is not split, do a best effort
452 const int split_block = !((ctx->header_flags & 0x10) >> 4);
453 // Why using cratio < 4 is based in experiments with low and high entropy
454 if (!split_block || cratio < 4) {
455 ipshift = 3;
456 minlen = 3;
457 }
458 else {
459 minlen = 4;
460 }
461
462 uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
463 HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
464 uint8_t hashlog = hashlog_[clevel];
465
466 uint8_t* ip = ibase;
467 uint8_t* ip_bound = ibase + length - 1;
468 uint8_t* ip_limit = ibase + length - 12;
469 uint8_t* op = (uint8_t*)output;
470 const uint8_t* op_limit = op + maxout;
471 uint32_t seq;
472 uint8_t copy;
473 uint32_t hval;
474
475 /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
476 if (length < 16 || maxout < 66) {
477 return 0;
478 }
479
480 // Initialize the hash table
481 uint32_t htab[1U << (uint8_t)HASH_LOG];
482 memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
483
484 /* we start with literal copy */
485 copy = 4;
486 *op++ = MAX_COPY - 1;
487 *op++ = *ip++;
488 *op++ = *ip++;
489 *op++ = *ip++;
490 *op++ = *ip++;
491
492 /* main loop */
493 while (BLOSCLZ_LIKELY(ip < ip_limit)) {
494 const uint8_t* ref;
495 unsigned distance;
496 uint8_t* anchor = ip; /* comparison starting-point */
497
498 /* find potential match */
499 seq = BLOSCLZ_READU32(ip);
500 HASH_FUNCTION(hval, seq, hashlog)
501 ref = ibase + htab[hval];
502
503 /* calculate distance to the match */
504 distance = (unsigned int)(anchor - ref);
505
506 /* update hash table */
507 htab[hval] = (uint32_t) (anchor - ibase);
508
509 if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
510 LITERAL(ip, op, op_limit, anchor, copy)
511 continue;
512 }
513
514 /* is this a match? check the first 4 bytes */
515 if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
516 ref += 4;
517 } else {
518 /* no luck, copy as a literal */
519 LITERAL(ip, op, op_limit, anchor, copy)
520 continue;
521 }
522
523 /* last matched byte */
524 ip = anchor + 4;
525
526 /* distance is biased */
527 distance--;
528
529 /* get runs or matches; zero distance means a run */
530 ip = get_run_or_match(ip, ip_bound, ref, !distance);
531
532 /* length is biased, '1' means a match of 3 bytes */
533 ip -= ipshift;
534
535 unsigned len = (int)(ip - anchor);
536
537 // Encoding short lengths is expensive during decompression
538 if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
539 LITERAL(ip, op, op_limit, anchor, copy)
540 continue;
541 }
542
543 /* if we have copied something, adjust the copy count */
544 if (copy)
545 /* copy is biased, '0' means 1 byte copy */
546 *(op - copy - 1) = (uint8_t)(copy - 1);
547 else
548 /* back, to overwrite the copy count */
549 op--;
550 /* reset literal counter */
551 copy = 0;
552
553 /* encode the match */
554 if (distance < MAX_DISTANCE) {
555 if (len < 7) {
556 MATCH_SHORT(op, op_limit, len, distance)
557 } else {
558 MATCH_LONG(op, op_limit, len, distance)
559 }
560 } else {
561 /* far away, but not yet in the another galaxy... */
562 distance -= MAX_DISTANCE;
563 if (len < 7) {
564 MATCH_SHORT_FAR(op, op_limit, len, distance)
565 } else {
566 MATCH_LONG_FAR(op, op_limit, len, distance)
567 }
568 }
569
570 /* update the hash at match boundary */
571 seq = BLOSCLZ_READU32(ip);
572 HASH_FUNCTION(hval, seq, hashlog)
573 htab[hval] = (uint32_t) (ip++ - ibase);
574 if (ctx->clevel == 9) {
575 // In some situations, including a second hash proves to be useful,
576 // but not in others. Activating here in max clevel only.
577 seq >>= 8U;
578 HASH_FUNCTION(hval, seq, hashlog)
579 htab[hval] = (uint32_t) (ip++ - ibase);
580 }
581 else {
582 ip++;
583 }
584
585 if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
586 goto out;
587
588 /* assuming literal copy */
589 *op++ = MAX_COPY - 1;
590 }
591
592 /* left-over as literal copy */
593 while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
594 if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
595 *op++ = *ip++;
596 copy++;
597 if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
598 copy = 0;
599 *op++ = MAX_COPY - 1;
600 }
601 }
602
603 /* if we have copied something, adjust the copy length */
604 if (copy)
605 *(op - copy - 1) = (uint8_t)(copy - 1);
606 else
607 op--;
608
609 /* marker for blosclz */
610 *(uint8_t*)output |= (1U << 5U);
611
612 return (int)(op - (uint8_t*)output);
613
614 out:
615 return 0;
616 }
617
618 // See https://habr.com/en/company/yandex/blog/457612/
619 #if defined(__AVX2__)
620
621 #if defined(_MSC_VER)
622 #define ALIGNED_(x) __declspec(align(x))
623 #else
624 #if defined(__GNUC__)
625 #define ALIGNED_(x) __attribute__ ((aligned(x)))
626 #endif
627 #endif
628 #define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
629
copy_match_16(unsigned char * op,const unsigned char * match,int32_t len)630 static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
631 {
632 size_t offset = op - match;
633 while (len >= 16) {
634
635 static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
636 {
637 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, // offset = 0, not used as mask, but for shift
638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // offset = 1
639 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
640 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0,
641 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
642 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
643 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
644 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1,
645 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
646 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6,
647 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
648 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4,
649 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
650 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2,
651 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1,
652 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
653 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // offset = 16
654 };
655
656 _mm_storeu_si128((__m128i *)(op),
657 _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
658 _mm_load_si128((const __m128i *)(masks) + offset)));
659
660 match += masks[offset];
661
662 op += 16;
663 len -= 16;
664 }
665 // Deal with remainders
666 for (; len > 0; len--) {
667 *op++ = *match++;
668 }
669 return op;
670 }
671 #endif
672
673 // LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
wild_copy(uint8_t * out,const uint8_t * from,uint8_t * end)674 static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
675 uint8_t* d = out;
676 const uint8_t* s = from;
677 uint8_t* const e = end;
678
679 do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
680 }
681
blosclz_decompress(const void * input,int length,void * output,int maxout)682 int blosclz_decompress(const void* input, int length, void* output, int maxout) {
683 const uint8_t* ip = (const uint8_t*)input;
684 const uint8_t* ip_limit = ip + length;
685 uint8_t* op = (uint8_t*)output;
686 uint32_t ctrl;
687 uint8_t* op_limit = op + maxout;
688 if (BLOSCLZ_UNLIKELY(length == 0)) {
689 return 0;
690 }
691 ctrl = (*ip++) & 31U;
692
693 while (1) {
694 if (ctrl >= 32) {
695 // match
696 int32_t len = (ctrl >> 5U) - 1 ;
697 int32_t ofs = (ctrl & 31U) << 8U;
698 uint8_t code;
699 const uint8_t* ref = op - ofs;
700
701 if (len == 7 - 1) {
702 do {
703 if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
704 return 0;
705 }
706 code = *ip++;
707 len += code;
708 } while (code == 255);
709 }
710 else {
711 if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
712 return 0;
713 }
714 }
715 code = *ip++;
716 len += 3;
717 ref -= code;
718
719 /* match from 16-bit distance */
720 if (BLOSCLZ_UNLIKELY(code == 255)) {
721 if (ofs == (31U << 8U)) {
722 if (ip + 1 >= ip_limit) {
723 return 0;
724 }
725 ofs = (*ip++) << 8U;
726 ofs += *ip++;
727 ref = op - ofs - MAX_DISTANCE;
728 }
729 }
730
731 if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
732 return 0;
733 }
734
735 if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
736 return 0;
737 }
738
739 if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
740 ctrl = *ip++;
741
742 ref--;
743 if (ref == op - 1) {
744 /* optimized copy for a run */
745 memset(op, *ref, len);
746 op += len;
747 }
748 else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
749 // copy with an overlap not larger than 8
750 wild_copy(op, ref, op + len);
751 op += len;
752 }
753 else {
754 // general copy with any overlap
755 #if defined(__AVX2__)
756 if (op - ref <= 16) {
757 // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
758 // it is not slower either. Let's activate here for experimentation.
759 op = copy_match_16(op, ref, len);
760 }
761 else {
762 #endif
763 op = copy_match(op, ref, (unsigned) len);
764 #if defined(__AVX2__)
765 }
766 #endif
767 }
768 }
769 else {
770 // literal
771 ctrl++;
772 if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
773 return 0;
774 }
775 if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
776 return 0;
777 }
778
779 memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
780 // On GCC-6, fastcopy this is still faster than plain memcpy
781 // However, using recent CLANG/LLVM 9.0, there is almost no difference
782 // in performance.
783 // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
784 // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
785
786 if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
787 ctrl = *ip++;
788 }
789 }
790
791 return (int)(op - (uint8_t*)output);
792 }
793