1 /*********************************************************************
2   Blosc - Blocked Shuffling and Compression Library
3 
4   Copyright (C) 2021  The Blosc Developers <blosc@blosc.org>
5   https://blosc.org
6   License: BSD 3-Clause (see LICENSE.txt)
7 
8   See LICENSE.txt for details about copyright and rights to use.
9 **********************************************************************/
10 
11 /*********************************************************************
12   The code in this file is heavily based on FastLZ, a lightning-fast
13   lossless compression library.  See LICENSES/FASTLZ.txt for details.
14 **********************************************************************/
15 
16 
17 #include <stdio.h>
18 #include <stdbool.h>
19 #include "blosclz.h"
20 #include "fastcopy.h"
21 #include "blosc2/blosc2-common.h"
22 
23 
24 /*
25  * Give hints to the compiler for branch prediction optimization.
26  */
27 #if defined(__GNUC__) && (__GNUC__ > 2)
28 #define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
29 #define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
30 #else
31 #define BLOSCLZ_LIKELY(c)    (c)
32 #define BLOSCLZ_UNLIKELY(c)  (c)
33 #endif
34 
35 /*
36  * Use inlined functions for supported systems.
37  */
38 #if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
39 #define inline __inline  /* Visual C is not C99, but supports some kind of inline */
40 #endif
41 
42 #define MAX_COPY 32U
43 #define MAX_DISTANCE 8191
44 #define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
45 
46 #ifdef BLOSC_STRICT_ALIGN
47   #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
48   #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
49 #else
50   #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
51   #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
52 #endif
53 
54 #define HASH_LOG (14U)
55 #define HASH_LOG2 (12U)
56 
57 // This is used in LZ4 and seems to work pretty well here too
58 #define HASH_FUNCTION(v, s, h) {      \
59   v = (s * 2654435761U) >> (32U - h); \
60 }
61 
62 
63 #if defined(__AVX2__)
get_run_32(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)64 static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
65     uint8_t x = ip[-1];
66 
67     while (ip < (ip_bound - (sizeof(__m256i)))) {
68         __m256i value, value2, cmp;
69         /* Broadcast the value for every byte in a 256-bit register */
70         memset(&value, x, sizeof(__m256i));
71         value2 = _mm256_loadu_si256((__m256i *)ref);
72         cmp = _mm256_cmpeq_epi64(value, value2);
73         if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
74             /* Return the byte that starts to differ */
75             while (*ref++ == x) ip++;
76             return ip;
77         }
78         else {
79             ip += sizeof(__m256i);
80             ref += sizeof(__m256i);
81         }
82     }
83     /* Look into the remainder */
84     while ((ip < ip_bound) && (*ref++ == x)) ip++;
85     return ip;
86 }
87 #endif
88 
89 #if defined(__SSE2__)
get_run_16(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)90 static uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
91   uint8_t x = ip[-1];
92 
93   while (ip < (ip_bound - sizeof(__m128i))) {
94     __m128i value, value2, cmp;
95     /* Broadcast the value for every byte in a 128-bit register */
96     memset(&value, x, sizeof(__m128i));
97     value2 = _mm_loadu_si128((__m128i *)ref);
98     cmp = _mm_cmpeq_epi32(value, value2);
99     if (_mm_movemask_epi8(cmp) != 0xFFFF) {
100       /* Return the byte that starts to differ */
101       while (*ref++ == x) ip++;
102       return ip;
103     }
104     else {
105       ip += sizeof(__m128i);
106       ref += sizeof(__m128i);
107     }
108   }
109   /* Look into the remainder */
110   while ((ip < ip_bound) && (*ref++ == x)) ip++;
111   return ip;
112 }
113 
114 #endif
115 
116 
get_run(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)117 static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
118   uint8_t x = ip[-1];
119   int64_t value, value2;
120   /* Broadcast the value for every byte in a 64-bit register */
121   memset(&value, x, 8);
122   /* safe because the outer check against ip limit */
123   while (ip < (ip_bound - sizeof(int64_t))) {
124 #if defined(BLOSC_STRICT_ALIGN)
125     memcpy(&value2, ref, 8);
126 #else
127     value2 = ((int64_t*)ref)[0];
128 #endif
129     if (value != value2) {
130       /* Return the byte that starts to differ */
131       while (*ref++ == x) ip++;
132       return ip;
133     }
134     else {
135       ip += 8;
136       ref += 8;
137     }
138   }
139   /* Look into the remainder */
140   while ((ip < ip_bound) && (*ref++ == x)) ip++;
141   return ip;
142 }
143 
144 
145 /* Return the byte that starts to differ */
get_match(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)146 static uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
147 #if !defined(BLOSC_STRICT_ALIGN)
148   while (ip < (ip_bound - sizeof(int64_t))) {
149     if (*(int64_t*)ref != *(int64_t*)ip) {
150       /* Return the byte that starts to differ */
151       while (*ref++ == *ip++) {}
152       return ip;
153     }
154     else {
155       ip += sizeof(int64_t);
156       ref += sizeof(int64_t);
157     }
158   }
159 #endif
160   /* Look into the remainder */
161   while ((ip < ip_bound) && (*ref++ == *ip++)) {}
162   return ip;
163 }
164 
165 
166 #if defined(__SSE2__)
get_match_16(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)167 static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
168   __m128i value, value2, cmp;
169 
170   while (ip < (ip_bound - sizeof(__m128i))) {
171     value = _mm_loadu_si128((__m128i *) ip);
172     value2 = _mm_loadu_si128((__m128i *) ref);
173     cmp = _mm_cmpeq_epi32(value, value2);
174     if (_mm_movemask_epi8(cmp) != 0xFFFF) {
175       /* Return the byte that starts to differ */
176       while (*ref++ == *ip++) {}
177       return ip;
178     }
179     else {
180       ip += sizeof(__m128i);
181       ref += sizeof(__m128i);
182     }
183   }
184   /* Look into the remainder */
185   while ((ip < ip_bound) && (*ref++ == *ip++)) {}
186   return ip;
187 }
188 #endif
189 
190 
191 #if defined(__AVX2__)
get_match_32(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)192 static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
193 
194   while (ip < (ip_bound - sizeof(__m256i))) {
195     __m256i value, value2, cmp;
196     value = _mm256_loadu_si256((__m256i *) ip);
197     value2 = _mm256_loadu_si256((__m256i *)ref);
198     cmp = _mm256_cmpeq_epi64(value, value2);
199     if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
200       /* Return the byte that starts to differ */
201       while (*ref++ == *ip++) {}
202       return ip;
203     }
204     else {
205       ip += sizeof(__m256i);
206       ref += sizeof(__m256i);
207     }
208   }
209   /* Look into the remainder */
210   while ((ip < ip_bound) && (*ref++ == *ip++)) {}
211   return ip;
212 }
213 #endif
214 
215 
get_run_or_match(uint8_t * ip,uint8_t * ip_bound,const uint8_t * ref,bool run)216 static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
217   if (BLOSCLZ_UNLIKELY(run)) {
218 #if defined(__AVX2__)
219     // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
220     // ip = get_run_32(ip, ip_bound, ref);
221     ip = get_run(ip, ip_bound, ref);
222 #elif defined(__SSE2__)
223     // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
224     // ip = get_run_16(ip, ip_bound, ref);
225     ip = get_run(ip, ip_bound, ref);
226 #else
227     ip = get_run(ip, ip_bound, ref);
228 #endif
229   }
230   else {
231 #if defined(__AVX2__)
232     // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
233     // ip = get_match_32(ip, ip_bound, ref);
234     ip = get_match_16(ip, ip_bound, ref);
235 #elif defined(__SSE2__)
236     ip = get_match_16(ip, ip_bound, ref);
237 #else
238     ip = get_match(ip, ip_bound, ref);
239 #endif
240   }
241 
242   return ip;
243 }
244 
245 
246 #define LITERAL(ip, op, op_limit, anchor, copy) {       \
247   if (BLOSCLZ_UNLIKELY(op + 2 > op_limit))              \
248     goto out;                                           \
249   *op++ = *anchor++;                                    \
250   ip = anchor;                                          \
251   copy++;                                               \
252   if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {             \
253     copy = 0;                                           \
254     *op++ = MAX_COPY-1;                                 \
255   }                                                     \
256 }
257 
258 #define LITERAL2(ip, anchor, copy) {                    \
259   oc++; anchor++;                                       \
260   ip = anchor;                                          \
261   copy++;                                               \
262   if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {             \
263     copy = 0;                                           \
264     oc++;                                               \
265   }                                                     \
266 }
267 
268 #define MATCH_SHORT(op, op_limit, len, distance) {      \
269   if (BLOSCLZ_UNLIKELY(op + 2 > op_limit))              \
270     goto out;                                           \
271   *op++ = (uint8_t)((len << 5U) + (distance >> 8U));    \
272   *op++ = (uint8_t)((distance & 255U));                 \
273 }
274 
275 #define MATCH_LONG(op, op_limit, len, distance) {       \
276   if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))              \
277     goto out;                                           \
278   *op++ = (uint8_t)((7U << 5U) + (distance >> 8U));     \
279   for (len -= 7; len >= 255; len -= 255) {              \
280     if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))            \
281       goto out;                                         \
282     *op++ = 255;                                        \
283   }                                                     \
284   if (BLOSCLZ_UNLIKELY(op + 2 > op_limit))              \
285     goto out;                                           \
286   *op++ = (uint8_t)len;                                 \
287   *op++ = (uint8_t)((distance & 255U));                 \
288 }
289 
290 #define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
291   if (BLOSCLZ_UNLIKELY(op + 4 > op_limit))                  \
292     goto out;                                               \
293   *op++ = (uint8_t)((len << 5U) + 31);                      \
294   *op++ = 255;                                              \
295   *op++ = (uint8_t)(distance >> 8U);                        \
296   *op++ = (uint8_t)(distance & 255U);                       \
297 }
298 
299 #define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
300   if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))                  \
301     goto out;                                               \
302   *op++ = (7U << 5U) + 31;                                  \
303   for (len -= 7; len >= 255; len -= 255) {                  \
304     if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))                \
305       goto out;                                             \
306     *op++ = 255;                                            \
307   }                                                         \
308   if (BLOSCLZ_UNLIKELY(op + 4 > op_limit))                  \
309     goto out;                                               \
310   *op++ = (uint8_t)len;                                     \
311   *op++ = 255;                                              \
312   *op++ = (uint8_t)(distance >> 8U);                        \
313   *op++ = (uint8_t)(distance & 255U);                       \
314 }
315 
316 
317 // Get a guess for the compressed size of a buffer
get_cratio(uint8_t * ibase,int maxlen,int minlen,int ipshift)318 static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift) {
319   uint8_t* ip = ibase;
320   int32_t oc = 0;
321   const uint16_t hashlen = (1U << (uint8_t)HASH_LOG2);
322   uint16_t htab[1U << (uint8_t)HASH_LOG2];
323   uint32_t hval;
324   uint32_t seq;
325   uint8_t copy;
326   // Make a tradeoff between testing too much and too little
327   uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
328   uint8_t* ip_bound = ibase + limit - 1;
329   uint8_t* ip_limit = ibase + limit - 12;
330 
331   // Initialize the hash table to distances of 0
332   memset(htab, 0, hashlen * sizeof(uint16_t));
333 
334   /* we start with literal copy */
335   copy = 4;
336   oc += 5;
337 
338   /* main loop */
339   while (BLOSCLZ_LIKELY(ip < ip_limit)) {
340     const uint8_t* ref;
341     unsigned distance;
342     uint8_t* anchor = ip;    /* comparison starting-point */
343 
344     /* find potential match */
345     seq = BLOSCLZ_READU32(ip);
346     HASH_FUNCTION(hval, seq, HASH_LOG2)
347     ref = ibase + htab[hval];
348 
349     /* calculate distance to the match */
350     distance = (unsigned int)(anchor - ref);
351 
352     /* update hash table */
353     htab[hval] = (uint16_t) (anchor - ibase);
354 
355     if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
356       LITERAL2(ip, anchor, copy)
357       continue;
358     }
359 
360     /* is this a match? check the first 4 bytes */
361     if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
362       ref += 4;
363     }
364     else {
365       /* no luck, copy as a literal */
366       LITERAL2(ip, anchor, copy)
367       continue;
368     }
369 
370     /* last matched byte */
371     ip = anchor + 4;
372 
373     /* distance is biased */
374     distance--;
375 
376     /* get runs or matches; zero distance means a run */
377     ip = get_run_or_match(ip, ip_bound, ref, !distance);
378 
379     ip -= ipshift;
380     unsigned len = (int)(ip - anchor);
381     if (len < minlen) {
382       LITERAL2(ip, anchor, copy)
383       continue;
384     }
385 
386     /* if we haven't copied anything, adjust the output counter */
387     if (!copy)
388       oc--;
389     /* reset literal counter */
390     copy = 0;
391 
392     /* encode the match */
393     if (distance < MAX_DISTANCE) {
394       if (len >= 7) {
395         oc += ((len - 7) / 255) + 1;
396       }
397       oc += 2;
398     }
399     else {
400       /* far away, but not yet in the another galaxy... */
401       if (len >= 7) {
402         oc += ((len - 7) / 255) + 1;
403       }
404       oc += 4;
405     }
406 
407     /* update the hash at match boundary */
408     seq = BLOSCLZ_READU32(ip);
409     HASH_FUNCTION(hval, seq, HASH_LOG2)
410     htab[hval] = (uint16_t)(ip++ - ibase);
411     ip++;
412     /* assuming literal copy */
413     oc++;
414   }
415 
416   double ic;
417 out:
418   ic = (double)(ip - ibase);
419   return ic / (double)oc;
420 }
421 
422 
blosclz_compress(const int clevel,const void * input,int length,void * output,int maxout,blosc2_context * ctx)423 int blosclz_compress(const int clevel, const void* input, int length,
424                      void* output, int maxout, blosc2_context* ctx) {
425   uint8_t* ibase = (uint8_t*)input;
426 
427   // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
428   int maxlen = length / 4;
429   // Start probing somewhere inside the buffer
430   int shift = length - maxlen;
431   // Actual entropy probing!
432   double cratio = get_cratio(ibase + shift, maxlen, 3, 3);
433   // discard probes with small compression ratios (too expensive)
434   double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
435   if (cratio < cratio_[clevel]) {
436       goto out;
437   }
438 
439   /* When we go back in a match (shift), we obtain quite different compression properties.
440    * It looks like 4 is more useful in combination with bitshuffle and small typesizes
441    * Fallback to 4 because it provides more consistent results for large cratios.
442    *
443    * In this block we also check cratios for the beginning of the buffers and
444    * eventually discard those that are small (take too long to decompress).
445    * This process is called _entropy probing_.
446    */
447   unsigned ipshift = 4;
448   // Compute optimal shift and minimum lengths for encoding
449   // Use 4 by default, except for low entropy data, where we should do a best effort
450   unsigned minlen = 4;
451   // BloscLZ works better with splits mostly, so when data is not split, do a best effort
452   const int split_block = !((ctx->header_flags & 0x10) >> 4);
453   // Why using cratio < 4 is based in experiments with low and high entropy
454   if (!split_block || cratio < 4) {
455     ipshift = 3;
456     minlen = 3;
457   }
458   else {
459     minlen = 4;
460   }
461 
462   uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
463                           HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
464   uint8_t hashlog = hashlog_[clevel];
465 
466   uint8_t* ip = ibase;
467   uint8_t* ip_bound = ibase + length - 1;
468   uint8_t* ip_limit = ibase + length - 12;
469   uint8_t* op = (uint8_t*)output;
470   const uint8_t* op_limit = op + maxout;
471   uint32_t seq;
472   uint8_t copy;
473   uint32_t hval;
474 
475   /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
476   if (length < 16 || maxout < 66) {
477     return 0;
478   }
479 
480   // Initialize the hash table
481   uint32_t htab[1U << (uint8_t)HASH_LOG];
482   memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
483 
484   /* we start with literal copy */
485   copy = 4;
486   *op++ = MAX_COPY - 1;
487   *op++ = *ip++;
488   *op++ = *ip++;
489   *op++ = *ip++;
490   *op++ = *ip++;
491 
492   /* main loop */
493   while (BLOSCLZ_LIKELY(ip < ip_limit)) {
494     const uint8_t* ref;
495     unsigned distance;
496     uint8_t* anchor = ip;    /* comparison starting-point */
497 
498     /* find potential match */
499     seq = BLOSCLZ_READU32(ip);
500     HASH_FUNCTION(hval, seq, hashlog)
501     ref = ibase + htab[hval];
502 
503     /* calculate distance to the match */
504     distance = (unsigned int)(anchor - ref);
505 
506     /* update hash table */
507     htab[hval] = (uint32_t) (anchor - ibase);
508 
509     if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
510       LITERAL(ip, op, op_limit, anchor, copy)
511       continue;
512     }
513 
514     /* is this a match? check the first 4 bytes */
515     if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
516       ref += 4;
517     } else {
518       /* no luck, copy as a literal */
519       LITERAL(ip, op, op_limit, anchor, copy)
520       continue;
521     }
522 
523     /* last matched byte */
524     ip = anchor + 4;
525 
526     /* distance is biased */
527     distance--;
528 
529     /* get runs or matches; zero distance means a run */
530     ip = get_run_or_match(ip, ip_bound, ref, !distance);
531 
532     /* length is biased, '1' means a match of 3 bytes */
533     ip -= ipshift;
534 
535     unsigned len = (int)(ip - anchor);
536 
537     // Encoding short lengths is expensive during decompression
538     if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
539       LITERAL(ip, op, op_limit, anchor, copy)
540       continue;
541     }
542 
543     /* if we have copied something, adjust the copy count */
544     if (copy)
545       /* copy is biased, '0' means 1 byte copy */
546       *(op - copy - 1) = (uint8_t)(copy - 1);
547     else
548       /* back, to overwrite the copy count */
549       op--;
550     /* reset literal counter */
551     copy = 0;
552 
553     /* encode the match */
554     if (distance < MAX_DISTANCE) {
555       if (len < 7) {
556         MATCH_SHORT(op, op_limit, len, distance)
557       } else {
558         MATCH_LONG(op, op_limit, len, distance)
559       }
560     } else {
561       /* far away, but not yet in the another galaxy... */
562       distance -= MAX_DISTANCE;
563       if (len < 7) {
564         MATCH_SHORT_FAR(op, op_limit, len, distance)
565       } else {
566         MATCH_LONG_FAR(op, op_limit, len, distance)
567       }
568     }
569 
570     /* update the hash at match boundary */
571     seq = BLOSCLZ_READU32(ip);
572     HASH_FUNCTION(hval, seq, hashlog)
573     htab[hval] = (uint32_t) (ip++ - ibase);
574     if (ctx->clevel == 9) {
575       // In some situations, including a second hash proves to be useful,
576       // but not in others.  Activating here in max clevel only.
577       seq >>= 8U;
578       HASH_FUNCTION(hval, seq, hashlog)
579       htab[hval] = (uint32_t) (ip++ - ibase);
580     }
581     else {
582       ip++;
583     }
584 
585     if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
586       goto out;
587 
588     /* assuming literal copy */
589     *op++ = MAX_COPY - 1;
590   }
591 
592   /* left-over as literal copy */
593   while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
594     if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
595     *op++ = *ip++;
596     copy++;
597     if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
598       copy = 0;
599       *op++ = MAX_COPY - 1;
600     }
601   }
602 
603   /* if we have copied something, adjust the copy length */
604   if (copy)
605     *(op - copy - 1) = (uint8_t)(copy - 1);
606   else
607     op--;
608 
609   /* marker for blosclz */
610   *(uint8_t*)output |= (1U << 5U);
611 
612   return (int)(op - (uint8_t*)output);
613 
614   out:
615   return 0;
616 }
617 
618 // See https://habr.com/en/company/yandex/blog/457612/
619 #if defined(__AVX2__)
620 
621 #if defined(_MSC_VER)
622 #define ALIGNED_(x) __declspec(align(x))
623 #else
624 #if defined(__GNUC__)
625 #define ALIGNED_(x) __attribute__ ((aligned(x)))
626 #endif
627 #endif
628 #define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
629 
copy_match_16(unsigned char * op,const unsigned char * match,int32_t len)630 static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
631 {
632   size_t offset = op - match;
633   while (len >= 16) {
634 
635     static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
636       {
637                 0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
638                 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
639                 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
640                 0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
641                 0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
642                 0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
643                 0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
644                 0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
645                 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
646                 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
647                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
648                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
649                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
650                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
651                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
652                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
653                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
654       };
655 
656     _mm_storeu_si128((__m128i *)(op),
657                      _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
658                                       _mm_load_si128((const __m128i *)(masks) + offset)));
659 
660     match += masks[offset];
661 
662     op += 16;
663     len -= 16;
664   }
665   // Deal with remainders
666   for (; len > 0; len--) {
667     *op++ = *match++;
668   }
669   return op;
670 }
671 #endif
672 
673 // LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
wild_copy(uint8_t * out,const uint8_t * from,uint8_t * end)674 static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
675   uint8_t* d = out;
676   const uint8_t* s = from;
677   uint8_t* const e = end;
678 
679   do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
680 }
681 
blosclz_decompress(const void * input,int length,void * output,int maxout)682 int blosclz_decompress(const void* input, int length, void* output, int maxout) {
683   const uint8_t* ip = (const uint8_t*)input;
684   const uint8_t* ip_limit = ip + length;
685   uint8_t* op = (uint8_t*)output;
686   uint32_t ctrl;
687   uint8_t* op_limit = op + maxout;
688   if (BLOSCLZ_UNLIKELY(length == 0)) {
689     return 0;
690   }
691   ctrl = (*ip++) & 31U;
692 
693   while (1) {
694     if (ctrl >= 32) {
695       // match
696       int32_t len = (ctrl >> 5U) - 1 ;
697       int32_t ofs = (ctrl & 31U) << 8U;
698       uint8_t code;
699       const uint8_t* ref = op - ofs;
700 
701       if (len == 7 - 1) {
702         do {
703           if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
704             return 0;
705           }
706           code = *ip++;
707           len += code;
708         } while (code == 255);
709       }
710       else {
711         if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
712           return 0;
713         }
714       }
715       code = *ip++;
716       len += 3;
717       ref -= code;
718 
719       /* match from 16-bit distance */
720       if (BLOSCLZ_UNLIKELY(code == 255)) {
721         if (ofs == (31U << 8U)) {
722           if (ip + 1 >= ip_limit) {
723             return 0;
724           }
725           ofs = (*ip++) << 8U;
726           ofs += *ip++;
727           ref = op - ofs - MAX_DISTANCE;
728         }
729       }
730 
731       if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
732         return 0;
733       }
734 
735       if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
736         return 0;
737       }
738 
739       if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
740       ctrl = *ip++;
741 
742       ref--;
743       if (ref == op - 1) {
744         /* optimized copy for a run */
745         memset(op, *ref, len);
746         op += len;
747       }
748       else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
749         // copy with an overlap not larger than 8
750         wild_copy(op, ref, op + len);
751         op += len;
752       }
753       else {
754         // general copy with any overlap
755 #if defined(__AVX2__)
756         if (op - ref <= 16) {
757           // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
758           // it is not slower either.  Let's activate here for experimentation.
759           op = copy_match_16(op, ref, len);
760         }
761         else {
762 #endif
763           op = copy_match(op, ref, (unsigned) len);
764 #if defined(__AVX2__)
765         }
766 #endif
767       }
768     }
769     else {
770       // literal
771       ctrl++;
772       if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
773         return 0;
774       }
775       if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
776         return 0;
777       }
778 
779       memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
780       // On GCC-6, fastcopy this is still faster than plain memcpy
781       // However, using recent CLANG/LLVM 9.0, there is almost no difference
782       // in performance.
783       // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
784       // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
785 
786       if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
787       ctrl = *ip++;
788     }
789   }
790 
791   return (int)(op - (uint8_t*)output);
792 }
793