1 /*********************************************************************
2   Blosc - Blocked Shuffling and Compression Library
3 
4   Author: Francesc Alted <francesc@blosc.org>
5   Creation date: 2009-05-20
6 
7   See LICENSE.txt for details about copyright and rights to use.
8 **********************************************************************/
9 
10 /*********************************************************************
11   The code in this file is heavily based on FastLZ, a lightning-fast
12   lossless compression library.  See LICENSES/FASTLZ.txt for details.
13 **********************************************************************/
14 
15 
16 #include <stdio.h>
17 #include <stdbool.h>
18 #include "blosclz.h"
19 #include "fastcopy.h"
20 #include "blosc-common.h"
21 
22 
23 /*
24  * Give hints to the compiler for branch prediction optimization.
25  */
26 #if defined(__GNUC__) && (__GNUC__ > 2)
27 #define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
28 #define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
29 #else
30 #define BLOSCLZ_LIKELY(c)    (c)
31 #define BLOSCLZ_UNLIKELY(c)  (c)
32 #endif
33 
34 /*
35  * Use inlined functions for supported systems.
36  */
37 #if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
38 #define inline __inline  /* Visual C is not C99, but supports some kind of inline */
39 #endif
40 
41 #define MAX_COPY 32U
42 #define MAX_DISTANCE 8191
43 #define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
44 
45 #ifdef BLOSC_STRICT_ALIGN
46   #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
47   #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
48 #else
49   #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
50   #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
51 #endif
52 
53 #define HASH_LOG (12U)
54 
55 // This is used in LZ4 and seems to work pretty well here too
56 #define HASH_FUNCTION(v, s, h) {      \
57   v = (s * 2654435761U) >> (32U - h); \
58 }
59 
60 
61 #if defined(__AVX2__)
get_run_32(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)62 static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
63     uint8_t x = ip[-1];
64     /* safe because the outer check against ip limit */
65     if (ip < (ip_bound - sizeof(int64_t))) {
66         int64_t value, value2;
67         /* Broadcast the value for every byte in a 64-bit register */
68         memset(&value, x, 8);
69 #if defined(BLOSC_STRICT_ALIGN)
70         memcpy(&value2, ref, 8);
71 #else
72         value2 = ((int64_t*)ref)[0];
73 #endif
74         if (value != value2) {
75             /* Return the byte that starts to differ */
76             while (*ref++ == x) ip++;
77             return ip;
78         }
79         else {
80             ip += 8;
81             ref += 8;
82         }
83     }
84     if (ip < (ip_bound - sizeof(__m128i))) {
85         __m128i value, value2, cmp;
86         /* Broadcast the value for every byte in a 128-bit register */
87         memset(&value, x, sizeof(__m128i));
88         value2 = _mm_loadu_si128((__m128i *) ref);
89         cmp = _mm_cmpeq_epi32(value, value2);
90         if (_mm_movemask_epi8(cmp) != 0xFFFF) {
91             /* Return the byte that starts to differ */
92             while (*ref++ == x) ip++;
93             return ip;
94         } else {
95             ip += sizeof(__m128i);
96             ref += sizeof(__m128i);
97         }
98     }
99     while (ip < (ip_bound - (sizeof(__m256i)))) {
100         __m256i value, value2, cmp;
101         /* Broadcast the value for every byte in a 256-bit register */
102         memset(&value, x, sizeof(__m256i));
103         value2 = _mm256_loadu_si256((__m256i *)ref);
104         cmp = _mm256_cmpeq_epi64(value, value2);
105         if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
106             /* Return the byte that starts to differ */
107             while (*ref++ == x) ip++;
108             return ip;
109         }
110         else {
111             ip += sizeof(__m256i);
112             ref += sizeof(__m256i);
113         }
114     }
115     /* Look into the remainder */
116     while ((ip < ip_bound) && (*ref++ == x)) ip++;
117     return ip;
118 }
119 
120 #elif defined(__SSE2__)
121 
get_run_16(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)122 static uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
123   uint8_t x = ip[-1];
124 
125   if (ip < (ip_bound - sizeof(int64_t))) {
126     int64_t value, value2;
127     /* Broadcast the value for every byte in a 64-bit register */
128     memset(&value, x, 8);
129 #if defined(BLOSC_STRICT_ALIGN)
130     memcpy(&value2, ref, 8);
131 #else
132     value2 = ((int64_t*)ref)[0];
133 #endif
134     if (value != value2) {
135       /* Return the byte that starts to differ */
136       while (*ref++ == x) ip++;
137       return ip;
138     }
139     else {
140       ip += 8;
141       ref += 8;
142     }
143   }
144   /* safe because the outer check against ip limit */
145   while (ip < (ip_bound - sizeof(__m128i))) {
146     __m128i value, value2, cmp;
147     /* Broadcast the value for every byte in a 128-bit register */
148     memset(&value, x, sizeof(__m128i));
149     value2 = _mm_loadu_si128((__m128i *)ref);
150     cmp = _mm_cmpeq_epi32(value, value2);
151     if (_mm_movemask_epi8(cmp) != 0xFFFF) {
152       /* Return the byte that starts to differ */
153       while (*ref++ == x) ip++;
154       return ip;
155     }
156     else {
157       ip += sizeof(__m128i);
158       ref += sizeof(__m128i);
159     }
160   }
161   /* Look into the remainder */
162   while ((ip < ip_bound) && (*ref++ == x)) ip++;
163   return ip;
164 }
165 
166 #else
167 
get_run(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)168 static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
169   uint8_t x = ip[-1];
170   int64_t value, value2;
171   /* Broadcast the value for every byte in a 64-bit register */
172   memset(&value, x, 8);
173   /* safe because the outer check against ip limit */
174   while (ip < (ip_bound - sizeof(int64_t))) {
175 #if defined(BLOSC_STRICT_ALIGN)
176     memcpy(&value2, ref, 8);
177 #else
178     value2 = ((int64_t*)ref)[0];
179 #endif
180     if (value != value2) {
181       /* Return the byte that starts to differ */
182       while (*ref++ == x) ip++;
183       return ip;
184     }
185     else {
186       ip += 8;
187       ref += 8;
188     }
189   }
190   /* Look into the remainder */
191   while ((ip < ip_bound) && (*ref++ == x)) ip++;
192   return ip;
193 }
194 
195 #endif
196 
197 
198 /* Return the byte that starts to differ */
get_match(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)199 static uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
200 #if !defined(BLOSC_STRICT_ALIGN)
201   while (ip < (ip_bound - sizeof(int64_t))) {
202     if (*(int64_t*)ref != *(int64_t*)ip) {
203       /* Return the byte that starts to differ */
204       while (*ref++ == *ip++) {}
205       return ip;
206     }
207     else {
208       ip += sizeof(int64_t);
209       ref += sizeof(int64_t);
210     }
211   }
212 #endif
213   /* Look into the remainder */
214   while ((ip < ip_bound) && (*ref++ == *ip++)) {}
215   return ip;
216 }
217 
218 
219 #if defined(__SSE2__)
get_match_16(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)220 static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
221   __m128i value, value2, cmp;
222 
223   if (ip < (ip_bound - sizeof(int64_t))) {
224     if (*(int64_t *) ref != *(int64_t *) ip) {
225       /* Return the byte that starts to differ */
226       while (*ref++ == *ip++) {}
227       return ip;
228     } else {
229       ip += sizeof(int64_t);
230       ref += sizeof(int64_t);
231     }
232   }
233   while (ip < (ip_bound - sizeof(__m128i))) {
234     value = _mm_loadu_si128((__m128i *) ip);
235     value2 = _mm_loadu_si128((__m128i *) ref);
236     cmp = _mm_cmpeq_epi32(value, value2);
237     if (_mm_movemask_epi8(cmp) != 0xFFFF) {
238       /* Return the byte that starts to differ */
239       return get_match(ip, ip_bound, ref);
240     }
241     else {
242       ip += sizeof(__m128i);
243       ref += sizeof(__m128i);
244     }
245   }
246   /* Look into the remainder */
247   while ((ip < ip_bound) && (*ref++ == *ip++)) {}
248   return ip;
249 }
250 #endif
251 
252 
253 #if defined(__AVX2__)
get_match_32(uint8_t * ip,const uint8_t * ip_bound,const uint8_t * ref)254 static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
255 
256   if (ip < (ip_bound - sizeof(int64_t))) {
257     if (*(int64_t *) ref != *(int64_t *) ip) {
258       /* Return the byte that starts to differ */
259       while (*ref++ == *ip++) {}
260       return ip;
261     } else {
262       ip += sizeof(int64_t);
263       ref += sizeof(int64_t);
264     }
265   }
266   if (ip < (ip_bound - sizeof(__m128i))) {
267     __m128i value, value2, cmp;
268     value = _mm_loadu_si128((__m128i *) ip);
269     value2 = _mm_loadu_si128((__m128i *) ref);
270     cmp = _mm_cmpeq_epi32(value, value2);
271     if (_mm_movemask_epi8(cmp) != 0xFFFF) {
272       /* Return the byte that starts to differ */
273       return get_match_16(ip, ip_bound, ref);
274     }
275     else {
276       ip += sizeof(__m128i);
277       ref += sizeof(__m128i);
278     }
279   }
280   while (ip < (ip_bound - sizeof(__m256i))) {
281     __m256i value, value2, cmp;
282     value = _mm256_loadu_si256((__m256i *) ip);
283     value2 = _mm256_loadu_si256((__m256i *)ref);
284     cmp = _mm256_cmpeq_epi64(value, value2);
285     if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
286       /* Return the byte that starts to differ */
287       while (*ref++ == *ip++) {}
288       return ip;
289     }
290     else {
291       ip += sizeof(__m256i);
292       ref += sizeof(__m256i);
293     }
294   }
295   /* Look into the remainder */
296   while ((ip < ip_bound) && (*ref++ == *ip++)) {}
297   return ip;
298 }
299 #endif
300 
301 
get_run_or_match(uint8_t * ip,uint8_t * ip_bound,const uint8_t * ref,bool run)302 static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
303   if (BLOSCLZ_UNLIKELY(run)) {
304 #if defined(__AVX2__)
305     ip = get_run_32(ip, ip_bound, ref);
306 #elif defined(__SSE2__)
307     ip = get_run_16(ip, ip_bound, ref);
308 #else
309     ip = get_run(ip, ip_bound, ref);
310 #endif
311   }
312   else {
313 #if defined(__AVX2__)
314     ip = get_match_32(ip, ip_bound, ref);
315 #elif defined(__SSE2__)
316     ip = get_match_16(ip, ip_bound, ref);
317 #else
318     ip = get_match(ip, ip_bound, ref);
319 #endif
320   }
321 
322   return ip;
323 }
324 
325 
326 #define LITERAL(ip, op, op_limit, anchor, copy) {       \
327   if (BLOSCLZ_UNLIKELY(op + 2 > op_limit))              \
328     goto out;                                           \
329   *op++ = *anchor++;                                    \
330   ip = anchor;                                          \
331   copy++;                                               \
332   if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {             \
333     copy = 0;                                           \
334     *op++ = MAX_COPY-1;                                 \
335   }                                                     \
336 }
337 
338 #define LITERAL2(ip, oc, anchor, copy) {                \
339   oc++; anchor++;                                       \
340   ip = anchor;                                          \
341   copy++;                                               \
342   if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {             \
343     copy = 0;                                           \
344     oc++;                                               \
345   }                                                     \
346 }
347 
348 #define DISTANCE_SHORT(op, op_limit, len, distance) {   \
349   if (BLOSCLZ_UNLIKELY(op + 2 > op_limit))              \
350     goto out;                                           \
351   *op++ = (uint8_t)((len << 5U) + (distance >> 8U));    \
352   *op++ = (uint8_t)((distance & 255U));                 \
353 }
354 
355 #define DISTANCE_LONG(op, op_limit, len, distance) {    \
356   if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))              \
357     goto out;                                           \
358   *op++ = (uint8_t)((7U << 5U) + (distance >> 8U));     \
359   for (len -= 7; len >= 255; len -= 255) {              \
360     if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))            \
361       goto out;                                         \
362     *op++ = 255;                                        \
363   }                                                     \
364   if (BLOSCLZ_UNLIKELY(op + 2 > op_limit))              \
365     goto out;                                           \
366   *op++ = (uint8_t)len;                                 \
367   *op++ = (uint8_t)((distance & 255U));                 \
368 }
369 
370 #define DISTANCE_SHORT_FAR(op, op_limit, len, distance) {   \
371   if (BLOSCLZ_UNLIKELY(op + 4 > op_limit))                  \
372     goto out;                                               \
373   *op++ = (uint8_t)((len << 5U) + 31);                      \
374   *op++ = 255;                                              \
375   *op++ = (uint8_t)(distance >> 8U);                        \
376   *op++ = (uint8_t)(distance & 255U);                       \
377 }
378 
379 #define DISTANCE_LONG_FAR(op, op_limit, len, distance) {    \
380   if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))                  \
381     goto out;                                               \
382   *op++ = (7U << 5U) + 31;                                  \
383   for (len -= 7; len >= 255; len -= 255) {                  \
384     if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))                \
385       goto out;                                             \
386     *op++ = 255;                                            \
387   }                                                         \
388   if (BLOSCLZ_UNLIKELY(op + 4 > op_limit))                  \
389     goto out;                                               \
390   *op++ = (uint8_t)len;                                     \
391   *op++ = 255;                                              \
392   *op++ = (uint8_t)(distance >> 8U);                        \
393   *op++ = (uint8_t)(distance & 255U);                       \
394 }
395 
396 
397 // Get the compressed size of a buffer.  Useful for testing compression ratios for high clevels.
get_csize(uint8_t * ibase,int maxlen,bool force_3b_shift)398 static int get_csize(uint8_t* ibase, int maxlen, bool force_3b_shift) {
399   uint8_t* ip = ibase;
400   int32_t oc = 0;
401   uint8_t* ip_bound = ibase + maxlen - 1;
402   uint8_t* ip_limit = ibase + maxlen - 12;
403   uint32_t htab[1U << (uint8_t)HASH_LOG];
404   uint32_t hval;
405   uint32_t seq;
406   uint8_t copy;
407 
408   // Initialize the hash table to distances of 0
409   for (unsigned i = 0; i < (1U << HASH_LOG); i++) {
410     htab[i] = 0;
411   }
412 
413   /* we start with literal copy */
414   copy = 4;
415   oc += 5;
416 
417   /* main loop */
418   while (BLOSCLZ_LIKELY(ip < ip_limit)) {
419     const uint8_t* ref;
420     unsigned distance;
421     uint8_t* anchor = ip;    /* comparison starting-point */
422 
423     /* find potential match */
424     seq = BLOSCLZ_READU32(ip);
425     HASH_FUNCTION(hval, seq, HASH_LOG)
426     ref = ibase + htab[hval];
427 
428     /* calculate distance to the match */
429     distance = anchor - ref;
430 
431     /* update hash table */
432     htab[hval] = (uint32_t) (anchor - ibase);
433 
434     if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
435       LITERAL2(ip, oc, anchor, copy)
436       continue;
437     }
438 
439     /* is this a match? check the first 4 bytes */
440     if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
441       ref += 4;
442     }
443     else {
444       /* no luck, copy as a literal */
445       LITERAL2(ip, oc, anchor, copy)
446       continue;
447     }
448 
449     /* last matched byte */
450     ip = anchor + 4;
451 
452     /* distance is biased */
453     distance--;
454 
455     /* get runs or matches; zero distance means a run */
456     ip = get_run_or_match(ip, ip_bound, ref, !distance);
457 
458     ip -= force_3b_shift ? 3 : 4;
459     unsigned len = (int)(ip - anchor);
460     // If match is close, let's reduce the minimum length to encode it
461     unsigned minlen = (distance < MAX_DISTANCE) ? 3 : 4;
462     // Encoding short lengths is expensive during decompression
463     if (len < minlen) {
464       LITERAL2(ip, oc, anchor, copy)
465       continue;
466     }
467 
468     /* if we have'nt copied anything, adjust the output counter */
469     if (!copy)
470       oc--;
471     /* reset literal counter */
472     copy = 0;
473 
474     /* encode the match */
475     if (distance < MAX_DISTANCE) {
476       if (len >= 7) {
477         oc += ((len - 7) / 255) + 1;
478       }
479       oc += 2;
480     }
481     else {
482       /* far away, but not yet in the another galaxy... */
483       if (len >= 7) {
484         oc += ((len - 7) / 255) + 1;
485       }
486       oc += 4;
487     }
488 
489     /* update the hash at match boundary */
490     seq = BLOSCLZ_READU32(ip);
491     HASH_FUNCTION(hval, seq, HASH_LOG)
492     htab[hval] = (uint32_t) (ip++ - ibase);
493     seq >>= 8U;
494     HASH_FUNCTION(hval, seq, HASH_LOG)
495     htab[hval] = (uint32_t) (ip++ - ibase);
496     /* assuming literal copy */
497     oc++;
498 
499   }
500 
501   /* if we have copied something, adjust the copy length */
502   if (!copy)
503     oc--;
504 
505   return (int)oc;
506 }
507 
508 
blosclz_compress(const int clevel,const void * input,int length,void * output,int maxout)509 int blosclz_compress(const int clevel, const void* input, int length,
510                      void* output, int maxout) {
511   uint8_t* ibase = (uint8_t*)input;
512   uint8_t* ip = ibase;
513   uint8_t* ip_bound = ibase + length - 1;
514   uint8_t* ip_limit = ibase + length - 12;
515   uint8_t* op = (uint8_t*)output;
516   uint8_t* op_limit;
517   uint32_t htab[1U << (uint8_t)HASH_LOG];
518   uint32_t hval;
519   uint32_t seq;
520   uint8_t copy;
521 
522   op_limit = op + maxout;
523 
524   // Minimum lengths for encoding
525   unsigned minlen_[10] = {0, 12, 12, 11, 10, 9, 8, 7, 6, 5};
526 
527   // Minimum compression ratios for initiate encoding
528   double cratio_[10] = {0, 2, 2, 2, 2, 1.8, 1.6, 1.4, 1.2, 1.1};
529 
530   uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
531                           HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
532   uint8_t hashlog = hashlog_[clevel];
533   // Initialize the hash table to distances of 0
534   for (unsigned i = 0; i < (1U << hashlog); i++) {
535     htab[i] = 0;
536   }
537 
538   /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
539   if (length < 16 || maxout < 66) {
540     return 0;
541   }
542 
543   /* When we go back in a match (shift), we obtain quite different compression properties.
544    * It looks like 4 is more useful in combination with bitshuffle and small typesizes
545    * (compress better and faster in e.g. `b2bench blosclz bitshuffle single 6 6291456 1 19`).
546    * Fallback to 4 because it provides more consistent results on small itemsizes.
547    *
548    * In this block we also check cratios for the beginning of the buffers and
549    * eventually discard those that are small (take too long to decompress).
550    * This process is called _entropy probing_.
551    */
552   int ipshift = 4;
553   int maxlen;  // maximum length for entropy probing
554   int csize_3b;
555   int csize_4b;
556   double cratio = 0;
557   switch (clevel) {
558     case 1:
559     case 2:
560     case 3:
561       maxlen = length / 8;
562       csize_4b = get_csize(ibase, maxlen, false);
563       cratio = (double)maxlen / csize_4b;
564       break;
565     case 4:
566     case 5:
567     case 6:
568     case 7:
569     case 8:
570       maxlen = length / 8;
571       csize_4b = get_csize(ibase, maxlen, false);
572       cratio = (double)maxlen / csize_4b;
573       break;
574     case 9:
575       // case 9 is special.  we need to asses the optimal shift
576       maxlen = length / 8;
577       csize_3b = get_csize(ibase, maxlen, true);
578       csize_4b = get_csize(ibase, maxlen, false);
579       ipshift = (csize_3b < csize_4b) ? 3 : 4;
580       cratio = (csize_3b < csize_4b) ? ((double)maxlen / csize_3b) : ((double)maxlen / csize_4b);
581       break;
582     default:
583       break;
584   }
585   // discard probes with small compression ratios (too expensive)
586   if (cratio < cratio_ [clevel]) {
587     goto out;
588   }
589 
590   /* we start with literal copy */
591   copy = 4;
592   *op++ = MAX_COPY - 1;
593   *op++ = *ip++;
594   *op++ = *ip++;
595   *op++ = *ip++;
596   *op++ = *ip++;
597 
598   /* main loop */
599   while (BLOSCLZ_LIKELY(ip < ip_limit)) {
600     const uint8_t* ref;
601     unsigned distance;
602     uint8_t* anchor = ip;    /* comparison starting-point */
603 
604     /* find potential match */
605     seq = BLOSCLZ_READU32(ip);
606     HASH_FUNCTION(hval, seq, hashlog)
607     ref = ibase + htab[hval];
608 
609     /* calculate distance to the match */
610     distance = anchor - ref;
611 
612     /* update hash table */
613     htab[hval] = (uint32_t) (anchor - ibase);
614 
615     if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
616       LITERAL(ip, op, op_limit, anchor, copy)
617       continue;
618     }
619 
620     /* is this a match? check the first 4 bytes */
621     if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
622       ref += 4;
623     } else {
624       /* no luck, copy as a literal */
625       LITERAL(ip, op, op_limit, anchor, copy)
626       continue;
627     }
628 
629     /* last matched byte */
630     ip = anchor + 4;
631 
632     /* distance is biased */
633     distance--;
634 
635     /* get runs or matches; zero distance means a run */
636     ip = get_run_or_match(ip, ip_bound, ref, !distance);
637 
638     /* length is biased, '1' means a match of 3 bytes */
639     ip -= ipshift;
640 
641     unsigned len = (int)(ip - anchor);
642     // If match is close, let's reduce the minimum length to encode it
643     unsigned minlen = (clevel == 9) ? ipshift : minlen_[clevel];
644 
645     // Encoding short lengths is expensive during decompression
646     // Encode only for reasonable lengths (extensive experiments done)
647     if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
648       LITERAL(ip, op, op_limit, anchor, copy)
649       continue;
650     }
651 
652     /* if we have copied something, adjust the copy count */
653     if (copy)
654       /* copy is biased, '0' means 1 byte copy */
655       *(op - copy - 1) = (uint8_t)(copy - 1);
656     else
657       /* back, to overwrite the copy count */
658       op--;
659     /* reset literal counter */
660     copy = 0;
661 
662     /* encode the match */
663     if (distance < MAX_DISTANCE) {
664       if (len < 7) {
665         DISTANCE_SHORT(op, op_limit, len, distance)
666       } else {
667         DISTANCE_LONG(op, op_limit, len, distance)
668       }
669     } else {
670       /* far away, but not yet in the another galaxy... */
671       distance -= MAX_DISTANCE;
672       if (len < 7) {
673         DISTANCE_SHORT_FAR(op, op_limit, len, distance)
674       } else {
675         DISTANCE_LONG_FAR(op, op_limit, len, distance)
676       }
677     }
678 
679     /* update the hash at match boundary */
680     seq = BLOSCLZ_READU32(ip);
681     HASH_FUNCTION(hval, seq, hashlog)
682     htab[hval] = (uint32_t) (ip++ - ibase);
683     seq >>= 8U;
684     HASH_FUNCTION(hval, seq, hashlog)
685     htab[hval] = (uint32_t) (ip++ - ibase);
686     /* assuming literal copy */
687 
688     if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
689       goto out;
690     *op++ = MAX_COPY - 1;
691   }
692 
693   /* left-over as literal copy */
694   while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
695     if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
696     *op++ = *ip++;
697     copy++;
698     if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
699       copy = 0;
700       *op++ = MAX_COPY - 1;
701     }
702   }
703 
704   /* if we have copied something, adjust the copy length */
705   if (copy)
706     *(op - copy - 1) = (uint8_t)(copy - 1);
707   else
708     op--;
709 
710   /* marker for blosclz */
711   *(uint8_t*)output |= (1U << 5U);
712 
713   return (int)(op - (uint8_t*)output);
714 
715   out:
716   return 0;
717 }
718 
719 // See https://habr.com/en/company/yandex/blog/457612/
720 #ifdef __AVX2__
721 
722 #if defined(_MSC_VER)
723 #define ALIGNED_(x) __declspec(align(x))
724 #else
725 #if defined(__GNUC__)
726 #define ALIGNED_(x) __attribute__ ((aligned(x)))
727 #endif
728 #endif
729 #define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
730 
copy_match_16(unsigned char * op,const unsigned char * match,int32_t len)731 static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
732 {
733   size_t offset = op - match;
734   while (len >= 16) {
735 
736     static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
737       {
738                 0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
739                 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
740                 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
741                 0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
742                 0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
743                 0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
744                 0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
745                 0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
746                 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
747                 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
748                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
749                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
750                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
751                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
752                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
753                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
754                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
755       };
756 
757     _mm_storeu_si128((__m128i *)(op),
758                      _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
759                                       _mm_load_si128((const __m128i *)(masks) + offset)));
760 
761     match += masks[offset];
762 
763     op += 16;
764     len -= 16;
765   }
766   // Deal with remainders
767   for (; len > 0; len--) {
768     *op++ = *match++;
769   }
770   return op;
771 }
772 #endif
773 
774 // LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
wild_copy(uint8_t * out,const uint8_t * from,uint8_t * end)775 static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
776   uint8_t* d = out;
777   const uint8_t* s = from;
778   uint8_t* const e = end;
779 
780   do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
781 }
782 
blosclz_decompress(const void * input,int length,void * output,int maxout)783 int blosclz_decompress(const void* input, int length, void* output, int maxout) {
784   const uint8_t* ip = (const uint8_t*)input;
785   const uint8_t* ip_limit = ip + length;
786   uint8_t* op = (uint8_t*)output;
787   uint32_t ctrl;
788   uint8_t* op_limit = op + maxout;
789   if (BLOSCLZ_UNLIKELY(length == 0)) {
790     return 0;
791   }
792   ctrl = (*ip++) & 31U;
793 
794   while (1) {
795     if (ctrl >= 32) {
796       // match
797       int32_t len = (ctrl >> 5U) - 1 ;
798       int32_t ofs = (ctrl & 31U) << 8U;
799       uint8_t code;
800       const uint8_t* ref = op - ofs;
801 
802       if (len == 7 - 1) {
803         do {
804           if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
805             return 0;
806           }
807           code = *ip++;
808           len += code;
809         } while (code == 255);
810       }
811       else {
812         if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
813           return 0;
814         }
815       }
816       code = *ip++;
817       len += 3;
818       ref -= code;
819 
820       /* match from 16-bit distance */
821       if (BLOSCLZ_UNLIKELY(code == 255)) {
822         if (ofs == (31U << 8U)) {
823           if (ip + 1 >= ip_limit) {
824             return 0;
825           }
826           ofs = (*ip++) << 8U;
827           ofs += *ip++;
828           ref = op - ofs - MAX_DISTANCE;
829         }
830       }
831 
832       if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
833         return 0;
834       }
835 
836       if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
837         return 0;
838       }
839 
840       if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
841       ctrl = *ip++;
842 
843       ref--;
844       if (ref == op - 1) {
845         /* optimized copy for a run */
846         memset(op, *ref, len);
847         op += len;
848       }
849       else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
850         // copy with an overlap not larger than 8
851         wild_copy(op, ref, op + len);
852         op += len;
853       }
854       else {
855         // general copy with any overlap
856 #ifdef __AVX2__
857         if (op - ref <= 16) {
858           // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
859           // it is not slower either.  Let's activate here for experimentation.
860           op = copy_match_16(op, ref, len);
861         }
862         else {
863 #endif
864           op = copy_match(op, ref, (unsigned) len);
865 #ifdef __AVX2__
866         }
867 #endif
868       }
869     }
870     else {
871       // literal
872       ctrl++;
873       if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
874         return 0;
875       }
876       if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
877         return 0;
878       }
879 
880       memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
881       // On GCC-6, fastcopy this is still faster than plain memcpy
882       // However, using recent CLANG/LLVM 9.0, there is almost no difference
883       // in performance.
884       // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
885       // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
886 
887       if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
888       ctrl = *ip++;
889     }
890   }
891 
892   return (int)(op - (uint8_t*)output);
893 }
894