1 /*********************************************************************
2   Blosc - Blocked Shuffling and Compression Library
3 
4   Author: Francesc Alted <francesc@blosc.org>
5   Creation date: 2009-05-20
6 
7   See LICENSES/BLOSC.txt for details about copyright and rights to use.
8 **********************************************************************/
9 
10 #include "shuffle.h"
11 #include "blosc-common.h"
12 #include "shuffle-generic.h"
13 #include "bitshuffle-generic.h"
14 #include "blosc-comp-features.h"
15 #include <stdio.h>
16 
17 #if defined(_WIN32)
18 #include "win32/pthread.h"
19 #else
20 #include <pthread.h>
21 #endif
22 
23 /* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */
24 #if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
25 /* have a C99 compiler */
26 typedef _Bool bool;
27 #else
28 /* do not have a C99 compiler */
29 typedef unsigned char bool;
30 #endif
31 
32 
33 #if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
34     __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
35 #define HAVE_CPU_FEAT_INTRIN
36 #endif
37 
38 /*  Include hardware-accelerated shuffle/unshuffle routines based on
39     the target architecture. Note that a target architecture may support
40     more than one type of acceleration!*/
41 #if defined(SHUFFLE_AVX2_ENABLED)
42   #include "shuffle-avx2.h"
43   #include "bitshuffle-avx2.h"
44 #endif  /* defined(SHUFFLE_AVX2_ENABLED) */
45 
46 #if defined(SHUFFLE_SSE2_ENABLED)
47   #include "shuffle-sse2.h"
48   #include "bitshuffle-sse2.h"
49 #endif  /* defined(SHUFFLE_SSE2_ENABLED) */
50 
51 
52 /*  Define function pointer types for shuffle/unshuffle routines. */
53 typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
54 typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
55 typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*);
56 typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*);
57 
58 /* An implementation of shuffle/unshuffle routines. */
59 typedef struct shuffle_implementation {
60   /* Name of this implementation. */
61   const char* name;
62   /* Function pointer to the shuffle routine for this implementation. */
63   shuffle_func shuffle;
64   /* Function pointer to the unshuffle routine for this implementation. */
65   unshuffle_func unshuffle;
66   /* Function pointer to the bitshuffle routine for this implementation. */
67   bitshuffle_func bitshuffle;
68   /* Function pointer to the bitunshuffle routine for this implementation. */
69   bitunshuffle_func bitunshuffle;
70 } shuffle_implementation_t;
71 
72 typedef enum {
73   BLOSC_HAVE_NOTHING = 0,
74   BLOSC_HAVE_SSE2 = 1,
75   BLOSC_HAVE_AVX2 = 2
76 } blosc_cpu_features;
77 
78 /*  Detect hardware and set function pointers to the best shuffle/unshuffle
79     implementations supported by the host processor. */
80 #if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED)    /* Intel/i686 */
81 
82 /*  Disabled the __builtin_cpu_supports() call, as it has issues with
83     new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial:
84       "undefined symbol: __cpu_model"
85     For a similar report, see:
86     https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
87 */
88 #if defined(HAVE_CPU_FEAT_INTRIN) && 0
blosc_get_cpu_features(void)89 static blosc_cpu_features blosc_get_cpu_features(void) {
90   blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
91   if (__builtin_cpu_supports("sse2")) {
92     cpu_features |= BLOSC_HAVE_SSE2;
93   }
94   if (__builtin_cpu_supports("avx2")) {
95     cpu_features |= BLOSC_HAVE_AVX2;
96   }
97   return cpu_features;
98 }
99 #else
100 
101 #if defined(_MSC_VER) && !defined(__clang__)
102   #include <intrin.h>     /* Needed for __cpuid */
103 
104 /*  _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */
105 #if _MSC_FULL_VER >= 160040219
106   #include <immintrin.h>  /* Needed for _xgetbv */
107   #define blosc_internal_xgetbv _xgetbv
108 #elif defined(_M_IX86)
109 
110 /*  Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */
111 
blosc_internal_xgetbv(uint32_t xcr)112 static uint64_t blosc_internal_xgetbv(uint32_t xcr) {
113     uint32_t xcr0, xcr1;
114     __asm {
115         mov        ecx, xcr
116         _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
117         mov        xcr0, eax
118         mov        xcr1, edx
119     }
120     return ((uint64_t)xcr1 << 32) | xcr0;
121 }
122 
123 #elif defined(_M_X64)
124 
125 /*  Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets.
126     These compilers don't support any of the newer acceleration ISAs
127     (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2
128     which means we can get away with returning a hard-coded value from
129     this implementation of _xgetbv. */
130 
blosc_internal_xgetbv(uint32_t xcr)131 static __inline uint64_t blosc_internal_xgetbv(uint32_t xcr) {
132     /* A 64-bit OS must have XMM save support. */
133     return (xcr == 0 ? (1UL << 1) : 0UL);
134 }
135 
136 #else
137 
138 /* Hardware detection for any other MSVC targets (e.g., ARM)
139    isn't implemented at this time. */
140 #error This version of c-blosc only supports x86 and x64 targets with MSVC.
141 
142 #endif /* _MSC_FULL_VER >= 160040219 */
143 
144 #define blosc_internal_cpuid __cpuid
145 
146 #else
147 
148 /*  Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
149     and others using inline assembly. */
150 __attribute__((always_inline))
151 static inline void
blosc_internal_cpuidex(int32_t cpuInfo[4],int32_t function_id,int32_t subfunction_id)152 blosc_internal_cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
153   __asm__ __volatile__ (
154 # if defined(__i386__) && defined (__PIC__)
155   /*  Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
156       https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
157   */
158     "movl %%ebx, %%edi\n\t"
159     "cpuid\n\t"
160     "xchgl %%ebx, %%edi":
161     "=D" (cpuInfo[1]),
162 #else
163     "cpuid":
164     "=b" (cpuInfo[1]),
165 #endif  /* defined(__i386) && defined(__PIC__) */
166     "=a" (cpuInfo[0]),
167     "=c" (cpuInfo[2]),
168     "=d" (cpuInfo[3]) :
169     "a" (function_id), "c" (subfunction_id)
170     );
171 }
172 
173 #define blosc_internal_cpuid(cpuInfo, function_id) blosc_internal_cpuidex(cpuInfo, function_id, 0)
174 
175 #define _XCR_XFEATURE_ENABLED_MASK 0
176 
177 #if !(defined(_IMMINTRIN_H_INCLUDED) && (BLOSC_GCC_VERSION >= 900))
178 
179 /* Reads the content of an extended control register.
180    https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
181 */
182 static inline uint64_t
blosc_internal_xgetbv(uint32_t xcr)183 blosc_internal_xgetbv(uint32_t xcr) {
184   uint32_t eax, edx;
185   __asm__ __volatile__ (
186     /* "xgetbv"
187        This is specified as raw instruction bytes due to some older compilers
188        having issues with the mnemonic form.
189     */
190     ".byte 0x0f, 0x01, 0xd0":
191     "=a" (eax),
192     "=d" (edx) :
193     "c" (xcr)
194     );
195   return ((uint64_t)edx << 32) | eax;
196 }
197 
198 #else
199 
200 #define blosc_internal_xgetbv _xgetbv
201 
202 #endif  // !(defined(_IMMINTRIN_H_INCLUDED) && (BLOSC_GCC_VERSION >= 900))
203 #endif  /* defined(_MSC_FULL_VER) */
204 
205 #ifndef _XCR_XFEATURE_ENABLED_MASK
206 #define _XCR_XFEATURE_ENABLED_MASK 0x0
207 #endif
208 
blosc_get_cpu_features(void)209 static blosc_cpu_features blosc_get_cpu_features(void) {
210   blosc_cpu_features result = BLOSC_HAVE_NOTHING;
211   int32_t max_basic_function_id;
212   /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
213   int32_t cpu_info[4];
214   int sse2_available;
215   int sse3_available;
216   int ssse3_available;
217   int sse41_available;
218   int sse42_available;
219   int xsave_available;
220   int xsave_enabled_by_os;
221   int avx2_available = 0;
222   int avx512bw_available = 0;
223   int xmm_state_enabled = 0;
224   int ymm_state_enabled = 0;
225   int zmm_state_enabled = 0;
226   uint64_t xcr0_contents;
227   char* envvar;
228 
229   /* Get the number of basic functions available. */
230   blosc_internal_cpuid(cpu_info, 0);
231   max_basic_function_id = cpu_info[0];
232 
233   /* Check for SSE-based features and required OS support */
234   blosc_internal_cpuid(cpu_info, 1);
235   sse2_available = (cpu_info[3] & (1 << 26)) != 0;
236   sse3_available = (cpu_info[2] & (1 << 0)) != 0;
237   ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
238   sse41_available = (cpu_info[2] & (1 << 19)) != 0;
239   sse42_available = (cpu_info[2] & (1 << 20)) != 0;
240 
241   xsave_available = (cpu_info[2] & (1 << 26)) != 0;
242   xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
243 
244   /* Check for AVX-based features, if the processor supports extended features. */
245   if (max_basic_function_id >= 7) {
246     blosc_internal_cpuid(cpu_info, 7);
247     avx2_available = (cpu_info[1] & (1 << 5)) != 0;
248     avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
249   }
250 
251   /*  Even if certain features are supported by the CPU, they may not be supported
252       by the OS (in which case using them would crash the process or system).
253       If xsave is available and enabled by the OS, check the contents of the
254       extended control register XCR0 to see if the CPU features are enabled. */
255 #if defined(_XCR_XFEATURE_ENABLED_MASK)
256   if (xsave_available && xsave_enabled_by_os && (
257       sse2_available || sse3_available || ssse3_available
258       || sse41_available || sse42_available
259       || avx2_available || avx512bw_available)) {
260     /* Determine which register states can be restored by the OS. */
261     xcr0_contents = blosc_internal_xgetbv(_XCR_XFEATURE_ENABLED_MASK);
262 
263     xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
264     ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
265 
266     /*  Require support for both the upper 256-bits of zmm0-zmm15 to be
267         restored as well as all of zmm16-zmm31 and the opmask registers. */
268     zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
269   }
270 #endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
271 
272   envvar = getenv("BLOSC_PRINT_SHUFFLE_ACCEL");
273   if (envvar != NULL) {
274     printf("Shuffle CPU Information:\n");
275     printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
276     printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
277     printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
278     printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
279     printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
280     printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
281     printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
282     printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
283     printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
284     printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
285     printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
286     printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
287   }
288 
289   /* Using the gathered CPU information, determine which implementation to use. */
290   /* technically could fail on sse2 cpu on os without xmm support, but that
291    * shouldn't exist anymore */
292   if (sse2_available) {
293     result |= BLOSC_HAVE_SSE2;
294   }
295   if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
296     result |= BLOSC_HAVE_AVX2;
297   }
298   return result;
299 }
300 #endif
301 
302 #else   /* No hardware acceleration supported for the target architecture. */
303   #if defined(_MSC_VER)
304   #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
305   #else
306   #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
307   #endif
308 
blosc_get_cpu_features(void)309 static blosc_cpu_features blosc_get_cpu_features(void) {
310   return BLOSC_HAVE_NOTHING;
311 }
312 
313 #endif
314 
get_shuffle_implementation(void)315 static shuffle_implementation_t get_shuffle_implementation(void) {
316   blosc_cpu_features cpu_features = blosc_get_cpu_features();
317   shuffle_implementation_t impl_generic;
318 
319 #if defined(SHUFFLE_AVX2_ENABLED)
320   if (cpu_features & BLOSC_HAVE_AVX2) {
321     shuffle_implementation_t impl_avx2;
322     impl_avx2.name = "avx2";
323     impl_avx2.shuffle = (shuffle_func)blosc_internal_shuffle_avx2;
324     impl_avx2.unshuffle = (unshuffle_func)blosc_internal_unshuffle_avx2;
325     impl_avx2.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_avx2;
326     impl_avx2.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_avx2;
327     return impl_avx2;
328   }
329 #endif  /* defined(SHUFFLE_AVX2_ENABLED) */
330 
331 #if defined(SHUFFLE_SSE2_ENABLED)
332   if (cpu_features & BLOSC_HAVE_SSE2) {
333     shuffle_implementation_t impl_sse2;
334     impl_sse2.name = "sse2";
335     impl_sse2.shuffle = (shuffle_func)blosc_internal_shuffle_sse2;
336     impl_sse2.unshuffle = (unshuffle_func)blosc_internal_unshuffle_sse2;
337     impl_sse2.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_sse2;
338     impl_sse2.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_sse2;
339     return impl_sse2;
340   }
341 #endif  /* defined(SHUFFLE_SSE2_ENABLED) */
342 
343   /*  Processor doesn't support any of the hardware-accelerated implementations,
344       so use the generic implementation. */
345   impl_generic.name = "generic";
346   impl_generic.shuffle = (shuffle_func)blosc_internal_shuffle_generic;
347   impl_generic.unshuffle = (unshuffle_func)blosc_internal_unshuffle_generic;
348   impl_generic.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_scal;
349   impl_generic.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_scal;
350   return impl_generic;
351 }
352 
353 
354 /*  Flag indicating whether the implementation has been initialized. */
355 static pthread_once_t implementation_initialized = PTHREAD_ONCE_INIT;
356 
357 /*  The dynamically-chosen shuffle/unshuffle implementation.
358     This is only safe to use once `implementation_initialized` is set. */
359 static shuffle_implementation_t host_implementation;
360 
set_host_implementation(void)361 static void set_host_implementation(void) {
362   host_implementation = get_shuffle_implementation();
363 }
364 
365 /*  Initialize the shuffle implementation, if necessary. */
366 #if defined(__GNUC__) || defined(__clang__)
367 __attribute__((always_inline))
368 #endif
369 static
370 #if defined(_MSC_VER)
371 __forceinline
372 #else
373 BLOSC_INLINE
374 #endif
init_shuffle_implementation(void)375 void init_shuffle_implementation(void) {
376   pthread_once(&implementation_initialized, &set_host_implementation);
377 }
378 
379 /*  Shuffle a block by dynamically dispatching to the appropriate
380     hardware-accelerated routine at run-time. */
381 void
blosc_internal_shuffle(const size_t bytesoftype,const size_t blocksize,const uint8_t * _src,const uint8_t * _dest)382 blosc_internal_shuffle(const size_t bytesoftype, const size_t blocksize,
383                        const uint8_t* _src, const uint8_t* _dest) {
384   /* Initialize the shuffle implementation if necessary. */
385   init_shuffle_implementation();
386 
387   /*  The implementation is initialized.
388       Dispatch to it's shuffle routine. */
389   (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
390 }
391 
392 /*  Unshuffle a block by dynamically dispatching to the appropriate
393     hardware-accelerated routine at run-time. */
394 void
blosc_internal_unshuffle(const size_t bytesoftype,const size_t blocksize,const uint8_t * _src,const uint8_t * _dest)395 blosc_internal_unshuffle(const size_t bytesoftype, const size_t blocksize,
396                          const uint8_t* _src, const uint8_t* _dest) {
397   /* Initialize the shuffle implementation if necessary. */
398   init_shuffle_implementation();
399 
400   /*  The implementation is initialized.
401       Dispatch to it's unshuffle routine. */
402   (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
403 }
404 
405 /*  Bit-shuffle a block by dynamically dispatching to the appropriate
406     hardware-accelerated routine at run-time. */
407 int
blosc_internal_bitshuffle(const size_t bytesoftype,const size_t blocksize,const uint8_t * const _src,const uint8_t * _dest,const uint8_t * _tmp)408 blosc_internal_bitshuffle(const size_t bytesoftype, const size_t blocksize,
409                           const uint8_t* const _src, const uint8_t* _dest,
410                           const uint8_t* _tmp) {
411   int size = blocksize / bytesoftype;
412   /* Initialize the shuffle implementation if necessary. */
413   init_shuffle_implementation();
414 
415   if ((size % 8) == 0) {
416     /* The number of elems is a multiple of 8 which is supported by
417        bitshuffle. */
418     int ret = (int)(host_implementation.bitshuffle)((void *) _src, (void *) _dest,
419                                                     blocksize / bytesoftype,
420                                                     bytesoftype, (void *) _tmp);
421     /* Copy the leftovers */
422     size_t offset = size * bytesoftype;
423     memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
424     return ret;
425   }
426   else {
427     memcpy((void *) _dest, (void *) _src, blocksize);
428   }
429   return size;
430 }
431 
432 /*  Bit-unshuffle a block by dynamically dispatching to the appropriate
433     hardware-accelerated routine at run-time. */
434 int
blosc_internal_bitunshuffle(const size_t bytesoftype,const size_t blocksize,const uint8_t * const _src,const uint8_t * _dest,const uint8_t * _tmp)435 blosc_internal_bitunshuffle(const size_t bytesoftype, const size_t blocksize,
436                             const uint8_t* const _src, const uint8_t* _dest,
437                             const uint8_t* _tmp) {
438   int size = blocksize / bytesoftype;
439   /* Initialize the shuffle implementation if necessary. */
440   init_shuffle_implementation();
441 
442   if ((size % 8) == 0) {
443     /* The number of elems is a multiple of 8 which is supported by
444        bitshuffle. */
445     int ret = (int) (host_implementation.bitunshuffle)((void *) _src, (void *) _dest,
446                                                        blocksize / bytesoftype,
447                                                        bytesoftype, (void *) _tmp);
448     /* Copy the leftovers */
449     size_t offset = size * bytesoftype;
450     memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
451     return ret;
452   }
453   else {
454     memcpy((void *) _dest, (void *) _src, blocksize);
455   }
456   return size;
457 }
458