1 /*********************************************************************
2 Blosc - Blocked Shuffling and Compression Library
3
4 Author: Francesc Alted <francesc@blosc.org>
5 Creation date: 2009-05-20
6
7 See LICENSES/BLOSC.txt for details about copyright and rights to use.
8 **********************************************************************/
9
10 #include "shuffle.h"
11 #include "blosc-common.h"
12 #include "shuffle-generic.h"
13 #include "bitshuffle-generic.h"
14 #include "blosc-comp-features.h"
15 #include <stdio.h>
16
17 #if defined(_WIN32)
18 #include "win32/pthread.h"
19 #else
20 #include <pthread.h>
21 #endif
22
23 /* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */
24 #if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
25 /* have a C99 compiler */
26 typedef _Bool bool;
27 #else
28 /* do not have a C99 compiler */
29 typedef unsigned char bool;
30 #endif
31
32
33 #if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
34 __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
35 #define HAVE_CPU_FEAT_INTRIN
36 #endif
37
38 /* Include hardware-accelerated shuffle/unshuffle routines based on
39 the target architecture. Note that a target architecture may support
40 more than one type of acceleration!*/
41 #if defined(SHUFFLE_AVX2_ENABLED)
42 #include "shuffle-avx2.h"
43 #include "bitshuffle-avx2.h"
44 #endif /* defined(SHUFFLE_AVX2_ENABLED) */
45
46 #if defined(SHUFFLE_SSE2_ENABLED)
47 #include "shuffle-sse2.h"
48 #include "bitshuffle-sse2.h"
49 #endif /* defined(SHUFFLE_SSE2_ENABLED) */
50
51
52 /* Define function pointer types for shuffle/unshuffle routines. */
53 typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
54 typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
55 typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*);
56 typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*);
57
58 /* An implementation of shuffle/unshuffle routines. */
59 typedef struct shuffle_implementation {
60 /* Name of this implementation. */
61 const char* name;
62 /* Function pointer to the shuffle routine for this implementation. */
63 shuffle_func shuffle;
64 /* Function pointer to the unshuffle routine for this implementation. */
65 unshuffle_func unshuffle;
66 /* Function pointer to the bitshuffle routine for this implementation. */
67 bitshuffle_func bitshuffle;
68 /* Function pointer to the bitunshuffle routine for this implementation. */
69 bitunshuffle_func bitunshuffle;
70 } shuffle_implementation_t;
71
72 typedef enum {
73 BLOSC_HAVE_NOTHING = 0,
74 BLOSC_HAVE_SSE2 = 1,
75 BLOSC_HAVE_AVX2 = 2
76 } blosc_cpu_features;
77
78 /* Detect hardware and set function pointers to the best shuffle/unshuffle
79 implementations supported by the host processor. */
80 #if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED) /* Intel/i686 */
81
82 /* Disabled the __builtin_cpu_supports() call, as it has issues with
83 new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial:
84 "undefined symbol: __cpu_model"
85 For a similar report, see:
86 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
87 */
88 #if defined(HAVE_CPU_FEAT_INTRIN) && 0
blosc_get_cpu_features(void)89 static blosc_cpu_features blosc_get_cpu_features(void) {
90 blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
91 if (__builtin_cpu_supports("sse2")) {
92 cpu_features |= BLOSC_HAVE_SSE2;
93 }
94 if (__builtin_cpu_supports("avx2")) {
95 cpu_features |= BLOSC_HAVE_AVX2;
96 }
97 return cpu_features;
98 }
99 #else
100
101 #if defined(_MSC_VER) && !defined(__clang__)
102 #include <intrin.h> /* Needed for __cpuid */
103
104 /* _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */
105 #if _MSC_FULL_VER >= 160040219
106 #include <immintrin.h> /* Needed for _xgetbv */
107 #define blosc_internal_xgetbv _xgetbv
108 #elif defined(_M_IX86)
109
110 /* Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */
111
blosc_internal_xgetbv(uint32_t xcr)112 static uint64_t blosc_internal_xgetbv(uint32_t xcr) {
113 uint32_t xcr0, xcr1;
114 __asm {
115 mov ecx, xcr
116 _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
117 mov xcr0, eax
118 mov xcr1, edx
119 }
120 return ((uint64_t)xcr1 << 32) | xcr0;
121 }
122
123 #elif defined(_M_X64)
124
125 /* Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets.
126 These compilers don't support any of the newer acceleration ISAs
127 (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2
128 which means we can get away with returning a hard-coded value from
129 this implementation of _xgetbv. */
130
blosc_internal_xgetbv(uint32_t xcr)131 static __inline uint64_t blosc_internal_xgetbv(uint32_t xcr) {
132 /* A 64-bit OS must have XMM save support. */
133 return (xcr == 0 ? (1UL << 1) : 0UL);
134 }
135
136 #else
137
138 /* Hardware detection for any other MSVC targets (e.g., ARM)
139 isn't implemented at this time. */
140 #error This version of c-blosc only supports x86 and x64 targets with MSVC.
141
142 #endif /* _MSC_FULL_VER >= 160040219 */
143
144 #define blosc_internal_cpuid __cpuid
145
146 #else
147
148 /* Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
149 and others using inline assembly. */
150 __attribute__((always_inline))
151 static inline void
blosc_internal_cpuidex(int32_t cpuInfo[4],int32_t function_id,int32_t subfunction_id)152 blosc_internal_cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
153 __asm__ __volatile__ (
154 # if defined(__i386__) && defined (__PIC__)
155 /* Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
156 https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
157 */
158 "movl %%ebx, %%edi\n\t"
159 "cpuid\n\t"
160 "xchgl %%ebx, %%edi":
161 "=D" (cpuInfo[1]),
162 #else
163 "cpuid":
164 "=b" (cpuInfo[1]),
165 #endif /* defined(__i386) && defined(__PIC__) */
166 "=a" (cpuInfo[0]),
167 "=c" (cpuInfo[2]),
168 "=d" (cpuInfo[3]) :
169 "a" (function_id), "c" (subfunction_id)
170 );
171 }
172
173 #define blosc_internal_cpuid(cpuInfo, function_id) blosc_internal_cpuidex(cpuInfo, function_id, 0)
174
175 #define _XCR_XFEATURE_ENABLED_MASK 0
176
177 #if !(defined(_IMMINTRIN_H_INCLUDED) && (BLOSC_GCC_VERSION >= 900))
178
179 /* Reads the content of an extended control register.
180 https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
181 */
182 static inline uint64_t
blosc_internal_xgetbv(uint32_t xcr)183 blosc_internal_xgetbv(uint32_t xcr) {
184 uint32_t eax, edx;
185 __asm__ __volatile__ (
186 /* "xgetbv"
187 This is specified as raw instruction bytes due to some older compilers
188 having issues with the mnemonic form.
189 */
190 ".byte 0x0f, 0x01, 0xd0":
191 "=a" (eax),
192 "=d" (edx) :
193 "c" (xcr)
194 );
195 return ((uint64_t)edx << 32) | eax;
196 }
197
198 #else
199
200 #define blosc_internal_xgetbv _xgetbv
201
202 #endif // !(defined(_IMMINTRIN_H_INCLUDED) && (BLOSC_GCC_VERSION >= 900))
203 #endif /* defined(_MSC_FULL_VER) */
204
205 #ifndef _XCR_XFEATURE_ENABLED_MASK
206 #define _XCR_XFEATURE_ENABLED_MASK 0x0
207 #endif
208
blosc_get_cpu_features(void)209 static blosc_cpu_features blosc_get_cpu_features(void) {
210 blosc_cpu_features result = BLOSC_HAVE_NOTHING;
211 int32_t max_basic_function_id;
212 /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
213 int32_t cpu_info[4];
214 int sse2_available;
215 int sse3_available;
216 int ssse3_available;
217 int sse41_available;
218 int sse42_available;
219 int xsave_available;
220 int xsave_enabled_by_os;
221 int avx2_available = 0;
222 int avx512bw_available = 0;
223 int xmm_state_enabled = 0;
224 int ymm_state_enabled = 0;
225 int zmm_state_enabled = 0;
226 uint64_t xcr0_contents;
227 char* envvar;
228
229 /* Get the number of basic functions available. */
230 blosc_internal_cpuid(cpu_info, 0);
231 max_basic_function_id = cpu_info[0];
232
233 /* Check for SSE-based features and required OS support */
234 blosc_internal_cpuid(cpu_info, 1);
235 sse2_available = (cpu_info[3] & (1 << 26)) != 0;
236 sse3_available = (cpu_info[2] & (1 << 0)) != 0;
237 ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
238 sse41_available = (cpu_info[2] & (1 << 19)) != 0;
239 sse42_available = (cpu_info[2] & (1 << 20)) != 0;
240
241 xsave_available = (cpu_info[2] & (1 << 26)) != 0;
242 xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
243
244 /* Check for AVX-based features, if the processor supports extended features. */
245 if (max_basic_function_id >= 7) {
246 blosc_internal_cpuid(cpu_info, 7);
247 avx2_available = (cpu_info[1] & (1 << 5)) != 0;
248 avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
249 }
250
251 /* Even if certain features are supported by the CPU, they may not be supported
252 by the OS (in which case using them would crash the process or system).
253 If xsave is available and enabled by the OS, check the contents of the
254 extended control register XCR0 to see if the CPU features are enabled. */
255 #if defined(_XCR_XFEATURE_ENABLED_MASK)
256 if (xsave_available && xsave_enabled_by_os && (
257 sse2_available || sse3_available || ssse3_available
258 || sse41_available || sse42_available
259 || avx2_available || avx512bw_available)) {
260 /* Determine which register states can be restored by the OS. */
261 xcr0_contents = blosc_internal_xgetbv(_XCR_XFEATURE_ENABLED_MASK);
262
263 xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
264 ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
265
266 /* Require support for both the upper 256-bits of zmm0-zmm15 to be
267 restored as well as all of zmm16-zmm31 and the opmask registers. */
268 zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
269 }
270 #endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
271
272 envvar = getenv("BLOSC_PRINT_SHUFFLE_ACCEL");
273 if (envvar != NULL) {
274 printf("Shuffle CPU Information:\n");
275 printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
276 printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
277 printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
278 printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
279 printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
280 printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
281 printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
282 printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
283 printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
284 printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
285 printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
286 printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
287 }
288
289 /* Using the gathered CPU information, determine which implementation to use. */
290 /* technically could fail on sse2 cpu on os without xmm support, but that
291 * shouldn't exist anymore */
292 if (sse2_available) {
293 result |= BLOSC_HAVE_SSE2;
294 }
295 if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
296 result |= BLOSC_HAVE_AVX2;
297 }
298 return result;
299 }
300 #endif
301
302 #else /* No hardware acceleration supported for the target architecture. */
303 #if defined(_MSC_VER)
304 #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
305 #else
306 #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
307 #endif
308
blosc_get_cpu_features(void)309 static blosc_cpu_features blosc_get_cpu_features(void) {
310 return BLOSC_HAVE_NOTHING;
311 }
312
313 #endif
314
get_shuffle_implementation(void)315 static shuffle_implementation_t get_shuffle_implementation(void) {
316 blosc_cpu_features cpu_features = blosc_get_cpu_features();
317 shuffle_implementation_t impl_generic;
318
319 #if defined(SHUFFLE_AVX2_ENABLED)
320 if (cpu_features & BLOSC_HAVE_AVX2) {
321 shuffle_implementation_t impl_avx2;
322 impl_avx2.name = "avx2";
323 impl_avx2.shuffle = (shuffle_func)blosc_internal_shuffle_avx2;
324 impl_avx2.unshuffle = (unshuffle_func)blosc_internal_unshuffle_avx2;
325 impl_avx2.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_avx2;
326 impl_avx2.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_avx2;
327 return impl_avx2;
328 }
329 #endif /* defined(SHUFFLE_AVX2_ENABLED) */
330
331 #if defined(SHUFFLE_SSE2_ENABLED)
332 if (cpu_features & BLOSC_HAVE_SSE2) {
333 shuffle_implementation_t impl_sse2;
334 impl_sse2.name = "sse2";
335 impl_sse2.shuffle = (shuffle_func)blosc_internal_shuffle_sse2;
336 impl_sse2.unshuffle = (unshuffle_func)blosc_internal_unshuffle_sse2;
337 impl_sse2.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_sse2;
338 impl_sse2.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_sse2;
339 return impl_sse2;
340 }
341 #endif /* defined(SHUFFLE_SSE2_ENABLED) */
342
343 /* Processor doesn't support any of the hardware-accelerated implementations,
344 so use the generic implementation. */
345 impl_generic.name = "generic";
346 impl_generic.shuffle = (shuffle_func)blosc_internal_shuffle_generic;
347 impl_generic.unshuffle = (unshuffle_func)blosc_internal_unshuffle_generic;
348 impl_generic.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_scal;
349 impl_generic.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_scal;
350 return impl_generic;
351 }
352
353
354 /* Flag indicating whether the implementation has been initialized. */
355 static pthread_once_t implementation_initialized = PTHREAD_ONCE_INIT;
356
357 /* The dynamically-chosen shuffle/unshuffle implementation.
358 This is only safe to use once `implementation_initialized` is set. */
359 static shuffle_implementation_t host_implementation;
360
set_host_implementation(void)361 static void set_host_implementation(void) {
362 host_implementation = get_shuffle_implementation();
363 }
364
365 /* Initialize the shuffle implementation, if necessary. */
366 #if defined(__GNUC__) || defined(__clang__)
367 __attribute__((always_inline))
368 #endif
369 static
370 #if defined(_MSC_VER)
371 __forceinline
372 #else
373 BLOSC_INLINE
374 #endif
init_shuffle_implementation(void)375 void init_shuffle_implementation(void) {
376 pthread_once(&implementation_initialized, &set_host_implementation);
377 }
378
379 /* Shuffle a block by dynamically dispatching to the appropriate
380 hardware-accelerated routine at run-time. */
381 void
blosc_internal_shuffle(const size_t bytesoftype,const size_t blocksize,const uint8_t * _src,const uint8_t * _dest)382 blosc_internal_shuffle(const size_t bytesoftype, const size_t blocksize,
383 const uint8_t* _src, const uint8_t* _dest) {
384 /* Initialize the shuffle implementation if necessary. */
385 init_shuffle_implementation();
386
387 /* The implementation is initialized.
388 Dispatch to it's shuffle routine. */
389 (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
390 }
391
392 /* Unshuffle a block by dynamically dispatching to the appropriate
393 hardware-accelerated routine at run-time. */
394 void
blosc_internal_unshuffle(const size_t bytesoftype,const size_t blocksize,const uint8_t * _src,const uint8_t * _dest)395 blosc_internal_unshuffle(const size_t bytesoftype, const size_t blocksize,
396 const uint8_t* _src, const uint8_t* _dest) {
397 /* Initialize the shuffle implementation if necessary. */
398 init_shuffle_implementation();
399
400 /* The implementation is initialized.
401 Dispatch to it's unshuffle routine. */
402 (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
403 }
404
405 /* Bit-shuffle a block by dynamically dispatching to the appropriate
406 hardware-accelerated routine at run-time. */
407 int
blosc_internal_bitshuffle(const size_t bytesoftype,const size_t blocksize,const uint8_t * const _src,const uint8_t * _dest,const uint8_t * _tmp)408 blosc_internal_bitshuffle(const size_t bytesoftype, const size_t blocksize,
409 const uint8_t* const _src, const uint8_t* _dest,
410 const uint8_t* _tmp) {
411 int size = blocksize / bytesoftype;
412 /* Initialize the shuffle implementation if necessary. */
413 init_shuffle_implementation();
414
415 if ((size % 8) == 0) {
416 /* The number of elems is a multiple of 8 which is supported by
417 bitshuffle. */
418 int ret = (int)(host_implementation.bitshuffle)((void *) _src, (void *) _dest,
419 blocksize / bytesoftype,
420 bytesoftype, (void *) _tmp);
421 /* Copy the leftovers */
422 size_t offset = size * bytesoftype;
423 memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
424 return ret;
425 }
426 else {
427 memcpy((void *) _dest, (void *) _src, blocksize);
428 }
429 return size;
430 }
431
432 /* Bit-unshuffle a block by dynamically dispatching to the appropriate
433 hardware-accelerated routine at run-time. */
434 int
blosc_internal_bitunshuffle(const size_t bytesoftype,const size_t blocksize,const uint8_t * const _src,const uint8_t * _dest,const uint8_t * _tmp)435 blosc_internal_bitunshuffle(const size_t bytesoftype, const size_t blocksize,
436 const uint8_t* const _src, const uint8_t* _dest,
437 const uint8_t* _tmp) {
438 int size = blocksize / bytesoftype;
439 /* Initialize the shuffle implementation if necessary. */
440 init_shuffle_implementation();
441
442 if ((size % 8) == 0) {
443 /* The number of elems is a multiple of 8 which is supported by
444 bitshuffle. */
445 int ret = (int) (host_implementation.bitunshuffle)((void *) _src, (void *) _dest,
446 blocksize / bytesoftype,
447 bytesoftype, (void *) _tmp);
448 /* Copy the leftovers */
449 size_t offset = size * bytesoftype;
450 memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
451 return ret;
452 }
453 else {
454 memcpy((void *) _dest, (void *) _src, blocksize);
455 }
456 return size;
457 }
458