1 /*********************************************************************
2   Blosc - Blocked Shuffling and Compression Library
3 
4   Author: Francesc Alted <francesc@blosc.org>
5 
6   See LICENSES/BLOSC.txt for details about copyright and rights to use.
7 **********************************************************************/
8 
9 #include "shuffle-generic.h"
10 #include "shuffle-avx2.h"
11 
12 /* Make sure AVX2 is available for the compilation target and compiler. */
13 #if !defined(__AVX2__)
14   #error AVX2 is not supported by the target architecture/platform and/or this compiler.
15 #endif
16 
17 #include <immintrin.h>
18 
19 
20 /* The next is useful for debugging purposes */
21 #if 0
22 #include <stdio.h>
23 #include <string.h>
24 
25 static void printymm(__m256i ymm0)
26 {
27   uint8_t buf[32];
28 
29   ((__m256i *)buf)[0] = ymm0;
30   printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
31           buf[0], buf[1], buf[2], buf[3],
32           buf[4], buf[5], buf[6], buf[7],
33           buf[8], buf[9], buf[10], buf[11],
34           buf[12], buf[13], buf[14], buf[15],
35           buf[16], buf[17], buf[18], buf[19],
36           buf[20], buf[21], buf[22], buf[23],
37           buf[24], buf[25], buf[26], buf[27],
38           buf[28], buf[29], buf[30], buf[31]);
39 }
40 #endif
41 
42 /* GCC doesn't include the split load/store intrinsics
43    needed for the tiled shuffle, so define them here. */
44 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
45 static inline __m256i
46 __attribute__((__always_inline__))
_mm256_loadu2_m128i(const __m128i * const hiaddr,const __m128i * const loaddr)47 _mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr)
48 {
49   return _mm256_inserti128_si256(
50     _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
51 }
52 
53 static inline void
54 __attribute__((__always_inline__))
_mm256_storeu2_m128i(__m128i * const hiaddr,__m128i * const loaddr,const __m256i a)55 _mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a)
56 {
57   _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
58   _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
59 }
60 #endif  /* defined(__GNUC__) */
61 
62 /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
63 static void
shuffle2_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)64 shuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
65   const size_t vectorizable_elements, const size_t total_elements)
66 {
67   static const size_t bytesoftype = 2;
68   size_t j;
69   int k;
70   __m256i ymm0[2], ymm1[2];
71 
72   /* Create the shuffle mask.
73      NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
74      most to least significant (i.e., their order is reversed when compared to
75      loading the mask from an array). */
76   const __m256i shmask = _mm256_set_epi8(
77     0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
78     0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
79     0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
80     0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);
81 
82   for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
83     /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */
84     for (k = 0; k < 2; k++) {
85       ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
86       ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
87     }
88 
89     ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
90     ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);
91 
92     ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
93     ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
94     ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);
95 
96     /* Store the result vectors */
97     uint8_t* const dest_for_jth_element = dest + j;
98     for (k = 0; k < 2; k++) {
99       _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]);
100     }
101   }
102 }
103 
104 /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
105 static void
shuffle4_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)106 shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
107   const size_t vectorizable_elements, const size_t total_elements)
108 {
109   static const size_t bytesoftype = 4;
110   size_t i;
111   int j;
112   __m256i ymm0[4], ymm1[4];
113 
114   /* Create the shuffle mask.
115      NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
116      most to least significant (i.e., their order is reversed when compared to
117      loading the mask from an array). */
118   const __m256i mask = _mm256_set_epi32(
119     0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
120 
121   for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
122     /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
123     for (j = 0; j < 4; j++) {
124       ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
125       ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
126       ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
127       ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
128       ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
129       ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
130     }
131     /* Transpose double words */
132     for (j = 0; j < 2; j++) {
133       ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
134       ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
135     }
136     /* Transpose quad words */
137     for (j = 0; j < 2; j++) {
138       ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]);
139       ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]);
140     }
141     for (j = 0; j < 4; j++) {
142       ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
143     }
144     /* Store the result vectors */
145     uint8_t* const dest_for_ith_element = dest + i;
146     for (j = 0; j < 4; j++) {
147       _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
148     }
149   }
150 }
151 
152 /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
153 static void
shuffle8_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)154 shuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
155   const size_t vectorizable_elements, const size_t total_elements)
156 {
157   static const size_t bytesoftype = 8;
158   size_t j;
159   int k, l;
160   __m256i ymm0[8], ymm1[8];
161 
162   for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
163     /* Fetch 32 elements (256 bytes) then transpose bytes. */
164     for (k = 0; k < 8; k++) {
165       ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
166       ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e);
167       ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]);
168     }
169     /* Transpose words */
170     for (k = 0, l = 0; k < 4; k++, l +=2) {
171       ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+1]);
172       ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+1]);
173     }
174     /* Transpose double words */
175     for (k = 0, l = 0; k < 4; k++, l++) {
176       if (k == 2) l += 2;
177       ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+2]);
178       ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+2]);
179     }
180     /* Transpose quad words */
181     for (k = 0; k < 4; k++) {
182       ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+4]);
183       ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+4]);
184     }
185     for(k = 0; k < 8; k++) {
186       ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72);
187       ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8);
188       ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]);
189     }
190     /* Store the result vectors */
191     uint8_t* const dest_for_jth_element = dest + j;
192     for (k = 0; k < 8; k++) {
193       _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
194     }
195   }
196 }
197 
198 /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
199 static void
shuffle16_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)200 shuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
201   const size_t vectorizable_elements, const size_t total_elements)
202 {
203   static const size_t bytesoftype = 16;
204   size_t j;
205   int k, l;
206   __m256i ymm0[16], ymm1[16];
207 
208   /* Create the shuffle mask.
209      NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
210      most to least significant (i.e., their order is reversed when compared to
211      loading the mask from an array). */
212   const __m256i shmask = _mm256_set_epi8(
213     0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
214     0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
215     0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
216     0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
217 
218   for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
219     /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
220     for (k = 0; k < 16; k++) {
221       ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
222     }
223     /* Transpose bytes */
224     for (k = 0, l = 0; k < 8; k++, l +=2) {
225       ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]);
226       ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]);
227     }
228     /* Transpose words */
229     for (k = 0, l = -2; k < 8; k++, l++) {
230       if ((k%2) == 0) l += 2;
231       ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]);
232       ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]);
233     }
234     /* Transpose double words */
235     for (k = 0, l = -4; k < 8; k++, l++) {
236       if ((k%4) == 0) l += 4;
237       ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]);
238       ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]);
239     }
240     /* Transpose quad words */
241     for (k = 0; k < 8; k++) {
242       ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]);
243       ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]);
244     }
245     for (k = 0; k < 16; k++) {
246       ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
247       ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
248     }
249     /* Store the result vectors */
250     uint8_t* const dest_for_jth_element = dest + j;
251     for (k = 0; k < 16; k++) {
252       _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
253     }
254   }
255 }
256 
257 /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
258 static void
shuffle16_tiled_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements,const size_t bytesoftype)259 shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
260   const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
261 {
262   size_t j;
263   int k, l;
264   __m256i ymm0[16], ymm1[16];
265 
266   const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
267 
268   /* Create the shuffle mask.
269      NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
270      most to least significant (i.e., their order is reversed when compared to
271      loading the mask from an array). */
272   const __m256i shmask = _mm256_set_epi8(
273     0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
274     0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
275     0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
276     0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
277 
278   for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
279     /* Advance the offset into the type by the vector size (in bytes), unless this is
280     the initial iteration and the type size is not a multiple of the vector size.
281     In that case, only advance by the number of bytes necessary so that the number
282     of remaining bytes in the type will be a multiple of the vector size. */
283     size_t offset_into_type;
284     for (offset_into_type = 0; offset_into_type < bytesoftype;
285       offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) {
286 
287       /* Fetch elements in groups of 512 bytes */
288       const uint8_t* const src_with_offset = src + offset_into_type;
289       for (k = 0; k < 16; k++) {
290         ymm0[k] = _mm256_loadu2_m128i(
291           (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype),
292           (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype));
293       }
294       /* Transpose bytes */
295       for (k = 0, l = 0; k < 8; k++, l +=2) {
296         ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]);
297         ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]);
298       }
299       /* Transpose words */
300       for (k = 0, l = -2; k < 8; k++, l++) {
301         if ((k%2) == 0) l += 2;
302         ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]);
303         ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]);
304       }
305       /* Transpose double words */
306       for (k = 0, l = -4; k < 8; k++, l++) {
307         if ((k%4) == 0) l += 4;
308         ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]);
309         ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]);
310       }
311       /* Transpose quad words */
312       for (k = 0; k < 8; k++) {
313         ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]);
314         ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]);
315       }
316       for (k = 0; k < 16; k++) {
317         ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
318         ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
319       }
320       /* Store the result vectors */
321       uint8_t* const dest_for_jth_element = dest + j;
322       for (k = 0; k < 16; k++) {
323         _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]);
324       }
325     }
326   }
327 }
328 
329 /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
330 static void
unshuffle2_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)331 unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
332   const size_t vectorizable_elements, const size_t total_elements)
333 {
334   static const size_t bytesoftype = 2;
335   size_t i;
336   int j;
337   __m256i ymm0[2], ymm1[2];
338 
339   for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
340     /* Load 32 elements (64 bytes) into 2 YMM registers. */
341     const uint8_t* const src_for_ith_element = src + i;
342     for (j = 0; j < 2; j++) {
343       ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
344     }
345     /* Shuffle bytes */
346     for (j = 0; j < 2; j++) {
347       ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
348     }
349     /* Compute the low 64 bytes */
350     ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
351     /* Compute the hi 64 bytes */
352     ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
353     /* Store the result vectors in proper order */
354     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
355     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]);
356   }
357 }
358 
359 /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
360 static void
unshuffle4_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)361 unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
362   const size_t vectorizable_elements, const size_t total_elements)
363 {
364   static const size_t bytesoftype = 4;
365   size_t i;
366   int j;
367   __m256i ymm0[4], ymm1[4];
368 
369   for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
370     /* Load 32 elements (128 bytes) into 4 YMM registers. */
371     const uint8_t* const src_for_ith_element = src + i;
372     for (j = 0; j < 4; j++) {
373       ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
374     }
375     /* Shuffle bytes */
376     for (j = 0; j < 2; j++) {
377       /* Compute the low 64 bytes */
378       ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
379       /* Compute the hi 64 bytes */
380       ymm1[2+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
381     }
382     /* Shuffle 2-byte words */
383     for (j = 0; j < 2; j++) {
384       /* Compute the low 64 bytes */
385       ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
386       /* Compute the hi 64 bytes */
387       ymm0[2+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
388     }
389     ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
390     ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
391     ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
392     ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);
393 
394     /* Store the result vectors in proper order */
395     for (j = 0; j < 4; j++) {
396       _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
397     }
398   }
399 }
400 
401 /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
402 static void
unshuffle8_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)403 unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
404   const size_t vectorizable_elements, const size_t total_elements)
405 {
406   static const size_t bytesoftype = 8;
407   size_t i;
408   int j;
409   __m256i ymm0[8], ymm1[8];
410 
411   for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
412     /* Fetch 32 elements (256 bytes) into 8 YMM registers. */
413     const uint8_t* const src_for_ith_element = src + i;
414     for (j = 0; j < 8; j++) {
415       ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
416     }
417     /* Shuffle bytes */
418     for (j = 0; j < 4; j++) {
419       /* Compute the low 32 bytes */
420       ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
421       /* Compute the hi 32 bytes */
422       ymm1[4+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
423     }
424     /* Shuffle words */
425     for (j = 0; j < 4; j++) {
426       /* Compute the low 32 bytes */
427       ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
428       /* Compute the hi 32 bytes */
429       ymm0[4+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
430     }
431     for (j = 0; j < 8; j++) {
432       ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
433     }
434 
435     /* Shuffle 4-byte dwords */
436     for (j = 0; j < 4; j++) {
437       /* Compute the low 32 bytes */
438       ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
439       /* Compute the hi 32 bytes */
440       ymm1[4+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
441     }
442 
443     /* Store the result vectors in proper order */
444     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
445     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]);
446     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]);
447     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]);
448     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]);
449     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]);
450     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]);
451     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
452   }
453 }
454 
455 /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
456 static void
unshuffle16_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)457 unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
458   const size_t vectorizable_elements, const size_t total_elements)
459 {
460   static const size_t bytesoftype = 16;
461   size_t i;
462   int j;
463   __m256i ymm0[16], ymm1[16];
464 
465   for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
466     /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
467     const uint8_t* const src_for_ith_element = src + i;
468     for (j = 0; j < 16; j++) {
469       ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
470     }
471 
472     /* Shuffle bytes */
473     for (j = 0; j < 8; j++) {
474       /* Compute the low 32 bytes */
475       ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
476       /* Compute the hi 32 bytes */
477       ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
478     }
479     /* Shuffle 2-byte words */
480     for (j = 0; j < 8; j++) {
481       /* Compute the low 32 bytes */
482       ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
483       /* Compute the hi 32 bytes */
484       ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
485     }
486     /* Shuffle 4-byte dwords */
487     for (j = 0; j < 8; j++) {
488       /* Compute the low 32 bytes */
489       ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
490       /* Compute the hi 32 bytes */
491       ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
492     }
493 
494     /* Shuffle 8-byte qwords */
495     for (j = 0; j < 8; j++) {
496       /* Compute the low 32 bytes */
497       ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
498       /* Compute the hi 32 bytes */
499       ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
500     }
501 
502     for (j = 0; j < 8; j++) {
503       ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
504       ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
505     }
506 
507     /* Store the result vectors in proper order */
508     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
509     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
510     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
511     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
512     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
513     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
514     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
515     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
516     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
517     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
518     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
519     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
520     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
521     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
522     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
523     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
524   }
525 }
526 
527 /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
528 static void
unshuffle16_tiled_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements,const size_t bytesoftype)529 unshuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
530   const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
531 {
532   size_t i;
533   int j;
534   __m256i ymm0[16], ymm1[16];
535 
536   const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
537 
538   /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
539      to optimize cache utilization. */
540   size_t offset_into_type;
541   for (offset_into_type = 0; offset_into_type < bytesoftype;
542     offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) {
543     for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
544       /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
545       const uint8_t* const src_for_ith_element = src + i;
546       for (j = 0; j < 16; j++) {
547         ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
548       }
549 
550       /* Shuffle bytes */
551       for (j = 0; j < 8; j++) {
552         /* Compute the low 32 bytes */
553         ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
554         /* Compute the hi 32 bytes */
555         ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
556       }
557       /* Shuffle 2-byte words */
558       for (j = 0; j < 8; j++) {
559         /* Compute the low 32 bytes */
560         ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
561         /* Compute the hi 32 bytes */
562         ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
563       }
564       /* Shuffle 4-byte dwords */
565       for (j = 0; j < 8; j++) {
566         /* Compute the low 32 bytes */
567         ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
568         /* Compute the hi 32 bytes */
569         ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
570       }
571 
572       /* Shuffle 8-byte qwords */
573       for (j = 0; j < 8; j++) {
574         /* Compute the low 32 bytes */
575         ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
576         /* Compute the hi 32 bytes */
577         ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
578       }
579 
580       for (j = 0; j < 8; j++) {
581         ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
582         ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
583       }
584 
585       /* Store the result vectors in proper order */
586       const uint8_t* const dest_with_offset = dest + offset_into_type;
587       _mm256_storeu2_m128i(
588         (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype),
589         (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
590       _mm256_storeu2_m128i(
591         (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype),
592         (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
593       _mm256_storeu2_m128i(
594         (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype),
595         (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
596       _mm256_storeu2_m128i(
597         (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype),
598         (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
599       _mm256_storeu2_m128i(
600         (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype),
601         (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
602       _mm256_storeu2_m128i(
603         (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype),
604         (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
605       _mm256_storeu2_m128i(
606         (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype),
607         (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
608       _mm256_storeu2_m128i(
609         (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype),
610         (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
611       _mm256_storeu2_m128i(
612         (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype),
613         (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
614       _mm256_storeu2_m128i(
615         (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype),
616         (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
617       _mm256_storeu2_m128i(
618         (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype),
619         (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
620       _mm256_storeu2_m128i(
621         (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype),
622         (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
623       _mm256_storeu2_m128i(
624         (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype),
625         (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
626       _mm256_storeu2_m128i(
627         (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype),
628         (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
629       _mm256_storeu2_m128i(
630         (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype),
631         (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
632       _mm256_storeu2_m128i(
633         (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype),
634         (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
635     }
636   }
637 }
638 
639 /* Shuffle a block.  This can never fail. */
640 void
blosc_internal_shuffle_avx2(const size_t bytesoftype,const size_t blocksize,const uint8_t * const _src,uint8_t * const _dest)641 blosc_internal_shuffle_avx2(const size_t bytesoftype, const size_t blocksize,
642                             const uint8_t* const _src, uint8_t* const _dest) {
643   const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
644 
645   /* If the block size is too small to be vectorized,
646      use the generic implementation. */
647   if (blocksize < vectorized_chunk_size) {
648     blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest);
649     return;
650   }
651 
652   /* If the blocksize is not a multiple of both the typesize and
653      the vector size, round the blocksize down to the next value
654      which is a multiple of both. The vectorized shuffle can be
655      used for that portion of the data, and the naive implementation
656      can be used for the remaining portion. */
657   const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
658 
659   const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
660   const size_t total_elements = blocksize / bytesoftype;
661 
662   /* Optimized shuffle implementations */
663   switch (bytesoftype)
664   {
665   case 2:
666     shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
667     break;
668   case 4:
669     shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
670     break;
671   case 8:
672     shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
673     break;
674   case 16:
675     shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
676     break;
677   default:
678     /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */
679     if (bytesoftype > sizeof(__m128i)) {
680       shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
681     }
682     else {
683       /* Non-optimized shuffle */
684       blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest);
685       /* The non-optimized function covers the whole buffer,
686          so we're done processing here. */
687       return;
688     }
689   }
690 
691   /* If the buffer had any bytes at the end which couldn't be handled
692      by the vectorized implementations, use the non-optimized version
693      to finish them up. */
694   if (vectorizable_bytes < blocksize) {
695     shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
696   }
697 }
698 
699 /* Unshuffle a block.  This can never fail. */
700 void
blosc_internal_unshuffle_avx2(const size_t bytesoftype,const size_t blocksize,const uint8_t * const _src,uint8_t * const _dest)701 blosc_internal_unshuffle_avx2(const size_t bytesoftype, const size_t blocksize,
702                               const uint8_t* const _src, uint8_t* const _dest) {
703   const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
704 
705   /* If the block size is too small to be vectorized,
706      use the generic implementation. */
707   if (blocksize < vectorized_chunk_size) {
708     blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest);
709     return;
710   }
711 
712   /* If the blocksize is not a multiple of both the typesize and
713      the vector size, round the blocksize down to the next value
714      which is a multiple of both. The vectorized unshuffle can be
715      used for that portion of the data, and the naive implementation
716      can be used for the remaining portion. */
717   const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
718 
719   const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
720   const size_t total_elements = blocksize / bytesoftype;
721 
722   /* Optimized unshuffle implementations */
723   switch (bytesoftype)
724   {
725   case 2:
726     unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
727     break;
728   case 4:
729     unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
730     break;
731   case 8:
732     unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
733     break;
734   case 16:
735     unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
736     break;
737   default:
738     /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */
739     if (bytesoftype > sizeof(__m128i)) {
740       unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
741     }
742     else {
743       /* Non-optimized unshuffle */
744       blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest);
745       /* The non-optimized function covers the whole buffer,
746          so we're done processing here. */
747       return;
748     }
749   }
750 
751   /* If the buffer had any bytes at the end which couldn't be handled
752      by the vectorized implementations, use the non-optimized version
753      to finish them up. */
754   if (vectorizable_bytes < blocksize) {
755     unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
756   }
757 }
758