1 /*********************************************************************
2   Blosc - Blocked Shuffling and Compression Library
3 
4   Copyright (C) 2021  The Blosc Developers <blosc@blosc.org>
5   https://blosc.org
6   License: BSD 3-Clause (see LICENSE.txt)
7 
8   See LICENSE.txt for details about copyright and rights to use.
9 **********************************************************************/
10 
11 #include "shuffle-generic.h"
12 #include "shuffle-avx2.h"
13 
14 /* Make sure AVX2 is available for the compilation target and compiler. */
15 #if !defined(__AVX2__)
16   #error AVX2 is not supported by the target architecture/platform and/or this compiler.
17 #endif
18 
19 #include <immintrin.h>
20 
21 
22 /* The next is useful for debugging purposes */
23 #if 0
24 #include <stdio.h>
25 #include <string.h>
26 
27 static void printymm(__m256i ymm0)
28 {
29   uint8_t buf[32];
30 
31   ((__m256i *)buf)[0] = ymm0;
32   printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
33           buf[0], buf[1], buf[2], buf[3],
34           buf[4], buf[5], buf[6], buf[7],
35           buf[8], buf[9], buf[10], buf[11],
36           buf[12], buf[13], buf[14], buf[15],
37           buf[16], buf[17], buf[18], buf[19],
38           buf[20], buf[21], buf[22], buf[23],
39           buf[24], buf[25], buf[26], buf[27],
40           buf[28], buf[29], buf[30], buf[31]);
41 }
42 #endif
43 
44 /* GCC doesn't include the split load/store intrinsics
45    needed for the tiled shuffle, so define them here. */
46 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
47 static inline __m256i
48 __attribute__((__always_inline__))
_mm256_loadu2_m128i(const __m128i * const hiaddr,const __m128i * const loaddr)49 _mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr) {
50   return _mm256_inserti128_si256(
51       _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
52 }
53 
54 static inline void
55 __attribute__((__always_inline__))
_mm256_storeu2_m128i(__m128i * const hiaddr,__m128i * const loaddr,const __m256i a)56 _mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a) {
57   _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
58   _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
59 }
60 #endif  /* defined(__GNUC__) */
61 
62 /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
63 static void
shuffle2_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)64 shuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
65               const int32_t vectorizable_elements, const int32_t total_elements) {
66   static const int32_t bytesoftype = 2;
67   int32_t j;
68   int k;
69   __m256i ymm0[2], ymm1[2];
70 
71   /* Create the shuffle mask.
72      NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
73      most to least significant (i.e., their order is reversed when compared to
74      loading the mask from an array). */
75   const __m256i shmask = _mm256_set_epi8(
76       0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
77       0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
78       0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
79       0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);
80 
81   for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
82     /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */
83     for (k = 0; k < 2; k++) {
84       ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
85       ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
86     }
87 
88     ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
89     ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);
90 
91     ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
92     ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
93     ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);
94 
95     /* Store the result vectors */
96     uint8_t* const dest_for_jth_element = dest + j;
97     for (k = 0; k < 2; k++) {
98       _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]);
99     }
100   }
101 }
102 
103 /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
104 static void
shuffle4_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)105 shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
106               const int32_t vectorizable_elements, const int32_t total_elements) {
107   static const int32_t bytesoftype = 4;
108   int32_t i;
109   int j;
110   __m256i ymm0[4], ymm1[4];
111 
112   /* Create the shuffle mask.
113      NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
114      most to least significant (i.e., their order is reversed when compared to
115      loading the mask from an array). */
116   const __m256i mask = _mm256_set_epi32(
117       0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
118 
119   for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
120     /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
121     for (j = 0; j < 4; j++) {
122       ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
123       ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
124       ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
125       ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
126       ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
127       ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
128     }
129     /* Transpose double words */
130     for (j = 0; j < 2; j++) {
131       ymm1[j * 2] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
132       ymm1[j * 2 + 1] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
133     }
134     /* Transpose quad words */
135     for (j = 0; j < 2; j++) {
136       ymm0[j * 2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j + 2]);
137       ymm0[j * 2 + 1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j + 2]);
138     }
139     for (j = 0; j < 4; j++) {
140       ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
141     }
142     /* Store the result vectors */
143     uint8_t* const dest_for_ith_element = dest + i;
144     for (j = 0; j < 4; j++) {
145       _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
146     }
147   }
148 }
149 
150 /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
151 static void
shuffle8_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)152 shuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
153               const int32_t vectorizable_elements, const int32_t total_elements) {
154   static const int32_t bytesoftype = 8;
155   int32_t j;
156   int k, l;
157   __m256i ymm0[8], ymm1[8];
158 
159   for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
160     /* Fetch 32 elements (256 bytes) then transpose bytes. */
161     for (k = 0; k < 8; k++) {
162       ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
163       ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e);
164       ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]);
165     }
166     /* Transpose words */
167     for (k = 0, l = 0; k < 4; k++, l += 2) {
168       ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 1]);
169       ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 1]);
170     }
171     /* Transpose double words */
172     for (k = 0, l = 0; k < 4; k++, l++) {
173       if (k == 2) l += 2;
174       ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 2]);
175       ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 2]);
176     }
177     /* Transpose quad words */
178     for (k = 0; k < 4; k++) {
179       ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 4]);
180       ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 4]);
181     }
182     for (k = 0; k < 8; k++) {
183       ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72);
184       ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8);
185       ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]);
186     }
187     /* Store the result vectors */
188     uint8_t* const dest_for_jth_element = dest + j;
189     for (k = 0; k < 8; k++) {
190       _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
191     }
192   }
193 }
194 
195 /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
196 static void
shuffle16_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)197 shuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
198                const int32_t vectorizable_elements, const int32_t total_elements) {
199   static const int32_t bytesoftype = 16;
200   int32_t j;
201   int k, l;
202   __m256i ymm0[16], ymm1[16];
203 
204   /* Create the shuffle mask.
205      NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
206      most to least significant (i.e., their order is reversed when compared to
207      loading the mask from an array). */
208   const __m256i shmask = _mm256_set_epi8(
209       0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
210       0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
211       0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
212       0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
213 
214   for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
215     /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
216     for (k = 0; k < 16; k++) {
217       ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
218     }
219     /* Transpose bytes */
220     for (k = 0, l = 0; k < 8; k++, l += 2) {
221       ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
222       ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
223     }
224     /* Transpose words */
225     for (k = 0, l = -2; k < 8; k++, l++) {
226       if ((k % 2) == 0) l += 2;
227       ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
228       ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
229     }
230     /* Transpose double words */
231     for (k = 0, l = -4; k < 8; k++, l++) {
232       if ((k % 4) == 0) l += 4;
233       ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
234       ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
235     }
236     /* Transpose quad words */
237     for (k = 0; k < 8; k++) {
238       ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
239       ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
240     }
241     for (k = 0; k < 16; k++) {
242       ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
243       ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
244     }
245     /* Store the result vectors */
246     uint8_t* const dest_for_jth_element = dest + j;
247     for (k = 0; k < 16; k++) {
248       _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
249     }
250   }
251 }
252 
253 /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
254 static void
shuffle16_tiled_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements,const int32_t bytesoftype)255 shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
256                      const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
257   int32_t j;
258   int k, l;
259   __m256i ymm0[16], ymm1[16];
260 
261   const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
262   const int32_t vecs_rem = (int32_t)vecs_per_el.rem;
263 
264   /* Create the shuffle mask.
265      NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
266      most to least significant (i.e., their order is reversed when compared to
267      loading the mask from an array). */
268   const __m256i shmask = _mm256_set_epi8(
269       0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
270       0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
271       0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
272       0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
273 
274   for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
275     /* Advance the offset into the type by the vector size (in bytes), unless this is
276     the initial iteration and the type size is not a multiple of the vector size.
277     In that case, only advance by the number of bytes necessary so that the number
278     of remaining bytes in the type will be a multiple of the vector size. */
279     int32_t offset_into_type;
280     for (offset_into_type = 0; offset_into_type < bytesoftype;
281          offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) {
282 
283       /* Fetch elements in groups of 512 bytes */
284       const uint8_t* const src_with_offset = src + offset_into_type;
285       for (k = 0; k < 16; k++) {
286         ymm0[k] = _mm256_loadu2_m128i(
287             (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype),
288             (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype));
289       }
290       /* Transpose bytes */
291       for (k = 0, l = 0; k < 8; k++, l += 2) {
292         ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
293         ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
294       }
295       /* Transpose words */
296       for (k = 0, l = -2; k < 8; k++, l++) {
297         if ((k % 2) == 0) l += 2;
298         ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
299         ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
300       }
301       /* Transpose double words */
302       for (k = 0, l = -4; k < 8; k++, l++) {
303         if ((k % 4) == 0) l += 4;
304         ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
305         ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
306       }
307       /* Transpose quad words */
308       for (k = 0; k < 8; k++) {
309         ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
310         ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
311       }
312       for (k = 0; k < 16; k++) {
313         ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
314         ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
315       }
316       /* Store the result vectors */
317       uint8_t* const dest_for_jth_element = dest + j;
318       for (k = 0; k < 16; k++) {
319         _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]);
320       }
321     }
322   }
323 }
324 
325 /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
326 static void
unshuffle2_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)327 unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
328                 const int32_t vectorizable_elements, const int32_t total_elements) {
329   static const int32_t bytesoftype = 2;
330   int32_t i;
331   int j;
332   __m256i ymm0[2], ymm1[2];
333 
334   for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
335     /* Load 32 elements (64 bytes) into 2 YMM registers. */
336     const uint8_t* const src_for_ith_element = src + i;
337     for (j = 0; j < 2; j++) {
338       ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
339     }
340     /* Shuffle bytes */
341     for (j = 0; j < 2; j++) {
342       ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
343     }
344     /* Compute the low 64 bytes */
345     ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
346     /* Compute the hi 64 bytes */
347     ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
348     /* Store the result vectors in proper order */
349     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
350     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]);
351   }
352 }
353 
354 /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
355 static void
unshuffle4_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)356 unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
357                 const int32_t vectorizable_elements, const int32_t total_elements) {
358   static const int32_t bytesoftype = 4;
359   int32_t i;
360   int j;
361   __m256i ymm0[4], ymm1[4];
362 
363   for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
364     /* Load 32 elements (128 bytes) into 4 YMM registers. */
365     const uint8_t* const src_for_ith_element = src + i;
366     for (j = 0; j < 4; j++) {
367       ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
368     }
369     /* Shuffle bytes */
370     for (j = 0; j < 2; j++) {
371       /* Compute the low 64 bytes */
372       ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
373       /* Compute the hi 64 bytes */
374       ymm1[2 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
375     }
376     /* Shuffle 2-byte words */
377     for (j = 0; j < 2; j++) {
378       /* Compute the low 64 bytes */
379       ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
380       /* Compute the hi 64 bytes */
381       ymm0[2 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
382     }
383     ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
384     ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
385     ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
386     ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);
387 
388     /* Store the result vectors in proper order */
389     for (j = 0; j < 4; j++) {
390       _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
391     }
392   }
393 }
394 
395 /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
396 static void
unshuffle8_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)397 unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
398                 const int32_t vectorizable_elements, const int32_t total_elements) {
399   static const int32_t bytesoftype = 8;
400   int32_t i;
401   int j;
402   __m256i ymm0[8], ymm1[8];
403 
404   for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
405     /* Fetch 32 elements (256 bytes) into 8 YMM registers. */
406     const uint8_t* const src_for_ith_element = src + i;
407     for (j = 0; j < 8; j++) {
408       ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
409     }
410     /* Shuffle bytes */
411     for (j = 0; j < 4; j++) {
412       /* Compute the low 32 bytes */
413       ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
414       /* Compute the hi 32 bytes */
415       ymm1[4 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
416     }
417     /* Shuffle words */
418     for (j = 0; j < 4; j++) {
419       /* Compute the low 32 bytes */
420       ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
421       /* Compute the hi 32 bytes */
422       ymm0[4 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
423     }
424     for (j = 0; j < 8; j++) {
425       ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
426     }
427 
428     /* Shuffle 4-byte dwords */
429     for (j = 0; j < 4; j++) {
430       /* Compute the low 32 bytes */
431       ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
432       /* Compute the hi 32 bytes */
433       ymm1[4 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
434     }
435 
436     /* Store the result vectors in proper order */
437     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
438     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]);
439     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]);
440     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]);
441     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]);
442     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]);
443     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]);
444     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
445   }
446 }
447 
448 /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
449 static void
unshuffle16_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)450 unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
451                  const int32_t vectorizable_elements, const int32_t total_elements) {
452   static const int32_t bytesoftype = 16;
453   int32_t i;
454   int j;
455   __m256i ymm0[16], ymm1[16];
456 
457   for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
458     /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
459     const uint8_t* const src_for_ith_element = src + i;
460     for (j = 0; j < 16; j++) {
461       ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
462     }
463 
464     /* Shuffle bytes */
465     for (j = 0; j < 8; j++) {
466       /* Compute the low 32 bytes */
467       ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
468       /* Compute the hi 32 bytes */
469       ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
470     }
471     /* Shuffle 2-byte words */
472     for (j = 0; j < 8; j++) {
473       /* Compute the low 32 bytes */
474       ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
475       /* Compute the hi 32 bytes */
476       ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
477     }
478     /* Shuffle 4-byte dwords */
479     for (j = 0; j < 8; j++) {
480       /* Compute the low 32 bytes */
481       ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
482       /* Compute the hi 32 bytes */
483       ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
484     }
485 
486     /* Shuffle 8-byte qwords */
487     for (j = 0; j < 8; j++) {
488       /* Compute the low 32 bytes */
489       ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
490       /* Compute the hi 32 bytes */
491       ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
492     }
493 
494     for (j = 0; j < 8; j++) {
495       ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
496       ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
497     }
498 
499     /* Store the result vectors in proper order */
500     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
501     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
502     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
503     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
504     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
505     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
506     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
507     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
508     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
509     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
510     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
511     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
512     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
513     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
514     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
515     _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
516   }
517 }
518 
519 /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
520 static void
unshuffle16_tiled_avx2(const uint8_t * dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements,const int32_t bytesoftype)521 unshuffle16_tiled_avx2(const uint8_t *dest, const uint8_t* const src,
522                        const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
523   int32_t i;
524   int j;
525   __m256i ymm0[16], ymm1[16];
526 
527   const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
528   const int32_t vecs_rem = (int32_t)vecs_per_el.rem;
529 
530   /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
531      to optimize cache utilization. */
532   int32_t offset_into_type;
533   for (offset_into_type = 0; offset_into_type < bytesoftype;
534        offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) {
535     for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
536       /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
537       const uint8_t* const src_for_ith_element = src + i;
538       for (j = 0; j < 16; j++) {
539         ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
540       }
541 
542       /* Shuffle bytes */
543       for (j = 0; j < 8; j++) {
544         /* Compute the low 32 bytes */
545         ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
546         /* Compute the hi 32 bytes */
547         ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
548       }
549       /* Shuffle 2-byte words */
550       for (j = 0; j < 8; j++) {
551         /* Compute the low 32 bytes */
552         ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
553         /* Compute the hi 32 bytes */
554         ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
555       }
556       /* Shuffle 4-byte dwords */
557       for (j = 0; j < 8; j++) {
558         /* Compute the low 32 bytes */
559         ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
560         /* Compute the hi 32 bytes */
561         ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
562       }
563 
564       /* Shuffle 8-byte qwords */
565       for (j = 0; j < 8; j++) {
566         /* Compute the low 32 bytes */
567         ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
568         /* Compute the hi 32 bytes */
569         ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
570       }
571 
572       for (j = 0; j < 8; j++) {
573         ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
574         ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
575       }
576 
577       /* Store the result vectors in proper order */
578       const uint8_t* const dest_with_offset = dest + offset_into_type;
579       _mm256_storeu2_m128i(
580           (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype),
581           (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
582       _mm256_storeu2_m128i(
583           (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype),
584           (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
585       _mm256_storeu2_m128i(
586           (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype),
587           (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
588       _mm256_storeu2_m128i(
589           (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype),
590           (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
591       _mm256_storeu2_m128i(
592           (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype),
593           (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
594       _mm256_storeu2_m128i(
595           (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype),
596           (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
597       _mm256_storeu2_m128i(
598           (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype),
599           (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
600       _mm256_storeu2_m128i(
601           (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype),
602           (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
603       _mm256_storeu2_m128i(
604           (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype),
605           (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
606       _mm256_storeu2_m128i(
607           (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype),
608           (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
609       _mm256_storeu2_m128i(
610           (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype),
611           (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
612       _mm256_storeu2_m128i(
613           (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype),
614           (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
615       _mm256_storeu2_m128i(
616           (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype),
617           (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
618       _mm256_storeu2_m128i(
619           (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype),
620           (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
621       _mm256_storeu2_m128i(
622           (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype),
623           (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
624       _mm256_storeu2_m128i(
625           (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype),
626           (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
627     }
628   }
629 }
630 
631 /* Shuffle a block.  This can never fail. */
632 void
shuffle_avx2(const int32_t bytesoftype,const int32_t blocksize,const uint8_t * _src,uint8_t * _dest)633 shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
634              const uint8_t *_src, uint8_t *_dest) {
635   const int32_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
636 
637   /* If the block size is too small to be vectorized,
638      use the generic implementation. */
639   if (blocksize < vectorized_chunk_size) {
640     shuffle_generic(bytesoftype, blocksize, _src, _dest);
641     return;
642   }
643 
644   /* If the blocksize is not a multiple of both the typesize and
645      the vector size, round the blocksize down to the next value
646      which is a multiple of both. The vectorized shuffle can be
647      used for that portion of the data, and the naive implementation
648      can be used for the remaining portion. */
649   const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
650 
651   const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
652   const int32_t total_elements = blocksize / bytesoftype;
653 
654   /* Optimized shuffle implementations */
655   switch (bytesoftype) {
656     case 2:
657       shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
658       break;
659     case 4:
660       shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
661       break;
662     case 8:
663       shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
664       break;
665     case 16:
666       shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
667       break;
668     default:
669       /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */
670       if (bytesoftype > (int32_t)sizeof(__m128i)) {
671         shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
672       }
673       else {
674         /* Non-optimized shuffle */
675         shuffle_generic(bytesoftype, blocksize, _src, _dest);
676         /* The non-optimized function covers the whole buffer,
677            so we're done processing here. */
678         return;
679       }
680   }
681 
682   /* If the buffer had any bytes at the end which couldn't be handled
683      by the vectorized implementations, use the non-optimized version
684      to finish them up. */
685   if (vectorizable_bytes < blocksize) {
686     shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
687   }
688 }
689 
690 /* Unshuffle a block.  This can never fail. */
691 void
unshuffle_avx2(const int32_t bytesoftype,const int32_t blocksize,const uint8_t * _src,uint8_t * _dest)692 unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
693                const uint8_t *_src, uint8_t *_dest) {
694   const int32_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
695 
696   /* If the block size is too small to be vectorized,
697      use the generic implementation. */
698   if (blocksize < vectorized_chunk_size) {
699     unshuffle_generic(bytesoftype, blocksize, _src, _dest);
700     return;
701   }
702 
703   /* If the blocksize is not a multiple of both the typesize and
704      the vector size, round the blocksize down to the next value
705      which is a multiple of both. The vectorized unshuffle can be
706      used for that portion of the data, and the naive implementation
707      can be used for the remaining portion. */
708   const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
709 
710   const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
711   const int32_t total_elements = blocksize / bytesoftype;
712 
713   /* Optimized unshuffle implementations */
714   switch (bytesoftype) {
715     case 2:
716       unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
717       break;
718     case 4:
719       unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
720       break;
721     case 8:
722       unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
723       break;
724     case 16:
725       unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
726       break;
727     default:
728       /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */
729       if (bytesoftype > (int32_t)sizeof(__m128i)) {
730         unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
731       }
732       else {
733         /* Non-optimized unshuffle */
734         unshuffle_generic(bytesoftype, blocksize, _src, _dest);
735         /* The non-optimized function covers the whole buffer,
736            so we're done processing here. */
737         return;
738       }
739   }
740 
741   /* If the buffer had any bytes at the end which couldn't be handled
742      by the vectorized implementations, use the non-optimized version
743      to finish them up. */
744   if (vectorizable_bytes < blocksize) {
745     unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
746   }
747 }
748