1 /*********************************************************************
2 Blosc - Blocked Shuffling and Compression Library
3
4 Copyright (C) 2021 The Blosc Developers <blosc@blosc.org>
5 https://blosc.org
6 License: BSD 3-Clause (see LICENSE.txt)
7
8 See LICENSE.txt for details about copyright and rights to use.
9 **********************************************************************/
10
11 #include "shuffle-generic.h"
12 #include "shuffle-avx2.h"
13
14 /* Make sure AVX2 is available for the compilation target and compiler. */
15 #if !defined(__AVX2__)
16 #error AVX2 is not supported by the target architecture/platform and/or this compiler.
17 #endif
18
19 #include <immintrin.h>
20
21
22 /* The next is useful for debugging purposes */
23 #if 0
24 #include <stdio.h>
25 #include <string.h>
26
27 static void printymm(__m256i ymm0)
28 {
29 uint8_t buf[32];
30
31 ((__m256i *)buf)[0] = ymm0;
32 printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
33 buf[0], buf[1], buf[2], buf[3],
34 buf[4], buf[5], buf[6], buf[7],
35 buf[8], buf[9], buf[10], buf[11],
36 buf[12], buf[13], buf[14], buf[15],
37 buf[16], buf[17], buf[18], buf[19],
38 buf[20], buf[21], buf[22], buf[23],
39 buf[24], buf[25], buf[26], buf[27],
40 buf[28], buf[29], buf[30], buf[31]);
41 }
42 #endif
43
44 /* GCC doesn't include the split load/store intrinsics
45 needed for the tiled shuffle, so define them here. */
46 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
47 static inline __m256i
48 __attribute__((__always_inline__))
_mm256_loadu2_m128i(const __m128i * const hiaddr,const __m128i * const loaddr)49 _mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr) {
50 return _mm256_inserti128_si256(
51 _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
52 }
53
54 static inline void
55 __attribute__((__always_inline__))
_mm256_storeu2_m128i(__m128i * const hiaddr,__m128i * const loaddr,const __m256i a)56 _mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a) {
57 _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
58 _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
59 }
60 #endif /* defined(__GNUC__) */
61
62 /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
63 static void
shuffle2_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)64 shuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
65 const int32_t vectorizable_elements, const int32_t total_elements) {
66 static const int32_t bytesoftype = 2;
67 int32_t j;
68 int k;
69 __m256i ymm0[2], ymm1[2];
70
71 /* Create the shuffle mask.
72 NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
73 most to least significant (i.e., their order is reversed when compared to
74 loading the mask from an array). */
75 const __m256i shmask = _mm256_set_epi8(
76 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
77 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
78 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
79 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);
80
81 for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
82 /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */
83 for (k = 0; k < 2; k++) {
84 ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
85 ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
86 }
87
88 ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
89 ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);
90
91 ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
92 ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
93 ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);
94
95 /* Store the result vectors */
96 uint8_t* const dest_for_jth_element = dest + j;
97 for (k = 0; k < 2; k++) {
98 _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]);
99 }
100 }
101 }
102
103 /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
104 static void
shuffle4_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)105 shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
106 const int32_t vectorizable_elements, const int32_t total_elements) {
107 static const int32_t bytesoftype = 4;
108 int32_t i;
109 int j;
110 __m256i ymm0[4], ymm1[4];
111
112 /* Create the shuffle mask.
113 NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
114 most to least significant (i.e., their order is reversed when compared to
115 loading the mask from an array). */
116 const __m256i mask = _mm256_set_epi32(
117 0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
118
119 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
120 /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
121 for (j = 0; j < 4; j++) {
122 ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
123 ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
124 ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
125 ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
126 ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
127 ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
128 }
129 /* Transpose double words */
130 for (j = 0; j < 2; j++) {
131 ymm1[j * 2] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
132 ymm1[j * 2 + 1] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
133 }
134 /* Transpose quad words */
135 for (j = 0; j < 2; j++) {
136 ymm0[j * 2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j + 2]);
137 ymm0[j * 2 + 1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j + 2]);
138 }
139 for (j = 0; j < 4; j++) {
140 ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
141 }
142 /* Store the result vectors */
143 uint8_t* const dest_for_ith_element = dest + i;
144 for (j = 0; j < 4; j++) {
145 _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
146 }
147 }
148 }
149
150 /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
151 static void
shuffle8_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)152 shuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
153 const int32_t vectorizable_elements, const int32_t total_elements) {
154 static const int32_t bytesoftype = 8;
155 int32_t j;
156 int k, l;
157 __m256i ymm0[8], ymm1[8];
158
159 for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
160 /* Fetch 32 elements (256 bytes) then transpose bytes. */
161 for (k = 0; k < 8; k++) {
162 ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
163 ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e);
164 ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]);
165 }
166 /* Transpose words */
167 for (k = 0, l = 0; k < 4; k++, l += 2) {
168 ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 1]);
169 ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 1]);
170 }
171 /* Transpose double words */
172 for (k = 0, l = 0; k < 4; k++, l++) {
173 if (k == 2) l += 2;
174 ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 2]);
175 ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 2]);
176 }
177 /* Transpose quad words */
178 for (k = 0; k < 4; k++) {
179 ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 4]);
180 ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 4]);
181 }
182 for (k = 0; k < 8; k++) {
183 ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72);
184 ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8);
185 ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]);
186 }
187 /* Store the result vectors */
188 uint8_t* const dest_for_jth_element = dest + j;
189 for (k = 0; k < 8; k++) {
190 _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
191 }
192 }
193 }
194
195 /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
196 static void
shuffle16_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)197 shuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
198 const int32_t vectorizable_elements, const int32_t total_elements) {
199 static const int32_t bytesoftype = 16;
200 int32_t j;
201 int k, l;
202 __m256i ymm0[16], ymm1[16];
203
204 /* Create the shuffle mask.
205 NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
206 most to least significant (i.e., their order is reversed when compared to
207 loading the mask from an array). */
208 const __m256i shmask = _mm256_set_epi8(
209 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
210 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
211 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
212 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
213
214 for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
215 /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
216 for (k = 0; k < 16; k++) {
217 ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
218 }
219 /* Transpose bytes */
220 for (k = 0, l = 0; k < 8; k++, l += 2) {
221 ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
222 ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
223 }
224 /* Transpose words */
225 for (k = 0, l = -2; k < 8; k++, l++) {
226 if ((k % 2) == 0) l += 2;
227 ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
228 ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
229 }
230 /* Transpose double words */
231 for (k = 0, l = -4; k < 8; k++, l++) {
232 if ((k % 4) == 0) l += 4;
233 ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
234 ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
235 }
236 /* Transpose quad words */
237 for (k = 0; k < 8; k++) {
238 ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
239 ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
240 }
241 for (k = 0; k < 16; k++) {
242 ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
243 ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
244 }
245 /* Store the result vectors */
246 uint8_t* const dest_for_jth_element = dest + j;
247 for (k = 0; k < 16; k++) {
248 _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
249 }
250 }
251 }
252
253 /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
254 static void
shuffle16_tiled_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements,const int32_t bytesoftype)255 shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
256 const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
257 int32_t j;
258 int k, l;
259 __m256i ymm0[16], ymm1[16];
260
261 const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
262 const int32_t vecs_rem = (int32_t)vecs_per_el.rem;
263
264 /* Create the shuffle mask.
265 NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
266 most to least significant (i.e., their order is reversed when compared to
267 loading the mask from an array). */
268 const __m256i shmask = _mm256_set_epi8(
269 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
270 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
271 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
272 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
273
274 for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
275 /* Advance the offset into the type by the vector size (in bytes), unless this is
276 the initial iteration and the type size is not a multiple of the vector size.
277 In that case, only advance by the number of bytes necessary so that the number
278 of remaining bytes in the type will be a multiple of the vector size. */
279 int32_t offset_into_type;
280 for (offset_into_type = 0; offset_into_type < bytesoftype;
281 offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) {
282
283 /* Fetch elements in groups of 512 bytes */
284 const uint8_t* const src_with_offset = src + offset_into_type;
285 for (k = 0; k < 16; k++) {
286 ymm0[k] = _mm256_loadu2_m128i(
287 (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype),
288 (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype));
289 }
290 /* Transpose bytes */
291 for (k = 0, l = 0; k < 8; k++, l += 2) {
292 ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
293 ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
294 }
295 /* Transpose words */
296 for (k = 0, l = -2; k < 8; k++, l++) {
297 if ((k % 2) == 0) l += 2;
298 ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
299 ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
300 }
301 /* Transpose double words */
302 for (k = 0, l = -4; k < 8; k++, l++) {
303 if ((k % 4) == 0) l += 4;
304 ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
305 ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
306 }
307 /* Transpose quad words */
308 for (k = 0; k < 8; k++) {
309 ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
310 ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
311 }
312 for (k = 0; k < 16; k++) {
313 ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
314 ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
315 }
316 /* Store the result vectors */
317 uint8_t* const dest_for_jth_element = dest + j;
318 for (k = 0; k < 16; k++) {
319 _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]);
320 }
321 }
322 }
323 }
324
325 /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
326 static void
unshuffle2_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)327 unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
328 const int32_t vectorizable_elements, const int32_t total_elements) {
329 static const int32_t bytesoftype = 2;
330 int32_t i;
331 int j;
332 __m256i ymm0[2], ymm1[2];
333
334 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
335 /* Load 32 elements (64 bytes) into 2 YMM registers. */
336 const uint8_t* const src_for_ith_element = src + i;
337 for (j = 0; j < 2; j++) {
338 ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
339 }
340 /* Shuffle bytes */
341 for (j = 0; j < 2; j++) {
342 ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
343 }
344 /* Compute the low 64 bytes */
345 ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
346 /* Compute the hi 64 bytes */
347 ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
348 /* Store the result vectors in proper order */
349 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
350 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]);
351 }
352 }
353
354 /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
355 static void
unshuffle4_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)356 unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
357 const int32_t vectorizable_elements, const int32_t total_elements) {
358 static const int32_t bytesoftype = 4;
359 int32_t i;
360 int j;
361 __m256i ymm0[4], ymm1[4];
362
363 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
364 /* Load 32 elements (128 bytes) into 4 YMM registers. */
365 const uint8_t* const src_for_ith_element = src + i;
366 for (j = 0; j < 4; j++) {
367 ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
368 }
369 /* Shuffle bytes */
370 for (j = 0; j < 2; j++) {
371 /* Compute the low 64 bytes */
372 ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
373 /* Compute the hi 64 bytes */
374 ymm1[2 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
375 }
376 /* Shuffle 2-byte words */
377 for (j = 0; j < 2; j++) {
378 /* Compute the low 64 bytes */
379 ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
380 /* Compute the hi 64 bytes */
381 ymm0[2 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
382 }
383 ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
384 ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
385 ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
386 ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);
387
388 /* Store the result vectors in proper order */
389 for (j = 0; j < 4; j++) {
390 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
391 }
392 }
393 }
394
395 /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
396 static void
unshuffle8_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)397 unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
398 const int32_t vectorizable_elements, const int32_t total_elements) {
399 static const int32_t bytesoftype = 8;
400 int32_t i;
401 int j;
402 __m256i ymm0[8], ymm1[8];
403
404 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
405 /* Fetch 32 elements (256 bytes) into 8 YMM registers. */
406 const uint8_t* const src_for_ith_element = src + i;
407 for (j = 0; j < 8; j++) {
408 ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
409 }
410 /* Shuffle bytes */
411 for (j = 0; j < 4; j++) {
412 /* Compute the low 32 bytes */
413 ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
414 /* Compute the hi 32 bytes */
415 ymm1[4 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
416 }
417 /* Shuffle words */
418 for (j = 0; j < 4; j++) {
419 /* Compute the low 32 bytes */
420 ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
421 /* Compute the hi 32 bytes */
422 ymm0[4 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
423 }
424 for (j = 0; j < 8; j++) {
425 ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
426 }
427
428 /* Shuffle 4-byte dwords */
429 for (j = 0; j < 4; j++) {
430 /* Compute the low 32 bytes */
431 ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
432 /* Compute the hi 32 bytes */
433 ymm1[4 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
434 }
435
436 /* Store the result vectors in proper order */
437 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
438 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]);
439 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]);
440 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]);
441 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]);
442 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]);
443 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]);
444 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
445 }
446 }
447
448 /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
449 static void
unshuffle16_avx2(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)450 unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
451 const int32_t vectorizable_elements, const int32_t total_elements) {
452 static const int32_t bytesoftype = 16;
453 int32_t i;
454 int j;
455 __m256i ymm0[16], ymm1[16];
456
457 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
458 /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
459 const uint8_t* const src_for_ith_element = src + i;
460 for (j = 0; j < 16; j++) {
461 ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
462 }
463
464 /* Shuffle bytes */
465 for (j = 0; j < 8; j++) {
466 /* Compute the low 32 bytes */
467 ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
468 /* Compute the hi 32 bytes */
469 ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
470 }
471 /* Shuffle 2-byte words */
472 for (j = 0; j < 8; j++) {
473 /* Compute the low 32 bytes */
474 ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
475 /* Compute the hi 32 bytes */
476 ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
477 }
478 /* Shuffle 4-byte dwords */
479 for (j = 0; j < 8; j++) {
480 /* Compute the low 32 bytes */
481 ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
482 /* Compute the hi 32 bytes */
483 ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
484 }
485
486 /* Shuffle 8-byte qwords */
487 for (j = 0; j < 8; j++) {
488 /* Compute the low 32 bytes */
489 ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
490 /* Compute the hi 32 bytes */
491 ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
492 }
493
494 for (j = 0; j < 8; j++) {
495 ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
496 ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
497 }
498
499 /* Store the result vectors in proper order */
500 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
501 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
502 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
503 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
504 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
505 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
506 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
507 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
508 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
509 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
510 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
511 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
512 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
513 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
514 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
515 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
516 }
517 }
518
519 /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
520 static void
unshuffle16_tiled_avx2(const uint8_t * dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements,const int32_t bytesoftype)521 unshuffle16_tiled_avx2(const uint8_t *dest, const uint8_t* const src,
522 const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
523 int32_t i;
524 int j;
525 __m256i ymm0[16], ymm1[16];
526
527 const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
528 const int32_t vecs_rem = (int32_t)vecs_per_el.rem;
529
530 /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
531 to optimize cache utilization. */
532 int32_t offset_into_type;
533 for (offset_into_type = 0; offset_into_type < bytesoftype;
534 offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) {
535 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
536 /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
537 const uint8_t* const src_for_ith_element = src + i;
538 for (j = 0; j < 16; j++) {
539 ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
540 }
541
542 /* Shuffle bytes */
543 for (j = 0; j < 8; j++) {
544 /* Compute the low 32 bytes */
545 ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
546 /* Compute the hi 32 bytes */
547 ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
548 }
549 /* Shuffle 2-byte words */
550 for (j = 0; j < 8; j++) {
551 /* Compute the low 32 bytes */
552 ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
553 /* Compute the hi 32 bytes */
554 ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
555 }
556 /* Shuffle 4-byte dwords */
557 for (j = 0; j < 8; j++) {
558 /* Compute the low 32 bytes */
559 ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
560 /* Compute the hi 32 bytes */
561 ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
562 }
563
564 /* Shuffle 8-byte qwords */
565 for (j = 0; j < 8; j++) {
566 /* Compute the low 32 bytes */
567 ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
568 /* Compute the hi 32 bytes */
569 ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
570 }
571
572 for (j = 0; j < 8; j++) {
573 ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
574 ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
575 }
576
577 /* Store the result vectors in proper order */
578 const uint8_t* const dest_with_offset = dest + offset_into_type;
579 _mm256_storeu2_m128i(
580 (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype),
581 (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
582 _mm256_storeu2_m128i(
583 (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype),
584 (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
585 _mm256_storeu2_m128i(
586 (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype),
587 (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
588 _mm256_storeu2_m128i(
589 (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype),
590 (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
591 _mm256_storeu2_m128i(
592 (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype),
593 (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
594 _mm256_storeu2_m128i(
595 (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype),
596 (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
597 _mm256_storeu2_m128i(
598 (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype),
599 (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
600 _mm256_storeu2_m128i(
601 (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype),
602 (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
603 _mm256_storeu2_m128i(
604 (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype),
605 (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
606 _mm256_storeu2_m128i(
607 (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype),
608 (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
609 _mm256_storeu2_m128i(
610 (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype),
611 (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
612 _mm256_storeu2_m128i(
613 (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype),
614 (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
615 _mm256_storeu2_m128i(
616 (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype),
617 (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
618 _mm256_storeu2_m128i(
619 (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype),
620 (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
621 _mm256_storeu2_m128i(
622 (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype),
623 (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
624 _mm256_storeu2_m128i(
625 (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype),
626 (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
627 }
628 }
629 }
630
631 /* Shuffle a block. This can never fail. */
632 void
shuffle_avx2(const int32_t bytesoftype,const int32_t blocksize,const uint8_t * _src,uint8_t * _dest)633 shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
634 const uint8_t *_src, uint8_t *_dest) {
635 const int32_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
636
637 /* If the block size is too small to be vectorized,
638 use the generic implementation. */
639 if (blocksize < vectorized_chunk_size) {
640 shuffle_generic(bytesoftype, blocksize, _src, _dest);
641 return;
642 }
643
644 /* If the blocksize is not a multiple of both the typesize and
645 the vector size, round the blocksize down to the next value
646 which is a multiple of both. The vectorized shuffle can be
647 used for that portion of the data, and the naive implementation
648 can be used for the remaining portion. */
649 const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
650
651 const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
652 const int32_t total_elements = blocksize / bytesoftype;
653
654 /* Optimized shuffle implementations */
655 switch (bytesoftype) {
656 case 2:
657 shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
658 break;
659 case 4:
660 shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
661 break;
662 case 8:
663 shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
664 break;
665 case 16:
666 shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
667 break;
668 default:
669 /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */
670 if (bytesoftype > (int32_t)sizeof(__m128i)) {
671 shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
672 }
673 else {
674 /* Non-optimized shuffle */
675 shuffle_generic(bytesoftype, blocksize, _src, _dest);
676 /* The non-optimized function covers the whole buffer,
677 so we're done processing here. */
678 return;
679 }
680 }
681
682 /* If the buffer had any bytes at the end which couldn't be handled
683 by the vectorized implementations, use the non-optimized version
684 to finish them up. */
685 if (vectorizable_bytes < blocksize) {
686 shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
687 }
688 }
689
690 /* Unshuffle a block. This can never fail. */
691 void
unshuffle_avx2(const int32_t bytesoftype,const int32_t blocksize,const uint8_t * _src,uint8_t * _dest)692 unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
693 const uint8_t *_src, uint8_t *_dest) {
694 const int32_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
695
696 /* If the block size is too small to be vectorized,
697 use the generic implementation. */
698 if (blocksize < vectorized_chunk_size) {
699 unshuffle_generic(bytesoftype, blocksize, _src, _dest);
700 return;
701 }
702
703 /* If the blocksize is not a multiple of both the typesize and
704 the vector size, round the blocksize down to the next value
705 which is a multiple of both. The vectorized unshuffle can be
706 used for that portion of the data, and the naive implementation
707 can be used for the remaining portion. */
708 const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
709
710 const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
711 const int32_t total_elements = blocksize / bytesoftype;
712
713 /* Optimized unshuffle implementations */
714 switch (bytesoftype) {
715 case 2:
716 unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
717 break;
718 case 4:
719 unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
720 break;
721 case 8:
722 unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
723 break;
724 case 16:
725 unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
726 break;
727 default:
728 /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */
729 if (bytesoftype > (int32_t)sizeof(__m128i)) {
730 unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
731 }
732 else {
733 /* Non-optimized unshuffle */
734 unshuffle_generic(bytesoftype, blocksize, _src, _dest);
735 /* The non-optimized function covers the whole buffer,
736 so we're done processing here. */
737 return;
738 }
739 }
740
741 /* If the buffer had any bytes at the end which couldn't be handled
742 by the vectorized implementations, use the non-optimized version
743 to finish them up. */
744 if (vectorizable_bytes < blocksize) {
745 unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
746 }
747 }
748