1 /*********************************************************************
2 Blosc - Blocked Shuffling and Compression Library
3
4 Author: Francesc Alted <francesc@blosc.org>
5
6 See LICENSES/BLOSC.txt for details about copyright and rights to use.
7 **********************************************************************/
8
9 #include "shuffle-generic.h"
10 #include "shuffle-avx2.h"
11
12 /* Make sure AVX2 is available for the compilation target and compiler. */
13 #if !defined(__AVX2__)
14 #error AVX2 is not supported by the target architecture/platform and/or this compiler.
15 #endif
16
17 #include <immintrin.h>
18
19
20 /* The next is useful for debugging purposes */
21 #if 0
22 #include <stdio.h>
23 #include <string.h>
24
25 static void printymm(__m256i ymm0)
26 {
27 uint8_t buf[32];
28
29 ((__m256i *)buf)[0] = ymm0;
30 printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
31 buf[0], buf[1], buf[2], buf[3],
32 buf[4], buf[5], buf[6], buf[7],
33 buf[8], buf[9], buf[10], buf[11],
34 buf[12], buf[13], buf[14], buf[15],
35 buf[16], buf[17], buf[18], buf[19],
36 buf[20], buf[21], buf[22], buf[23],
37 buf[24], buf[25], buf[26], buf[27],
38 buf[28], buf[29], buf[30], buf[31]);
39 }
40 #endif
41
42 /* GCC doesn't include the split load/store intrinsics
43 needed for the tiled shuffle, so define them here. */
44 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
45 static inline __m256i
46 __attribute__((__always_inline__))
_mm256_loadu2_m128i(const __m128i * const hiaddr,const __m128i * const loaddr)47 _mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr)
48 {
49 return _mm256_inserti128_si256(
50 _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
51 }
52
53 static inline void
54 __attribute__((__always_inline__))
_mm256_storeu2_m128i(__m128i * const hiaddr,__m128i * const loaddr,const __m256i a)55 _mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a)
56 {
57 _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
58 _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
59 }
60 #endif /* defined(__GNUC__) */
61
62 /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
63 static void
shuffle2_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)64 shuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
65 const size_t vectorizable_elements, const size_t total_elements)
66 {
67 static const size_t bytesoftype = 2;
68 size_t j;
69 int k;
70 __m256i ymm0[2], ymm1[2];
71
72 /* Create the shuffle mask.
73 NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
74 most to least significant (i.e., their order is reversed when compared to
75 loading the mask from an array). */
76 const __m256i shmask = _mm256_set_epi8(
77 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
78 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
79 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
80 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);
81
82 for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
83 /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */
84 for (k = 0; k < 2; k++) {
85 ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
86 ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
87 }
88
89 ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
90 ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);
91
92 ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
93 ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
94 ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);
95
96 /* Store the result vectors */
97 uint8_t* const dest_for_jth_element = dest + j;
98 for (k = 0; k < 2; k++) {
99 _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]);
100 }
101 }
102 }
103
104 /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
105 static void
shuffle4_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)106 shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
107 const size_t vectorizable_elements, const size_t total_elements)
108 {
109 static const size_t bytesoftype = 4;
110 size_t i;
111 int j;
112 __m256i ymm0[4], ymm1[4];
113
114 /* Create the shuffle mask.
115 NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
116 most to least significant (i.e., their order is reversed when compared to
117 loading the mask from an array). */
118 const __m256i mask = _mm256_set_epi32(
119 0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
120
121 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
122 /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
123 for (j = 0; j < 4; j++) {
124 ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
125 ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
126 ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
127 ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
128 ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
129 ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
130 }
131 /* Transpose double words */
132 for (j = 0; j < 2; j++) {
133 ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
134 ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
135 }
136 /* Transpose quad words */
137 for (j = 0; j < 2; j++) {
138 ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]);
139 ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]);
140 }
141 for (j = 0; j < 4; j++) {
142 ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
143 }
144 /* Store the result vectors */
145 uint8_t* const dest_for_ith_element = dest + i;
146 for (j = 0; j < 4; j++) {
147 _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
148 }
149 }
150 }
151
152 /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
153 static void
shuffle8_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)154 shuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
155 const size_t vectorizable_elements, const size_t total_elements)
156 {
157 static const size_t bytesoftype = 8;
158 size_t j;
159 int k, l;
160 __m256i ymm0[8], ymm1[8];
161
162 for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
163 /* Fetch 32 elements (256 bytes) then transpose bytes. */
164 for (k = 0; k < 8; k++) {
165 ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
166 ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e);
167 ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]);
168 }
169 /* Transpose words */
170 for (k = 0, l = 0; k < 4; k++, l +=2) {
171 ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+1]);
172 ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+1]);
173 }
174 /* Transpose double words */
175 for (k = 0, l = 0; k < 4; k++, l++) {
176 if (k == 2) l += 2;
177 ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+2]);
178 ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+2]);
179 }
180 /* Transpose quad words */
181 for (k = 0; k < 4; k++) {
182 ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+4]);
183 ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+4]);
184 }
185 for(k = 0; k < 8; k++) {
186 ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72);
187 ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8);
188 ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]);
189 }
190 /* Store the result vectors */
191 uint8_t* const dest_for_jth_element = dest + j;
192 for (k = 0; k < 8; k++) {
193 _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
194 }
195 }
196 }
197
198 /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
199 static void
shuffle16_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)200 shuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
201 const size_t vectorizable_elements, const size_t total_elements)
202 {
203 static const size_t bytesoftype = 16;
204 size_t j;
205 int k, l;
206 __m256i ymm0[16], ymm1[16];
207
208 /* Create the shuffle mask.
209 NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
210 most to least significant (i.e., their order is reversed when compared to
211 loading the mask from an array). */
212 const __m256i shmask = _mm256_set_epi8(
213 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
214 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
215 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
216 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
217
218 for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
219 /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
220 for (k = 0; k < 16; k++) {
221 ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
222 }
223 /* Transpose bytes */
224 for (k = 0, l = 0; k < 8; k++, l +=2) {
225 ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]);
226 ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]);
227 }
228 /* Transpose words */
229 for (k = 0, l = -2; k < 8; k++, l++) {
230 if ((k%2) == 0) l += 2;
231 ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]);
232 ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]);
233 }
234 /* Transpose double words */
235 for (k = 0, l = -4; k < 8; k++, l++) {
236 if ((k%4) == 0) l += 4;
237 ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]);
238 ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]);
239 }
240 /* Transpose quad words */
241 for (k = 0; k < 8; k++) {
242 ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]);
243 ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]);
244 }
245 for (k = 0; k < 16; k++) {
246 ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
247 ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
248 }
249 /* Store the result vectors */
250 uint8_t* const dest_for_jth_element = dest + j;
251 for (k = 0; k < 16; k++) {
252 _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
253 }
254 }
255 }
256
257 /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
258 static void
shuffle16_tiled_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements,const size_t bytesoftype)259 shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
260 const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
261 {
262 size_t j;
263 int k, l;
264 __m256i ymm0[16], ymm1[16];
265
266 const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
267
268 /* Create the shuffle mask.
269 NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
270 most to least significant (i.e., their order is reversed when compared to
271 loading the mask from an array). */
272 const __m256i shmask = _mm256_set_epi8(
273 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
274 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
275 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
276 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
277
278 for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
279 /* Advance the offset into the type by the vector size (in bytes), unless this is
280 the initial iteration and the type size is not a multiple of the vector size.
281 In that case, only advance by the number of bytes necessary so that the number
282 of remaining bytes in the type will be a multiple of the vector size. */
283 size_t offset_into_type;
284 for (offset_into_type = 0; offset_into_type < bytesoftype;
285 offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) {
286
287 /* Fetch elements in groups of 512 bytes */
288 const uint8_t* const src_with_offset = src + offset_into_type;
289 for (k = 0; k < 16; k++) {
290 ymm0[k] = _mm256_loadu2_m128i(
291 (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype),
292 (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype));
293 }
294 /* Transpose bytes */
295 for (k = 0, l = 0; k < 8; k++, l +=2) {
296 ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]);
297 ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]);
298 }
299 /* Transpose words */
300 for (k = 0, l = -2; k < 8; k++, l++) {
301 if ((k%2) == 0) l += 2;
302 ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]);
303 ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]);
304 }
305 /* Transpose double words */
306 for (k = 0, l = -4; k < 8; k++, l++) {
307 if ((k%4) == 0) l += 4;
308 ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]);
309 ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]);
310 }
311 /* Transpose quad words */
312 for (k = 0; k < 8; k++) {
313 ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]);
314 ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]);
315 }
316 for (k = 0; k < 16; k++) {
317 ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
318 ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
319 }
320 /* Store the result vectors */
321 uint8_t* const dest_for_jth_element = dest + j;
322 for (k = 0; k < 16; k++) {
323 _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]);
324 }
325 }
326 }
327 }
328
329 /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
330 static void
unshuffle2_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)331 unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
332 const size_t vectorizable_elements, const size_t total_elements)
333 {
334 static const size_t bytesoftype = 2;
335 size_t i;
336 int j;
337 __m256i ymm0[2], ymm1[2];
338
339 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
340 /* Load 32 elements (64 bytes) into 2 YMM registers. */
341 const uint8_t* const src_for_ith_element = src + i;
342 for (j = 0; j < 2; j++) {
343 ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
344 }
345 /* Shuffle bytes */
346 for (j = 0; j < 2; j++) {
347 ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
348 }
349 /* Compute the low 64 bytes */
350 ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
351 /* Compute the hi 64 bytes */
352 ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
353 /* Store the result vectors in proper order */
354 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
355 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]);
356 }
357 }
358
359 /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
360 static void
unshuffle4_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)361 unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
362 const size_t vectorizable_elements, const size_t total_elements)
363 {
364 static const size_t bytesoftype = 4;
365 size_t i;
366 int j;
367 __m256i ymm0[4], ymm1[4];
368
369 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
370 /* Load 32 elements (128 bytes) into 4 YMM registers. */
371 const uint8_t* const src_for_ith_element = src + i;
372 for (j = 0; j < 4; j++) {
373 ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
374 }
375 /* Shuffle bytes */
376 for (j = 0; j < 2; j++) {
377 /* Compute the low 64 bytes */
378 ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
379 /* Compute the hi 64 bytes */
380 ymm1[2+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
381 }
382 /* Shuffle 2-byte words */
383 for (j = 0; j < 2; j++) {
384 /* Compute the low 64 bytes */
385 ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
386 /* Compute the hi 64 bytes */
387 ymm0[2+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
388 }
389 ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
390 ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
391 ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
392 ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);
393
394 /* Store the result vectors in proper order */
395 for (j = 0; j < 4; j++) {
396 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
397 }
398 }
399 }
400
401 /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
402 static void
unshuffle8_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)403 unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
404 const size_t vectorizable_elements, const size_t total_elements)
405 {
406 static const size_t bytesoftype = 8;
407 size_t i;
408 int j;
409 __m256i ymm0[8], ymm1[8];
410
411 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
412 /* Fetch 32 elements (256 bytes) into 8 YMM registers. */
413 const uint8_t* const src_for_ith_element = src + i;
414 for (j = 0; j < 8; j++) {
415 ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
416 }
417 /* Shuffle bytes */
418 for (j = 0; j < 4; j++) {
419 /* Compute the low 32 bytes */
420 ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
421 /* Compute the hi 32 bytes */
422 ymm1[4+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
423 }
424 /* Shuffle words */
425 for (j = 0; j < 4; j++) {
426 /* Compute the low 32 bytes */
427 ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
428 /* Compute the hi 32 bytes */
429 ymm0[4+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
430 }
431 for (j = 0; j < 8; j++) {
432 ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
433 }
434
435 /* Shuffle 4-byte dwords */
436 for (j = 0; j < 4; j++) {
437 /* Compute the low 32 bytes */
438 ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
439 /* Compute the hi 32 bytes */
440 ymm1[4+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
441 }
442
443 /* Store the result vectors in proper order */
444 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
445 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]);
446 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]);
447 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]);
448 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]);
449 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]);
450 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]);
451 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
452 }
453 }
454
455 /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
456 static void
unshuffle16_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements)457 unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
458 const size_t vectorizable_elements, const size_t total_elements)
459 {
460 static const size_t bytesoftype = 16;
461 size_t i;
462 int j;
463 __m256i ymm0[16], ymm1[16];
464
465 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
466 /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
467 const uint8_t* const src_for_ith_element = src + i;
468 for (j = 0; j < 16; j++) {
469 ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
470 }
471
472 /* Shuffle bytes */
473 for (j = 0; j < 8; j++) {
474 /* Compute the low 32 bytes */
475 ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
476 /* Compute the hi 32 bytes */
477 ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
478 }
479 /* Shuffle 2-byte words */
480 for (j = 0; j < 8; j++) {
481 /* Compute the low 32 bytes */
482 ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
483 /* Compute the hi 32 bytes */
484 ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
485 }
486 /* Shuffle 4-byte dwords */
487 for (j = 0; j < 8; j++) {
488 /* Compute the low 32 bytes */
489 ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
490 /* Compute the hi 32 bytes */
491 ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
492 }
493
494 /* Shuffle 8-byte qwords */
495 for (j = 0; j < 8; j++) {
496 /* Compute the low 32 bytes */
497 ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
498 /* Compute the hi 32 bytes */
499 ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
500 }
501
502 for (j = 0; j < 8; j++) {
503 ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
504 ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
505 }
506
507 /* Store the result vectors in proper order */
508 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
509 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
510 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
511 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
512 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
513 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
514 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
515 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
516 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
517 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
518 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
519 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
520 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
521 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
522 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
523 _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
524 }
525 }
526
527 /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
528 static void
unshuffle16_tiled_avx2(uint8_t * const dest,const uint8_t * const src,const size_t vectorizable_elements,const size_t total_elements,const size_t bytesoftype)529 unshuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
530 const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
531 {
532 size_t i;
533 int j;
534 __m256i ymm0[16], ymm1[16];
535
536 const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
537
538 /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
539 to optimize cache utilization. */
540 size_t offset_into_type;
541 for (offset_into_type = 0; offset_into_type < bytesoftype;
542 offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) {
543 for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
544 /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
545 const uint8_t* const src_for_ith_element = src + i;
546 for (j = 0; j < 16; j++) {
547 ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
548 }
549
550 /* Shuffle bytes */
551 for (j = 0; j < 8; j++) {
552 /* Compute the low 32 bytes */
553 ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
554 /* Compute the hi 32 bytes */
555 ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
556 }
557 /* Shuffle 2-byte words */
558 for (j = 0; j < 8; j++) {
559 /* Compute the low 32 bytes */
560 ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
561 /* Compute the hi 32 bytes */
562 ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
563 }
564 /* Shuffle 4-byte dwords */
565 for (j = 0; j < 8; j++) {
566 /* Compute the low 32 bytes */
567 ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
568 /* Compute the hi 32 bytes */
569 ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
570 }
571
572 /* Shuffle 8-byte qwords */
573 for (j = 0; j < 8; j++) {
574 /* Compute the low 32 bytes */
575 ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
576 /* Compute the hi 32 bytes */
577 ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
578 }
579
580 for (j = 0; j < 8; j++) {
581 ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
582 ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
583 }
584
585 /* Store the result vectors in proper order */
586 const uint8_t* const dest_with_offset = dest + offset_into_type;
587 _mm256_storeu2_m128i(
588 (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype),
589 (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
590 _mm256_storeu2_m128i(
591 (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype),
592 (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
593 _mm256_storeu2_m128i(
594 (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype),
595 (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
596 _mm256_storeu2_m128i(
597 (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype),
598 (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
599 _mm256_storeu2_m128i(
600 (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype),
601 (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
602 _mm256_storeu2_m128i(
603 (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype),
604 (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
605 _mm256_storeu2_m128i(
606 (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype),
607 (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
608 _mm256_storeu2_m128i(
609 (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype),
610 (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
611 _mm256_storeu2_m128i(
612 (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype),
613 (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
614 _mm256_storeu2_m128i(
615 (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype),
616 (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
617 _mm256_storeu2_m128i(
618 (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype),
619 (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
620 _mm256_storeu2_m128i(
621 (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype),
622 (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
623 _mm256_storeu2_m128i(
624 (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype),
625 (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
626 _mm256_storeu2_m128i(
627 (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype),
628 (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
629 _mm256_storeu2_m128i(
630 (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype),
631 (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
632 _mm256_storeu2_m128i(
633 (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype),
634 (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
635 }
636 }
637 }
638
639 /* Shuffle a block. This can never fail. */
640 void
blosc_internal_shuffle_avx2(const size_t bytesoftype,const size_t blocksize,const uint8_t * const _src,uint8_t * const _dest)641 blosc_internal_shuffle_avx2(const size_t bytesoftype, const size_t blocksize,
642 const uint8_t* const _src, uint8_t* const _dest) {
643 const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
644
645 /* If the block size is too small to be vectorized,
646 use the generic implementation. */
647 if (blocksize < vectorized_chunk_size) {
648 blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest);
649 return;
650 }
651
652 /* If the blocksize is not a multiple of both the typesize and
653 the vector size, round the blocksize down to the next value
654 which is a multiple of both. The vectorized shuffle can be
655 used for that portion of the data, and the naive implementation
656 can be used for the remaining portion. */
657 const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
658
659 const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
660 const size_t total_elements = blocksize / bytesoftype;
661
662 /* Optimized shuffle implementations */
663 switch (bytesoftype)
664 {
665 case 2:
666 shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
667 break;
668 case 4:
669 shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
670 break;
671 case 8:
672 shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
673 break;
674 case 16:
675 shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
676 break;
677 default:
678 /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */
679 if (bytesoftype > sizeof(__m128i)) {
680 shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
681 }
682 else {
683 /* Non-optimized shuffle */
684 blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest);
685 /* The non-optimized function covers the whole buffer,
686 so we're done processing here. */
687 return;
688 }
689 }
690
691 /* If the buffer had any bytes at the end which couldn't be handled
692 by the vectorized implementations, use the non-optimized version
693 to finish them up. */
694 if (vectorizable_bytes < blocksize) {
695 shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
696 }
697 }
698
699 /* Unshuffle a block. This can never fail. */
700 void
blosc_internal_unshuffle_avx2(const size_t bytesoftype,const size_t blocksize,const uint8_t * const _src,uint8_t * const _dest)701 blosc_internal_unshuffle_avx2(const size_t bytesoftype, const size_t blocksize,
702 const uint8_t* const _src, uint8_t* const _dest) {
703 const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
704
705 /* If the block size is too small to be vectorized,
706 use the generic implementation. */
707 if (blocksize < vectorized_chunk_size) {
708 blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest);
709 return;
710 }
711
712 /* If the blocksize is not a multiple of both the typesize and
713 the vector size, round the blocksize down to the next value
714 which is a multiple of both. The vectorized unshuffle can be
715 used for that portion of the data, and the naive implementation
716 can be used for the remaining portion. */
717 const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
718
719 const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
720 const size_t total_elements = blocksize / bytesoftype;
721
722 /* Optimized unshuffle implementations */
723 switch (bytesoftype)
724 {
725 case 2:
726 unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
727 break;
728 case 4:
729 unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
730 break;
731 case 8:
732 unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
733 break;
734 case 16:
735 unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
736 break;
737 default:
738 /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */
739 if (bytesoftype > sizeof(__m128i)) {
740 unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
741 }
742 else {
743 /* Non-optimized unshuffle */
744 blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest);
745 /* The non-optimized function covers the whole buffer,
746 so we're done processing here. */
747 return;
748 }
749 }
750
751 /* If the buffer had any bytes at the end which couldn't be handled
752 by the vectorized implementations, use the non-optimized version
753 to finish them up. */
754 if (vectorizable_bytes < blocksize) {
755 unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
756 }
757 }
758