1 /*********************************************************************
2 Blosc - Blocked Shuffling and Compression Library
3
4 Copyright (C) 2021 The Blosc developers <blosc@blosc.org> and Jerome Kieffer <jerome.kieffer@esrf.fr>
5 https://blosc.org
6 License: BSD 3-Clause (see LICENSE.txt)
7
8 See LICENSE.txt for details about copyright and rights to use.
9 **********************************************************************/
10
11 #include "shuffle-generic.h"
12 #include "shuffle-altivec.h"
13
14 /* Make sure ALTIVEC is available for the compilation target and compiler. */
15 #if !defined(__ALTIVEC__)
16 #error ALTIVEC is not supported by the target architecture/platform and/or this compiler.
17 #endif
18
19 #include <altivec.h>
20 #include "transpose-altivec.h"
21
22 /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
23 static void
shuffle2_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)24 shuffle2_altivec(uint8_t* const dest, const uint8_t* const src,
25 const int32_t vectorizable_elements, const int32_t total_elements){
26 static const int32_t bytesoftype = 2;
27 uint32_t i, j;
28 __vector uint8_t xmm0[2];
29
30 for (j = 0; j < vectorizable_elements; j += 16){
31 /* Fetch 16 elements (32 bytes) */
32 for (i = 0; i < bytesoftype; i++)
33 xmm0[i] = vec_xl(bytesoftype * j + 16 * i, src);
34
35 /* Transpose vectors */
36 transpose2x16(xmm0);
37
38 /* Store the result vectors */
39 for (i = 0; i < bytesoftype; i++)
40 vec_xst(xmm0[i], j + i * total_elements, dest);
41 }
42 }
43
44 /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
45 static void
shuffle4_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)46 shuffle4_altivec(uint8_t* const dest, const uint8_t* const src,
47 const int32_t vectorizable_elements, const int32_t total_elements){
48 static const int32_t bytesoftype = 4;
49 int32_t i, j;
50 __vector uint8_t xmm0[4];
51
52 for (j = 0; j < vectorizable_elements; j += 16)
53 {
54 /* Fetch 16 elements (64 bytes, 4 vectors) */
55 for (i = 0; i < bytesoftype; i++)
56 xmm0[i] = vec_xl(bytesoftype * j + 16 * i, src);
57
58
59 /* Transpose vectors */
60 transpose4x16(xmm0);
61
62 /* Store the result vectors */
63 for (i = 0; i < bytesoftype; i ++){
64 vec_xst(xmm0[i], j + i*total_elements, dest);
65 }
66 }
67 }
68
69
70 /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
71 static void
shuffle8_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)72 shuffle8_altivec(uint8_t* const dest, const uint8_t* const src,
73 const int32_t vectorizable_elements, const int32_t total_elements) {
74 static const uint8_t bytesoftype = 8;
75 int32_t i, j;
76 __vector uint8_t xmm0[8];
77
78 for (j = 0; j < vectorizable_elements; j += 16)
79 {
80 /* Fetch 16 elements (128 bytes, 8 vectors) */
81 for (i = 0; i < bytesoftype; i++)
82 xmm0[i] = vec_xl(bytesoftype * j + 16 * i, src);
83
84 /* Transpose vectors */
85 transpose8x16(xmm0);
86
87 /* Store the result vectors */
88 for (i = 0; i < bytesoftype; i++)
89 vec_xst(xmm0[i], j + i*total_elements, dest);
90 }
91 }
92
93 /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
94 static void
shuffle16_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)95 shuffle16_altivec(uint8_t* const dest, const uint8_t* const src,
96 const int32_t vectorizable_elements, const int32_t total_elements) {
97 static const int32_t bytesoftype = 16;
98 int32_t i, j;
99 __vector uint8_t xmm0[16];
100
101 for (j = 0; j < vectorizable_elements; j += 16)
102 {
103 /* Fetch 16 elements (256 bytes, 16 vectors) */
104 for (i = 0; i < bytesoftype; i++)
105 xmm0[i] = vec_xl(bytesoftype * j + 16 * i, src);
106
107 // Do the job !
108 transpose16x16(xmm0);
109
110 /* Store the result vectors */
111 for (i = 0; i < bytesoftype; i ++)
112 vec_xst(xmm0[i], j + i * total_elements, dest);
113 }
114 }
115
116
117 /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
118 static void
shuffle16_tiled_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements,const int32_t bytesoftype)119 shuffle16_tiled_altivec(uint8_t* const dest, const uint8_t* const src,
120 const int32_t vectorizable_elements, const int32_t total_elements,
121 const int32_t bytesoftype) {
122 int32_t j, k;
123 const int32_t vecs_per_el_rem = bytesoftype & 0xF;
124 __vector uint8_t xmm[16];
125
126 for (j = 0; j < vectorizable_elements; j += 16) {
127 /* Advance the offset into the type by the vector size (in bytes), unless this is
128 the initial iteration and the type size is not a multiple of the vector size.
129 In that case, only advance by the number of bytes necessary so that the number
130 of remaining bytes in the type will be a multiple of the vector size. */
131 int32_t offset_into_type;
132 for (offset_into_type = 0; offset_into_type < bytesoftype;
133 offset_into_type += (offset_into_type == 0 &&
134 vecs_per_el_rem > 0 ? vecs_per_el_rem : 16)) {
135
136 /* Fetch elements in groups of 256 bytes */
137 const uint8_t* const src_with_offset = src + offset_into_type;
138 for (k = 0; k < 16; k++)
139 xmm[k] = vec_xl((j + k) * bytesoftype, src_with_offset);
140 // Do the Job!
141 transpose16x16(xmm);
142 /* Store the result vectors */
143 for (k = 0; k < 16; k++) {
144 vec_xst(xmm[k], j + total_elements * (offset_into_type + k), dest);
145 }
146 }
147 }
148 }
149 /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
150 static void
unshuffle2_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)151 unshuffle2_altivec(uint8_t* const dest, const uint8_t* const src,
152 const int32_t vectorizable_elements, const int32_t total_elements) {
153 static const int32_t bytesoftype = 2;
154 uint32_t i, j;
155 __vector uint8_t xmm0[2], xmm1[2];
156
157 for (j = 0; j < vectorizable_elements; j += 16) {
158 /* Load 16 elements (32 bytes) into 2 vectors registers. */
159 for (i = 0; i < bytesoftype; i++)
160 xmm0[i] = vec_xl(j + i * total_elements, src);
161
162 /* Shuffle bytes */
163 /* Note the shuffling is different from intel's SSE2 */
164 xmm1[0] = vec_vmrghb(xmm0[0], xmm0[1]);
165 xmm1[1] = vec_vmrglb(xmm0[0], xmm0[1]);
166
167 /* Store the result vectors*/
168 for (i = 0; i < bytesoftype; i++)
169 vec_xst(xmm1[i], bytesoftype * j + 16 * i, dest);
170 }
171 }
172
173 /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
174 static void
unshuffle4_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)175 unshuffle4_altivec(uint8_t* const dest, const uint8_t* const src,
176 const int32_t vectorizable_elements, const int32_t total_elements) {
177 static const int32_t bytesoftype = 4;
178 uint32_t i, j;
179 __vector uint8_t xmm0[4], xmm1[4];
180
181 for (j = 0; j < vectorizable_elements; j += 16) {
182 /* Load 16 elements (64 bytes) into 4 vectors registers. */
183 for (i = 0; i < bytesoftype; i++)
184 xmm0[i] = vec_xl(j + i * total_elements, src);
185
186 /* Shuffle bytes */
187 for (i = 0; i < 2; i++) {
188 xmm1[i ] = vec_vmrghb(xmm0[i * 2], xmm0[i * 2 + 1]);
189 xmm1[i+2] = vec_vmrglb(xmm0[i * 2], xmm0[i * 2 + 1]);
190 }
191 /* Shuffle 2-byte words */
192 for (i = 0; i < 2; i++) {
193 /* Compute the low 32 bytes */
194 xmm0[i] = (__vector uint8_t) vec_vmrghh((__vector uint16_t)xmm1[i * 2],
195 (__vector uint16_t) xmm1[i * 2 + 1]);
196 /* Compute the hi 32 bytes */
197 xmm0[i+2] = (__vector uint8_t) vec_vmrglh((__vector uint16_t)xmm1[i * 2],
198 (__vector uint16_t)xmm1[i * 2 + 1]);
199 }
200 /* Store the result vectors in proper order */
201 vec_xst(xmm0[0], bytesoftype * j, dest);
202 vec_xst(xmm0[2], bytesoftype * j + 16, dest);
203 vec_xst(xmm0[1], bytesoftype * j + 32, dest);
204 vec_xst(xmm0[3], bytesoftype * j + 48, dest);
205 }
206 }
207
208 /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
209 static void
unshuffle8_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)210 unshuffle8_altivec(uint8_t* const dest, const uint8_t* const src,
211 const int32_t vectorizable_elements, const int32_t total_elements) {
212 static const uint8_t bytesoftype = 8;
213 uint32_t i, j;
214 __vector uint8_t xmm0[8], xmm1[8];
215
216 // Initialize permutations for writing
217 for (j = 0; j < vectorizable_elements; j += 16) {
218 /* Load 16 elements (64 bytes) into 4 vectors registers. */
219 for (i = 0; i < bytesoftype; i++)
220 xmm0[i] = vec_xl(j + i * total_elements, src);
221 /* Shuffle bytes */
222 for (i = 0; i < 4; i++) {
223 xmm1[i] = vec_vmrghb(xmm0[i * 2], xmm0[i * 2 + 1]);
224 xmm1[4 + i] = vec_vmrglb(xmm0[i * 2], xmm0[i * 2 + 1]);
225 }
226 /* Shuffle 2-byte words */
227 for (i = 0; i < 4; i++) {
228 xmm0[i] = (__vector uint8_t)vec_vmrghh((__vector uint16_t)xmm1[i * 2],
229 (__vector uint16_t)xmm1[i * 2 + 1]);
230 xmm0[4 + i] = (__vector uint8_t)vec_vmrglh((__vector uint16_t)xmm1[i * 2],
231 (__vector uint16_t)xmm1[i * 2 + 1]);
232 }
233 /* Shuffle 4-byte dwords */
234 for (i = 0; i < 4; i++) {
235 xmm1[i] = (__vector uint8_t)vec_vmrghw((__vector uint32_t)xmm0[i * 2],
236 (__vector uint32_t)xmm0[i * 2 + 1]);
237 xmm1[4 + i] = (__vector uint8_t)vec_vmrglw((__vector uint32_t)xmm0[i * 2],
238 (__vector uint32_t)xmm0[i * 2 + 1]);
239 }
240 /* Store the result vectors in proper order */
241 vec_xst(xmm1[0], bytesoftype * j, dest);
242 vec_xst(xmm1[4], bytesoftype * j + 16, dest);
243 vec_xst(xmm1[2], bytesoftype * j + 32, dest);
244 vec_xst(xmm1[6], bytesoftype * j + 48, dest);
245 vec_xst(xmm1[1], bytesoftype * j + 64, dest);
246 vec_xst(xmm1[5], bytesoftype * j + 80, dest);
247 vec_xst(xmm1[3], bytesoftype * j + 96, dest);
248 vec_xst(xmm1[7], bytesoftype * j + 112, dest);
249 }
250 }
251
252
253 /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
254 static void
unshuffle16_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)255 unshuffle16_altivec(uint8_t* const dest, const uint8_t* const src,
256 const int32_t vectorizable_elements, const int32_t total_elements) {
257 static const int32_t bytesoftype = 16;
258 uint32_t i, j;
259 __vector uint8_t xmm0[16];
260
261 for (j = 0; j < vectorizable_elements; j += 16) {
262 /* Load 16 elements (64 bytes) into 4 vectors registers. */
263 for (i = 0; i < bytesoftype; i++)
264 xmm0[i] = vec_xl(j + i * total_elements, src);
265
266 // Do the Job!
267 transpose16x16(xmm0);
268
269 /* Store the result vectors*/
270 for (i = 0; i < 16; i++)
271 vec_st(xmm0[i], bytesoftype * (i+j), dest);
272 }
273 }
274
275
276 /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
277 static void
unshuffle16_tiled_altivec(uint8_t * const dest,const uint8_t * const orig,const int32_t vectorizable_elements,const int32_t total_elements,const int32_t bytesoftype)278 unshuffle16_tiled_altivec(uint8_t* const dest, const uint8_t* const orig,
279 const int32_t vectorizable_elements, const int32_t total_elements,
280 const int32_t bytesoftype) {
281 int32_t i, j, offset_into_type;
282 const int32_t vecs_per_el_rem = bytesoftype & 0xF;
283 __vector uint8_t xmm[16];
284
285
286 /* Advance the offset into the type by the vector size (in bytes), unless this is
287 the initial iteration and the type size is not a multiple of the vector size.
288 In that case, only advance by the number of bytes necessary so that the number
289 of remaining bytes in the type will be a multiple of the vector size. */
290
291 for (offset_into_type = 0; offset_into_type < bytesoftype;
292 offset_into_type += (offset_into_type == 0 &&
293 vecs_per_el_rem > 0 ? vecs_per_el_rem : 16)) {
294 for (i = 0; i < vectorizable_elements; i += 16) {
295 /* Load the first 128 bytes in 16 XMM registers */
296 for (j = 0; j < 16; j++)
297 xmm[j] = vec_xl(total_elements * (offset_into_type + j) + i, orig);
298
299 // Do the Job !
300 transpose16x16(xmm);
301
302 /* Store the result vectors in proper order */
303 for (j = 0; j < 16; j++)
304 vec_xst(xmm[j], (i + j) * bytesoftype + offset_into_type, dest);
305 }
306 }
307 }
308
309 /* Shuffle a block. This can never fail. */
310 void
shuffle_altivec(const int32_t bytesoftype,const int32_t blocksize,const uint8_t * _src,uint8_t * _dest)311 shuffle_altivec(const int32_t bytesoftype, const int32_t blocksize,
312 const uint8_t *_src, uint8_t *_dest)
313 {
314 int32_t vectorized_chunk_size;
315 vectorized_chunk_size = bytesoftype * 16;
316
317
318 /* If the blocksize is not a multiple of both the typesize and
319 the vector size, round the blocksize down to the next value
320 which is a multiple of both. The vectorized shuffle can be
321 used for that portion of the data, and the naive implementation
322 can be used for the remaining portion. */
323 const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
324 const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
325 const int32_t total_elements = blocksize / bytesoftype;
326
327 /* If the block size is too small to be vectorized,
328 use the generic implementation. */
329 if (blocksize < vectorized_chunk_size) {
330 shuffle_generic(bytesoftype, blocksize, _src, _dest);
331 return;
332 }
333
334 /* Optimized shuffle implementations */
335 switch (bytesoftype) {
336 case 2:
337 shuffle2_altivec(_dest, _src, vectorizable_elements, total_elements);
338 break;
339 case 4:
340 shuffle4_altivec(_dest, _src, vectorizable_elements, total_elements);
341 break;
342 case 8:
343 shuffle8_altivec(_dest, _src, vectorizable_elements, total_elements);
344 break;
345 case 16:
346 shuffle16_altivec(_dest, _src, vectorizable_elements, total_elements);
347 break;
348 default:
349 if (bytesoftype > 16) {
350 shuffle16_tiled_altivec(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
351 }
352 else {
353 /* Non-optimized shuffle */
354 shuffle_generic(bytesoftype, blocksize, _src, _dest);
355 /* The non-optimized function covers the whole buffer,
356 so we're done processing here. */
357 return;
358 }
359 }
360
361 /* If the buffer had any bytes at the end which couldn't be handled
362 by the vectorized implementations, use the non-optimized version
363 to finish them up. */
364 if (vectorizable_bytes < blocksize) {
365 shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
366 }
367 }
368
369 /* Unshuffle a block. This can never fail. */
370 void
unshuffle_altivec(const int32_t bytesoftype,const int32_t blocksize,const uint8_t * _src,uint8_t * _dest)371 unshuffle_altivec(const int32_t bytesoftype, const int32_t blocksize,
372 const uint8_t *_src, uint8_t *_dest) {
373 const int32_t vectorized_chunk_size = bytesoftype * 16;
374 /* If the blocksize is not a multiple of both the typesize and
375 the vector size, round the blocksize down to the next value
376 which is a multiple of both. The vectorized unshuffle can be
377 used for that portion of the data, and the naive implementation
378 can be used for the remaining portion. */
379 const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
380 const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
381 const int32_t total_elements = blocksize / bytesoftype;
382
383 /* If the block size is too small to be vectorized,
384 use the generic implementation. */
385 if (blocksize < vectorized_chunk_size) {
386 unshuffle_generic(bytesoftype, blocksize, _src, _dest);
387 return;
388 }
389
390 /* Optimized unshuffle implementations */
391 switch (bytesoftype) {
392 case 2:
393 unshuffle2_altivec(_dest, _src, vectorizable_elements, total_elements);
394 break;
395 case 4:
396 unshuffle4_altivec(_dest, _src, vectorizable_elements, total_elements);
397 break;
398 case 8:
399 unshuffle8_altivec(_dest, _src, vectorizable_elements, total_elements);
400 break;
401 case 16:
402 unshuffle16_altivec(_dest, _src, vectorizable_elements, total_elements);
403 break;
404 default:
405 if (bytesoftype > 16) {
406 unshuffle16_tiled_altivec(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
407 }
408 else {
409 /* Non-optimized unshuffle */
410 unshuffle_generic(bytesoftype, blocksize, _src, _dest);
411 /* The non-optimized function covers the whole buffer,
412 so we're done processing here. */
413 return;
414 }
415 }
416
417 /* If the buffer had any bytes at the end which couldn't be handled
418 by the vectorized implementations, use the non-optimized version
419 to finish them up. */
420 if (vectorizable_bytes < blocksize) {
421 unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
422 }
423 }
424