1 /*********************************************************************
2   Blosc - Blocked Shuffling and Compression Library
3 
4   Copyright (C) 2021  The Blosc developers <blosc@blosc.org> and Jerome Kieffer <jerome.kieffer@esrf.fr>
5   https://blosc.org
6   License: BSD 3-Clause (see LICENSE.txt)
7 
8   See LICENSE.txt for details about copyright and rights to use.
9 **********************************************************************/
10 
11 #include "shuffle-generic.h"
12 #include "shuffle-altivec.h"
13 
14 /* Make sure ALTIVEC is available for the compilation target and compiler. */
15 #if !defined(__ALTIVEC__)
16   #error ALTIVEC is not supported by the target architecture/platform and/or this compiler.
17 #endif
18 
19 #include <altivec.h>
20 #include "transpose-altivec.h"
21 
22 /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
23 static void
shuffle2_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)24 shuffle2_altivec(uint8_t* const dest, const uint8_t* const src,
25                  const int32_t vectorizable_elements, const int32_t total_elements){
26   static const int32_t bytesoftype = 2;
27   uint32_t i, j;
28   __vector uint8_t xmm0[2];
29 
30   for (j = 0; j < vectorizable_elements; j += 16){
31     /* Fetch 16 elements (32 bytes) */
32     for (i = 0; i < bytesoftype; i++)
33       xmm0[i] = vec_xl(bytesoftype * j + 16 * i, src);
34 
35     /* Transpose vectors */
36     transpose2x16(xmm0);
37 
38     /* Store the result vectors */
39     for (i = 0; i < bytesoftype; i++)
40         vec_xst(xmm0[i], j + i * total_elements, dest);
41   }
42 }
43 
44 /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
45 static void
shuffle4_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)46 shuffle4_altivec(uint8_t* const dest, const uint8_t* const src,
47                  const int32_t vectorizable_elements, const int32_t total_elements){
48   static const int32_t bytesoftype = 4;
49   int32_t i, j;
50   __vector uint8_t xmm0[4];
51 
52   for (j = 0; j < vectorizable_elements; j += 16)
53   {
54     /* Fetch 16 elements (64 bytes, 4 vectors) */
55     for (i = 0; i < bytesoftype; i++)
56       xmm0[i] = vec_xl(bytesoftype * j + 16 * i, src);
57 
58 
59     /* Transpose vectors */
60     transpose4x16(xmm0);
61 
62     /* Store the result vectors */
63     for (i = 0; i < bytesoftype; i ++){
64         vec_xst(xmm0[i], j + i*total_elements, dest);
65     }
66   }
67 }
68 
69 
70 /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
71 static void
shuffle8_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)72 shuffle8_altivec(uint8_t* const dest, const uint8_t* const src,
73                  const int32_t vectorizable_elements, const int32_t total_elements) {
74   static const uint8_t bytesoftype = 8;
75   int32_t i, j;
76   __vector uint8_t xmm0[8];
77 
78   for (j = 0; j < vectorizable_elements; j += 16)
79   {
80     /* Fetch 16 elements (128 bytes, 8 vectors) */
81     for (i = 0; i < bytesoftype; i++)
82       xmm0[i] = vec_xl(bytesoftype * j + 16 * i, src);
83 
84     /* Transpose vectors */
85     transpose8x16(xmm0);
86 
87     /* Store the result vectors */
88     for (i = 0; i < bytesoftype; i++)
89       vec_xst(xmm0[i], j + i*total_elements, dest);
90   }
91 }
92 
93 /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
94 static void
shuffle16_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)95 shuffle16_altivec(uint8_t* const dest, const uint8_t* const src,
96                   const int32_t vectorizable_elements, const int32_t total_elements) {
97   static const int32_t bytesoftype = 16;
98   int32_t i, j;
99   __vector uint8_t xmm0[16];
100 
101   for (j = 0; j < vectorizable_elements; j += 16)
102   {
103     /* Fetch 16 elements (256 bytes, 16 vectors) */
104     for (i = 0; i < bytesoftype; i++)
105       xmm0[i] = vec_xl(bytesoftype * j + 16 * i, src);
106 
107     // Do the job !
108     transpose16x16(xmm0);
109 
110     /* Store the result vectors */
111     for (i = 0; i < bytesoftype; i ++)
112       vec_xst(xmm0[i], j + i * total_elements, dest);
113   }
114 }
115 
116 
117 /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
118 static void
shuffle16_tiled_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements,const int32_t bytesoftype)119 shuffle16_tiled_altivec(uint8_t* const dest, const uint8_t* const src,
120                         const int32_t vectorizable_elements, const int32_t total_elements,
121                         const int32_t bytesoftype) {
122   int32_t j, k;
123   const int32_t vecs_per_el_rem = bytesoftype & 0xF;
124   __vector uint8_t xmm[16];
125 
126   for (j = 0; j < vectorizable_elements; j += 16) {
127     /* Advance the offset into the type by the vector size (in bytes), unless this is
128     the initial iteration and the type size is not a multiple of the vector size.
129     In that case, only advance by the number of bytes necessary so that the number
130     of remaining bytes in the type will be a multiple of the vector size. */
131     int32_t offset_into_type;
132     for (offset_into_type = 0; offset_into_type < bytesoftype;
133          offset_into_type += (offset_into_type == 0 &&
134                               vecs_per_el_rem > 0 ? vecs_per_el_rem : 16)) {
135 
136       /* Fetch elements in groups of 256 bytes */
137       const uint8_t* const src_with_offset = src + offset_into_type;
138       for (k = 0; k < 16; k++)
139         xmm[k] = vec_xl((j + k) * bytesoftype, src_with_offset);
140       // Do the Job!
141       transpose16x16(xmm);
142       /* Store the result vectors */
143       for (k = 0; k < 16; k++) {
144         vec_xst(xmm[k], j + total_elements * (offset_into_type + k), dest);
145       }
146     }
147   }
148 }
149 /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
150 static void
unshuffle2_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)151 unshuffle2_altivec(uint8_t* const dest, const uint8_t* const src,
152                    const int32_t vectorizable_elements, const int32_t total_elements) {
153   static const int32_t bytesoftype = 2;
154   uint32_t i, j;
155   __vector uint8_t xmm0[2], xmm1[2];
156 
157   for (j = 0; j < vectorizable_elements; j += 16) {
158     /* Load 16 elements (32 bytes) into 2 vectors registers. */
159     for (i = 0; i < bytesoftype; i++)
160       xmm0[i] = vec_xl(j + i * total_elements, src);
161 
162     /* Shuffle bytes */
163     /* Note the shuffling is different from intel's SSE2 */
164     xmm1[0] = vec_vmrghb(xmm0[0], xmm0[1]);
165     xmm1[1] = vec_vmrglb(xmm0[0], xmm0[1]);
166 
167     /* Store the result vectors*/
168     for (i = 0; i < bytesoftype; i++)
169       vec_xst(xmm1[i], bytesoftype * j + 16 * i, dest);
170   }
171 }
172 
173 /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
174 static void
unshuffle4_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)175 unshuffle4_altivec(uint8_t* const dest, const uint8_t* const src,
176                    const int32_t vectorizable_elements, const int32_t total_elements) {
177   static const int32_t bytesoftype = 4;
178   uint32_t i, j;
179   __vector uint8_t xmm0[4], xmm1[4];
180 
181   for (j = 0; j < vectorizable_elements; j += 16) {
182     /* Load 16 elements (64 bytes) into 4 vectors registers. */
183     for (i = 0; i < bytesoftype; i++)
184         xmm0[i] = vec_xl(j + i * total_elements, src);
185 
186     /* Shuffle bytes */
187     for (i = 0; i < 2; i++) {
188       xmm1[i  ] = vec_vmrghb(xmm0[i * 2], xmm0[i * 2 + 1]);
189       xmm1[i+2] = vec_vmrglb(xmm0[i * 2], xmm0[i * 2 + 1]);
190     }
191     /* Shuffle 2-byte words */
192     for (i = 0; i < 2; i++) {
193       /* Compute the low 32 bytes */
194       xmm0[i] = (__vector uint8_t) vec_vmrghh((__vector uint16_t)xmm1[i * 2],
195                                               (__vector uint16_t) xmm1[i * 2 + 1]);
196       /* Compute the hi 32 bytes */
197       xmm0[i+2] = (__vector uint8_t) vec_vmrglh((__vector uint16_t)xmm1[i * 2],
198                                                 (__vector uint16_t)xmm1[i * 2 + 1]);
199     }
200     /* Store the result vectors in proper order */
201     vec_xst(xmm0[0], bytesoftype * j, dest);
202     vec_xst(xmm0[2], bytesoftype * j + 16, dest);
203     vec_xst(xmm0[1], bytesoftype * j + 32, dest);
204     vec_xst(xmm0[3], bytesoftype * j + 48, dest);
205   }
206 }
207 
208 /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
209 static void
unshuffle8_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)210 unshuffle8_altivec(uint8_t* const dest, const uint8_t* const src,
211                    const int32_t vectorizable_elements, const int32_t total_elements) {
212   static const uint8_t bytesoftype = 8;
213   uint32_t i, j;
214   __vector uint8_t xmm0[8], xmm1[8];
215 
216   // Initialize permutations for writing
217   for (j = 0; j < vectorizable_elements; j += 16) {
218     /* Load 16 elements (64 bytes) into 4 vectors registers. */
219     for (i = 0; i < bytesoftype; i++)
220         xmm0[i] = vec_xl(j + i * total_elements, src);
221     /* Shuffle bytes */
222     for (i = 0; i < 4; i++) {
223       xmm1[i] = vec_vmrghb(xmm0[i * 2], xmm0[i * 2 + 1]);
224       xmm1[4 + i] = vec_vmrglb(xmm0[i * 2], xmm0[i * 2 + 1]);
225     }
226     /* Shuffle 2-byte words */
227     for (i = 0; i < 4; i++) {
228       xmm0[i] = (__vector uint8_t)vec_vmrghh((__vector uint16_t)xmm1[i * 2],
229                                              (__vector uint16_t)xmm1[i * 2 + 1]);
230       xmm0[4 + i] = (__vector uint8_t)vec_vmrglh((__vector uint16_t)xmm1[i * 2],
231                                                  (__vector uint16_t)xmm1[i * 2 + 1]);
232     }
233     /* Shuffle 4-byte dwords */
234     for (i = 0; i < 4; i++) {
235       xmm1[i] = (__vector uint8_t)vec_vmrghw((__vector uint32_t)xmm0[i * 2],
236                                              (__vector uint32_t)xmm0[i * 2 + 1]);
237       xmm1[4 + i] = (__vector uint8_t)vec_vmrglw((__vector uint32_t)xmm0[i * 2],
238                                                  (__vector uint32_t)xmm0[i * 2 + 1]);
239     }
240     /* Store the result vectors in proper order */
241     vec_xst(xmm1[0], bytesoftype * j, dest);
242     vec_xst(xmm1[4], bytesoftype * j + 16, dest);
243     vec_xst(xmm1[2], bytesoftype * j + 32, dest);
244     vec_xst(xmm1[6], bytesoftype * j + 48, dest);
245     vec_xst(xmm1[1], bytesoftype * j + 64, dest);
246     vec_xst(xmm1[5], bytesoftype * j + 80, dest);
247     vec_xst(xmm1[3], bytesoftype * j + 96, dest);
248     vec_xst(xmm1[7], bytesoftype * j + 112, dest);
249   }
250 }
251 
252 
253 /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
254 static void
unshuffle16_altivec(uint8_t * const dest,const uint8_t * const src,const int32_t vectorizable_elements,const int32_t total_elements)255 unshuffle16_altivec(uint8_t* const dest, const uint8_t* const src,
256                     const int32_t vectorizable_elements, const int32_t total_elements) {
257   static const int32_t bytesoftype = 16;
258   uint32_t i, j;
259   __vector uint8_t xmm0[16];
260 
261   for (j = 0; j < vectorizable_elements; j += 16) {
262     /* Load 16 elements (64 bytes) into 4 vectors registers. */
263     for (i = 0; i < bytesoftype; i++)
264         xmm0[i] = vec_xl(j + i * total_elements, src);
265 
266     // Do the Job!
267     transpose16x16(xmm0);
268 
269     /* Store the result vectors*/
270     for (i = 0; i < 16; i++)
271       vec_st(xmm0[i], bytesoftype * (i+j), dest);
272   }
273 }
274 
275 
276 /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
277 static void
unshuffle16_tiled_altivec(uint8_t * const dest,const uint8_t * const orig,const int32_t vectorizable_elements,const int32_t total_elements,const int32_t bytesoftype)278 unshuffle16_tiled_altivec(uint8_t* const dest, const uint8_t* const orig,
279                           const int32_t vectorizable_elements, const int32_t total_elements,
280                           const int32_t bytesoftype) {
281   int32_t i, j, offset_into_type;
282   const int32_t vecs_per_el_rem = bytesoftype &  0xF;
283   __vector uint8_t xmm[16];
284 
285 
286   /* Advance the offset into the type by the vector size (in bytes), unless this is
287     the initial iteration and the type size is not a multiple of the vector size.
288     In that case, only advance by the number of bytes necessary so that the number
289     of remaining bytes in the type will be a multiple of the vector size. */
290 
291   for (offset_into_type = 0; offset_into_type < bytesoftype;
292        offset_into_type += (offset_into_type == 0 &&
293            vecs_per_el_rem > 0 ? vecs_per_el_rem : 16)) {
294     for (i = 0; i < vectorizable_elements; i += 16) {
295       /* Load the first 128 bytes in 16 XMM registers */
296       for (j = 0; j < 16; j++)
297         xmm[j] = vec_xl(total_elements * (offset_into_type + j) + i, orig);
298 
299       // Do the Job !
300       transpose16x16(xmm);
301 
302       /* Store the result vectors in proper order */
303       for (j = 0; j < 16; j++)
304         vec_xst(xmm[j], (i + j) * bytesoftype + offset_into_type, dest);
305     }
306   }
307 }
308 
309 /* Shuffle a block.  This can never fail. */
310 void
shuffle_altivec(const int32_t bytesoftype,const int32_t blocksize,const uint8_t * _src,uint8_t * _dest)311 shuffle_altivec(const int32_t bytesoftype, const int32_t blocksize,
312                 const uint8_t *_src, uint8_t *_dest)
313 {
314 	int32_t vectorized_chunk_size;
315     vectorized_chunk_size = bytesoftype * 16;
316 
317 
318   /* If the blocksize is not a multiple of both the typesize and
319      the vector size, round the blocksize down to the next value
320      which is a multiple of both. The vectorized shuffle can be
321      used for that portion of the data, and the naive implementation
322      can be used for the remaining portion. */
323   const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
324   const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
325   const int32_t total_elements = blocksize / bytesoftype;
326 
327   /* If the block size is too small to be vectorized,
328      use the generic implementation. */
329   if (blocksize < vectorized_chunk_size) {
330     shuffle_generic(bytesoftype, blocksize, _src, _dest);
331     return;
332   }
333 
334   /* Optimized shuffle implementations */
335   switch (bytesoftype) {
336     case 2:
337       shuffle2_altivec(_dest, _src, vectorizable_elements, total_elements);
338       break;
339     case 4:
340       shuffle4_altivec(_dest, _src, vectorizable_elements, total_elements);
341       break;
342     case 8:
343       shuffle8_altivec(_dest, _src, vectorizable_elements, total_elements);
344       break;
345     case 16:
346       shuffle16_altivec(_dest, _src, vectorizable_elements, total_elements);
347       break;
348     default:
349       if (bytesoftype > 16) {
350         shuffle16_tiled_altivec(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
351       }
352       else {
353         /* Non-optimized shuffle */
354         shuffle_generic(bytesoftype, blocksize, _src, _dest);
355         /* The non-optimized function covers the whole buffer,
356            so we're done processing here. */
357         return;
358       }
359   }
360 
361   /* If the buffer had any bytes at the end which couldn't be handled
362      by the vectorized implementations, use the non-optimized version
363      to finish them up. */
364   if (vectorizable_bytes < blocksize) {
365     shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
366   }
367 }
368 
369 /* Unshuffle a block.  This can never fail. */
370 void
unshuffle_altivec(const int32_t bytesoftype,const int32_t blocksize,const uint8_t * _src,uint8_t * _dest)371 unshuffle_altivec(const int32_t bytesoftype, const int32_t blocksize,
372                   const uint8_t *_src, uint8_t *_dest) {
373   const int32_t vectorized_chunk_size = bytesoftype * 16;
374   /* If the blocksize is not a multiple of both the typesize and
375      the vector size, round the blocksize down to the next value
376      which is a multiple of both. The vectorized unshuffle can be
377      used for that portion of the data, and the naive implementation
378      can be used for the remaining portion. */
379   const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
380   const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
381   const int32_t total_elements = blocksize / bytesoftype;
382 
383   /* If the block size is too small to be vectorized,
384      use the generic implementation. */
385   if (blocksize < vectorized_chunk_size) {
386     unshuffle_generic(bytesoftype, blocksize, _src, _dest);
387     return;
388   }
389 
390   /* Optimized unshuffle implementations */
391   switch (bytesoftype) {
392     case 2:
393       unshuffle2_altivec(_dest, _src, vectorizable_elements, total_elements);
394       break;
395     case 4:
396       unshuffle4_altivec(_dest, _src, vectorizable_elements, total_elements);
397       break;
398     case 8:
399       unshuffle8_altivec(_dest, _src, vectorizable_elements, total_elements);
400       break;
401     case 16:
402       unshuffle16_altivec(_dest, _src, vectorizable_elements, total_elements);
403       break;
404     default:
405       if (bytesoftype > 16) {
406         unshuffle16_tiled_altivec(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
407       }
408       else {
409         /* Non-optimized unshuffle */
410         unshuffle_generic(bytesoftype, blocksize, _src, _dest);
411         /* The non-optimized function covers the whole buffer,
412            so we're done processing here. */
413         return;
414       }
415   }
416 
417   /* If the buffer had any bytes at the end which couldn't be handled
418      by the vectorized implementations, use the non-optimized version
419      to finish them up. */
420   if (vectorizable_bytes < blocksize) {
421     unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
422   }
423 }
424