1 /*********************************************************************
2   Blosc - Blocked Shuffling and Compression Library
3 
4   Copyright (C) 2021  The Blosc Developers <blosc@blosc.org>
5   https://blosc.org
6   License: BSD 3-Clause (see LICENSE.txt)
7 
8   See LICENSE.txt for details about copyright and rights to use.
9 **********************************************************************/
10 
11 /*********************************************************************
12   Bitshuffle - Filter for improving compression of typed binary data.
13 
14   Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
15   Website: http://www.github.com/kiyo-masui/bitshuffle
16 
17   Note: Adapted for c-blosc by Francesc Alted
18         Altivec/VSX version by Jerome Kieffer.
19 
20   See LICENSES/BITSHUFFLE.txt file for details about copyright and
21   rights to use.
22 **********************************************************************/
23 
24 
25 #include "bitshuffle-generic.h"
26 #include "bitshuffle-altivec.h"
27 
28 /* Make sure ALTIVEC is available for the compilation target and compiler. */
29 #if !defined(__ALTIVEC__)
30   #error ALTIVEC is not supported by the target architecture/platform and/or this compiler.
31 #endif
32 #include <altivec.h>
33 #include "transpose-altivec.h"
34 
35 /* The next is useful for debugging purposes */
36 #if 0
37 #include <stdio.h>
38 #include <string.h>
39 
40 static void helper_print(__vector uint8_t v, char* txt){
41   printf("%s %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",txt,
42   v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]);
43 }
44 #endif
45 
46 
gen_save_mask(size_t offset)47 static inline __vector uint8_t gen_save_mask(size_t offset){
48   __vector uint8_t mask;
49   size_t k;
50   for (k = 0; k < 16; k++)
51     mask[k] = (k<offset)?0:0xFF;
52   return mask;
53 }
54 
55 
56 // Build and return a bit-permutation mask
make_bitperm_mask(int type_size,int bit)57 static __vector uint8_t make_bitperm_mask(int type_size, int bit) {
58   __vector uint8_t result;
59   if (type_size == 1) {
60     // data_type is 8 bits long
61     for (int i = 0; i < 16; i++)
62       result[i] = 8 * (15 - i) + (7 - bit);
63   }
64   else if (type_size == 2) {
65     // data_type is 16 bits long
66     for (int i = 0; i < 8; i++) {
67       result[i] = 16 * i + 2 * bit;
68       result[i+8] = 16 * i + 2 * bit + 1;
69     }
70   }
71   return result;
72 }
73 
74 
75 /* Routine optimized for bit-unshuffling a buffer for a type size of 1 byte.
76  *
77  * Strategy: Read 8 vectors of 128bits, hence 128 elements,
78  *           Transpose byte-wise, 2 neighboring elements (x8) remain in each vector : 24 operations
79  *           Transpose bit-wise within a vector: 8x8 bitwise-transposition: 64 operations
80  *           Saving is perform by shorts (2 bytes at a time)
81  * Total cost: 8 vector read, 88 transpositions, 64 writes,
82  *             14 mask vectors, 16 work-vectors
83  * */
84 void
bitunshuffle1_altivec(void * _src,void * dest,const size_t size,const size_t elem_size)85 bitunshuffle1_altivec(void* _src, void* dest, const size_t size, const size_t elem_size) {
86   size_t ii, jj, kk, vp;
87   const uint8_t* in_b = (const uint8_t*)_src;
88   uint16_t* out_s = (uint16_t*)dest;
89   size_t nrows = 8 * elem_size;
90   size_t nbyte_row = size / 8;
91 
92   // working vectors
93   __vector uint8_t xmm0[8], xmm1[8], masks[8];
94   // Vector masks
95   static const __vector uint8_t lo01 = (const __vector uint8_t) {
96     0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
97     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d};
98   static const __vector uint8_t hi01 = (const __vector uint8_t) {
99     0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f,
100     0x12, 0x13, 0x16, 0x17, 0x1a, 0x1b, 0x1e, 0x1f};
101   static const __vector uint8_t lo02 = (const __vector uint8_t) {
102     0x00, 0x01, 0x08, 0x09, 0x10, 0x11, 0x18, 0x19,
103     0x02, 0x03, 0x0a, 0x0b, 0x12, 0x13, 0x1a, 0x1b};
104   static const __vector uint8_t hi02 = (const __vector uint8_t) {
105     0x04, 0x05, 0x0c, 0x0d, 0x14, 0x15, 0x1c, 0x1d,
106     0x06, 0x07, 0x0e, 0x0f, 0x16, 0x17, 0x1e, 0x1f};
107   static const __vector uint8_t epi64_low = (const __vector uint8_t) {
108     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
109     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
110   static const __vector uint8_t epi64_hi = (const __vector uint8_t) {
111     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
112     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
113 
114   for (kk = 0; kk < 8; kk++){
115     __vector uint8_t msk;
116     for (ii = 0; ii < 8; ii++){
117       msk[ii] = 127-(16*ii+2*kk);
118       msk[ii+8] = 127-(16*ii+2*kk+1);
119     }
120     //helper_print(msk, "Mask");
121     masks[kk] = msk;
122   }
123 
124   // read the data
125   vp = 0;
126   for (ii = 0; ii + 7 < nrows; ii += 8) {
127     for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
128 
129       for (kk = 0; kk < 8; kk++){
130         xmm0[kk] = vec_xl((ii +kk) * nbyte_row + jj, in_b);
131         //helper_print(xmm0[kk], "vector read");
132       }
133 
134       // transpositions 0-1
135       xmm1[0] = vec_perm(xmm0[0], xmm0[1], lo01);
136       xmm1[1] = vec_perm(xmm0[0], xmm0[1], hi01);
137       xmm1[2] = vec_perm(xmm0[2], xmm0[3], lo01);
138       xmm1[3] = vec_perm(xmm0[2], xmm0[3], hi01);
139       xmm1[4] = vec_perm(xmm0[4], xmm0[5], lo01);
140       xmm1[5] = vec_perm(xmm0[4], xmm0[5], hi01);
141       xmm1[6] = vec_perm(xmm0[6], xmm0[7], lo01);
142       xmm1[7] = vec_perm(xmm0[6], xmm0[7], hi01);
143       // transpositions 0-2
144       xmm0[0] = vec_perm(xmm1[0], xmm1[2], lo02);
145       xmm0[2] = vec_perm(xmm1[0], xmm1[2], hi02);
146       xmm0[1] = vec_perm(xmm1[1], xmm1[3], lo02);
147       xmm0[3] = vec_perm(xmm1[1], xmm1[3], hi02);
148       xmm0[4] = vec_perm(xmm1[4], xmm1[6], lo02);
149       xmm0[6] = vec_perm(xmm1[4], xmm1[6], hi02);
150       xmm0[5] = vec_perm(xmm1[5], xmm1[7], lo02);
151       xmm0[7] = vec_perm(xmm1[5], xmm1[7], hi02);
152       // transpositions 0-4
153       xmm1[0] = vec_perm(xmm0[0], xmm1[4], epi64_low);
154       xmm1[2] = vec_perm(xmm0[0], xmm1[4], epi64_hi);
155       xmm1[1] = vec_perm(xmm0[1], xmm1[5], epi64_low);
156       xmm1[3] = vec_perm(xmm0[1], xmm1[5], epi64_hi);
157       xmm1[4] = vec_perm(xmm0[2], xmm1[6], epi64_low);
158       xmm1[6] = vec_perm(xmm0[2], xmm1[6], epi64_hi);
159       xmm1[5] = vec_perm(xmm0[3], xmm1[7], epi64_low);
160       xmm1[7] = vec_perm(xmm0[3], xmm1[7], epi64_hi);
161 
162       // At this stage each vector xmm1 contains the data from 16 adjacent bytes
163       for (int ll = 0; ll < 8; ll++){
164         __vector uint8_t xmm = xmm1[ll];
165         //helper_print(xmm, "vector transposed");
166         for (kk = 0; kk < 8; kk++) {
167            __vector uint16_t tmp;
168            tmp = (__vector uint16_t) vec_bperm(xmm, masks[kk]);
169            //printf("%d %d\n", vp, tmp[4]);
170            //helper_print((__vector uint8_t)tmp, "tmp");
171            out_s[vp++] = tmp[4];
172         }
173       }
174     }
175   }
176 }
177 
178 
179 /* Transpose bytes within elements for 16 bit elements. */
bshuf_trans_byte_elem_16(void * in,void * out,const size_t size)180 int64_t bshuf_trans_byte_elem_16(void* in, void* out, const size_t size) {
181   static const uint8_t bytesoftype = 2;
182   __vector uint8_t xmm0[2];
183 
184   for (size_t i = 0; i + 15 < size; i += 16) {
185     for (int j = 0; j < bytesoftype; j++)
186       xmm0[j] = vec_xl(bytesoftype * i + 16 * j, (const uint8_t*)in);
187 
188     /* Transpose vectors */
189     transpose2x16(xmm0);
190 
191     for (int j = 0; j < bytesoftype; j++)
192       vec_xst(xmm0[j], i + j * size, (uint8_t*)out);
193   }
194   return bshuf_trans_byte_elem_remainder(in, out, size, bytesoftype,
195                                          size - size % 16);
196 }
197 
198 
199 /* Transpose bytes within elements for 32 bit elements. */
bshuf_trans_byte_elem_32(void * in,void * out,const size_t size)200 int64_t bshuf_trans_byte_elem_32(void* in, void* out, const size_t size) {
201   static const uint8_t bytesoftype = 4;
202   __vector uint8_t xmm0[4];
203 
204   for (size_t i = 0; i + 15 < size; i += 16) {
205     for (int j = 0; j < bytesoftype; j++)
206       xmm0[j] = vec_xl(bytesoftype * i + 16 * j, (const uint8_t*)in);
207 
208     /* Transpose vectors */
209     transpose4x16(xmm0);
210 
211     for (int j = 0; j < bytesoftype; j++)
212       vec_xst(xmm0[j], i + j * size, (uint8_t*)out);
213   }
214   return bshuf_trans_byte_elem_remainder(in, out, size, bytesoftype,
215                                          size - size % 16);
216 }
217 
218 
219 /* Transpose bytes within elements for 64 bit elements. */
bshuf_trans_byte_elem_64(void * in,void * out,const size_t size)220 int64_t bshuf_trans_byte_elem_64(void* in, void* out, const size_t size) {
221   static const uint8_t bytesoftype = 8;
222   __vector uint8_t xmm0[8];
223 
224   for (size_t i = 0; i + 15 < size; i += 16) {
225     for (int j = 0; j < bytesoftype; j++)
226       xmm0[j] = vec_xl(bytesoftype * i + 16 * j, (const uint8_t*)in);
227 
228     /* Transpose vectors */
229     transpose8x16(xmm0);
230 
231     for (int j = 0; j < bytesoftype; j++)
232       vec_xst(xmm0[j], i + j * size, (uint8_t*)out);
233   }
234   return bshuf_trans_byte_elem_remainder(in, out, size, bytesoftype,
235                                          size - size % 16);
236 }
237 
238 
239 /* Transpose bytes within elements for 128 bit elements. */
bshuf_trans_byte_elem_128(void * in,void * out,const size_t size)240 int64_t bshuf_trans_byte_elem_128(void* in, void* out, const size_t size) {
241   static const uint8_t bytesoftype = 16;
242   __vector uint8_t xmm0[16];
243 
244   for (size_t i = 0; i + 15 < size; i += 16) {
245     for (int j = 0; j < bytesoftype; j++)
246       xmm0[j] = vec_xl(bytesoftype * i + 16 * j, (const uint8_t*)in);
247 
248     /* Transpose vectors */
249     transpose16x16(xmm0);
250 
251     for (int j = 0; j < bytesoftype; j++)
252       vec_xst(xmm0[j], i + j * size, (uint8_t*)out);
253   }
254   return bshuf_trans_byte_elem_remainder(in, out, size, bytesoftype,
255                                          size - size % 16);
256 }
257 
258 
259 /* Memory copy with bshuf call signature. */
bshuf_copy(void * in,void * out,const size_t size,const size_t elem_size)260 int64_t bshuf_copy(void* in, void* out, const size_t size,
261                    const size_t elem_size) {
262 
263   char* in_b = (char*)in;
264   char* out_b = (char*)out;
265 
266   memcpy(out_b, in_b, size * elem_size);
267   return size * elem_size;
268 }
269 
270 
271 /* Transpose bytes within elements using best SSE algorithm available. */
bshuf_trans_byte_elem_altivec(void * in,void * out,const size_t size,const size_t elem_size,void * tmp_buf)272 int64_t bshuf_trans_byte_elem_altivec(void* in, void* out, const size_t size,
273                                       const size_t elem_size, void* tmp_buf) {
274 
275   int64_t count;
276 
277   /*  Trivial cases: power of 2 bytes. */
278   switch (elem_size) {
279     case 1:
280       count = bshuf_copy(in, out, size, elem_size);
281       return count;
282     case 2:
283       count = bshuf_trans_byte_elem_16(in, out, size);
284       return count;
285     case 4:
286       count = bshuf_trans_byte_elem_32(in, out, size);
287       return count;
288     case 8:
289       count = bshuf_trans_byte_elem_64(in, out, size);
290       return count;
291     case 16:
292       count = bshuf_trans_byte_elem_128(in, out, size);
293       return count;
294   }
295 
296   /*  Worst case: odd number of bytes. Turns out that this is faster for */
297   /*  (odd * 2) byte elements as well (hence % 4). */
298   if (elem_size % 4) {
299     count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
300     return count;
301   }
302 
303   /*  Multiple of power of 2: transpose hierarchically. */
304   {
305     size_t nchunk_elem;
306 
307     if ((elem_size % 16) == 0) {
308       nchunk_elem = elem_size / 16;
309       TRANS_ELEM_TYPE(in, out, size, nchunk_elem, __vector uint8_t);
310       count = bshuf_trans_byte_elem_128(out, tmp_buf,
311                                         size * nchunk_elem);
312       bshuf_trans_elem(tmp_buf, out, 16, nchunk_elem, size);
313     } else if ((elem_size % 8) == 0) {
314         nchunk_elem = elem_size / 8;
315         TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
316         count = bshuf_trans_byte_elem_64(out, tmp_buf,
317                                          size * nchunk_elem);
318         bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
319     } else if ((elem_size % 4) == 0) {
320       nchunk_elem = elem_size / 4;
321       TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
322       count = bshuf_trans_byte_elem_32(out, tmp_buf,
323                                            size * nchunk_elem);
324       bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
325     } else {
326       /*  Not used since scalar algorithm is faster. */
327       nchunk_elem = elem_size / 2;
328       TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
329       count = bshuf_trans_byte_elem_16(out, tmp_buf, size * nchunk_elem);
330       bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
331     }
332 
333     return count;
334   }
335 }
336 
337 
338 /* Transpose bits within bytes. */
bshuf_trans_bit_byte_altivec(void * in,void * out,const size_t size,const size_t elem_size)339 int64_t bshuf_trans_bit_byte_altivec(void* in, void* out, const size_t size,
340                                      const size_t elem_size) {
341 
342   const uint8_t* in_b = (const uint8_t*)in;
343   uint8_t* out_b = (uint8_t*)out;
344   int64_t count;
345   size_t nbyte = elem_size * size;
346   __vector uint8_t data, masks[8];
347   size_t ii, kk;
348 
349   CHECK_MULT_EIGHT(nbyte);
350 
351   // Generate all 8 needed masks
352   for (kk = 0; kk < 8; kk++){
353     masks[kk] = make_bitperm_mask(1, kk);
354   }
355 
356   for (ii = 0; ii + 15 < nbyte; ii += 16) {
357     data = vec_xl(ii, in_b);
358     for (kk = 0; kk < 8; kk++) {
359       __vector uint16_t tmp;
360       uint16_t* oui16;
361       tmp = (__vector uint16_t) vec_bperm(data, masks[kk]);
362       oui16 = (uint16_t*)&out_b[(ii + kk*nbyte) >> 3];
363       *oui16 = tmp[4];
364     }
365   }
366   count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
367                                          nbyte - nbyte % 16);
368   return count;
369 }
370 
371 
372 /* Transpose bits within elements. */
bshuf_trans_bit_elem_altivec(void * in,void * out,const size_t size,const size_t elem_size,void * tmp_buf)373 int64_t bshuf_trans_bit_elem_altivec(void* in, void* out, const size_t size,
374                                      const size_t elem_size, void* tmp_buf) {
375 
376   int64_t count;
377 
378   CHECK_MULT_EIGHT(size);
379 
380   count = bshuf_trans_byte_elem_altivec(in, out, size, elem_size, tmp_buf);
381   CHECK_ERR(count);
382   // bshuf_trans_bit_byte_altivec / bitshuffle1_altivec
383   count = bshuf_trans_bit_byte_altivec(out, tmp_buf, size, elem_size);
384   CHECK_ERR(count);
385   count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
386   return count;
387 }
388 
389 /* For data organized into a row for each bit (8 * elem_size rows), transpose
390  * the bytes. */
bshuf_trans_byte_bitrow_altivec(void * in,void * out,const size_t size,const size_t elem_size)391 int64_t bshuf_trans_byte_bitrow_altivec(void* in, void* out, const size_t size,
392                                         const size_t elem_size) {
393   static const __vector uint8_t epi8_low = (const __vector uint8_t) {
394     0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13,
395     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17};
396   static const __vector uint8_t epi8_hi = (const __vector uint8_t) {
397     0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b,
398     0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f};
399   static const __vector uint8_t epi16_low = (const __vector uint8_t) {
400     0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
401     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17};
402   static const __vector uint8_t epi16_hi = (const __vector uint8_t) {
403     0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b,
404     0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f};
405   static const __vector uint8_t epi32_low = (const __vector uint8_t) {
406     0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
407     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17};
408   static const __vector uint8_t epi32_hi = (const __vector uint8_t) {
409     0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
410     0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
411   static const __vector uint8_t epi64_low = (const __vector uint8_t) {
412     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
413     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
414   static const __vector uint8_t epi64_hi = (const __vector uint8_t) {
415     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
416     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
417 
418   const uint8_t* in_b = (const uint8_t*)in;
419   uint8_t* out_b = (uint8_t*)out;
420   size_t nrows = 8 * elem_size;
421   size_t nbyte_row = size / 8;
422   __vector uint8_t xmm0[16], xmm1[16];
423 
424   CHECK_MULT_EIGHT(size);
425 
426   // The optimized algorithms can only deal with even values or 1 for elem_size
427   if ((elem_size > 1) && (elem_size % 2)) {
428     return bshuf_trans_byte_bitrow_scal(in, out, size, elem_size);
429   }
430 
431   int nvectors = (elem_size == 1) ? 8 : 16;
432   for (size_t ii = 0; ii + (nvectors - 1) < nrows; ii += nvectors) {
433     for (size_t jj = 0; jj + 15 < nbyte_row; jj += 16) {  // vectors of 16 elements
434 
435       if (elem_size == 1) {
436         for (int k = 0; k < 8; k++) {
437           xmm0[k] = vec_xl((ii + k) * nbyte_row + jj, in_b);
438         }
439 
440         xmm1[0] = vec_perm(xmm0[0], xmm0[1], epi8_low);
441         xmm1[1] = vec_perm(xmm0[2], xmm0[3], epi8_low);
442         xmm1[2] = vec_perm(xmm0[4], xmm0[5], epi8_low);
443         xmm1[3] = vec_perm(xmm0[6], xmm0[7], epi8_low);
444         xmm1[4] = vec_perm(xmm0[0], xmm0[1], epi8_hi);
445         xmm1[5] = vec_perm(xmm0[2], xmm0[3], epi8_hi);
446         xmm1[6] = vec_perm(xmm0[4], xmm0[5], epi8_hi);
447         xmm1[7] = vec_perm(xmm0[6], xmm0[7], epi8_hi);
448 
449         xmm0[0] = vec_perm(xmm1[0], xmm1[1], epi16_low);
450         xmm0[1] = vec_perm(xmm1[2], xmm1[3], epi16_low);
451         xmm0[2] = vec_perm(xmm1[0], xmm1[1], epi16_hi);
452         xmm0[3] = vec_perm(xmm1[2], xmm1[3], epi16_hi);
453         xmm0[4] = vec_perm(xmm1[4], xmm1[5], epi16_low);
454         xmm0[5] = vec_perm(xmm1[6], xmm1[7], epi16_low);
455         xmm0[6] = vec_perm(xmm1[4], xmm1[5], epi16_hi);
456         xmm0[7] = vec_perm(xmm1[6], xmm1[7], epi16_hi);
457 
458         xmm1[0] = vec_perm(xmm0[0], xmm0[1], epi32_low);
459         xmm1[1] = vec_perm(xmm0[0], xmm0[1], epi32_hi);
460         xmm1[2] = vec_perm(xmm0[2], xmm0[3], epi32_low);
461         xmm1[3] = vec_perm(xmm0[2], xmm0[3], epi32_hi);
462         xmm1[4] = vec_perm(xmm0[4], xmm0[5], epi32_low);
463         xmm1[5] = vec_perm(xmm0[4], xmm0[5], epi32_hi);
464         xmm1[6] = vec_perm(xmm0[6], xmm0[7], epi32_low);
465         xmm1[7] = vec_perm(xmm0[6], xmm0[7], epi32_hi);
466 
467         for (int k = 0; k < 8; k++) {
468           vec_xst(xmm1[k], (jj + k * 2) * nrows + ii, out_b);
469         }
470 
471         continue;
472       }
473 
474       for (int k = 0; k < 16; k++) {
475         xmm0[k] = vec_xl((ii + k) * nbyte_row + jj, in_b);
476       }
477 
478       for (int k = 0; k < 16; k += 8) {
479         xmm1[k + 0] = vec_perm(xmm0[k + 0], xmm0[k + 1], epi8_low);
480         xmm1[k + 1] = vec_perm(xmm0[k + 2], xmm0[k + 3], epi8_low);
481         xmm1[k + 2] = vec_perm(xmm0[k + 4], xmm0[k + 5], epi8_low);
482         xmm1[k + 3] = vec_perm(xmm0[k + 6], xmm0[k + 7], epi8_low);
483         xmm1[k + 4] = vec_perm(xmm0[k + 0], xmm0[k + 1], epi8_hi);
484         xmm1[k + 5] = vec_perm(xmm0[k + 2], xmm0[k + 3], epi8_hi);
485         xmm1[k + 6] = vec_perm(xmm0[k + 4], xmm0[k + 5], epi8_hi);
486         xmm1[k + 7] = vec_perm(xmm0[k + 6], xmm0[k + 7], epi8_hi);
487       }
488 
489       for (int k = 0; k < 16; k += 8) {
490         xmm0[k + 0] = vec_perm(xmm1[k + 0], xmm1[k + 1], epi16_low);
491         xmm0[k + 1] = vec_perm(xmm1[k + 2], xmm1[k + 3], epi16_low);
492         xmm0[k + 2] = vec_perm(xmm1[k + 0], xmm1[k + 1], epi16_hi);
493         xmm0[k + 3] = vec_perm(xmm1[k + 2], xmm1[k + 3], epi16_hi);
494         xmm0[k + 4] = vec_perm(xmm1[k + 4], xmm1[k + 5], epi16_low);
495         xmm0[k + 5] = vec_perm(xmm1[k + 6], xmm1[k + 7], epi16_low);
496         xmm0[k + 6] = vec_perm(xmm1[k + 4], xmm1[k + 5], epi16_hi);
497         xmm0[k + 7] = vec_perm(xmm1[k + 6], xmm1[k + 7], epi16_hi);
498       }
499 
500       for (int k = 0; k < 16; k += 8) {
501         xmm1[k + 0] = vec_perm(xmm0[k + 0], xmm0[k + 1], epi32_low);
502         xmm1[k + 1] = vec_perm(xmm0[k + 0], xmm0[k + 1], epi32_hi);
503         xmm1[k + 2] = vec_perm(xmm0[k + 2], xmm0[k + 3], epi32_low);
504         xmm1[k + 3] = vec_perm(xmm0[k + 2], xmm0[k + 3], epi32_hi);
505         xmm1[k + 4] = vec_perm(xmm0[k + 4], xmm0[k + 5], epi32_low);
506         xmm1[k + 5] = vec_perm(xmm0[k + 4], xmm0[k + 5], epi32_hi);
507         xmm1[k + 6] = vec_perm(xmm0[k + 6], xmm0[k + 7], epi32_low);
508         xmm1[k + 7] = vec_perm(xmm0[k + 6], xmm0[k + 7], epi32_hi);
509       }
510 
511       for (int k = 0; k < 8; k += 4) {
512         xmm0[k * 2 + 0] = vec_perm(xmm1[k + 0], xmm1[k + 8], epi64_low);
513         xmm0[k * 2 + 1] = vec_perm(xmm1[k + 0], xmm1[k + 8], epi64_hi);
514         xmm0[k * 2 + 2] = vec_perm(xmm1[k + 1], xmm1[k + 9], epi64_low);
515         xmm0[k * 2 + 3] = vec_perm(xmm1[k + 1], xmm1[k + 9], epi64_hi);
516         xmm0[k * 2 + 4] = vec_perm(xmm1[k + 2], xmm1[k + 10], epi64_low);
517         xmm0[k * 2 + 5] = vec_perm(xmm1[k + 2], xmm1[k + 10], epi64_hi);
518         xmm0[k * 2 + 6] = vec_perm(xmm1[k + 3], xmm1[k + 11], epi64_low);
519         xmm0[k * 2 + 7] = vec_perm(xmm1[k + 3], xmm1[k + 11], epi64_hi);
520       }
521 
522       for (int k = 0; k < 16; k++) {
523         vec_xst(xmm0[k], (jj + k) * nrows + ii, out_b);
524       }
525 
526     }
527 
528     // Copy the remainder
529     for (size_t jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj++) {
530       for (int k = 0; k < nvectors; k++) {
531         out_b[jj * nrows + ii + k] = in_b[(ii + k) * nbyte_row + jj];
532       }
533     }
534 
535   }
536 
537   return size * elem_size;
538 }
539 
540 
541 /* Shuffle bits within the bytes of eight element blocks. */
bshuf_shuffle_bit_eightelem_altivec(void * in,void * out,const size_t size,const size_t elem_size)542 int64_t bshuf_shuffle_bit_eightelem_altivec(void* in, void* out, const size_t size,
543                                             const size_t elem_size) {
544   /*  With a bit of care, this could be written such that such that it is */
545   /*  in_buf = out_buf safe. */
546   const uint8_t* in_b = (const uint8_t*)in;
547   uint8_t* out_b = (uint8_t*)out;
548   size_t nbyte = elem_size * size;
549   __vector uint8_t masks[8], data;
550 
551   CHECK_MULT_EIGHT(size);
552 
553   // Generate all 8 needed masks
554   for (int kk = 0; kk < 8; kk++){
555     masks[kk] = make_bitperm_mask(1, kk);
556   }
557 
558   if (elem_size % 2) {
559     bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
560   } else {
561     for (size_t ii = 0; ii + 8 * elem_size - 1 < nbyte;
562          ii += 8 * elem_size) {
563       for (size_t jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
564         data = vec_xl(ii + jj, in_b);
565         for (size_t kk = 0; kk < 8; kk++) {
566           __vector uint16_t tmp;
567           uint16_t* oui16;
568           tmp = (__vector uint16_t) vec_bperm(data, masks[kk]);
569           oui16 = (uint16_t*)&out_b[ii + (jj>>3) + kk * elem_size];
570           *oui16 = tmp[4];
571         }
572       }
573     }
574   }
575   return size * elem_size;
576 }
577 
578 
579 /* Untranspose bits within elements. */
bshuf_untrans_bit_elem_altivec(void * in,void * out,const size_t size,const size_t elem_size,void * tmp_buf)580 int64_t bshuf_untrans_bit_elem_altivec(void* in, void* out, const size_t size,
581                                        const size_t elem_size, void* tmp_buf) {
582 
583   int64_t count;
584 
585   CHECK_MULT_EIGHT(size);
586 
587   count = bshuf_trans_byte_bitrow_altivec(in, tmp_buf, size, elem_size);
588   CHECK_ERR(count);
589   count = bshuf_shuffle_bit_eightelem_altivec(tmp_buf, out, size, elem_size);
590 
591   return count;
592 }
593