1 /*********************************************************************
2 Blosc - Blocked Shuffling and Compression Library
3
4 Copyright (C) 2021 The Blosc Developers <blosc@blosc.org>
5 https://blosc.org
6 License: BSD 3-Clause (see LICENSE.txt)
7
8 See LICENSE.txt for details about copyright and rights to use.
9 **********************************************************************/
10
11 /*********************************************************************
12 Bitshuffle - Filter for improving compression of typed binary data.
13
14 Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
15 Website: http://www.github.com/kiyo-masui/bitshuffle
16
17 Note: Adapted for c-blosc by Francesc Alted
18 Altivec/VSX version by Jerome Kieffer.
19
20 See LICENSES/BITSHUFFLE.txt file for details about copyright and
21 rights to use.
22 **********************************************************************/
23
24
25 #include "bitshuffle-generic.h"
26 #include "bitshuffle-altivec.h"
27
28 /* Make sure ALTIVEC is available for the compilation target and compiler. */
29 #if !defined(__ALTIVEC__)
30 #error ALTIVEC is not supported by the target architecture/platform and/or this compiler.
31 #endif
32 #include <altivec.h>
33 #include "transpose-altivec.h"
34
35 /* The next is useful for debugging purposes */
36 #if 0
37 #include <stdio.h>
38 #include <string.h>
39
40 static void helper_print(__vector uint8_t v, char* txt){
41 printf("%s %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",txt,
42 v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]);
43 }
44 #endif
45
46
gen_save_mask(size_t offset)47 static inline __vector uint8_t gen_save_mask(size_t offset){
48 __vector uint8_t mask;
49 size_t k;
50 for (k = 0; k < 16; k++)
51 mask[k] = (k<offset)?0:0xFF;
52 return mask;
53 }
54
55
56 // Build and return a bit-permutation mask
make_bitperm_mask(int type_size,int bit)57 static __vector uint8_t make_bitperm_mask(int type_size, int bit) {
58 __vector uint8_t result;
59 if (type_size == 1) {
60 // data_type is 8 bits long
61 for (int i = 0; i < 16; i++)
62 result[i] = 8 * (15 - i) + (7 - bit);
63 }
64 else if (type_size == 2) {
65 // data_type is 16 bits long
66 for (int i = 0; i < 8; i++) {
67 result[i] = 16 * i + 2 * bit;
68 result[i+8] = 16 * i + 2 * bit + 1;
69 }
70 }
71 return result;
72 }
73
74
75 /* Routine optimized for bit-unshuffling a buffer for a type size of 1 byte.
76 *
77 * Strategy: Read 8 vectors of 128bits, hence 128 elements,
78 * Transpose byte-wise, 2 neighboring elements (x8) remain in each vector : 24 operations
79 * Transpose bit-wise within a vector: 8x8 bitwise-transposition: 64 operations
80 * Saving is perform by shorts (2 bytes at a time)
81 * Total cost: 8 vector read, 88 transpositions, 64 writes,
82 * 14 mask vectors, 16 work-vectors
83 * */
84 void
bitunshuffle1_altivec(void * _src,void * dest,const size_t size,const size_t elem_size)85 bitunshuffle1_altivec(void* _src, void* dest, const size_t size, const size_t elem_size) {
86 size_t ii, jj, kk, vp;
87 const uint8_t* in_b = (const uint8_t*)_src;
88 uint16_t* out_s = (uint16_t*)dest;
89 size_t nrows = 8 * elem_size;
90 size_t nbyte_row = size / 8;
91
92 // working vectors
93 __vector uint8_t xmm0[8], xmm1[8], masks[8];
94 // Vector masks
95 static const __vector uint8_t lo01 = (const __vector uint8_t) {
96 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
97 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d};
98 static const __vector uint8_t hi01 = (const __vector uint8_t) {
99 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f,
100 0x12, 0x13, 0x16, 0x17, 0x1a, 0x1b, 0x1e, 0x1f};
101 static const __vector uint8_t lo02 = (const __vector uint8_t) {
102 0x00, 0x01, 0x08, 0x09, 0x10, 0x11, 0x18, 0x19,
103 0x02, 0x03, 0x0a, 0x0b, 0x12, 0x13, 0x1a, 0x1b};
104 static const __vector uint8_t hi02 = (const __vector uint8_t) {
105 0x04, 0x05, 0x0c, 0x0d, 0x14, 0x15, 0x1c, 0x1d,
106 0x06, 0x07, 0x0e, 0x0f, 0x16, 0x17, 0x1e, 0x1f};
107 static const __vector uint8_t epi64_low = (const __vector uint8_t) {
108 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
109 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
110 static const __vector uint8_t epi64_hi = (const __vector uint8_t) {
111 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
112 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
113
114 for (kk = 0; kk < 8; kk++){
115 __vector uint8_t msk;
116 for (ii = 0; ii < 8; ii++){
117 msk[ii] = 127-(16*ii+2*kk);
118 msk[ii+8] = 127-(16*ii+2*kk+1);
119 }
120 //helper_print(msk, "Mask");
121 masks[kk] = msk;
122 }
123
124 // read the data
125 vp = 0;
126 for (ii = 0; ii + 7 < nrows; ii += 8) {
127 for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
128
129 for (kk = 0; kk < 8; kk++){
130 xmm0[kk] = vec_xl((ii +kk) * nbyte_row + jj, in_b);
131 //helper_print(xmm0[kk], "vector read");
132 }
133
134 // transpositions 0-1
135 xmm1[0] = vec_perm(xmm0[0], xmm0[1], lo01);
136 xmm1[1] = vec_perm(xmm0[0], xmm0[1], hi01);
137 xmm1[2] = vec_perm(xmm0[2], xmm0[3], lo01);
138 xmm1[3] = vec_perm(xmm0[2], xmm0[3], hi01);
139 xmm1[4] = vec_perm(xmm0[4], xmm0[5], lo01);
140 xmm1[5] = vec_perm(xmm0[4], xmm0[5], hi01);
141 xmm1[6] = vec_perm(xmm0[6], xmm0[7], lo01);
142 xmm1[7] = vec_perm(xmm0[6], xmm0[7], hi01);
143 // transpositions 0-2
144 xmm0[0] = vec_perm(xmm1[0], xmm1[2], lo02);
145 xmm0[2] = vec_perm(xmm1[0], xmm1[2], hi02);
146 xmm0[1] = vec_perm(xmm1[1], xmm1[3], lo02);
147 xmm0[3] = vec_perm(xmm1[1], xmm1[3], hi02);
148 xmm0[4] = vec_perm(xmm1[4], xmm1[6], lo02);
149 xmm0[6] = vec_perm(xmm1[4], xmm1[6], hi02);
150 xmm0[5] = vec_perm(xmm1[5], xmm1[7], lo02);
151 xmm0[7] = vec_perm(xmm1[5], xmm1[7], hi02);
152 // transpositions 0-4
153 xmm1[0] = vec_perm(xmm0[0], xmm1[4], epi64_low);
154 xmm1[2] = vec_perm(xmm0[0], xmm1[4], epi64_hi);
155 xmm1[1] = vec_perm(xmm0[1], xmm1[5], epi64_low);
156 xmm1[3] = vec_perm(xmm0[1], xmm1[5], epi64_hi);
157 xmm1[4] = vec_perm(xmm0[2], xmm1[6], epi64_low);
158 xmm1[6] = vec_perm(xmm0[2], xmm1[6], epi64_hi);
159 xmm1[5] = vec_perm(xmm0[3], xmm1[7], epi64_low);
160 xmm1[7] = vec_perm(xmm0[3], xmm1[7], epi64_hi);
161
162 // At this stage each vector xmm1 contains the data from 16 adjacent bytes
163 for (int ll = 0; ll < 8; ll++){
164 __vector uint8_t xmm = xmm1[ll];
165 //helper_print(xmm, "vector transposed");
166 for (kk = 0; kk < 8; kk++) {
167 __vector uint16_t tmp;
168 tmp = (__vector uint16_t) vec_bperm(xmm, masks[kk]);
169 //printf("%d %d\n", vp, tmp[4]);
170 //helper_print((__vector uint8_t)tmp, "tmp");
171 out_s[vp++] = tmp[4];
172 }
173 }
174 }
175 }
176 }
177
178
179 /* Transpose bytes within elements for 16 bit elements. */
bshuf_trans_byte_elem_16(void * in,void * out,const size_t size)180 int64_t bshuf_trans_byte_elem_16(void* in, void* out, const size_t size) {
181 static const uint8_t bytesoftype = 2;
182 __vector uint8_t xmm0[2];
183
184 for (size_t i = 0; i + 15 < size; i += 16) {
185 for (int j = 0; j < bytesoftype; j++)
186 xmm0[j] = vec_xl(bytesoftype * i + 16 * j, (const uint8_t*)in);
187
188 /* Transpose vectors */
189 transpose2x16(xmm0);
190
191 for (int j = 0; j < bytesoftype; j++)
192 vec_xst(xmm0[j], i + j * size, (uint8_t*)out);
193 }
194 return bshuf_trans_byte_elem_remainder(in, out, size, bytesoftype,
195 size - size % 16);
196 }
197
198
199 /* Transpose bytes within elements for 32 bit elements. */
bshuf_trans_byte_elem_32(void * in,void * out,const size_t size)200 int64_t bshuf_trans_byte_elem_32(void* in, void* out, const size_t size) {
201 static const uint8_t bytesoftype = 4;
202 __vector uint8_t xmm0[4];
203
204 for (size_t i = 0; i + 15 < size; i += 16) {
205 for (int j = 0; j < bytesoftype; j++)
206 xmm0[j] = vec_xl(bytesoftype * i + 16 * j, (const uint8_t*)in);
207
208 /* Transpose vectors */
209 transpose4x16(xmm0);
210
211 for (int j = 0; j < bytesoftype; j++)
212 vec_xst(xmm0[j], i + j * size, (uint8_t*)out);
213 }
214 return bshuf_trans_byte_elem_remainder(in, out, size, bytesoftype,
215 size - size % 16);
216 }
217
218
219 /* Transpose bytes within elements for 64 bit elements. */
bshuf_trans_byte_elem_64(void * in,void * out,const size_t size)220 int64_t bshuf_trans_byte_elem_64(void* in, void* out, const size_t size) {
221 static const uint8_t bytesoftype = 8;
222 __vector uint8_t xmm0[8];
223
224 for (size_t i = 0; i + 15 < size; i += 16) {
225 for (int j = 0; j < bytesoftype; j++)
226 xmm0[j] = vec_xl(bytesoftype * i + 16 * j, (const uint8_t*)in);
227
228 /* Transpose vectors */
229 transpose8x16(xmm0);
230
231 for (int j = 0; j < bytesoftype; j++)
232 vec_xst(xmm0[j], i + j * size, (uint8_t*)out);
233 }
234 return bshuf_trans_byte_elem_remainder(in, out, size, bytesoftype,
235 size - size % 16);
236 }
237
238
239 /* Transpose bytes within elements for 128 bit elements. */
bshuf_trans_byte_elem_128(void * in,void * out,const size_t size)240 int64_t bshuf_trans_byte_elem_128(void* in, void* out, const size_t size) {
241 static const uint8_t bytesoftype = 16;
242 __vector uint8_t xmm0[16];
243
244 for (size_t i = 0; i + 15 < size; i += 16) {
245 for (int j = 0; j < bytesoftype; j++)
246 xmm0[j] = vec_xl(bytesoftype * i + 16 * j, (const uint8_t*)in);
247
248 /* Transpose vectors */
249 transpose16x16(xmm0);
250
251 for (int j = 0; j < bytesoftype; j++)
252 vec_xst(xmm0[j], i + j * size, (uint8_t*)out);
253 }
254 return bshuf_trans_byte_elem_remainder(in, out, size, bytesoftype,
255 size - size % 16);
256 }
257
258
259 /* Memory copy with bshuf call signature. */
bshuf_copy(void * in,void * out,const size_t size,const size_t elem_size)260 int64_t bshuf_copy(void* in, void* out, const size_t size,
261 const size_t elem_size) {
262
263 char* in_b = (char*)in;
264 char* out_b = (char*)out;
265
266 memcpy(out_b, in_b, size * elem_size);
267 return size * elem_size;
268 }
269
270
271 /* Transpose bytes within elements using best SSE algorithm available. */
bshuf_trans_byte_elem_altivec(void * in,void * out,const size_t size,const size_t elem_size,void * tmp_buf)272 int64_t bshuf_trans_byte_elem_altivec(void* in, void* out, const size_t size,
273 const size_t elem_size, void* tmp_buf) {
274
275 int64_t count;
276
277 /* Trivial cases: power of 2 bytes. */
278 switch (elem_size) {
279 case 1:
280 count = bshuf_copy(in, out, size, elem_size);
281 return count;
282 case 2:
283 count = bshuf_trans_byte_elem_16(in, out, size);
284 return count;
285 case 4:
286 count = bshuf_trans_byte_elem_32(in, out, size);
287 return count;
288 case 8:
289 count = bshuf_trans_byte_elem_64(in, out, size);
290 return count;
291 case 16:
292 count = bshuf_trans_byte_elem_128(in, out, size);
293 return count;
294 }
295
296 /* Worst case: odd number of bytes. Turns out that this is faster for */
297 /* (odd * 2) byte elements as well (hence % 4). */
298 if (elem_size % 4) {
299 count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
300 return count;
301 }
302
303 /* Multiple of power of 2: transpose hierarchically. */
304 {
305 size_t nchunk_elem;
306
307 if ((elem_size % 16) == 0) {
308 nchunk_elem = elem_size / 16;
309 TRANS_ELEM_TYPE(in, out, size, nchunk_elem, __vector uint8_t);
310 count = bshuf_trans_byte_elem_128(out, tmp_buf,
311 size * nchunk_elem);
312 bshuf_trans_elem(tmp_buf, out, 16, nchunk_elem, size);
313 } else if ((elem_size % 8) == 0) {
314 nchunk_elem = elem_size / 8;
315 TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
316 count = bshuf_trans_byte_elem_64(out, tmp_buf,
317 size * nchunk_elem);
318 bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
319 } else if ((elem_size % 4) == 0) {
320 nchunk_elem = elem_size / 4;
321 TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
322 count = bshuf_trans_byte_elem_32(out, tmp_buf,
323 size * nchunk_elem);
324 bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
325 } else {
326 /* Not used since scalar algorithm is faster. */
327 nchunk_elem = elem_size / 2;
328 TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
329 count = bshuf_trans_byte_elem_16(out, tmp_buf, size * nchunk_elem);
330 bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
331 }
332
333 return count;
334 }
335 }
336
337
338 /* Transpose bits within bytes. */
bshuf_trans_bit_byte_altivec(void * in,void * out,const size_t size,const size_t elem_size)339 int64_t bshuf_trans_bit_byte_altivec(void* in, void* out, const size_t size,
340 const size_t elem_size) {
341
342 const uint8_t* in_b = (const uint8_t*)in;
343 uint8_t* out_b = (uint8_t*)out;
344 int64_t count;
345 size_t nbyte = elem_size * size;
346 __vector uint8_t data, masks[8];
347 size_t ii, kk;
348
349 CHECK_MULT_EIGHT(nbyte);
350
351 // Generate all 8 needed masks
352 for (kk = 0; kk < 8; kk++){
353 masks[kk] = make_bitperm_mask(1, kk);
354 }
355
356 for (ii = 0; ii + 15 < nbyte; ii += 16) {
357 data = vec_xl(ii, in_b);
358 for (kk = 0; kk < 8; kk++) {
359 __vector uint16_t tmp;
360 uint16_t* oui16;
361 tmp = (__vector uint16_t) vec_bperm(data, masks[kk]);
362 oui16 = (uint16_t*)&out_b[(ii + kk*nbyte) >> 3];
363 *oui16 = tmp[4];
364 }
365 }
366 count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
367 nbyte - nbyte % 16);
368 return count;
369 }
370
371
372 /* Transpose bits within elements. */
bshuf_trans_bit_elem_altivec(void * in,void * out,const size_t size,const size_t elem_size,void * tmp_buf)373 int64_t bshuf_trans_bit_elem_altivec(void* in, void* out, const size_t size,
374 const size_t elem_size, void* tmp_buf) {
375
376 int64_t count;
377
378 CHECK_MULT_EIGHT(size);
379
380 count = bshuf_trans_byte_elem_altivec(in, out, size, elem_size, tmp_buf);
381 CHECK_ERR(count);
382 // bshuf_trans_bit_byte_altivec / bitshuffle1_altivec
383 count = bshuf_trans_bit_byte_altivec(out, tmp_buf, size, elem_size);
384 CHECK_ERR(count);
385 count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
386 return count;
387 }
388
389 /* For data organized into a row for each bit (8 * elem_size rows), transpose
390 * the bytes. */
bshuf_trans_byte_bitrow_altivec(void * in,void * out,const size_t size,const size_t elem_size)391 int64_t bshuf_trans_byte_bitrow_altivec(void* in, void* out, const size_t size,
392 const size_t elem_size) {
393 static const __vector uint8_t epi8_low = (const __vector uint8_t) {
394 0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13,
395 0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17};
396 static const __vector uint8_t epi8_hi = (const __vector uint8_t) {
397 0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b,
398 0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f};
399 static const __vector uint8_t epi16_low = (const __vector uint8_t) {
400 0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
401 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17};
402 static const __vector uint8_t epi16_hi = (const __vector uint8_t) {
403 0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b,
404 0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f};
405 static const __vector uint8_t epi32_low = (const __vector uint8_t) {
406 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
407 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17};
408 static const __vector uint8_t epi32_hi = (const __vector uint8_t) {
409 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
410 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
411 static const __vector uint8_t epi64_low = (const __vector uint8_t) {
412 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
413 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
414 static const __vector uint8_t epi64_hi = (const __vector uint8_t) {
415 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
416 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
417
418 const uint8_t* in_b = (const uint8_t*)in;
419 uint8_t* out_b = (uint8_t*)out;
420 size_t nrows = 8 * elem_size;
421 size_t nbyte_row = size / 8;
422 __vector uint8_t xmm0[16], xmm1[16];
423
424 CHECK_MULT_EIGHT(size);
425
426 // The optimized algorithms can only deal with even values or 1 for elem_size
427 if ((elem_size > 1) && (elem_size % 2)) {
428 return bshuf_trans_byte_bitrow_scal(in, out, size, elem_size);
429 }
430
431 int nvectors = (elem_size == 1) ? 8 : 16;
432 for (size_t ii = 0; ii + (nvectors - 1) < nrows; ii += nvectors) {
433 for (size_t jj = 0; jj + 15 < nbyte_row; jj += 16) { // vectors of 16 elements
434
435 if (elem_size == 1) {
436 for (int k = 0; k < 8; k++) {
437 xmm0[k] = vec_xl((ii + k) * nbyte_row + jj, in_b);
438 }
439
440 xmm1[0] = vec_perm(xmm0[0], xmm0[1], epi8_low);
441 xmm1[1] = vec_perm(xmm0[2], xmm0[3], epi8_low);
442 xmm1[2] = vec_perm(xmm0[4], xmm0[5], epi8_low);
443 xmm1[3] = vec_perm(xmm0[6], xmm0[7], epi8_low);
444 xmm1[4] = vec_perm(xmm0[0], xmm0[1], epi8_hi);
445 xmm1[5] = vec_perm(xmm0[2], xmm0[3], epi8_hi);
446 xmm1[6] = vec_perm(xmm0[4], xmm0[5], epi8_hi);
447 xmm1[7] = vec_perm(xmm0[6], xmm0[7], epi8_hi);
448
449 xmm0[0] = vec_perm(xmm1[0], xmm1[1], epi16_low);
450 xmm0[1] = vec_perm(xmm1[2], xmm1[3], epi16_low);
451 xmm0[2] = vec_perm(xmm1[0], xmm1[1], epi16_hi);
452 xmm0[3] = vec_perm(xmm1[2], xmm1[3], epi16_hi);
453 xmm0[4] = vec_perm(xmm1[4], xmm1[5], epi16_low);
454 xmm0[5] = vec_perm(xmm1[6], xmm1[7], epi16_low);
455 xmm0[6] = vec_perm(xmm1[4], xmm1[5], epi16_hi);
456 xmm0[7] = vec_perm(xmm1[6], xmm1[7], epi16_hi);
457
458 xmm1[0] = vec_perm(xmm0[0], xmm0[1], epi32_low);
459 xmm1[1] = vec_perm(xmm0[0], xmm0[1], epi32_hi);
460 xmm1[2] = vec_perm(xmm0[2], xmm0[3], epi32_low);
461 xmm1[3] = vec_perm(xmm0[2], xmm0[3], epi32_hi);
462 xmm1[4] = vec_perm(xmm0[4], xmm0[5], epi32_low);
463 xmm1[5] = vec_perm(xmm0[4], xmm0[5], epi32_hi);
464 xmm1[6] = vec_perm(xmm0[6], xmm0[7], epi32_low);
465 xmm1[7] = vec_perm(xmm0[6], xmm0[7], epi32_hi);
466
467 for (int k = 0; k < 8; k++) {
468 vec_xst(xmm1[k], (jj + k * 2) * nrows + ii, out_b);
469 }
470
471 continue;
472 }
473
474 for (int k = 0; k < 16; k++) {
475 xmm0[k] = vec_xl((ii + k) * nbyte_row + jj, in_b);
476 }
477
478 for (int k = 0; k < 16; k += 8) {
479 xmm1[k + 0] = vec_perm(xmm0[k + 0], xmm0[k + 1], epi8_low);
480 xmm1[k + 1] = vec_perm(xmm0[k + 2], xmm0[k + 3], epi8_low);
481 xmm1[k + 2] = vec_perm(xmm0[k + 4], xmm0[k + 5], epi8_low);
482 xmm1[k + 3] = vec_perm(xmm0[k + 6], xmm0[k + 7], epi8_low);
483 xmm1[k + 4] = vec_perm(xmm0[k + 0], xmm0[k + 1], epi8_hi);
484 xmm1[k + 5] = vec_perm(xmm0[k + 2], xmm0[k + 3], epi8_hi);
485 xmm1[k + 6] = vec_perm(xmm0[k + 4], xmm0[k + 5], epi8_hi);
486 xmm1[k + 7] = vec_perm(xmm0[k + 6], xmm0[k + 7], epi8_hi);
487 }
488
489 for (int k = 0; k < 16; k += 8) {
490 xmm0[k + 0] = vec_perm(xmm1[k + 0], xmm1[k + 1], epi16_low);
491 xmm0[k + 1] = vec_perm(xmm1[k + 2], xmm1[k + 3], epi16_low);
492 xmm0[k + 2] = vec_perm(xmm1[k + 0], xmm1[k + 1], epi16_hi);
493 xmm0[k + 3] = vec_perm(xmm1[k + 2], xmm1[k + 3], epi16_hi);
494 xmm0[k + 4] = vec_perm(xmm1[k + 4], xmm1[k + 5], epi16_low);
495 xmm0[k + 5] = vec_perm(xmm1[k + 6], xmm1[k + 7], epi16_low);
496 xmm0[k + 6] = vec_perm(xmm1[k + 4], xmm1[k + 5], epi16_hi);
497 xmm0[k + 7] = vec_perm(xmm1[k + 6], xmm1[k + 7], epi16_hi);
498 }
499
500 for (int k = 0; k < 16; k += 8) {
501 xmm1[k + 0] = vec_perm(xmm0[k + 0], xmm0[k + 1], epi32_low);
502 xmm1[k + 1] = vec_perm(xmm0[k + 0], xmm0[k + 1], epi32_hi);
503 xmm1[k + 2] = vec_perm(xmm0[k + 2], xmm0[k + 3], epi32_low);
504 xmm1[k + 3] = vec_perm(xmm0[k + 2], xmm0[k + 3], epi32_hi);
505 xmm1[k + 4] = vec_perm(xmm0[k + 4], xmm0[k + 5], epi32_low);
506 xmm1[k + 5] = vec_perm(xmm0[k + 4], xmm0[k + 5], epi32_hi);
507 xmm1[k + 6] = vec_perm(xmm0[k + 6], xmm0[k + 7], epi32_low);
508 xmm1[k + 7] = vec_perm(xmm0[k + 6], xmm0[k + 7], epi32_hi);
509 }
510
511 for (int k = 0; k < 8; k += 4) {
512 xmm0[k * 2 + 0] = vec_perm(xmm1[k + 0], xmm1[k + 8], epi64_low);
513 xmm0[k * 2 + 1] = vec_perm(xmm1[k + 0], xmm1[k + 8], epi64_hi);
514 xmm0[k * 2 + 2] = vec_perm(xmm1[k + 1], xmm1[k + 9], epi64_low);
515 xmm0[k * 2 + 3] = vec_perm(xmm1[k + 1], xmm1[k + 9], epi64_hi);
516 xmm0[k * 2 + 4] = vec_perm(xmm1[k + 2], xmm1[k + 10], epi64_low);
517 xmm0[k * 2 + 5] = vec_perm(xmm1[k + 2], xmm1[k + 10], epi64_hi);
518 xmm0[k * 2 + 6] = vec_perm(xmm1[k + 3], xmm1[k + 11], epi64_low);
519 xmm0[k * 2 + 7] = vec_perm(xmm1[k + 3], xmm1[k + 11], epi64_hi);
520 }
521
522 for (int k = 0; k < 16; k++) {
523 vec_xst(xmm0[k], (jj + k) * nrows + ii, out_b);
524 }
525
526 }
527
528 // Copy the remainder
529 for (size_t jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj++) {
530 for (int k = 0; k < nvectors; k++) {
531 out_b[jj * nrows + ii + k] = in_b[(ii + k) * nbyte_row + jj];
532 }
533 }
534
535 }
536
537 return size * elem_size;
538 }
539
540
541 /* Shuffle bits within the bytes of eight element blocks. */
bshuf_shuffle_bit_eightelem_altivec(void * in,void * out,const size_t size,const size_t elem_size)542 int64_t bshuf_shuffle_bit_eightelem_altivec(void* in, void* out, const size_t size,
543 const size_t elem_size) {
544 /* With a bit of care, this could be written such that such that it is */
545 /* in_buf = out_buf safe. */
546 const uint8_t* in_b = (const uint8_t*)in;
547 uint8_t* out_b = (uint8_t*)out;
548 size_t nbyte = elem_size * size;
549 __vector uint8_t masks[8], data;
550
551 CHECK_MULT_EIGHT(size);
552
553 // Generate all 8 needed masks
554 for (int kk = 0; kk < 8; kk++){
555 masks[kk] = make_bitperm_mask(1, kk);
556 }
557
558 if (elem_size % 2) {
559 bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
560 } else {
561 for (size_t ii = 0; ii + 8 * elem_size - 1 < nbyte;
562 ii += 8 * elem_size) {
563 for (size_t jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
564 data = vec_xl(ii + jj, in_b);
565 for (size_t kk = 0; kk < 8; kk++) {
566 __vector uint16_t tmp;
567 uint16_t* oui16;
568 tmp = (__vector uint16_t) vec_bperm(data, masks[kk]);
569 oui16 = (uint16_t*)&out_b[ii + (jj>>3) + kk * elem_size];
570 *oui16 = tmp[4];
571 }
572 }
573 }
574 }
575 return size * elem_size;
576 }
577
578
579 /* Untranspose bits within elements. */
bshuf_untrans_bit_elem_altivec(void * in,void * out,const size_t size,const size_t elem_size,void * tmp_buf)580 int64_t bshuf_untrans_bit_elem_altivec(void* in, void* out, const size_t size,
581 const size_t elem_size, void* tmp_buf) {
582
583 int64_t count;
584
585 CHECK_MULT_EIGHT(size);
586
587 count = bshuf_trans_byte_bitrow_altivec(in, tmp_buf, size, elem_size);
588 CHECK_ERR(count);
589 count = bshuf_shuffle_bit_eightelem_altivec(tmp_buf, out, size, elem_size);
590
591 return count;
592 }
593