1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 //
18 // Automatically generated file; DO NOT EDIT.
19
20 #pragma once
21
22 #include <stdint.h>
23 #include <string.h>
24
25 #ifdef _MSC_VER
26 #include <intrin.h>
27 #else
28 #include <immintrin.h>
29 #endif
30
31 namespace arrow {
32 namespace internal {
33
unpack0_32_avx512(const uint32_t * in,uint32_t * out)34 inline static const uint32_t* unpack0_32_avx512(const uint32_t* in, uint32_t* out) {
35 memset(out, 0x0, 32 * sizeof(*out));
36 out += 32;
37
38 return in;
39 }
40
unpack1_32_avx512(const uint32_t * in,uint32_t * out)41 inline static const uint32_t* unpack1_32_avx512(const uint32_t* in, uint32_t* out) {
42 uint32_t mask = 0x1;
43 __m512i reg_shifts, reg_inls, reg_masks;
44 __m512i results;
45
46 reg_masks = _mm512_set1_epi32(mask);
47
48 // shift the first 16 outs
49 reg_shifts = _mm512_set_epi32(15, 14, 13, 12,
50 11, 10, 9, 8,
51 7, 6, 5, 4,
52 3, 2, 1, 0);
53 reg_inls = _mm512_set_epi32(in[0], in[0],
54 in[0], in[0],
55 in[0], in[0],
56 in[0], in[0],
57 in[0], in[0],
58 in[0], in[0],
59 in[0], in[0],
60 in[0], in[0]);
61 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
62 _mm512_storeu_si512(out, results);
63 out += 16;
64
65 // shift the second 16 outs
66 reg_shifts = _mm512_set_epi32(31, 30, 29, 28,
67 27, 26, 25, 24,
68 23, 22, 21, 20,
69 19, 18, 17, 16);
70 reg_inls = _mm512_set_epi32(in[0], in[0],
71 in[0], in[0],
72 in[0], in[0],
73 in[0], in[0],
74 in[0], in[0],
75 in[0], in[0],
76 in[0], in[0],
77 in[0], in[0]);
78 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
79 _mm512_storeu_si512(out, results);
80 out += 16;
81
82 in += 1;
83
84 return in;
85 }
86
unpack2_32_avx512(const uint32_t * in,uint32_t * out)87 inline static const uint32_t* unpack2_32_avx512(const uint32_t* in, uint32_t* out) {
88 uint32_t mask = 0x3;
89 __m512i reg_shifts, reg_inls, reg_masks;
90 __m512i results;
91
92 reg_masks = _mm512_set1_epi32(mask);
93
94 // shift the first 16 outs
95 reg_shifts = _mm512_set_epi32(30, 28, 26, 24,
96 22, 20, 18, 16,
97 14, 12, 10, 8,
98 6, 4, 2, 0);
99 reg_inls = _mm512_set_epi32(in[0], in[0],
100 in[0], in[0],
101 in[0], in[0],
102 in[0], in[0],
103 in[0], in[0],
104 in[0], in[0],
105 in[0], in[0],
106 in[0], in[0]);
107 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
108 _mm512_storeu_si512(out, results);
109 out += 16;
110
111 // shift the second 16 outs
112 reg_shifts = _mm512_set_epi32(30, 28, 26, 24,
113 22, 20, 18, 16,
114 14, 12, 10, 8,
115 6, 4, 2, 0);
116 reg_inls = _mm512_set_epi32(in[1], in[1],
117 in[1], in[1],
118 in[1], in[1],
119 in[1], in[1],
120 in[1], in[1],
121 in[1], in[1],
122 in[1], in[1],
123 in[1], in[1]);
124 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
125 _mm512_storeu_si512(out, results);
126 out += 16;
127
128 in += 2;
129
130 return in;
131 }
132
unpack3_32_avx512(const uint32_t * in,uint32_t * out)133 inline static const uint32_t* unpack3_32_avx512(const uint32_t* in, uint32_t* out) {
134 uint32_t mask = 0x7;
135 __m512i reg_shifts, reg_inls, reg_masks;
136 __m512i results;
137
138 reg_masks = _mm512_set1_epi32(mask);
139
140 // shift the first 16 outs
141 reg_shifts = _mm512_set_epi32(13, 10, 7, 4,
142 1, 0, 27, 24,
143 21, 18, 15, 12,
144 9, 6, 3, 0);
145 reg_inls = _mm512_set_epi32(in[1], in[1],
146 in[1], in[1],
147 in[1], in[0] >> 30 | in[1] << 2,
148 in[0], in[0],
149 in[0], in[0],
150 in[0], in[0],
151 in[0], in[0],
152 in[0], in[0]);
153 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
154 _mm512_storeu_si512(out, results);
155 out += 16;
156
157 // shift the second 16 outs
158 reg_shifts = _mm512_set_epi32(29, 26, 23, 20,
159 17, 14, 11, 8,
160 5, 2, 0, 28,
161 25, 22, 19, 16);
162 reg_inls = _mm512_set_epi32(in[2], in[2],
163 in[2], in[2],
164 in[2], in[2],
165 in[2], in[2],
166 in[2], in[2],
167 in[1] >> 31 | in[2] << 1, in[1],
168 in[1], in[1],
169 in[1], in[1]);
170 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
171 _mm512_storeu_si512(out, results);
172 out += 16;
173
174 in += 3;
175
176 return in;
177 }
178
unpack4_32_avx512(const uint32_t * in,uint32_t * out)179 inline static const uint32_t* unpack4_32_avx512(const uint32_t* in, uint32_t* out) {
180 uint32_t mask = 0xf;
181 __m512i reg_shifts, reg_inls, reg_masks;
182 __m512i results;
183
184 reg_masks = _mm512_set1_epi32(mask);
185
186 // shift the first 16 outs
187 reg_shifts = _mm512_set_epi32(28, 24, 20, 16,
188 12, 8, 4, 0,
189 28, 24, 20, 16,
190 12, 8, 4, 0);
191 reg_inls = _mm512_set_epi32(in[1], in[1],
192 in[1], in[1],
193 in[1], in[1],
194 in[1], in[1],
195 in[0], in[0],
196 in[0], in[0],
197 in[0], in[0],
198 in[0], in[0]);
199 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
200 _mm512_storeu_si512(out, results);
201 out += 16;
202
203 // shift the second 16 outs
204 reg_shifts = _mm512_set_epi32(28, 24, 20, 16,
205 12, 8, 4, 0,
206 28, 24, 20, 16,
207 12, 8, 4, 0);
208 reg_inls = _mm512_set_epi32(in[3], in[3],
209 in[3], in[3],
210 in[3], in[3],
211 in[3], in[3],
212 in[2], in[2],
213 in[2], in[2],
214 in[2], in[2],
215 in[2], in[2]);
216 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
217 _mm512_storeu_si512(out, results);
218 out += 16;
219
220 in += 4;
221
222 return in;
223 }
224
unpack5_32_avx512(const uint32_t * in,uint32_t * out)225 inline static const uint32_t* unpack5_32_avx512(const uint32_t* in, uint32_t* out) {
226 uint32_t mask = 0x1f;
227 __m512i reg_shifts, reg_inls, reg_masks;
228 __m512i results;
229
230 reg_masks = _mm512_set1_epi32(mask);
231
232 // shift the first 16 outs
233 reg_shifts = _mm512_set_epi32(11, 6, 1, 0,
234 23, 18, 13, 8,
235 3, 0, 25, 20,
236 15, 10, 5, 0);
237 reg_inls = _mm512_set_epi32(in[2], in[2],
238 in[2], in[1] >> 28 | in[2] << 4,
239 in[1], in[1],
240 in[1], in[1],
241 in[1], in[0] >> 30 | in[1] << 2,
242 in[0], in[0],
243 in[0], in[0],
244 in[0], in[0]);
245 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
246 _mm512_storeu_si512(out, results);
247 out += 16;
248
249 // shift the second 16 outs
250 reg_shifts = _mm512_set_epi32(27, 22, 17, 12,
251 7, 2, 0, 24,
252 19, 14, 9, 4,
253 0, 26, 21, 16);
254 reg_inls = _mm512_set_epi32(in[4], in[4],
255 in[4], in[4],
256 in[4], in[4],
257 in[3] >> 29 | in[4] << 3, in[3],
258 in[3], in[3],
259 in[3], in[3],
260 in[2] >> 31 | in[3] << 1, in[2],
261 in[2], in[2]);
262 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
263 _mm512_storeu_si512(out, results);
264 out += 16;
265
266 in += 5;
267
268 return in;
269 }
270
unpack6_32_avx512(const uint32_t * in,uint32_t * out)271 inline static const uint32_t* unpack6_32_avx512(const uint32_t* in, uint32_t* out) {
272 uint32_t mask = 0x3f;
273 __m512i reg_shifts, reg_inls, reg_masks;
274 __m512i results;
275
276 reg_masks = _mm512_set1_epi32(mask);
277
278 // shift the first 16 outs
279 reg_shifts = _mm512_set_epi32(26, 20, 14, 8,
280 2, 0, 22, 16,
281 10, 4, 0, 24,
282 18, 12, 6, 0);
283 reg_inls = _mm512_set_epi32(in[2], in[2],
284 in[2], in[2],
285 in[2], in[1] >> 28 | in[2] << 4,
286 in[1], in[1],
287 in[1], in[1],
288 in[0] >> 30 | in[1] << 2, in[0],
289 in[0], in[0],
290 in[0], in[0]);
291 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
292 _mm512_storeu_si512(out, results);
293 out += 16;
294
295 // shift the second 16 outs
296 reg_shifts = _mm512_set_epi32(26, 20, 14, 8,
297 2, 0, 22, 16,
298 10, 4, 0, 24,
299 18, 12, 6, 0);
300 reg_inls = _mm512_set_epi32(in[5], in[5],
301 in[5], in[5],
302 in[5], in[4] >> 28 | in[5] << 4,
303 in[4], in[4],
304 in[4], in[4],
305 in[3] >> 30 | in[4] << 2, in[3],
306 in[3], in[3],
307 in[3], in[3]);
308 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
309 _mm512_storeu_si512(out, results);
310 out += 16;
311
312 in += 6;
313
314 return in;
315 }
316
unpack7_32_avx512(const uint32_t * in,uint32_t * out)317 inline static const uint32_t* unpack7_32_avx512(const uint32_t* in, uint32_t* out) {
318 uint32_t mask = 0x7f;
319 __m512i reg_shifts, reg_inls, reg_masks;
320 __m512i results;
321
322 reg_masks = _mm512_set1_epi32(mask);
323
324 // shift the first 16 outs
325 reg_shifts = _mm512_set_epi32(9, 2, 0, 20,
326 13, 6, 0, 24,
327 17, 10, 3, 0,
328 21, 14, 7, 0);
329 reg_inls = _mm512_set_epi32(in[3], in[3],
330 in[2] >> 27 | in[3] << 5, in[2],
331 in[2], in[2],
332 in[1] >> 31 | in[2] << 1, in[1],
333 in[1], in[1],
334 in[1], in[0] >> 28 | in[1] << 4,
335 in[0], in[0],
336 in[0], in[0]);
337 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
338 _mm512_storeu_si512(out, results);
339 out += 16;
340
341 // shift the second 16 outs
342 reg_shifts = _mm512_set_epi32(25, 18, 11, 4,
343 0, 22, 15, 8,
344 1, 0, 19, 12,
345 5, 0, 23, 16);
346 reg_inls = _mm512_set_epi32(in[6], in[6],
347 in[6], in[6],
348 in[5] >> 29 | in[6] << 3, in[5],
349 in[5], in[5],
350 in[5], in[4] >> 26 | in[5] << 6,
351 in[4], in[4],
352 in[4], in[3] >> 30 | in[4] << 2,
353 in[3], in[3]);
354 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
355 _mm512_storeu_si512(out, results);
356 out += 16;
357
358 in += 7;
359
360 return in;
361 }
362
unpack8_32_avx512(const uint32_t * in,uint32_t * out)363 inline static const uint32_t* unpack8_32_avx512(const uint32_t* in, uint32_t* out) {
364 uint32_t mask = 0xff;
365 __m512i reg_shifts, reg_inls, reg_masks;
366 __m512i results;
367
368 reg_masks = _mm512_set1_epi32(mask);
369
370 // shift the first 16 outs
371 reg_shifts = _mm512_set_epi32(24, 16, 8, 0,
372 24, 16, 8, 0,
373 24, 16, 8, 0,
374 24, 16, 8, 0);
375 reg_inls = _mm512_set_epi32(in[3], in[3],
376 in[3], in[3],
377 in[2], in[2],
378 in[2], in[2],
379 in[1], in[1],
380 in[1], in[1],
381 in[0], in[0],
382 in[0], in[0]);
383 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
384 _mm512_storeu_si512(out, results);
385 out += 16;
386
387 // shift the second 16 outs
388 reg_shifts = _mm512_set_epi32(24, 16, 8, 0,
389 24, 16, 8, 0,
390 24, 16, 8, 0,
391 24, 16, 8, 0);
392 reg_inls = _mm512_set_epi32(in[7], in[7],
393 in[7], in[7],
394 in[6], in[6],
395 in[6], in[6],
396 in[5], in[5],
397 in[5], in[5],
398 in[4], in[4],
399 in[4], in[4]);
400 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
401 _mm512_storeu_si512(out, results);
402 out += 16;
403
404 in += 8;
405
406 return in;
407 }
408
unpack9_32_avx512(const uint32_t * in,uint32_t * out)409 inline static const uint32_t* unpack9_32_avx512(const uint32_t* in, uint32_t* out) {
410 uint32_t mask = 0x1ff;
411 __m512i reg_shifts, reg_inls, reg_masks;
412 __m512i results;
413
414 reg_masks = _mm512_set1_epi32(mask);
415
416 // shift the first 16 outs
417 reg_shifts = _mm512_set_epi32(7, 0, 21, 12,
418 3, 0, 17, 8,
419 0, 22, 13, 4,
420 0, 18, 9, 0);
421 reg_inls = _mm512_set_epi32(in[4], in[3] >> 30 | in[4] << 2,
422 in[3], in[3],
423 in[3], in[2] >> 26 | in[3] << 6,
424 in[2], in[2],
425 in[1] >> 31 | in[2] << 1, in[1],
426 in[1], in[1],
427 in[0] >> 27 | in[1] << 5, in[0],
428 in[0], in[0]);
429 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
430 _mm512_storeu_si512(out, results);
431 out += 16;
432
433 // shift the second 16 outs
434 reg_shifts = _mm512_set_epi32(23, 14, 5, 0,
435 19, 10, 1, 0,
436 15, 6, 0, 20,
437 11, 2, 0, 16);
438 reg_inls = _mm512_set_epi32(in[8], in[8],
439 in[8], in[7] >> 28 | in[8] << 4,
440 in[7], in[7],
441 in[7], in[6] >> 24 | in[7] << 8,
442 in[6], in[6],
443 in[5] >> 29 | in[6] << 3, in[5],
444 in[5], in[5],
445 in[4] >> 25 | in[5] << 7, in[4]);
446 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
447 _mm512_storeu_si512(out, results);
448 out += 16;
449
450 in += 9;
451
452 return in;
453 }
454
unpack10_32_avx512(const uint32_t * in,uint32_t * out)455 inline static const uint32_t* unpack10_32_avx512(const uint32_t* in, uint32_t* out) {
456 uint32_t mask = 0x3ff;
457 __m512i reg_shifts, reg_inls, reg_masks;
458 __m512i results;
459
460 reg_masks = _mm512_set1_epi32(mask);
461
462 // shift the first 16 outs
463 reg_shifts = _mm512_set_epi32(22, 12, 2, 0,
464 14, 4, 0, 16,
465 6, 0, 18, 8,
466 0, 20, 10, 0);
467 reg_inls = _mm512_set_epi32(in[4], in[4],
468 in[4], in[3] >> 24 | in[4] << 8,
469 in[3], in[3],
470 in[2] >> 26 | in[3] << 6, in[2],
471 in[2], in[1] >> 28 | in[2] << 4,
472 in[1], in[1],
473 in[0] >> 30 | in[1] << 2, in[0],
474 in[0], in[0]);
475 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
476 _mm512_storeu_si512(out, results);
477 out += 16;
478
479 // shift the second 16 outs
480 reg_shifts = _mm512_set_epi32(22, 12, 2, 0,
481 14, 4, 0, 16,
482 6, 0, 18, 8,
483 0, 20, 10, 0);
484 reg_inls = _mm512_set_epi32(in[9], in[9],
485 in[9], in[8] >> 24 | in[9] << 8,
486 in[8], in[8],
487 in[7] >> 26 | in[8] << 6, in[7],
488 in[7], in[6] >> 28 | in[7] << 4,
489 in[6], in[6],
490 in[5] >> 30 | in[6] << 2, in[5],
491 in[5], in[5]);
492 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
493 _mm512_storeu_si512(out, results);
494 out += 16;
495
496 in += 10;
497
498 return in;
499 }
500
unpack11_32_avx512(const uint32_t * in,uint32_t * out)501 inline static const uint32_t* unpack11_32_avx512(const uint32_t* in, uint32_t* out) {
502 uint32_t mask = 0x7ff;
503 __m512i reg_shifts, reg_inls, reg_masks;
504 __m512i results;
505
506 reg_masks = _mm512_set1_epi32(mask);
507
508 // shift the first 16 outs
509 reg_shifts = _mm512_set_epi32(5, 0, 15, 4,
510 0, 14, 3, 0,
511 13, 2, 0, 12,
512 1, 0, 11, 0);
513 reg_inls = _mm512_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
514 in[4], in[4],
515 in[3] >> 25 | in[4] << 7, in[3],
516 in[3], in[2] >> 24 | in[3] << 8,
517 in[2], in[2],
518 in[1] >> 23 | in[2] << 9, in[1],
519 in[1], in[0] >> 22 | in[1] << 10,
520 in[0], in[0]);
521 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
522 _mm512_storeu_si512(out, results);
523 out += 16;
524
525 // shift the second 16 outs
526 reg_shifts = _mm512_set_epi32(21, 10, 0, 20,
527 9, 0, 19, 8,
528 0, 18, 7, 0,
529 17, 6, 0, 16);
530 reg_inls = _mm512_set_epi32(in[10], in[10],
531 in[9] >> 31 | in[10] << 1, in[9],
532 in[9], in[8] >> 30 | in[9] << 2,
533 in[8], in[8],
534 in[7] >> 29 | in[8] << 3, in[7],
535 in[7], in[6] >> 28 | in[7] << 4,
536 in[6], in[6],
537 in[5] >> 27 | in[6] << 5, in[5]);
538 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
539 _mm512_storeu_si512(out, results);
540 out += 16;
541
542 in += 11;
543
544 return in;
545 }
546
unpack12_32_avx512(const uint32_t * in,uint32_t * out)547 inline static const uint32_t* unpack12_32_avx512(const uint32_t* in, uint32_t* out) {
548 uint32_t mask = 0xfff;
549 __m512i reg_shifts, reg_inls, reg_masks;
550 __m512i results;
551
552 reg_masks = _mm512_set1_epi32(mask);
553
554 // shift the first 16 outs
555 reg_shifts = _mm512_set_epi32(20, 8, 0, 16,
556 4, 0, 12, 0,
557 20, 8, 0, 16,
558 4, 0, 12, 0);
559 reg_inls = _mm512_set_epi32(in[5], in[5],
560 in[4] >> 28 | in[5] << 4, in[4],
561 in[4], in[3] >> 24 | in[4] << 8,
562 in[3], in[3],
563 in[2], in[2],
564 in[1] >> 28 | in[2] << 4, in[1],
565 in[1], in[0] >> 24 | in[1] << 8,
566 in[0], in[0]);
567 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
568 _mm512_storeu_si512(out, results);
569 out += 16;
570
571 // shift the second 16 outs
572 reg_shifts = _mm512_set_epi32(20, 8, 0, 16,
573 4, 0, 12, 0,
574 20, 8, 0, 16,
575 4, 0, 12, 0);
576 reg_inls = _mm512_set_epi32(in[11], in[11],
577 in[10] >> 28 | in[11] << 4, in[10],
578 in[10], in[9] >> 24 | in[10] << 8,
579 in[9], in[9],
580 in[8], in[8],
581 in[7] >> 28 | in[8] << 4, in[7],
582 in[7], in[6] >> 24 | in[7] << 8,
583 in[6], in[6]);
584 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
585 _mm512_storeu_si512(out, results);
586 out += 16;
587
588 in += 12;
589
590 return in;
591 }
592
unpack13_32_avx512(const uint32_t * in,uint32_t * out)593 inline static const uint32_t* unpack13_32_avx512(const uint32_t* in, uint32_t* out) {
594 uint32_t mask = 0x1fff;
595 __m512i reg_shifts, reg_inls, reg_masks;
596 __m512i results;
597
598 reg_masks = _mm512_set1_epi32(mask);
599
600 // shift the first 16 outs
601 reg_shifts = _mm512_set_epi32(3, 0, 9, 0,
602 15, 2, 0, 8,
603 0, 14, 1, 0,
604 7, 0, 13, 0);
605 reg_inls = _mm512_set_epi32(in[6], in[5] >> 22 | in[6] << 10,
606 in[5], in[4] >> 28 | in[5] << 4,
607 in[4], in[4],
608 in[3] >> 21 | in[4] << 11, in[3],
609 in[2] >> 27 | in[3] << 5, in[2],
610 in[2], in[1] >> 20 | in[2] << 12,
611 in[1], in[0] >> 26 | in[1] << 6,
612 in[0], in[0]);
613 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
614 _mm512_storeu_si512(out, results);
615 out += 16;
616
617 // shift the second 16 outs
618 reg_shifts = _mm512_set_epi32(19, 6, 0, 12,
619 0, 18, 5, 0,
620 11, 0, 17, 4,
621 0, 10, 0, 16);
622 reg_inls = _mm512_set_epi32(in[12], in[12],
623 in[11] >> 25 | in[12] << 7, in[11],
624 in[10] >> 31 | in[11] << 1, in[10],
625 in[10], in[9] >> 24 | in[10] << 8,
626 in[9], in[8] >> 30 | in[9] << 2,
627 in[8], in[8],
628 in[7] >> 23 | in[8] << 9, in[7],
629 in[6] >> 29 | in[7] << 3, in[6]);
630 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
631 _mm512_storeu_si512(out, results);
632 out += 16;
633
634 in += 13;
635
636 return in;
637 }
638
unpack14_32_avx512(const uint32_t * in,uint32_t * out)639 inline static const uint32_t* unpack14_32_avx512(const uint32_t* in, uint32_t* out) {
640 uint32_t mask = 0x3fff;
641 __m512i reg_shifts, reg_inls, reg_masks;
642 __m512i results;
643
644 reg_masks = _mm512_set1_epi32(mask);
645
646 // shift the first 16 outs
647 reg_shifts = _mm512_set_epi32(18, 4, 0, 8,
648 0, 12, 0, 16,
649 2, 0, 6, 0,
650 10, 0, 14, 0);
651 reg_inls = _mm512_set_epi32(in[6], in[6],
652 in[5] >> 22 | in[6] << 10, in[5],
653 in[4] >> 26 | in[5] << 6, in[4],
654 in[3] >> 30 | in[4] << 2, in[3],
655 in[3], in[2] >> 20 | in[3] << 12,
656 in[2], in[1] >> 24 | in[2] << 8,
657 in[1], in[0] >> 28 | in[1] << 4,
658 in[0], in[0]);
659 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
660 _mm512_storeu_si512(out, results);
661 out += 16;
662
663 // shift the second 16 outs
664 reg_shifts = _mm512_set_epi32(18, 4, 0, 8,
665 0, 12, 0, 16,
666 2, 0, 6, 0,
667 10, 0, 14, 0);
668 reg_inls = _mm512_set_epi32(in[13], in[13],
669 in[12] >> 22 | in[13] << 10, in[12],
670 in[11] >> 26 | in[12] << 6, in[11],
671 in[10] >> 30 | in[11] << 2, in[10],
672 in[10], in[9] >> 20 | in[10] << 12,
673 in[9], in[8] >> 24 | in[9] << 8,
674 in[8], in[7] >> 28 | in[8] << 4,
675 in[7], in[7]);
676 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
677 _mm512_storeu_si512(out, results);
678 out += 16;
679
680 in += 14;
681
682 return in;
683 }
684
unpack15_32_avx512(const uint32_t * in,uint32_t * out)685 inline static const uint32_t* unpack15_32_avx512(const uint32_t* in, uint32_t* out) {
686 uint32_t mask = 0x7fff;
687 __m512i reg_shifts, reg_inls, reg_masks;
688 __m512i results;
689
690 reg_masks = _mm512_set1_epi32(mask);
691
692 // shift the first 16 outs
693 reg_shifts = _mm512_set_epi32(1, 0, 3, 0,
694 5, 0, 7, 0,
695 9, 0, 11, 0,
696 13, 0, 15, 0);
697 reg_inls = _mm512_set_epi32(in[7], in[6] >> 18 | in[7] << 14,
698 in[6], in[5] >> 20 | in[6] << 12,
699 in[5], in[4] >> 22 | in[5] << 10,
700 in[4], in[3] >> 24 | in[4] << 8,
701 in[3], in[2] >> 26 | in[3] << 6,
702 in[2], in[1] >> 28 | in[2] << 4,
703 in[1], in[0] >> 30 | in[1] << 2,
704 in[0], in[0]);
705 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
706 _mm512_storeu_si512(out, results);
707 out += 16;
708
709 // shift the second 16 outs
710 reg_shifts = _mm512_set_epi32(17, 2, 0, 4,
711 0, 6, 0, 8,
712 0, 10, 0, 12,
713 0, 14, 0, 16);
714 reg_inls = _mm512_set_epi32(in[14], in[14],
715 in[13] >> 19 | in[14] << 13, in[13],
716 in[12] >> 21 | in[13] << 11, in[12],
717 in[11] >> 23 | in[12] << 9, in[11],
718 in[10] >> 25 | in[11] << 7, in[10],
719 in[9] >> 27 | in[10] << 5, in[9],
720 in[8] >> 29 | in[9] << 3, in[8],
721 in[7] >> 31 | in[8] << 1, in[7]);
722 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
723 _mm512_storeu_si512(out, results);
724 out += 16;
725
726 in += 15;
727
728 return in;
729 }
730
unpack16_32_avx512(const uint32_t * in,uint32_t * out)731 inline static const uint32_t* unpack16_32_avx512(const uint32_t* in, uint32_t* out) {
732 uint32_t mask = 0xffff;
733 __m512i reg_shifts, reg_inls, reg_masks;
734 __m512i results;
735
736 reg_masks = _mm512_set1_epi32(mask);
737
738 // shift the first 16 outs
739 reg_shifts = _mm512_set_epi32(16, 0, 16, 0,
740 16, 0, 16, 0,
741 16, 0, 16, 0,
742 16, 0, 16, 0);
743 reg_inls = _mm512_set_epi32(in[7], in[7],
744 in[6], in[6],
745 in[5], in[5],
746 in[4], in[4],
747 in[3], in[3],
748 in[2], in[2],
749 in[1], in[1],
750 in[0], in[0]);
751 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
752 _mm512_storeu_si512(out, results);
753 out += 16;
754
755 // shift the second 16 outs
756 reg_shifts = _mm512_set_epi32(16, 0, 16, 0,
757 16, 0, 16, 0,
758 16, 0, 16, 0,
759 16, 0, 16, 0);
760 reg_inls = _mm512_set_epi32(in[15], in[15],
761 in[14], in[14],
762 in[13], in[13],
763 in[12], in[12],
764 in[11], in[11],
765 in[10], in[10],
766 in[9], in[9],
767 in[8], in[8]);
768 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
769 _mm512_storeu_si512(out, results);
770 out += 16;
771
772 in += 16;
773
774 return in;
775 }
776
unpack17_32_avx512(const uint32_t * in,uint32_t * out)777 inline static const uint32_t* unpack17_32_avx512(const uint32_t* in, uint32_t* out) {
778 uint32_t mask = 0x1ffff;
779 __m512i reg_shifts, reg_inls, reg_masks;
780 __m512i results;
781
782 reg_masks = _mm512_set1_epi32(mask);
783
784 // shift the first 16 outs
785 reg_shifts = _mm512_set_epi32(0, 14, 0, 12,
786 0, 10, 0, 8,
787 0, 6, 0, 4,
788 0, 2, 0, 0);
789 reg_inls = _mm512_set_epi32(in[7] >> 31 | in[8] << 1, in[7],
790 in[6] >> 29 | in[7] << 3, in[6],
791 in[5] >> 27 | in[6] << 5, in[5],
792 in[4] >> 25 | in[5] << 7, in[4],
793 in[3] >> 23 | in[4] << 9, in[3],
794 in[2] >> 21 | in[3] << 11, in[2],
795 in[1] >> 19 | in[2] << 13, in[1],
796 in[0] >> 17 | in[1] << 15, in[0]);
797 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
798 _mm512_storeu_si512(out, results);
799 out += 16;
800
801 // shift the second 16 outs
802 reg_shifts = _mm512_set_epi32(15, 0, 13, 0,
803 11, 0, 9, 0,
804 7, 0, 5, 0,
805 3, 0, 1, 0);
806 reg_inls = _mm512_set_epi32(in[16], in[15] >> 30 | in[16] << 2,
807 in[15], in[14] >> 28 | in[15] << 4,
808 in[14], in[13] >> 26 | in[14] << 6,
809 in[13], in[12] >> 24 | in[13] << 8,
810 in[12], in[11] >> 22 | in[12] << 10,
811 in[11], in[10] >> 20 | in[11] << 12,
812 in[10], in[9] >> 18 | in[10] << 14,
813 in[9], in[8] >> 16 | in[9] << 16);
814 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
815 _mm512_storeu_si512(out, results);
816 out += 16;
817
818 in += 17;
819
820 return in;
821 }
822
unpack18_32_avx512(const uint32_t * in,uint32_t * out)823 inline static const uint32_t* unpack18_32_avx512(const uint32_t* in, uint32_t* out) {
824 uint32_t mask = 0x3ffff;
825 __m512i reg_shifts, reg_inls, reg_masks;
826 __m512i results;
827
828 reg_masks = _mm512_set1_epi32(mask);
829
830 // shift the first 16 outs
831 reg_shifts = _mm512_set_epi32(14, 0, 10, 0,
832 6, 0, 2, 0,
833 0, 12, 0, 8,
834 0, 4, 0, 0);
835 reg_inls = _mm512_set_epi32(in[8], in[7] >> 28 | in[8] << 4,
836 in[7], in[6] >> 24 | in[7] << 8,
837 in[6], in[5] >> 20 | in[6] << 12,
838 in[5], in[4] >> 16 | in[5] << 16,
839 in[3] >> 30 | in[4] << 2, in[3],
840 in[2] >> 26 | in[3] << 6, in[2],
841 in[1] >> 22 | in[2] << 10, in[1],
842 in[0] >> 18 | in[1] << 14, in[0]);
843 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
844 _mm512_storeu_si512(out, results);
845 out += 16;
846
847 // shift the second 16 outs
848 reg_shifts = _mm512_set_epi32(14, 0, 10, 0,
849 6, 0, 2, 0,
850 0, 12, 0, 8,
851 0, 4, 0, 0);
852 reg_inls = _mm512_set_epi32(in[17], in[16] >> 28 | in[17] << 4,
853 in[16], in[15] >> 24 | in[16] << 8,
854 in[15], in[14] >> 20 | in[15] << 12,
855 in[14], in[13] >> 16 | in[14] << 16,
856 in[12] >> 30 | in[13] << 2, in[12],
857 in[11] >> 26 | in[12] << 6, in[11],
858 in[10] >> 22 | in[11] << 10, in[10],
859 in[9] >> 18 | in[10] << 14, in[9]);
860 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
861 _mm512_storeu_si512(out, results);
862 out += 16;
863
864 in += 18;
865
866 return in;
867 }
868
unpack19_32_avx512(const uint32_t * in,uint32_t * out)869 inline static const uint32_t* unpack19_32_avx512(const uint32_t* in, uint32_t* out) {
870 uint32_t mask = 0x7ffff;
871 __m512i reg_shifts, reg_inls, reg_masks;
872 __m512i results;
873
874 reg_masks = _mm512_set1_epi32(mask);
875
876 // shift the first 16 outs
877 reg_shifts = _mm512_set_epi32(0, 10, 0, 4,
878 0, 0, 11, 0,
879 5, 0, 0, 12,
880 0, 6, 0, 0);
881 reg_inls = _mm512_set_epi32(in[8] >> 29 | in[9] << 3, in[8],
882 in[7] >> 23 | in[8] << 9, in[7],
883 in[6] >> 17 | in[7] << 15, in[5] >> 30 | in[6] << 2,
884 in[5], in[4] >> 24 | in[5] << 8,
885 in[4], in[3] >> 18 | in[4] << 14,
886 in[2] >> 31 | in[3] << 1, in[2],
887 in[1] >> 25 | in[2] << 7, in[1],
888 in[0] >> 19 | in[1] << 13, in[0]);
889 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
890 _mm512_storeu_si512(out, results);
891 out += 16;
892
893 // shift the second 16 outs
894 reg_shifts = _mm512_set_epi32(13, 0, 7, 0,
895 1, 0, 0, 8,
896 0, 2, 0, 0,
897 9, 0, 3, 0);
898 reg_inls = _mm512_set_epi32(in[18], in[17] >> 26 | in[18] << 6,
899 in[17], in[16] >> 20 | in[17] << 12,
900 in[16], in[15] >> 14 | in[16] << 18,
901 in[14] >> 27 | in[15] << 5, in[14],
902 in[13] >> 21 | in[14] << 11, in[13],
903 in[12] >> 15 | in[13] << 17, in[11] >> 28 | in[12] << 4,
904 in[11], in[10] >> 22 | in[11] << 10,
905 in[10], in[9] >> 16 | in[10] << 16);
906 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
907 _mm512_storeu_si512(out, results);
908 out += 16;
909
910 in += 19;
911
912 return in;
913 }
914
unpack20_32_avx512(const uint32_t * in,uint32_t * out)915 inline static const uint32_t* unpack20_32_avx512(const uint32_t* in, uint32_t* out) {
916 uint32_t mask = 0xfffff;
917 __m512i reg_shifts, reg_inls, reg_masks;
918 __m512i results;
919
920 reg_masks = _mm512_set1_epi32(mask);
921
922 // shift the first 16 outs
923 reg_shifts = _mm512_set_epi32(12, 0, 4, 0,
924 0, 8, 0, 0,
925 12, 0, 4, 0,
926 0, 8, 0, 0);
927 reg_inls = _mm512_set_epi32(in[9], in[8] >> 24 | in[9] << 8,
928 in[8], in[7] >> 16 | in[8] << 16,
929 in[6] >> 28 | in[7] << 4, in[6],
930 in[5] >> 20 | in[6] << 12, in[5],
931 in[4], in[3] >> 24 | in[4] << 8,
932 in[3], in[2] >> 16 | in[3] << 16,
933 in[1] >> 28 | in[2] << 4, in[1],
934 in[0] >> 20 | in[1] << 12, in[0]);
935 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
936 _mm512_storeu_si512(out, results);
937 out += 16;
938
939 // shift the second 16 outs
940 reg_shifts = _mm512_set_epi32(12, 0, 4, 0,
941 0, 8, 0, 0,
942 12, 0, 4, 0,
943 0, 8, 0, 0);
944 reg_inls = _mm512_set_epi32(in[19], in[18] >> 24 | in[19] << 8,
945 in[18], in[17] >> 16 | in[18] << 16,
946 in[16] >> 28 | in[17] << 4, in[16],
947 in[15] >> 20 | in[16] << 12, in[15],
948 in[14], in[13] >> 24 | in[14] << 8,
949 in[13], in[12] >> 16 | in[13] << 16,
950 in[11] >> 28 | in[12] << 4, in[11],
951 in[10] >> 20 | in[11] << 12, in[10]);
952 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
953 _mm512_storeu_si512(out, results);
954 out += 16;
955
956 in += 20;
957
958 return in;
959 }
960
unpack21_32_avx512(const uint32_t * in,uint32_t * out)961 inline static const uint32_t* unpack21_32_avx512(const uint32_t* in, uint32_t* out) {
962 uint32_t mask = 0x1fffff;
963 __m512i reg_shifts, reg_inls, reg_masks;
964 __m512i results;
965
966 reg_masks = _mm512_set1_epi32(mask);
967
968 // shift the first 16 outs
969 reg_shifts = _mm512_set_epi32(0, 6, 0, 0,
970 7, 0, 0, 8,
971 0, 0, 9, 0,
972 0, 10, 0, 0);
973 reg_inls = _mm512_set_epi32(in[9] >> 27 | in[10] << 5, in[9],
974 in[8] >> 17 | in[9] << 15, in[7] >> 28 | in[8] << 4,
975 in[7], in[6] >> 18 | in[7] << 14,
976 in[5] >> 29 | in[6] << 3, in[5],
977 in[4] >> 19 | in[5] << 13, in[3] >> 30 | in[4] << 2,
978 in[3], in[2] >> 20 | in[3] << 12,
979 in[1] >> 31 | in[2] << 1, in[1],
980 in[0] >> 21 | in[1] << 11, in[0]);
981 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
982 _mm512_storeu_si512(out, results);
983 out += 16;
984
985 // shift the second 16 outs
986 reg_shifts = _mm512_set_epi32(11, 0, 1, 0,
987 0, 2, 0, 0,
988 3, 0, 0, 4,
989 0, 0, 5, 0);
990 reg_inls = _mm512_set_epi32(in[20], in[19] >> 22 | in[20] << 10,
991 in[19], in[18] >> 12 | in[19] << 20,
992 in[17] >> 23 | in[18] << 9, in[17],
993 in[16] >> 13 | in[17] << 19, in[15] >> 24 | in[16] << 8,
994 in[15], in[14] >> 14 | in[15] << 18,
995 in[13] >> 25 | in[14] << 7, in[13],
996 in[12] >> 15 | in[13] << 17, in[11] >> 26 | in[12] << 6,
997 in[11], in[10] >> 16 | in[11] << 16);
998 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
999 _mm512_storeu_si512(out, results);
1000 out += 16;
1001
1002 in += 21;
1003
1004 return in;
1005 }
1006
unpack22_32_avx512(const uint32_t * in,uint32_t * out)1007 inline static const uint32_t* unpack22_32_avx512(const uint32_t* in, uint32_t* out) {
1008 uint32_t mask = 0x3fffff;
1009 __m512i reg_shifts, reg_inls, reg_masks;
1010 __m512i results;
1011
1012 reg_masks = _mm512_set1_epi32(mask);
1013
1014 // shift the first 16 outs
1015 reg_shifts = _mm512_set_epi32(10, 0, 0, 8,
1016 0, 0, 6, 0,
1017 0, 4, 0, 0,
1018 2, 0, 0, 0);
1019 reg_inls = _mm512_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
1020 in[8] >> 30 | in[9] << 2, in[8],
1021 in[7] >> 18 | in[8] << 14, in[6] >> 28 | in[7] << 4,
1022 in[6], in[5] >> 16 | in[6] << 16,
1023 in[4] >> 26 | in[5] << 6, in[4],
1024 in[3] >> 14 | in[4] << 18, in[2] >> 24 | in[3] << 8,
1025 in[2], in[1] >> 12 | in[2] << 20,
1026 in[0] >> 22 | in[1] << 10, in[0]);
1027 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1028 _mm512_storeu_si512(out, results);
1029 out += 16;
1030
1031 // shift the second 16 outs
1032 reg_shifts = _mm512_set_epi32(10, 0, 0, 8,
1033 0, 0, 6, 0,
1034 0, 4, 0, 0,
1035 2, 0, 0, 0);
1036 reg_inls = _mm512_set_epi32(in[21], in[20] >> 20 | in[21] << 12,
1037 in[19] >> 30 | in[20] << 2, in[19],
1038 in[18] >> 18 | in[19] << 14, in[17] >> 28 | in[18] << 4,
1039 in[17], in[16] >> 16 | in[17] << 16,
1040 in[15] >> 26 | in[16] << 6, in[15],
1041 in[14] >> 14 | in[15] << 18, in[13] >> 24 | in[14] << 8,
1042 in[13], in[12] >> 12 | in[13] << 20,
1043 in[11] >> 22 | in[12] << 10, in[11]);
1044 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1045 _mm512_storeu_si512(out, results);
1046 out += 16;
1047
1048 in += 22;
1049
1050 return in;
1051 }
1052
unpack23_32_avx512(const uint32_t * in,uint32_t * out)1053 inline static const uint32_t* unpack23_32_avx512(const uint32_t* in, uint32_t* out) {
1054 uint32_t mask = 0x7fffff;
1055 __m512i reg_shifts, reg_inls, reg_masks;
1056 __m512i results;
1057
1058 reg_masks = _mm512_set1_epi32(mask);
1059
1060 // shift the first 16 outs
1061 reg_shifts = _mm512_set_epi32(0, 2, 0, 0,
1062 0, 6, 0, 0,
1063 1, 0, 0, 0,
1064 5, 0, 0, 0);
1065 reg_inls = _mm512_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
1066 in[9] >> 11 | in[10] << 21, in[8] >> 20 | in[9] << 12,
1067 in[7] >> 29 | in[8] << 3, in[7],
1068 in[6] >> 15 | in[7] << 17, in[5] >> 24 | in[6] << 8,
1069 in[5], in[4] >> 10 | in[5] << 22,
1070 in[3] >> 19 | in[4] << 13, in[2] >> 28 | in[3] << 4,
1071 in[2], in[1] >> 14 | in[2] << 18,
1072 in[0] >> 23 | in[1] << 9, in[0]);
1073 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1074 _mm512_storeu_si512(out, results);
1075 out += 16;
1076
1077 // shift the second 16 outs
1078 reg_shifts = _mm512_set_epi32(9, 0, 0, 4,
1079 0, 0, 0, 8,
1080 0, 0, 3, 0,
1081 0, 0, 7, 0);
1082 reg_inls = _mm512_set_epi32(in[22], in[21] >> 18 | in[22] << 14,
1083 in[20] >> 27 | in[21] << 5, in[20],
1084 in[19] >> 13 | in[20] << 19, in[18] >> 22 | in[19] << 10,
1085 in[17] >> 31 | in[18] << 1, in[17],
1086 in[16] >> 17 | in[17] << 15, in[15] >> 26 | in[16] << 6,
1087 in[15], in[14] >> 12 | in[15] << 20,
1088 in[13] >> 21 | in[14] << 11, in[12] >> 30 | in[13] << 2,
1089 in[12], in[11] >> 16 | in[12] << 16);
1090 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1091 _mm512_storeu_si512(out, results);
1092 out += 16;
1093
1094 in += 23;
1095
1096 return in;
1097 }
1098
unpack24_32_avx512(const uint32_t * in,uint32_t * out)1099 inline static const uint32_t* unpack24_32_avx512(const uint32_t* in, uint32_t* out) {
1100 uint32_t mask = 0xffffff;
1101 __m512i reg_shifts, reg_inls, reg_masks;
1102 __m512i results;
1103
1104 reg_masks = _mm512_set1_epi32(mask);
1105
1106 // shift the first 16 outs
1107 reg_shifts = _mm512_set_epi32(8, 0, 0, 0,
1108 8, 0, 0, 0,
1109 8, 0, 0, 0,
1110 8, 0, 0, 0);
1111 reg_inls = _mm512_set_epi32(in[11], in[10] >> 16 | in[11] << 16,
1112 in[9] >> 24 | in[10] << 8, in[9],
1113 in[8], in[7] >> 16 | in[8] << 16,
1114 in[6] >> 24 | in[7] << 8, in[6],
1115 in[5], in[4] >> 16 | in[5] << 16,
1116 in[3] >> 24 | in[4] << 8, in[3],
1117 in[2], in[1] >> 16 | in[2] << 16,
1118 in[0] >> 24 | in[1] << 8, in[0]);
1119 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1120 _mm512_storeu_si512(out, results);
1121 out += 16;
1122
1123 // shift the second 16 outs
1124 reg_shifts = _mm512_set_epi32(8, 0, 0, 0,
1125 8, 0, 0, 0,
1126 8, 0, 0, 0,
1127 8, 0, 0, 0);
1128 reg_inls = _mm512_set_epi32(in[23], in[22] >> 16 | in[23] << 16,
1129 in[21] >> 24 | in[22] << 8, in[21],
1130 in[20], in[19] >> 16 | in[20] << 16,
1131 in[18] >> 24 | in[19] << 8, in[18],
1132 in[17], in[16] >> 16 | in[17] << 16,
1133 in[15] >> 24 | in[16] << 8, in[15],
1134 in[14], in[13] >> 16 | in[14] << 16,
1135 in[12] >> 24 | in[13] << 8, in[12]);
1136 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1137 _mm512_storeu_si512(out, results);
1138 out += 16;
1139
1140 in += 24;
1141
1142 return in;
1143 }
1144
unpack25_32_avx512(const uint32_t * in,uint32_t * out)1145 inline static const uint32_t* unpack25_32_avx512(const uint32_t* in, uint32_t* out) {
1146 uint32_t mask = 0x1ffffff;
1147 __m512i reg_shifts, reg_inls, reg_masks;
1148 __m512i results;
1149
1150 reg_masks = _mm512_set1_epi32(mask);
1151
1152 // shift the first 16 outs
1153 reg_shifts = _mm512_set_epi32(0, 0, 5, 0,
1154 0, 0, 1, 0,
1155 0, 0, 0, 4,
1156 0, 0, 0, 0);
1157 reg_inls = _mm512_set_epi32(in[11] >> 23 | in[12] << 9, in[10] >> 30 | in[11] << 2,
1158 in[10], in[9] >> 12 | in[10] << 20,
1159 in[8] >> 19 | in[9] << 13, in[7] >> 26 | in[8] << 6,
1160 in[7], in[6] >> 8 | in[7] << 24,
1161 in[5] >> 15 | in[6] << 17, in[4] >> 22 | in[5] << 10,
1162 in[3] >> 29 | in[4] << 3, in[3],
1163 in[2] >> 11 | in[3] << 21, in[1] >> 18 | in[2] << 14,
1164 in[0] >> 25 | in[1] << 7, in[0]);
1165 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1166 _mm512_storeu_si512(out, results);
1167 out += 16;
1168
1169 // shift the second 16 outs
1170 reg_shifts = _mm512_set_epi32(7, 0, 0, 0,
1171 3, 0, 0, 0,
1172 0, 6, 0, 0,
1173 0, 2, 0, 0);
1174 reg_inls = _mm512_set_epi32(in[24], in[23] >> 14 | in[24] << 18,
1175 in[22] >> 21 | in[23] << 11, in[21] >> 28 | in[22] << 4,
1176 in[21], in[20] >> 10 | in[21] << 22,
1177 in[19] >> 17 | in[20] << 15, in[18] >> 24 | in[19] << 8,
1178 in[17] >> 31 | in[18] << 1, in[17],
1179 in[16] >> 13 | in[17] << 19, in[15] >> 20 | in[16] << 12,
1180 in[14] >> 27 | in[15] << 5, in[14],
1181 in[13] >> 9 | in[14] << 23, in[12] >> 16 | in[13] << 16);
1182 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1183 _mm512_storeu_si512(out, results);
1184 out += 16;
1185
1186 in += 25;
1187
1188 return in;
1189 }
1190
unpack26_32_avx512(const uint32_t * in,uint32_t * out)1191 inline static const uint32_t* unpack26_32_avx512(const uint32_t* in, uint32_t* out) {
1192 uint32_t mask = 0x3ffffff;
1193 __m512i reg_shifts, reg_inls, reg_masks;
1194 __m512i results;
1195
1196 reg_masks = _mm512_set1_epi32(mask);
1197
1198 // shift the first 16 outs
1199 reg_shifts = _mm512_set_epi32(6, 0, 0, 0,
1200 0, 4, 0, 0,
1201 0, 0, 2, 0,
1202 0, 0, 0, 0);
1203 reg_inls = _mm512_set_epi32(in[12], in[11] >> 12 | in[12] << 20,
1204 in[10] >> 18 | in[11] << 14, in[9] >> 24 | in[10] << 8,
1205 in[8] >> 30 | in[9] << 2, in[8],
1206 in[7] >> 10 | in[8] << 22, in[6] >> 16 | in[7] << 16,
1207 in[5] >> 22 | in[6] << 10, in[4] >> 28 | in[5] << 4,
1208 in[4], in[3] >> 8 | in[4] << 24,
1209 in[2] >> 14 | in[3] << 18, in[1] >> 20 | in[2] << 12,
1210 in[0] >> 26 | in[1] << 6, in[0]);
1211 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1212 _mm512_storeu_si512(out, results);
1213 out += 16;
1214
1215 // shift the second 16 outs
1216 reg_shifts = _mm512_set_epi32(6, 0, 0, 0,
1217 0, 4, 0, 0,
1218 0, 0, 2, 0,
1219 0, 0, 0, 0);
1220 reg_inls = _mm512_set_epi32(in[25], in[24] >> 12 | in[25] << 20,
1221 in[23] >> 18 | in[24] << 14, in[22] >> 24 | in[23] << 8,
1222 in[21] >> 30 | in[22] << 2, in[21],
1223 in[20] >> 10 | in[21] << 22, in[19] >> 16 | in[20] << 16,
1224 in[18] >> 22 | in[19] << 10, in[17] >> 28 | in[18] << 4,
1225 in[17], in[16] >> 8 | in[17] << 24,
1226 in[15] >> 14 | in[16] << 18, in[14] >> 20 | in[15] << 12,
1227 in[13] >> 26 | in[14] << 6, in[13]);
1228 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1229 _mm512_storeu_si512(out, results);
1230 out += 16;
1231
1232 in += 26;
1233
1234 return in;
1235 }
1236
unpack27_32_avx512(const uint32_t * in,uint32_t * out)1237 inline static const uint32_t* unpack27_32_avx512(const uint32_t* in, uint32_t* out) {
1238 uint32_t mask = 0x7ffffff;
1239 __m512i reg_shifts, reg_inls, reg_masks;
1240 __m512i results;
1241
1242 reg_masks = _mm512_set1_epi32(mask);
1243
1244 // shift the first 16 outs
1245 reg_shifts = _mm512_set_epi32(0, 0, 0, 4,
1246 0, 0, 0, 0,
1247 0, 2, 0, 0,
1248 0, 0, 0, 0);
1249 reg_inls = _mm512_set_epi32(in[12] >> 21 | in[13] << 11, in[11] >> 26 | in[12] << 6,
1250 in[10] >> 31 | in[11] << 1, in[10],
1251 in[9] >> 9 | in[10] << 23, in[8] >> 14 | in[9] << 18,
1252 in[7] >> 19 | in[8] << 13, in[6] >> 24 | in[7] << 8,
1253 in[5] >> 29 | in[6] << 3, in[5],
1254 in[4] >> 7 | in[5] << 25, in[3] >> 12 | in[4] << 20,
1255 in[2] >> 17 | in[3] << 15, in[1] >> 22 | in[2] << 10,
1256 in[0] >> 27 | in[1] << 5, in[0]);
1257 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1258 _mm512_storeu_si512(out, results);
1259 out += 16;
1260
1261 // shift the second 16 outs
1262 reg_shifts = _mm512_set_epi32(5, 0, 0, 0,
1263 0, 0, 3, 0,
1264 0, 0, 0, 0,
1265 1, 0, 0, 0);
1266 reg_inls = _mm512_set_epi32(in[26], in[25] >> 10 | in[26] << 22,
1267 in[24] >> 15 | in[25] << 17, in[23] >> 20 | in[24] << 12,
1268 in[22] >> 25 | in[23] << 7, in[21] >> 30 | in[22] << 2,
1269 in[21], in[20] >> 8 | in[21] << 24,
1270 in[19] >> 13 | in[20] << 19, in[18] >> 18 | in[19] << 14,
1271 in[17] >> 23 | in[18] << 9, in[16] >> 28 | in[17] << 4,
1272 in[16], in[15] >> 6 | in[16] << 26,
1273 in[14] >> 11 | in[15] << 21, in[13] >> 16 | in[14] << 16);
1274 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1275 _mm512_storeu_si512(out, results);
1276 out += 16;
1277
1278 in += 27;
1279
1280 return in;
1281 }
1282
unpack28_32_avx512(const uint32_t * in,uint32_t * out)1283 inline static const uint32_t* unpack28_32_avx512(const uint32_t* in, uint32_t* out) {
1284 uint32_t mask = 0xfffffff;
1285 __m512i reg_shifts, reg_inls, reg_masks;
1286 __m512i results;
1287
1288 reg_masks = _mm512_set1_epi32(mask);
1289
1290 // shift the first 16 outs
1291 reg_shifts = _mm512_set_epi32(4, 0, 0, 0,
1292 0, 0, 0, 0,
1293 4, 0, 0, 0,
1294 0, 0, 0, 0);
1295 reg_inls = _mm512_set_epi32(in[13], in[12] >> 8 | in[13] << 24,
1296 in[11] >> 12 | in[12] << 20, in[10] >> 16 | in[11] << 16,
1297 in[9] >> 20 | in[10] << 12, in[8] >> 24 | in[9] << 8,
1298 in[7] >> 28 | in[8] << 4, in[7],
1299 in[6], in[5] >> 8 | in[6] << 24,
1300 in[4] >> 12 | in[5] << 20, in[3] >> 16 | in[4] << 16,
1301 in[2] >> 20 | in[3] << 12, in[1] >> 24 | in[2] << 8,
1302 in[0] >> 28 | in[1] << 4, in[0]);
1303 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1304 _mm512_storeu_si512(out, results);
1305 out += 16;
1306
1307 // shift the second 16 outs
1308 reg_shifts = _mm512_set_epi32(4, 0, 0, 0,
1309 0, 0, 0, 0,
1310 4, 0, 0, 0,
1311 0, 0, 0, 0);
1312 reg_inls = _mm512_set_epi32(in[27], in[26] >> 8 | in[27] << 24,
1313 in[25] >> 12 | in[26] << 20, in[24] >> 16 | in[25] << 16,
1314 in[23] >> 20 | in[24] << 12, in[22] >> 24 | in[23] << 8,
1315 in[21] >> 28 | in[22] << 4, in[21],
1316 in[20], in[19] >> 8 | in[20] << 24,
1317 in[18] >> 12 | in[19] << 20, in[17] >> 16 | in[18] << 16,
1318 in[16] >> 20 | in[17] << 12, in[15] >> 24 | in[16] << 8,
1319 in[14] >> 28 | in[15] << 4, in[14]);
1320 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1321 _mm512_storeu_si512(out, results);
1322 out += 16;
1323
1324 in += 28;
1325
1326 return in;
1327 }
1328
unpack29_32_avx512(const uint32_t * in,uint32_t * out)1329 inline static const uint32_t* unpack29_32_avx512(const uint32_t* in, uint32_t* out) {
1330 uint32_t mask = 0x1fffffff;
1331 __m512i reg_shifts, reg_inls, reg_masks;
1332 __m512i results;
1333
1334 reg_masks = _mm512_set1_epi32(mask);
1335
1336 // shift the first 16 outs
1337 reg_shifts = _mm512_set_epi32(0, 0, 0, 0,
1338 0, 2, 0, 0,
1339 0, 0, 0, 0,
1340 0, 0, 0, 0);
1341 reg_inls = _mm512_set_epi32(in[13] >> 19 | in[14] << 13, in[12] >> 22 | in[13] << 10,
1342 in[11] >> 25 | in[12] << 7, in[10] >> 28 | in[11] << 4,
1343 in[9] >> 31 | in[10] << 1, in[9],
1344 in[8] >> 5 | in[9] << 27, in[7] >> 8 | in[8] << 24,
1345 in[6] >> 11 | in[7] << 21, in[5] >> 14 | in[6] << 18,
1346 in[4] >> 17 | in[5] << 15, in[3] >> 20 | in[4] << 12,
1347 in[2] >> 23 | in[3] << 9, in[1] >> 26 | in[2] << 6,
1348 in[0] >> 29 | in[1] << 3, in[0]);
1349 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1350 _mm512_storeu_si512(out, results);
1351 out += 16;
1352
1353 // shift the second 16 outs
1354 reg_shifts = _mm512_set_epi32(3, 0, 0, 0,
1355 0, 0, 0, 0,
1356 0, 0, 1, 0,
1357 0, 0, 0, 0);
1358 reg_inls = _mm512_set_epi32(in[28], in[27] >> 6 | in[28] << 26,
1359 in[26] >> 9 | in[27] << 23, in[25] >> 12 | in[26] << 20,
1360 in[24] >> 15 | in[25] << 17, in[23] >> 18 | in[24] << 14,
1361 in[22] >> 21 | in[23] << 11, in[21] >> 24 | in[22] << 8,
1362 in[20] >> 27 | in[21] << 5, in[19] >> 30 | in[20] << 2,
1363 in[19], in[18] >> 4 | in[19] << 28,
1364 in[17] >> 7 | in[18] << 25, in[16] >> 10 | in[17] << 22,
1365 in[15] >> 13 | in[16] << 19, in[14] >> 16 | in[15] << 16);
1366 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1367 _mm512_storeu_si512(out, results);
1368 out += 16;
1369
1370 in += 29;
1371
1372 return in;
1373 }
1374
unpack30_32_avx512(const uint32_t * in,uint32_t * out)1375 inline static const uint32_t* unpack30_32_avx512(const uint32_t* in, uint32_t* out) {
1376 uint32_t mask = 0x3fffffff;
1377 __m512i reg_shifts, reg_inls, reg_masks;
1378 __m512i results;
1379
1380 reg_masks = _mm512_set1_epi32(mask);
1381
1382 // shift the first 16 outs
1383 reg_shifts = _mm512_set_epi32(2, 0, 0, 0,
1384 0, 0, 0, 0,
1385 0, 0, 0, 0,
1386 0, 0, 0, 0);
1387 reg_inls = _mm512_set_epi32(in[14], in[13] >> 4 | in[14] << 28,
1388 in[12] >> 6 | in[13] << 26, in[11] >> 8 | in[12] << 24,
1389 in[10] >> 10 | in[11] << 22, in[9] >> 12 | in[10] << 20,
1390 in[8] >> 14 | in[9] << 18, in[7] >> 16 | in[8] << 16,
1391 in[6] >> 18 | in[7] << 14, in[5] >> 20 | in[6] << 12,
1392 in[4] >> 22 | in[5] << 10, in[3] >> 24 | in[4] << 8,
1393 in[2] >> 26 | in[3] << 6, in[1] >> 28 | in[2] << 4,
1394 in[0] >> 30 | in[1] << 2, in[0]);
1395 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1396 _mm512_storeu_si512(out, results);
1397 out += 16;
1398
1399 // shift the second 16 outs
1400 reg_shifts = _mm512_set_epi32(2, 0, 0, 0,
1401 0, 0, 0, 0,
1402 0, 0, 0, 0,
1403 0, 0, 0, 0);
1404 reg_inls = _mm512_set_epi32(in[29], in[28] >> 4 | in[29] << 28,
1405 in[27] >> 6 | in[28] << 26, in[26] >> 8 | in[27] << 24,
1406 in[25] >> 10 | in[26] << 22, in[24] >> 12 | in[25] << 20,
1407 in[23] >> 14 | in[24] << 18, in[22] >> 16 | in[23] << 16,
1408 in[21] >> 18 | in[22] << 14, in[20] >> 20 | in[21] << 12,
1409 in[19] >> 22 | in[20] << 10, in[18] >> 24 | in[19] << 8,
1410 in[17] >> 26 | in[18] << 6, in[16] >> 28 | in[17] << 4,
1411 in[15] >> 30 | in[16] << 2, in[15]);
1412 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1413 _mm512_storeu_si512(out, results);
1414 out += 16;
1415
1416 in += 30;
1417
1418 return in;
1419 }
1420
unpack31_32_avx512(const uint32_t * in,uint32_t * out)1421 inline static const uint32_t* unpack31_32_avx512(const uint32_t* in, uint32_t* out) {
1422 uint32_t mask = 0x7fffffff;
1423 __m512i reg_shifts, reg_inls, reg_masks;
1424 __m512i results;
1425
1426 reg_masks = _mm512_set1_epi32(mask);
1427
1428 // shift the first 16 outs
1429 reg_shifts = _mm512_set_epi32(0, 0, 0, 0,
1430 0, 0, 0, 0,
1431 0, 0, 0, 0,
1432 0, 0, 0, 0);
1433 reg_inls = _mm512_set_epi32(in[14] >> 17 | in[15] << 15, in[13] >> 18 | in[14] << 14,
1434 in[12] >> 19 | in[13] << 13, in[11] >> 20 | in[12] << 12,
1435 in[10] >> 21 | in[11] << 11, in[9] >> 22 | in[10] << 10,
1436 in[8] >> 23 | in[9] << 9, in[7] >> 24 | in[8] << 8,
1437 in[6] >> 25 | in[7] << 7, in[5] >> 26 | in[6] << 6,
1438 in[4] >> 27 | in[5] << 5, in[3] >> 28 | in[4] << 4,
1439 in[2] >> 29 | in[3] << 3, in[1] >> 30 | in[2] << 2,
1440 in[0] >> 31 | in[1] << 1, in[0]);
1441 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1442 _mm512_storeu_si512(out, results);
1443 out += 16;
1444
1445 // shift the second 16 outs
1446 reg_shifts = _mm512_set_epi32(1, 0, 0, 0,
1447 0, 0, 0, 0,
1448 0, 0, 0, 0,
1449 0, 0, 0, 0);
1450 reg_inls = _mm512_set_epi32(in[30], in[29] >> 2 | in[30] << 30,
1451 in[28] >> 3 | in[29] << 29, in[27] >> 4 | in[28] << 28,
1452 in[26] >> 5 | in[27] << 27, in[25] >> 6 | in[26] << 26,
1453 in[24] >> 7 | in[25] << 25, in[23] >> 8 | in[24] << 24,
1454 in[22] >> 9 | in[23] << 23, in[21] >> 10 | in[22] << 22,
1455 in[20] >> 11 | in[21] << 21, in[19] >> 12 | in[20] << 20,
1456 in[18] >> 13 | in[19] << 19, in[17] >> 14 | in[18] << 18,
1457 in[16] >> 15 | in[17] << 17, in[15] >> 16 | in[16] << 16);
1458 results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1459 _mm512_storeu_si512(out, results);
1460 out += 16;
1461
1462 in += 31;
1463
1464 return in;
1465 }
1466
unpack32_32_avx512(const uint32_t * in,uint32_t * out)1467 inline static const uint32_t* unpack32_32_avx512(const uint32_t* in, uint32_t* out) {
1468 memcpy(out, in, 32 * sizeof(*out));
1469 in += 32;
1470 out += 32;
1471
1472 return in;
1473 }
1474
1475 } // namespace internal
1476 } // namespace arrow
1477