1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 //
18 // Automatically generated file; DO NOT EDIT.
19 
20 #pragma once
21 
22 #include <stdint.h>
23 #include <string.h>
24 
25 #ifdef _MSC_VER
26 #include <intrin.h>
27 #else
28 #include <immintrin.h>
29 #endif
30 
31 namespace arrow {
32 namespace internal {
33 
unpack0_32_avx512(const uint32_t * in,uint32_t * out)34 inline static const uint32_t* unpack0_32_avx512(const uint32_t* in, uint32_t* out) {
35   memset(out, 0x0, 32 * sizeof(*out));
36   out += 32;
37 
38   return in;
39 }
40 
unpack1_32_avx512(const uint32_t * in,uint32_t * out)41 inline static const uint32_t* unpack1_32_avx512(const uint32_t* in, uint32_t* out) {
42   uint32_t mask = 0x1;
43   __m512i reg_shifts, reg_inls, reg_masks;
44   __m512i results;
45 
46   reg_masks = _mm512_set1_epi32(mask);
47 
48   // shift the first 16 outs
49   reg_shifts = _mm512_set_epi32(15, 14, 13, 12,
50                                 11, 10, 9, 8,
51                                 7, 6, 5, 4,
52                                 3, 2, 1, 0);
53   reg_inls = _mm512_set_epi32(in[0], in[0],
54                               in[0], in[0],
55                               in[0], in[0],
56                               in[0], in[0],
57                               in[0], in[0],
58                               in[0], in[0],
59                               in[0], in[0],
60                               in[0], in[0]);
61   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
62   _mm512_storeu_si512(out, results);
63   out += 16;
64 
65   // shift the second 16 outs
66   reg_shifts = _mm512_set_epi32(31, 30, 29, 28,
67                                 27, 26, 25, 24,
68                                 23, 22, 21, 20,
69                                 19, 18, 17, 16);
70   reg_inls = _mm512_set_epi32(in[0], in[0],
71                               in[0], in[0],
72                               in[0], in[0],
73                               in[0], in[0],
74                               in[0], in[0],
75                               in[0], in[0],
76                               in[0], in[0],
77                               in[0], in[0]);
78   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
79   _mm512_storeu_si512(out, results);
80   out += 16;
81 
82   in += 1;
83 
84   return in;
85 }
86 
unpack2_32_avx512(const uint32_t * in,uint32_t * out)87 inline static const uint32_t* unpack2_32_avx512(const uint32_t* in, uint32_t* out) {
88   uint32_t mask = 0x3;
89   __m512i reg_shifts, reg_inls, reg_masks;
90   __m512i results;
91 
92   reg_masks = _mm512_set1_epi32(mask);
93 
94   // shift the first 16 outs
95   reg_shifts = _mm512_set_epi32(30, 28, 26, 24,
96                                 22, 20, 18, 16,
97                                 14, 12, 10, 8,
98                                 6, 4, 2, 0);
99   reg_inls = _mm512_set_epi32(in[0], in[0],
100                               in[0], in[0],
101                               in[0], in[0],
102                               in[0], in[0],
103                               in[0], in[0],
104                               in[0], in[0],
105                               in[0], in[0],
106                               in[0], in[0]);
107   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
108   _mm512_storeu_si512(out, results);
109   out += 16;
110 
111   // shift the second 16 outs
112   reg_shifts = _mm512_set_epi32(30, 28, 26, 24,
113                                 22, 20, 18, 16,
114                                 14, 12, 10, 8,
115                                 6, 4, 2, 0);
116   reg_inls = _mm512_set_epi32(in[1], in[1],
117                               in[1], in[1],
118                               in[1], in[1],
119                               in[1], in[1],
120                               in[1], in[1],
121                               in[1], in[1],
122                               in[1], in[1],
123                               in[1], in[1]);
124   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
125   _mm512_storeu_si512(out, results);
126   out += 16;
127 
128   in += 2;
129 
130   return in;
131 }
132 
unpack3_32_avx512(const uint32_t * in,uint32_t * out)133 inline static const uint32_t* unpack3_32_avx512(const uint32_t* in, uint32_t* out) {
134   uint32_t mask = 0x7;
135   __m512i reg_shifts, reg_inls, reg_masks;
136   __m512i results;
137 
138   reg_masks = _mm512_set1_epi32(mask);
139 
140   // shift the first 16 outs
141   reg_shifts = _mm512_set_epi32(13, 10, 7, 4,
142                                 1, 0, 27, 24,
143                                 21, 18, 15, 12,
144                                 9, 6, 3, 0);
145   reg_inls = _mm512_set_epi32(in[1], in[1],
146                               in[1], in[1],
147                               in[1], in[0] >> 30 | in[1] << 2,
148                               in[0], in[0],
149                               in[0], in[0],
150                               in[0], in[0],
151                               in[0], in[0],
152                               in[0], in[0]);
153   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
154   _mm512_storeu_si512(out, results);
155   out += 16;
156 
157   // shift the second 16 outs
158   reg_shifts = _mm512_set_epi32(29, 26, 23, 20,
159                                 17, 14, 11, 8,
160                                 5, 2, 0, 28,
161                                 25, 22, 19, 16);
162   reg_inls = _mm512_set_epi32(in[2], in[2],
163                               in[2], in[2],
164                               in[2], in[2],
165                               in[2], in[2],
166                               in[2], in[2],
167                               in[1] >> 31 | in[2] << 1, in[1],
168                               in[1], in[1],
169                               in[1], in[1]);
170   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
171   _mm512_storeu_si512(out, results);
172   out += 16;
173 
174   in += 3;
175 
176   return in;
177 }
178 
unpack4_32_avx512(const uint32_t * in,uint32_t * out)179 inline static const uint32_t* unpack4_32_avx512(const uint32_t* in, uint32_t* out) {
180   uint32_t mask = 0xf;
181   __m512i reg_shifts, reg_inls, reg_masks;
182   __m512i results;
183 
184   reg_masks = _mm512_set1_epi32(mask);
185 
186   // shift the first 16 outs
187   reg_shifts = _mm512_set_epi32(28, 24, 20, 16,
188                                 12, 8, 4, 0,
189                                 28, 24, 20, 16,
190                                 12, 8, 4, 0);
191   reg_inls = _mm512_set_epi32(in[1], in[1],
192                               in[1], in[1],
193                               in[1], in[1],
194                               in[1], in[1],
195                               in[0], in[0],
196                               in[0], in[0],
197                               in[0], in[0],
198                               in[0], in[0]);
199   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
200   _mm512_storeu_si512(out, results);
201   out += 16;
202 
203   // shift the second 16 outs
204   reg_shifts = _mm512_set_epi32(28, 24, 20, 16,
205                                 12, 8, 4, 0,
206                                 28, 24, 20, 16,
207                                 12, 8, 4, 0);
208   reg_inls = _mm512_set_epi32(in[3], in[3],
209                               in[3], in[3],
210                               in[3], in[3],
211                               in[3], in[3],
212                               in[2], in[2],
213                               in[2], in[2],
214                               in[2], in[2],
215                               in[2], in[2]);
216   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
217   _mm512_storeu_si512(out, results);
218   out += 16;
219 
220   in += 4;
221 
222   return in;
223 }
224 
unpack5_32_avx512(const uint32_t * in,uint32_t * out)225 inline static const uint32_t* unpack5_32_avx512(const uint32_t* in, uint32_t* out) {
226   uint32_t mask = 0x1f;
227   __m512i reg_shifts, reg_inls, reg_masks;
228   __m512i results;
229 
230   reg_masks = _mm512_set1_epi32(mask);
231 
232   // shift the first 16 outs
233   reg_shifts = _mm512_set_epi32(11, 6, 1, 0,
234                                 23, 18, 13, 8,
235                                 3, 0, 25, 20,
236                                 15, 10, 5, 0);
237   reg_inls = _mm512_set_epi32(in[2], in[2],
238                               in[2], in[1] >> 28 | in[2] << 4,
239                               in[1], in[1],
240                               in[1], in[1],
241                               in[1], in[0] >> 30 | in[1] << 2,
242                               in[0], in[0],
243                               in[0], in[0],
244                               in[0], in[0]);
245   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
246   _mm512_storeu_si512(out, results);
247   out += 16;
248 
249   // shift the second 16 outs
250   reg_shifts = _mm512_set_epi32(27, 22, 17, 12,
251                                 7, 2, 0, 24,
252                                 19, 14, 9, 4,
253                                 0, 26, 21, 16);
254   reg_inls = _mm512_set_epi32(in[4], in[4],
255                               in[4], in[4],
256                               in[4], in[4],
257                               in[3] >> 29 | in[4] << 3, in[3],
258                               in[3], in[3],
259                               in[3], in[3],
260                               in[2] >> 31 | in[3] << 1, in[2],
261                               in[2], in[2]);
262   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
263   _mm512_storeu_si512(out, results);
264   out += 16;
265 
266   in += 5;
267 
268   return in;
269 }
270 
unpack6_32_avx512(const uint32_t * in,uint32_t * out)271 inline static const uint32_t* unpack6_32_avx512(const uint32_t* in, uint32_t* out) {
272   uint32_t mask = 0x3f;
273   __m512i reg_shifts, reg_inls, reg_masks;
274   __m512i results;
275 
276   reg_masks = _mm512_set1_epi32(mask);
277 
278   // shift the first 16 outs
279   reg_shifts = _mm512_set_epi32(26, 20, 14, 8,
280                                 2, 0, 22, 16,
281                                 10, 4, 0, 24,
282                                 18, 12, 6, 0);
283   reg_inls = _mm512_set_epi32(in[2], in[2],
284                               in[2], in[2],
285                               in[2], in[1] >> 28 | in[2] << 4,
286                               in[1], in[1],
287                               in[1], in[1],
288                               in[0] >> 30 | in[1] << 2, in[0],
289                               in[0], in[0],
290                               in[0], in[0]);
291   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
292   _mm512_storeu_si512(out, results);
293   out += 16;
294 
295   // shift the second 16 outs
296   reg_shifts = _mm512_set_epi32(26, 20, 14, 8,
297                                 2, 0, 22, 16,
298                                 10, 4, 0, 24,
299                                 18, 12, 6, 0);
300   reg_inls = _mm512_set_epi32(in[5], in[5],
301                               in[5], in[5],
302                               in[5], in[4] >> 28 | in[5] << 4,
303                               in[4], in[4],
304                               in[4], in[4],
305                               in[3] >> 30 | in[4] << 2, in[3],
306                               in[3], in[3],
307                               in[3], in[3]);
308   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
309   _mm512_storeu_si512(out, results);
310   out += 16;
311 
312   in += 6;
313 
314   return in;
315 }
316 
unpack7_32_avx512(const uint32_t * in,uint32_t * out)317 inline static const uint32_t* unpack7_32_avx512(const uint32_t* in, uint32_t* out) {
318   uint32_t mask = 0x7f;
319   __m512i reg_shifts, reg_inls, reg_masks;
320   __m512i results;
321 
322   reg_masks = _mm512_set1_epi32(mask);
323 
324   // shift the first 16 outs
325   reg_shifts = _mm512_set_epi32(9, 2, 0, 20,
326                                 13, 6, 0, 24,
327                                 17, 10, 3, 0,
328                                 21, 14, 7, 0);
329   reg_inls = _mm512_set_epi32(in[3], in[3],
330                               in[2] >> 27 | in[3] << 5, in[2],
331                               in[2], in[2],
332                               in[1] >> 31 | in[2] << 1, in[1],
333                               in[1], in[1],
334                               in[1], in[0] >> 28 | in[1] << 4,
335                               in[0], in[0],
336                               in[0], in[0]);
337   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
338   _mm512_storeu_si512(out, results);
339   out += 16;
340 
341   // shift the second 16 outs
342   reg_shifts = _mm512_set_epi32(25, 18, 11, 4,
343                                 0, 22, 15, 8,
344                                 1, 0, 19, 12,
345                                 5, 0, 23, 16);
346   reg_inls = _mm512_set_epi32(in[6], in[6],
347                               in[6], in[6],
348                               in[5] >> 29 | in[6] << 3, in[5],
349                               in[5], in[5],
350                               in[5], in[4] >> 26 | in[5] << 6,
351                               in[4], in[4],
352                               in[4], in[3] >> 30 | in[4] << 2,
353                               in[3], in[3]);
354   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
355   _mm512_storeu_si512(out, results);
356   out += 16;
357 
358   in += 7;
359 
360   return in;
361 }
362 
unpack8_32_avx512(const uint32_t * in,uint32_t * out)363 inline static const uint32_t* unpack8_32_avx512(const uint32_t* in, uint32_t* out) {
364   uint32_t mask = 0xff;
365   __m512i reg_shifts, reg_inls, reg_masks;
366   __m512i results;
367 
368   reg_masks = _mm512_set1_epi32(mask);
369 
370   // shift the first 16 outs
371   reg_shifts = _mm512_set_epi32(24, 16, 8, 0,
372                                 24, 16, 8, 0,
373                                 24, 16, 8, 0,
374                                 24, 16, 8, 0);
375   reg_inls = _mm512_set_epi32(in[3], in[3],
376                               in[3], in[3],
377                               in[2], in[2],
378                               in[2], in[2],
379                               in[1], in[1],
380                               in[1], in[1],
381                               in[0], in[0],
382                               in[0], in[0]);
383   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
384   _mm512_storeu_si512(out, results);
385   out += 16;
386 
387   // shift the second 16 outs
388   reg_shifts = _mm512_set_epi32(24, 16, 8, 0,
389                                 24, 16, 8, 0,
390                                 24, 16, 8, 0,
391                                 24, 16, 8, 0);
392   reg_inls = _mm512_set_epi32(in[7], in[7],
393                               in[7], in[7],
394                               in[6], in[6],
395                               in[6], in[6],
396                               in[5], in[5],
397                               in[5], in[5],
398                               in[4], in[4],
399                               in[4], in[4]);
400   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
401   _mm512_storeu_si512(out, results);
402   out += 16;
403 
404   in += 8;
405 
406   return in;
407 }
408 
unpack9_32_avx512(const uint32_t * in,uint32_t * out)409 inline static const uint32_t* unpack9_32_avx512(const uint32_t* in, uint32_t* out) {
410   uint32_t mask = 0x1ff;
411   __m512i reg_shifts, reg_inls, reg_masks;
412   __m512i results;
413 
414   reg_masks = _mm512_set1_epi32(mask);
415 
416   // shift the first 16 outs
417   reg_shifts = _mm512_set_epi32(7, 0, 21, 12,
418                                 3, 0, 17, 8,
419                                 0, 22, 13, 4,
420                                 0, 18, 9, 0);
421   reg_inls = _mm512_set_epi32(in[4], in[3] >> 30 | in[4] << 2,
422                               in[3], in[3],
423                               in[3], in[2] >> 26 | in[3] << 6,
424                               in[2], in[2],
425                               in[1] >> 31 | in[2] << 1, in[1],
426                               in[1], in[1],
427                               in[0] >> 27 | in[1] << 5, in[0],
428                               in[0], in[0]);
429   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
430   _mm512_storeu_si512(out, results);
431   out += 16;
432 
433   // shift the second 16 outs
434   reg_shifts = _mm512_set_epi32(23, 14, 5, 0,
435                                 19, 10, 1, 0,
436                                 15, 6, 0, 20,
437                                 11, 2, 0, 16);
438   reg_inls = _mm512_set_epi32(in[8], in[8],
439                               in[8], in[7] >> 28 | in[8] << 4,
440                               in[7], in[7],
441                               in[7], in[6] >> 24 | in[7] << 8,
442                               in[6], in[6],
443                               in[5] >> 29 | in[6] << 3, in[5],
444                               in[5], in[5],
445                               in[4] >> 25 | in[5] << 7, in[4]);
446   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
447   _mm512_storeu_si512(out, results);
448   out += 16;
449 
450   in += 9;
451 
452   return in;
453 }
454 
unpack10_32_avx512(const uint32_t * in,uint32_t * out)455 inline static const uint32_t* unpack10_32_avx512(const uint32_t* in, uint32_t* out) {
456   uint32_t mask = 0x3ff;
457   __m512i reg_shifts, reg_inls, reg_masks;
458   __m512i results;
459 
460   reg_masks = _mm512_set1_epi32(mask);
461 
462   // shift the first 16 outs
463   reg_shifts = _mm512_set_epi32(22, 12, 2, 0,
464                                 14, 4, 0, 16,
465                                 6, 0, 18, 8,
466                                 0, 20, 10, 0);
467   reg_inls = _mm512_set_epi32(in[4], in[4],
468                               in[4], in[3] >> 24 | in[4] << 8,
469                               in[3], in[3],
470                               in[2] >> 26 | in[3] << 6, in[2],
471                               in[2], in[1] >> 28 | in[2] << 4,
472                               in[1], in[1],
473                               in[0] >> 30 | in[1] << 2, in[0],
474                               in[0], in[0]);
475   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
476   _mm512_storeu_si512(out, results);
477   out += 16;
478 
479   // shift the second 16 outs
480   reg_shifts = _mm512_set_epi32(22, 12, 2, 0,
481                                 14, 4, 0, 16,
482                                 6, 0, 18, 8,
483                                 0, 20, 10, 0);
484   reg_inls = _mm512_set_epi32(in[9], in[9],
485                               in[9], in[8] >> 24 | in[9] << 8,
486                               in[8], in[8],
487                               in[7] >> 26 | in[8] << 6, in[7],
488                               in[7], in[6] >> 28 | in[7] << 4,
489                               in[6], in[6],
490                               in[5] >> 30 | in[6] << 2, in[5],
491                               in[5], in[5]);
492   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
493   _mm512_storeu_si512(out, results);
494   out += 16;
495 
496   in += 10;
497 
498   return in;
499 }
500 
unpack11_32_avx512(const uint32_t * in,uint32_t * out)501 inline static const uint32_t* unpack11_32_avx512(const uint32_t* in, uint32_t* out) {
502   uint32_t mask = 0x7ff;
503   __m512i reg_shifts, reg_inls, reg_masks;
504   __m512i results;
505 
506   reg_masks = _mm512_set1_epi32(mask);
507 
508   // shift the first 16 outs
509   reg_shifts = _mm512_set_epi32(5, 0, 15, 4,
510                                 0, 14, 3, 0,
511                                 13, 2, 0, 12,
512                                 1, 0, 11, 0);
513   reg_inls = _mm512_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
514                               in[4], in[4],
515                               in[3] >> 25 | in[4] << 7, in[3],
516                               in[3], in[2] >> 24 | in[3] << 8,
517                               in[2], in[2],
518                               in[1] >> 23 | in[2] << 9, in[1],
519                               in[1], in[0] >> 22 | in[1] << 10,
520                               in[0], in[0]);
521   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
522   _mm512_storeu_si512(out, results);
523   out += 16;
524 
525   // shift the second 16 outs
526   reg_shifts = _mm512_set_epi32(21, 10, 0, 20,
527                                 9, 0, 19, 8,
528                                 0, 18, 7, 0,
529                                 17, 6, 0, 16);
530   reg_inls = _mm512_set_epi32(in[10], in[10],
531                               in[9] >> 31 | in[10] << 1, in[9],
532                               in[9], in[8] >> 30 | in[9] << 2,
533                               in[8], in[8],
534                               in[7] >> 29 | in[8] << 3, in[7],
535                               in[7], in[6] >> 28 | in[7] << 4,
536                               in[6], in[6],
537                               in[5] >> 27 | in[6] << 5, in[5]);
538   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
539   _mm512_storeu_si512(out, results);
540   out += 16;
541 
542   in += 11;
543 
544   return in;
545 }
546 
unpack12_32_avx512(const uint32_t * in,uint32_t * out)547 inline static const uint32_t* unpack12_32_avx512(const uint32_t* in, uint32_t* out) {
548   uint32_t mask = 0xfff;
549   __m512i reg_shifts, reg_inls, reg_masks;
550   __m512i results;
551 
552   reg_masks = _mm512_set1_epi32(mask);
553 
554   // shift the first 16 outs
555   reg_shifts = _mm512_set_epi32(20, 8, 0, 16,
556                                 4, 0, 12, 0,
557                                 20, 8, 0, 16,
558                                 4, 0, 12, 0);
559   reg_inls = _mm512_set_epi32(in[5], in[5],
560                               in[4] >> 28 | in[5] << 4, in[4],
561                               in[4], in[3] >> 24 | in[4] << 8,
562                               in[3], in[3],
563                               in[2], in[2],
564                               in[1] >> 28 | in[2] << 4, in[1],
565                               in[1], in[0] >> 24 | in[1] << 8,
566                               in[0], in[0]);
567   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
568   _mm512_storeu_si512(out, results);
569   out += 16;
570 
571   // shift the second 16 outs
572   reg_shifts = _mm512_set_epi32(20, 8, 0, 16,
573                                 4, 0, 12, 0,
574                                 20, 8, 0, 16,
575                                 4, 0, 12, 0);
576   reg_inls = _mm512_set_epi32(in[11], in[11],
577                               in[10] >> 28 | in[11] << 4, in[10],
578                               in[10], in[9] >> 24 | in[10] << 8,
579                               in[9], in[9],
580                               in[8], in[8],
581                               in[7] >> 28 | in[8] << 4, in[7],
582                               in[7], in[6] >> 24 | in[7] << 8,
583                               in[6], in[6]);
584   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
585   _mm512_storeu_si512(out, results);
586   out += 16;
587 
588   in += 12;
589 
590   return in;
591 }
592 
unpack13_32_avx512(const uint32_t * in,uint32_t * out)593 inline static const uint32_t* unpack13_32_avx512(const uint32_t* in, uint32_t* out) {
594   uint32_t mask = 0x1fff;
595   __m512i reg_shifts, reg_inls, reg_masks;
596   __m512i results;
597 
598   reg_masks = _mm512_set1_epi32(mask);
599 
600   // shift the first 16 outs
601   reg_shifts = _mm512_set_epi32(3, 0, 9, 0,
602                                 15, 2, 0, 8,
603                                 0, 14, 1, 0,
604                                 7, 0, 13, 0);
605   reg_inls = _mm512_set_epi32(in[6], in[5] >> 22 | in[6] << 10,
606                               in[5], in[4] >> 28 | in[5] << 4,
607                               in[4], in[4],
608                               in[3] >> 21 | in[4] << 11, in[3],
609                               in[2] >> 27 | in[3] << 5, in[2],
610                               in[2], in[1] >> 20 | in[2] << 12,
611                               in[1], in[0] >> 26 | in[1] << 6,
612                               in[0], in[0]);
613   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
614   _mm512_storeu_si512(out, results);
615   out += 16;
616 
617   // shift the second 16 outs
618   reg_shifts = _mm512_set_epi32(19, 6, 0, 12,
619                                 0, 18, 5, 0,
620                                 11, 0, 17, 4,
621                                 0, 10, 0, 16);
622   reg_inls = _mm512_set_epi32(in[12], in[12],
623                               in[11] >> 25 | in[12] << 7, in[11],
624                               in[10] >> 31 | in[11] << 1, in[10],
625                               in[10], in[9] >> 24 | in[10] << 8,
626                               in[9], in[8] >> 30 | in[9] << 2,
627                               in[8], in[8],
628                               in[7] >> 23 | in[8] << 9, in[7],
629                               in[6] >> 29 | in[7] << 3, in[6]);
630   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
631   _mm512_storeu_si512(out, results);
632   out += 16;
633 
634   in += 13;
635 
636   return in;
637 }
638 
unpack14_32_avx512(const uint32_t * in,uint32_t * out)639 inline static const uint32_t* unpack14_32_avx512(const uint32_t* in, uint32_t* out) {
640   uint32_t mask = 0x3fff;
641   __m512i reg_shifts, reg_inls, reg_masks;
642   __m512i results;
643 
644   reg_masks = _mm512_set1_epi32(mask);
645 
646   // shift the first 16 outs
647   reg_shifts = _mm512_set_epi32(18, 4, 0, 8,
648                                 0, 12, 0, 16,
649                                 2, 0, 6, 0,
650                                 10, 0, 14, 0);
651   reg_inls = _mm512_set_epi32(in[6], in[6],
652                               in[5] >> 22 | in[6] << 10, in[5],
653                               in[4] >> 26 | in[5] << 6, in[4],
654                               in[3] >> 30 | in[4] << 2, in[3],
655                               in[3], in[2] >> 20 | in[3] << 12,
656                               in[2], in[1] >> 24 | in[2] << 8,
657                               in[1], in[0] >> 28 | in[1] << 4,
658                               in[0], in[0]);
659   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
660   _mm512_storeu_si512(out, results);
661   out += 16;
662 
663   // shift the second 16 outs
664   reg_shifts = _mm512_set_epi32(18, 4, 0, 8,
665                                 0, 12, 0, 16,
666                                 2, 0, 6, 0,
667                                 10, 0, 14, 0);
668   reg_inls = _mm512_set_epi32(in[13], in[13],
669                               in[12] >> 22 | in[13] << 10, in[12],
670                               in[11] >> 26 | in[12] << 6, in[11],
671                               in[10] >> 30 | in[11] << 2, in[10],
672                               in[10], in[9] >> 20 | in[10] << 12,
673                               in[9], in[8] >> 24 | in[9] << 8,
674                               in[8], in[7] >> 28 | in[8] << 4,
675                               in[7], in[7]);
676   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
677   _mm512_storeu_si512(out, results);
678   out += 16;
679 
680   in += 14;
681 
682   return in;
683 }
684 
unpack15_32_avx512(const uint32_t * in,uint32_t * out)685 inline static const uint32_t* unpack15_32_avx512(const uint32_t* in, uint32_t* out) {
686   uint32_t mask = 0x7fff;
687   __m512i reg_shifts, reg_inls, reg_masks;
688   __m512i results;
689 
690   reg_masks = _mm512_set1_epi32(mask);
691 
692   // shift the first 16 outs
693   reg_shifts = _mm512_set_epi32(1, 0, 3, 0,
694                                 5, 0, 7, 0,
695                                 9, 0, 11, 0,
696                                 13, 0, 15, 0);
697   reg_inls = _mm512_set_epi32(in[7], in[6] >> 18 | in[7] << 14,
698                               in[6], in[5] >> 20 | in[6] << 12,
699                               in[5], in[4] >> 22 | in[5] << 10,
700                               in[4], in[3] >> 24 | in[4] << 8,
701                               in[3], in[2] >> 26 | in[3] << 6,
702                               in[2], in[1] >> 28 | in[2] << 4,
703                               in[1], in[0] >> 30 | in[1] << 2,
704                               in[0], in[0]);
705   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
706   _mm512_storeu_si512(out, results);
707   out += 16;
708 
709   // shift the second 16 outs
710   reg_shifts = _mm512_set_epi32(17, 2, 0, 4,
711                                 0, 6, 0, 8,
712                                 0, 10, 0, 12,
713                                 0, 14, 0, 16);
714   reg_inls = _mm512_set_epi32(in[14], in[14],
715                               in[13] >> 19 | in[14] << 13, in[13],
716                               in[12] >> 21 | in[13] << 11, in[12],
717                               in[11] >> 23 | in[12] << 9, in[11],
718                               in[10] >> 25 | in[11] << 7, in[10],
719                               in[9] >> 27 | in[10] << 5, in[9],
720                               in[8] >> 29 | in[9] << 3, in[8],
721                               in[7] >> 31 | in[8] << 1, in[7]);
722   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
723   _mm512_storeu_si512(out, results);
724   out += 16;
725 
726   in += 15;
727 
728   return in;
729 }
730 
unpack16_32_avx512(const uint32_t * in,uint32_t * out)731 inline static const uint32_t* unpack16_32_avx512(const uint32_t* in, uint32_t* out) {
732   uint32_t mask = 0xffff;
733   __m512i reg_shifts, reg_inls, reg_masks;
734   __m512i results;
735 
736   reg_masks = _mm512_set1_epi32(mask);
737 
738   // shift the first 16 outs
739   reg_shifts = _mm512_set_epi32(16, 0, 16, 0,
740                                 16, 0, 16, 0,
741                                 16, 0, 16, 0,
742                                 16, 0, 16, 0);
743   reg_inls = _mm512_set_epi32(in[7], in[7],
744                               in[6], in[6],
745                               in[5], in[5],
746                               in[4], in[4],
747                               in[3], in[3],
748                               in[2], in[2],
749                               in[1], in[1],
750                               in[0], in[0]);
751   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
752   _mm512_storeu_si512(out, results);
753   out += 16;
754 
755   // shift the second 16 outs
756   reg_shifts = _mm512_set_epi32(16, 0, 16, 0,
757                                 16, 0, 16, 0,
758                                 16, 0, 16, 0,
759                                 16, 0, 16, 0);
760   reg_inls = _mm512_set_epi32(in[15], in[15],
761                               in[14], in[14],
762                               in[13], in[13],
763                               in[12], in[12],
764                               in[11], in[11],
765                               in[10], in[10],
766                               in[9], in[9],
767                               in[8], in[8]);
768   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
769   _mm512_storeu_si512(out, results);
770   out += 16;
771 
772   in += 16;
773 
774   return in;
775 }
776 
unpack17_32_avx512(const uint32_t * in,uint32_t * out)777 inline static const uint32_t* unpack17_32_avx512(const uint32_t* in, uint32_t* out) {
778   uint32_t mask = 0x1ffff;
779   __m512i reg_shifts, reg_inls, reg_masks;
780   __m512i results;
781 
782   reg_masks = _mm512_set1_epi32(mask);
783 
784   // shift the first 16 outs
785   reg_shifts = _mm512_set_epi32(0, 14, 0, 12,
786                                 0, 10, 0, 8,
787                                 0, 6, 0, 4,
788                                 0, 2, 0, 0);
789   reg_inls = _mm512_set_epi32(in[7] >> 31 | in[8] << 1, in[7],
790                               in[6] >> 29 | in[7] << 3, in[6],
791                               in[5] >> 27 | in[6] << 5, in[5],
792                               in[4] >> 25 | in[5] << 7, in[4],
793                               in[3] >> 23 | in[4] << 9, in[3],
794                               in[2] >> 21 | in[3] << 11, in[2],
795                               in[1] >> 19 | in[2] << 13, in[1],
796                               in[0] >> 17 | in[1] << 15, in[0]);
797   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
798   _mm512_storeu_si512(out, results);
799   out += 16;
800 
801   // shift the second 16 outs
802   reg_shifts = _mm512_set_epi32(15, 0, 13, 0,
803                                 11, 0, 9, 0,
804                                 7, 0, 5, 0,
805                                 3, 0, 1, 0);
806   reg_inls = _mm512_set_epi32(in[16], in[15] >> 30 | in[16] << 2,
807                               in[15], in[14] >> 28 | in[15] << 4,
808                               in[14], in[13] >> 26 | in[14] << 6,
809                               in[13], in[12] >> 24 | in[13] << 8,
810                               in[12], in[11] >> 22 | in[12] << 10,
811                               in[11], in[10] >> 20 | in[11] << 12,
812                               in[10], in[9] >> 18 | in[10] << 14,
813                               in[9], in[8] >> 16 | in[9] << 16);
814   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
815   _mm512_storeu_si512(out, results);
816   out += 16;
817 
818   in += 17;
819 
820   return in;
821 }
822 
unpack18_32_avx512(const uint32_t * in,uint32_t * out)823 inline static const uint32_t* unpack18_32_avx512(const uint32_t* in, uint32_t* out) {
824   uint32_t mask = 0x3ffff;
825   __m512i reg_shifts, reg_inls, reg_masks;
826   __m512i results;
827 
828   reg_masks = _mm512_set1_epi32(mask);
829 
830   // shift the first 16 outs
831   reg_shifts = _mm512_set_epi32(14, 0, 10, 0,
832                                 6, 0, 2, 0,
833                                 0, 12, 0, 8,
834                                 0, 4, 0, 0);
835   reg_inls = _mm512_set_epi32(in[8], in[7] >> 28 | in[8] << 4,
836                               in[7], in[6] >> 24 | in[7] << 8,
837                               in[6], in[5] >> 20 | in[6] << 12,
838                               in[5], in[4] >> 16 | in[5] << 16,
839                               in[3] >> 30 | in[4] << 2, in[3],
840                               in[2] >> 26 | in[3] << 6, in[2],
841                               in[1] >> 22 | in[2] << 10, in[1],
842                               in[0] >> 18 | in[1] << 14, in[0]);
843   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
844   _mm512_storeu_si512(out, results);
845   out += 16;
846 
847   // shift the second 16 outs
848   reg_shifts = _mm512_set_epi32(14, 0, 10, 0,
849                                 6, 0, 2, 0,
850                                 0, 12, 0, 8,
851                                 0, 4, 0, 0);
852   reg_inls = _mm512_set_epi32(in[17], in[16] >> 28 | in[17] << 4,
853                               in[16], in[15] >> 24 | in[16] << 8,
854                               in[15], in[14] >> 20 | in[15] << 12,
855                               in[14], in[13] >> 16 | in[14] << 16,
856                               in[12] >> 30 | in[13] << 2, in[12],
857                               in[11] >> 26 | in[12] << 6, in[11],
858                               in[10] >> 22 | in[11] << 10, in[10],
859                               in[9] >> 18 | in[10] << 14, in[9]);
860   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
861   _mm512_storeu_si512(out, results);
862   out += 16;
863 
864   in += 18;
865 
866   return in;
867 }
868 
unpack19_32_avx512(const uint32_t * in,uint32_t * out)869 inline static const uint32_t* unpack19_32_avx512(const uint32_t* in, uint32_t* out) {
870   uint32_t mask = 0x7ffff;
871   __m512i reg_shifts, reg_inls, reg_masks;
872   __m512i results;
873 
874   reg_masks = _mm512_set1_epi32(mask);
875 
876   // shift the first 16 outs
877   reg_shifts = _mm512_set_epi32(0, 10, 0, 4,
878                                 0, 0, 11, 0,
879                                 5, 0, 0, 12,
880                                 0, 6, 0, 0);
881   reg_inls = _mm512_set_epi32(in[8] >> 29 | in[9] << 3, in[8],
882                               in[7] >> 23 | in[8] << 9, in[7],
883                               in[6] >> 17 | in[7] << 15, in[5] >> 30 | in[6] << 2,
884                               in[5], in[4] >> 24 | in[5] << 8,
885                               in[4], in[3] >> 18 | in[4] << 14,
886                               in[2] >> 31 | in[3] << 1, in[2],
887                               in[1] >> 25 | in[2] << 7, in[1],
888                               in[0] >> 19 | in[1] << 13, in[0]);
889   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
890   _mm512_storeu_si512(out, results);
891   out += 16;
892 
893   // shift the second 16 outs
894   reg_shifts = _mm512_set_epi32(13, 0, 7, 0,
895                                 1, 0, 0, 8,
896                                 0, 2, 0, 0,
897                                 9, 0, 3, 0);
898   reg_inls = _mm512_set_epi32(in[18], in[17] >> 26 | in[18] << 6,
899                               in[17], in[16] >> 20 | in[17] << 12,
900                               in[16], in[15] >> 14 | in[16] << 18,
901                               in[14] >> 27 | in[15] << 5, in[14],
902                               in[13] >> 21 | in[14] << 11, in[13],
903                               in[12] >> 15 | in[13] << 17, in[11] >> 28 | in[12] << 4,
904                               in[11], in[10] >> 22 | in[11] << 10,
905                               in[10], in[9] >> 16 | in[10] << 16);
906   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
907   _mm512_storeu_si512(out, results);
908   out += 16;
909 
910   in += 19;
911 
912   return in;
913 }
914 
unpack20_32_avx512(const uint32_t * in,uint32_t * out)915 inline static const uint32_t* unpack20_32_avx512(const uint32_t* in, uint32_t* out) {
916   uint32_t mask = 0xfffff;
917   __m512i reg_shifts, reg_inls, reg_masks;
918   __m512i results;
919 
920   reg_masks = _mm512_set1_epi32(mask);
921 
922   // shift the first 16 outs
923   reg_shifts = _mm512_set_epi32(12, 0, 4, 0,
924                                 0, 8, 0, 0,
925                                 12, 0, 4, 0,
926                                 0, 8, 0, 0);
927   reg_inls = _mm512_set_epi32(in[9], in[8] >> 24 | in[9] << 8,
928                               in[8], in[7] >> 16 | in[8] << 16,
929                               in[6] >> 28 | in[7] << 4, in[6],
930                               in[5] >> 20 | in[6] << 12, in[5],
931                               in[4], in[3] >> 24 | in[4] << 8,
932                               in[3], in[2] >> 16 | in[3] << 16,
933                               in[1] >> 28 | in[2] << 4, in[1],
934                               in[0] >> 20 | in[1] << 12, in[0]);
935   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
936   _mm512_storeu_si512(out, results);
937   out += 16;
938 
939   // shift the second 16 outs
940   reg_shifts = _mm512_set_epi32(12, 0, 4, 0,
941                                 0, 8, 0, 0,
942                                 12, 0, 4, 0,
943                                 0, 8, 0, 0);
944   reg_inls = _mm512_set_epi32(in[19], in[18] >> 24 | in[19] << 8,
945                               in[18], in[17] >> 16 | in[18] << 16,
946                               in[16] >> 28 | in[17] << 4, in[16],
947                               in[15] >> 20 | in[16] << 12, in[15],
948                               in[14], in[13] >> 24 | in[14] << 8,
949                               in[13], in[12] >> 16 | in[13] << 16,
950                               in[11] >> 28 | in[12] << 4, in[11],
951                               in[10] >> 20 | in[11] << 12, in[10]);
952   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
953   _mm512_storeu_si512(out, results);
954   out += 16;
955 
956   in += 20;
957 
958   return in;
959 }
960 
unpack21_32_avx512(const uint32_t * in,uint32_t * out)961 inline static const uint32_t* unpack21_32_avx512(const uint32_t* in, uint32_t* out) {
962   uint32_t mask = 0x1fffff;
963   __m512i reg_shifts, reg_inls, reg_masks;
964   __m512i results;
965 
966   reg_masks = _mm512_set1_epi32(mask);
967 
968   // shift the first 16 outs
969   reg_shifts = _mm512_set_epi32(0, 6, 0, 0,
970                                 7, 0, 0, 8,
971                                 0, 0, 9, 0,
972                                 0, 10, 0, 0);
973   reg_inls = _mm512_set_epi32(in[9] >> 27 | in[10] << 5, in[9],
974                               in[8] >> 17 | in[9] << 15, in[7] >> 28 | in[8] << 4,
975                               in[7], in[6] >> 18 | in[7] << 14,
976                               in[5] >> 29 | in[6] << 3, in[5],
977                               in[4] >> 19 | in[5] << 13, in[3] >> 30 | in[4] << 2,
978                               in[3], in[2] >> 20 | in[3] << 12,
979                               in[1] >> 31 | in[2] << 1, in[1],
980                               in[0] >> 21 | in[1] << 11, in[0]);
981   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
982   _mm512_storeu_si512(out, results);
983   out += 16;
984 
985   // shift the second 16 outs
986   reg_shifts = _mm512_set_epi32(11, 0, 1, 0,
987                                 0, 2, 0, 0,
988                                 3, 0, 0, 4,
989                                 0, 0, 5, 0);
990   reg_inls = _mm512_set_epi32(in[20], in[19] >> 22 | in[20] << 10,
991                               in[19], in[18] >> 12 | in[19] << 20,
992                               in[17] >> 23 | in[18] << 9, in[17],
993                               in[16] >> 13 | in[17] << 19, in[15] >> 24 | in[16] << 8,
994                               in[15], in[14] >> 14 | in[15] << 18,
995                               in[13] >> 25 | in[14] << 7, in[13],
996                               in[12] >> 15 | in[13] << 17, in[11] >> 26 | in[12] << 6,
997                               in[11], in[10] >> 16 | in[11] << 16);
998   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
999   _mm512_storeu_si512(out, results);
1000   out += 16;
1001 
1002   in += 21;
1003 
1004   return in;
1005 }
1006 
unpack22_32_avx512(const uint32_t * in,uint32_t * out)1007 inline static const uint32_t* unpack22_32_avx512(const uint32_t* in, uint32_t* out) {
1008   uint32_t mask = 0x3fffff;
1009   __m512i reg_shifts, reg_inls, reg_masks;
1010   __m512i results;
1011 
1012   reg_masks = _mm512_set1_epi32(mask);
1013 
1014   // shift the first 16 outs
1015   reg_shifts = _mm512_set_epi32(10, 0, 0, 8,
1016                                 0, 0, 6, 0,
1017                                 0, 4, 0, 0,
1018                                 2, 0, 0, 0);
1019   reg_inls = _mm512_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
1020                               in[8] >> 30 | in[9] << 2, in[8],
1021                               in[7] >> 18 | in[8] << 14, in[6] >> 28 | in[7] << 4,
1022                               in[6], in[5] >> 16 | in[6] << 16,
1023                               in[4] >> 26 | in[5] << 6, in[4],
1024                               in[3] >> 14 | in[4] << 18, in[2] >> 24 | in[3] << 8,
1025                               in[2], in[1] >> 12 | in[2] << 20,
1026                               in[0] >> 22 | in[1] << 10, in[0]);
1027   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1028   _mm512_storeu_si512(out, results);
1029   out += 16;
1030 
1031   // shift the second 16 outs
1032   reg_shifts = _mm512_set_epi32(10, 0, 0, 8,
1033                                 0, 0, 6, 0,
1034                                 0, 4, 0, 0,
1035                                 2, 0, 0, 0);
1036   reg_inls = _mm512_set_epi32(in[21], in[20] >> 20 | in[21] << 12,
1037                               in[19] >> 30 | in[20] << 2, in[19],
1038                               in[18] >> 18 | in[19] << 14, in[17] >> 28 | in[18] << 4,
1039                               in[17], in[16] >> 16 | in[17] << 16,
1040                               in[15] >> 26 | in[16] << 6, in[15],
1041                               in[14] >> 14 | in[15] << 18, in[13] >> 24 | in[14] << 8,
1042                               in[13], in[12] >> 12 | in[13] << 20,
1043                               in[11] >> 22 | in[12] << 10, in[11]);
1044   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1045   _mm512_storeu_si512(out, results);
1046   out += 16;
1047 
1048   in += 22;
1049 
1050   return in;
1051 }
1052 
unpack23_32_avx512(const uint32_t * in,uint32_t * out)1053 inline static const uint32_t* unpack23_32_avx512(const uint32_t* in, uint32_t* out) {
1054   uint32_t mask = 0x7fffff;
1055   __m512i reg_shifts, reg_inls, reg_masks;
1056   __m512i results;
1057 
1058   reg_masks = _mm512_set1_epi32(mask);
1059 
1060   // shift the first 16 outs
1061   reg_shifts = _mm512_set_epi32(0, 2, 0, 0,
1062                                 0, 6, 0, 0,
1063                                 1, 0, 0, 0,
1064                                 5, 0, 0, 0);
1065   reg_inls = _mm512_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
1066                               in[9] >> 11 | in[10] << 21, in[8] >> 20 | in[9] << 12,
1067                               in[7] >> 29 | in[8] << 3, in[7],
1068                               in[6] >> 15 | in[7] << 17, in[5] >> 24 | in[6] << 8,
1069                               in[5], in[4] >> 10 | in[5] << 22,
1070                               in[3] >> 19 | in[4] << 13, in[2] >> 28 | in[3] << 4,
1071                               in[2], in[1] >> 14 | in[2] << 18,
1072                               in[0] >> 23 | in[1] << 9, in[0]);
1073   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1074   _mm512_storeu_si512(out, results);
1075   out += 16;
1076 
1077   // shift the second 16 outs
1078   reg_shifts = _mm512_set_epi32(9, 0, 0, 4,
1079                                 0, 0, 0, 8,
1080                                 0, 0, 3, 0,
1081                                 0, 0, 7, 0);
1082   reg_inls = _mm512_set_epi32(in[22], in[21] >> 18 | in[22] << 14,
1083                               in[20] >> 27 | in[21] << 5, in[20],
1084                               in[19] >> 13 | in[20] << 19, in[18] >> 22 | in[19] << 10,
1085                               in[17] >> 31 | in[18] << 1, in[17],
1086                               in[16] >> 17 | in[17] << 15, in[15] >> 26 | in[16] << 6,
1087                               in[15], in[14] >> 12 | in[15] << 20,
1088                               in[13] >> 21 | in[14] << 11, in[12] >> 30 | in[13] << 2,
1089                               in[12], in[11] >> 16 | in[12] << 16);
1090   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1091   _mm512_storeu_si512(out, results);
1092   out += 16;
1093 
1094   in += 23;
1095 
1096   return in;
1097 }
1098 
unpack24_32_avx512(const uint32_t * in,uint32_t * out)1099 inline static const uint32_t* unpack24_32_avx512(const uint32_t* in, uint32_t* out) {
1100   uint32_t mask = 0xffffff;
1101   __m512i reg_shifts, reg_inls, reg_masks;
1102   __m512i results;
1103 
1104   reg_masks = _mm512_set1_epi32(mask);
1105 
1106   // shift the first 16 outs
1107   reg_shifts = _mm512_set_epi32(8, 0, 0, 0,
1108                                 8, 0, 0, 0,
1109                                 8, 0, 0, 0,
1110                                 8, 0, 0, 0);
1111   reg_inls = _mm512_set_epi32(in[11], in[10] >> 16 | in[11] << 16,
1112                               in[9] >> 24 | in[10] << 8, in[9],
1113                               in[8], in[7] >> 16 | in[8] << 16,
1114                               in[6] >> 24 | in[7] << 8, in[6],
1115                               in[5], in[4] >> 16 | in[5] << 16,
1116                               in[3] >> 24 | in[4] << 8, in[3],
1117                               in[2], in[1] >> 16 | in[2] << 16,
1118                               in[0] >> 24 | in[1] << 8, in[0]);
1119   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1120   _mm512_storeu_si512(out, results);
1121   out += 16;
1122 
1123   // shift the second 16 outs
1124   reg_shifts = _mm512_set_epi32(8, 0, 0, 0,
1125                                 8, 0, 0, 0,
1126                                 8, 0, 0, 0,
1127                                 8, 0, 0, 0);
1128   reg_inls = _mm512_set_epi32(in[23], in[22] >> 16 | in[23] << 16,
1129                               in[21] >> 24 | in[22] << 8, in[21],
1130                               in[20], in[19] >> 16 | in[20] << 16,
1131                               in[18] >> 24 | in[19] << 8, in[18],
1132                               in[17], in[16] >> 16 | in[17] << 16,
1133                               in[15] >> 24 | in[16] << 8, in[15],
1134                               in[14], in[13] >> 16 | in[14] << 16,
1135                               in[12] >> 24 | in[13] << 8, in[12]);
1136   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1137   _mm512_storeu_si512(out, results);
1138   out += 16;
1139 
1140   in += 24;
1141 
1142   return in;
1143 }
1144 
unpack25_32_avx512(const uint32_t * in,uint32_t * out)1145 inline static const uint32_t* unpack25_32_avx512(const uint32_t* in, uint32_t* out) {
1146   uint32_t mask = 0x1ffffff;
1147   __m512i reg_shifts, reg_inls, reg_masks;
1148   __m512i results;
1149 
1150   reg_masks = _mm512_set1_epi32(mask);
1151 
1152   // shift the first 16 outs
1153   reg_shifts = _mm512_set_epi32(0, 0, 5, 0,
1154                                 0, 0, 1, 0,
1155                                 0, 0, 0, 4,
1156                                 0, 0, 0, 0);
1157   reg_inls = _mm512_set_epi32(in[11] >> 23 | in[12] << 9, in[10] >> 30 | in[11] << 2,
1158                               in[10], in[9] >> 12 | in[10] << 20,
1159                               in[8] >> 19 | in[9] << 13, in[7] >> 26 | in[8] << 6,
1160                               in[7], in[6] >> 8 | in[7] << 24,
1161                               in[5] >> 15 | in[6] << 17, in[4] >> 22 | in[5] << 10,
1162                               in[3] >> 29 | in[4] << 3, in[3],
1163                               in[2] >> 11 | in[3] << 21, in[1] >> 18 | in[2] << 14,
1164                               in[0] >> 25 | in[1] << 7, in[0]);
1165   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1166   _mm512_storeu_si512(out, results);
1167   out += 16;
1168 
1169   // shift the second 16 outs
1170   reg_shifts = _mm512_set_epi32(7, 0, 0, 0,
1171                                 3, 0, 0, 0,
1172                                 0, 6, 0, 0,
1173                                 0, 2, 0, 0);
1174   reg_inls = _mm512_set_epi32(in[24], in[23] >> 14 | in[24] << 18,
1175                               in[22] >> 21 | in[23] << 11, in[21] >> 28 | in[22] << 4,
1176                               in[21], in[20] >> 10 | in[21] << 22,
1177                               in[19] >> 17 | in[20] << 15, in[18] >> 24 | in[19] << 8,
1178                               in[17] >> 31 | in[18] << 1, in[17],
1179                               in[16] >> 13 | in[17] << 19, in[15] >> 20 | in[16] << 12,
1180                               in[14] >> 27 | in[15] << 5, in[14],
1181                               in[13] >> 9 | in[14] << 23, in[12] >> 16 | in[13] << 16);
1182   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1183   _mm512_storeu_si512(out, results);
1184   out += 16;
1185 
1186   in += 25;
1187 
1188   return in;
1189 }
1190 
unpack26_32_avx512(const uint32_t * in,uint32_t * out)1191 inline static const uint32_t* unpack26_32_avx512(const uint32_t* in, uint32_t* out) {
1192   uint32_t mask = 0x3ffffff;
1193   __m512i reg_shifts, reg_inls, reg_masks;
1194   __m512i results;
1195 
1196   reg_masks = _mm512_set1_epi32(mask);
1197 
1198   // shift the first 16 outs
1199   reg_shifts = _mm512_set_epi32(6, 0, 0, 0,
1200                                 0, 4, 0, 0,
1201                                 0, 0, 2, 0,
1202                                 0, 0, 0, 0);
1203   reg_inls = _mm512_set_epi32(in[12], in[11] >> 12 | in[12] << 20,
1204                               in[10] >> 18 | in[11] << 14, in[9] >> 24 | in[10] << 8,
1205                               in[8] >> 30 | in[9] << 2, in[8],
1206                               in[7] >> 10 | in[8] << 22, in[6] >> 16 | in[7] << 16,
1207                               in[5] >> 22 | in[6] << 10, in[4] >> 28 | in[5] << 4,
1208                               in[4], in[3] >> 8 | in[4] << 24,
1209                               in[2] >> 14 | in[3] << 18, in[1] >> 20 | in[2] << 12,
1210                               in[0] >> 26 | in[1] << 6, in[0]);
1211   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1212   _mm512_storeu_si512(out, results);
1213   out += 16;
1214 
1215   // shift the second 16 outs
1216   reg_shifts = _mm512_set_epi32(6, 0, 0, 0,
1217                                 0, 4, 0, 0,
1218                                 0, 0, 2, 0,
1219                                 0, 0, 0, 0);
1220   reg_inls = _mm512_set_epi32(in[25], in[24] >> 12 | in[25] << 20,
1221                               in[23] >> 18 | in[24] << 14, in[22] >> 24 | in[23] << 8,
1222                               in[21] >> 30 | in[22] << 2, in[21],
1223                               in[20] >> 10 | in[21] << 22, in[19] >> 16 | in[20] << 16,
1224                               in[18] >> 22 | in[19] << 10, in[17] >> 28 | in[18] << 4,
1225                               in[17], in[16] >> 8 | in[17] << 24,
1226                               in[15] >> 14 | in[16] << 18, in[14] >> 20 | in[15] << 12,
1227                               in[13] >> 26 | in[14] << 6, in[13]);
1228   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1229   _mm512_storeu_si512(out, results);
1230   out += 16;
1231 
1232   in += 26;
1233 
1234   return in;
1235 }
1236 
unpack27_32_avx512(const uint32_t * in,uint32_t * out)1237 inline static const uint32_t* unpack27_32_avx512(const uint32_t* in, uint32_t* out) {
1238   uint32_t mask = 0x7ffffff;
1239   __m512i reg_shifts, reg_inls, reg_masks;
1240   __m512i results;
1241 
1242   reg_masks = _mm512_set1_epi32(mask);
1243 
1244   // shift the first 16 outs
1245   reg_shifts = _mm512_set_epi32(0, 0, 0, 4,
1246                                 0, 0, 0, 0,
1247                                 0, 2, 0, 0,
1248                                 0, 0, 0, 0);
1249   reg_inls = _mm512_set_epi32(in[12] >> 21 | in[13] << 11, in[11] >> 26 | in[12] << 6,
1250                               in[10] >> 31 | in[11] << 1, in[10],
1251                               in[9] >> 9 | in[10] << 23, in[8] >> 14 | in[9] << 18,
1252                               in[7] >> 19 | in[8] << 13, in[6] >> 24 | in[7] << 8,
1253                               in[5] >> 29 | in[6] << 3, in[5],
1254                               in[4] >> 7 | in[5] << 25, in[3] >> 12 | in[4] << 20,
1255                               in[2] >> 17 | in[3] << 15, in[1] >> 22 | in[2] << 10,
1256                               in[0] >> 27 | in[1] << 5, in[0]);
1257   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1258   _mm512_storeu_si512(out, results);
1259   out += 16;
1260 
1261   // shift the second 16 outs
1262   reg_shifts = _mm512_set_epi32(5, 0, 0, 0,
1263                                 0, 0, 3, 0,
1264                                 0, 0, 0, 0,
1265                                 1, 0, 0, 0);
1266   reg_inls = _mm512_set_epi32(in[26], in[25] >> 10 | in[26] << 22,
1267                               in[24] >> 15 | in[25] << 17, in[23] >> 20 | in[24] << 12,
1268                               in[22] >> 25 | in[23] << 7, in[21] >> 30 | in[22] << 2,
1269                               in[21], in[20] >> 8 | in[21] << 24,
1270                               in[19] >> 13 | in[20] << 19, in[18] >> 18 | in[19] << 14,
1271                               in[17] >> 23 | in[18] << 9, in[16] >> 28 | in[17] << 4,
1272                               in[16], in[15] >> 6 | in[16] << 26,
1273                               in[14] >> 11 | in[15] << 21, in[13] >> 16 | in[14] << 16);
1274   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1275   _mm512_storeu_si512(out, results);
1276   out += 16;
1277 
1278   in += 27;
1279 
1280   return in;
1281 }
1282 
unpack28_32_avx512(const uint32_t * in,uint32_t * out)1283 inline static const uint32_t* unpack28_32_avx512(const uint32_t* in, uint32_t* out) {
1284   uint32_t mask = 0xfffffff;
1285   __m512i reg_shifts, reg_inls, reg_masks;
1286   __m512i results;
1287 
1288   reg_masks = _mm512_set1_epi32(mask);
1289 
1290   // shift the first 16 outs
1291   reg_shifts = _mm512_set_epi32(4, 0, 0, 0,
1292                                 0, 0, 0, 0,
1293                                 4, 0, 0, 0,
1294                                 0, 0, 0, 0);
1295   reg_inls = _mm512_set_epi32(in[13], in[12] >> 8 | in[13] << 24,
1296                               in[11] >> 12 | in[12] << 20, in[10] >> 16 | in[11] << 16,
1297                               in[9] >> 20 | in[10] << 12, in[8] >> 24 | in[9] << 8,
1298                               in[7] >> 28 | in[8] << 4, in[7],
1299                               in[6], in[5] >> 8 | in[6] << 24,
1300                               in[4] >> 12 | in[5] << 20, in[3] >> 16 | in[4] << 16,
1301                               in[2] >> 20 | in[3] << 12, in[1] >> 24 | in[2] << 8,
1302                               in[0] >> 28 | in[1] << 4, in[0]);
1303   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1304   _mm512_storeu_si512(out, results);
1305   out += 16;
1306 
1307   // shift the second 16 outs
1308   reg_shifts = _mm512_set_epi32(4, 0, 0, 0,
1309                                 0, 0, 0, 0,
1310                                 4, 0, 0, 0,
1311                                 0, 0, 0, 0);
1312   reg_inls = _mm512_set_epi32(in[27], in[26] >> 8 | in[27] << 24,
1313                               in[25] >> 12 | in[26] << 20, in[24] >> 16 | in[25] << 16,
1314                               in[23] >> 20 | in[24] << 12, in[22] >> 24 | in[23] << 8,
1315                               in[21] >> 28 | in[22] << 4, in[21],
1316                               in[20], in[19] >> 8 | in[20] << 24,
1317                               in[18] >> 12 | in[19] << 20, in[17] >> 16 | in[18] << 16,
1318                               in[16] >> 20 | in[17] << 12, in[15] >> 24 | in[16] << 8,
1319                               in[14] >> 28 | in[15] << 4, in[14]);
1320   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1321   _mm512_storeu_si512(out, results);
1322   out += 16;
1323 
1324   in += 28;
1325 
1326   return in;
1327 }
1328 
unpack29_32_avx512(const uint32_t * in,uint32_t * out)1329 inline static const uint32_t* unpack29_32_avx512(const uint32_t* in, uint32_t* out) {
1330   uint32_t mask = 0x1fffffff;
1331   __m512i reg_shifts, reg_inls, reg_masks;
1332   __m512i results;
1333 
1334   reg_masks = _mm512_set1_epi32(mask);
1335 
1336   // shift the first 16 outs
1337   reg_shifts = _mm512_set_epi32(0, 0, 0, 0,
1338                                 0, 2, 0, 0,
1339                                 0, 0, 0, 0,
1340                                 0, 0, 0, 0);
1341   reg_inls = _mm512_set_epi32(in[13] >> 19 | in[14] << 13, in[12] >> 22 | in[13] << 10,
1342                               in[11] >> 25 | in[12] << 7, in[10] >> 28 | in[11] << 4,
1343                               in[9] >> 31 | in[10] << 1, in[9],
1344                               in[8] >> 5 | in[9] << 27, in[7] >> 8 | in[8] << 24,
1345                               in[6] >> 11 | in[7] << 21, in[5] >> 14 | in[6] << 18,
1346                               in[4] >> 17 | in[5] << 15, in[3] >> 20 | in[4] << 12,
1347                               in[2] >> 23 | in[3] << 9, in[1] >> 26 | in[2] << 6,
1348                               in[0] >> 29 | in[1] << 3, in[0]);
1349   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1350   _mm512_storeu_si512(out, results);
1351   out += 16;
1352 
1353   // shift the second 16 outs
1354   reg_shifts = _mm512_set_epi32(3, 0, 0, 0,
1355                                 0, 0, 0, 0,
1356                                 0, 0, 1, 0,
1357                                 0, 0, 0, 0);
1358   reg_inls = _mm512_set_epi32(in[28], in[27] >> 6 | in[28] << 26,
1359                               in[26] >> 9 | in[27] << 23, in[25] >> 12 | in[26] << 20,
1360                               in[24] >> 15 | in[25] << 17, in[23] >> 18 | in[24] << 14,
1361                               in[22] >> 21 | in[23] << 11, in[21] >> 24 | in[22] << 8,
1362                               in[20] >> 27 | in[21] << 5, in[19] >> 30 | in[20] << 2,
1363                               in[19], in[18] >> 4 | in[19] << 28,
1364                               in[17] >> 7 | in[18] << 25, in[16] >> 10 | in[17] << 22,
1365                               in[15] >> 13 | in[16] << 19, in[14] >> 16 | in[15] << 16);
1366   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1367   _mm512_storeu_si512(out, results);
1368   out += 16;
1369 
1370   in += 29;
1371 
1372   return in;
1373 }
1374 
unpack30_32_avx512(const uint32_t * in,uint32_t * out)1375 inline static const uint32_t* unpack30_32_avx512(const uint32_t* in, uint32_t* out) {
1376   uint32_t mask = 0x3fffffff;
1377   __m512i reg_shifts, reg_inls, reg_masks;
1378   __m512i results;
1379 
1380   reg_masks = _mm512_set1_epi32(mask);
1381 
1382   // shift the first 16 outs
1383   reg_shifts = _mm512_set_epi32(2, 0, 0, 0,
1384                                 0, 0, 0, 0,
1385                                 0, 0, 0, 0,
1386                                 0, 0, 0, 0);
1387   reg_inls = _mm512_set_epi32(in[14], in[13] >> 4 | in[14] << 28,
1388                               in[12] >> 6 | in[13] << 26, in[11] >> 8 | in[12] << 24,
1389                               in[10] >> 10 | in[11] << 22, in[9] >> 12 | in[10] << 20,
1390                               in[8] >> 14 | in[9] << 18, in[7] >> 16 | in[8] << 16,
1391                               in[6] >> 18 | in[7] << 14, in[5] >> 20 | in[6] << 12,
1392                               in[4] >> 22 | in[5] << 10, in[3] >> 24 | in[4] << 8,
1393                               in[2] >> 26 | in[3] << 6, in[1] >> 28 | in[2] << 4,
1394                               in[0] >> 30 | in[1] << 2, in[0]);
1395   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1396   _mm512_storeu_si512(out, results);
1397   out += 16;
1398 
1399   // shift the second 16 outs
1400   reg_shifts = _mm512_set_epi32(2, 0, 0, 0,
1401                                 0, 0, 0, 0,
1402                                 0, 0, 0, 0,
1403                                 0, 0, 0, 0);
1404   reg_inls = _mm512_set_epi32(in[29], in[28] >> 4 | in[29] << 28,
1405                               in[27] >> 6 | in[28] << 26, in[26] >> 8 | in[27] << 24,
1406                               in[25] >> 10 | in[26] << 22, in[24] >> 12 | in[25] << 20,
1407                               in[23] >> 14 | in[24] << 18, in[22] >> 16 | in[23] << 16,
1408                               in[21] >> 18 | in[22] << 14, in[20] >> 20 | in[21] << 12,
1409                               in[19] >> 22 | in[20] << 10, in[18] >> 24 | in[19] << 8,
1410                               in[17] >> 26 | in[18] << 6, in[16] >> 28 | in[17] << 4,
1411                               in[15] >> 30 | in[16] << 2, in[15]);
1412   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1413   _mm512_storeu_si512(out, results);
1414   out += 16;
1415 
1416   in += 30;
1417 
1418   return in;
1419 }
1420 
unpack31_32_avx512(const uint32_t * in,uint32_t * out)1421 inline static const uint32_t* unpack31_32_avx512(const uint32_t* in, uint32_t* out) {
1422   uint32_t mask = 0x7fffffff;
1423   __m512i reg_shifts, reg_inls, reg_masks;
1424   __m512i results;
1425 
1426   reg_masks = _mm512_set1_epi32(mask);
1427 
1428   // shift the first 16 outs
1429   reg_shifts = _mm512_set_epi32(0, 0, 0, 0,
1430                                 0, 0, 0, 0,
1431                                 0, 0, 0, 0,
1432                                 0, 0, 0, 0);
1433   reg_inls = _mm512_set_epi32(in[14] >> 17 | in[15] << 15, in[13] >> 18 | in[14] << 14,
1434                               in[12] >> 19 | in[13] << 13, in[11] >> 20 | in[12] << 12,
1435                               in[10] >> 21 | in[11] << 11, in[9] >> 22 | in[10] << 10,
1436                               in[8] >> 23 | in[9] << 9, in[7] >> 24 | in[8] << 8,
1437                               in[6] >> 25 | in[7] << 7, in[5] >> 26 | in[6] << 6,
1438                               in[4] >> 27 | in[5] << 5, in[3] >> 28 | in[4] << 4,
1439                               in[2] >> 29 | in[3] << 3, in[1] >> 30 | in[2] << 2,
1440                               in[0] >> 31 | in[1] << 1, in[0]);
1441   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1442   _mm512_storeu_si512(out, results);
1443   out += 16;
1444 
1445   // shift the second 16 outs
1446   reg_shifts = _mm512_set_epi32(1, 0, 0, 0,
1447                                 0, 0, 0, 0,
1448                                 0, 0, 0, 0,
1449                                 0, 0, 0, 0);
1450   reg_inls = _mm512_set_epi32(in[30], in[29] >> 2 | in[30] << 30,
1451                               in[28] >> 3 | in[29] << 29, in[27] >> 4 | in[28] << 28,
1452                               in[26] >> 5 | in[27] << 27, in[25] >> 6 | in[26] << 26,
1453                               in[24] >> 7 | in[25] << 25, in[23] >> 8 | in[24] << 24,
1454                               in[22] >> 9 | in[23] << 23, in[21] >> 10 | in[22] << 22,
1455                               in[20] >> 11 | in[21] << 21, in[19] >> 12 | in[20] << 20,
1456                               in[18] >> 13 | in[19] << 19, in[17] >> 14 | in[18] << 18,
1457                               in[16] >> 15 | in[17] << 17, in[15] >> 16 | in[16] << 16);
1458   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1459   _mm512_storeu_si512(out, results);
1460   out += 16;
1461 
1462   in += 31;
1463 
1464   return in;
1465 }
1466 
unpack32_32_avx512(const uint32_t * in,uint32_t * out)1467 inline static const uint32_t* unpack32_32_avx512(const uint32_t* in, uint32_t* out) {
1468   memcpy(out, in, 32 * sizeof(*out));
1469   in += 32;
1470   out += 32;
1471 
1472   return in;
1473 }
1474 
1475 }  // namespace internal
1476 }  // namespace arrow
1477