1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 // Automatically generated file; DO NOT EDIT.
19
20 #pragma once
21
22 #include <cstdint>
23 #include <cstring>
24
25 #include <xsimd/xsimd.hpp>
26
27 #include "arrow/util/dispatch.h"
28 #include "arrow/util/ubsan.h"
29
30 namespace arrow {
31 namespace internal {
32 namespace {
33
34 using ::arrow::util::SafeLoad;
35
36 template <DispatchLevel level>
37 struct UnpackBits512 {
38
39 using simd_arch = xsimd::avx512bw;
40 using simd_batch = xsimd::batch<uint32_t, simd_arch>;
41
unpack0_32UnpackBits51242 inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) {
43 memset(out, 0x0, 32 * sizeof(*out));
44 out += 32;
45
46 return in;
47 }
48
unpack1_32UnpackBits51249 inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
50 uint32_t mask = 0x1;
51
52 simd_batch masks(mask);
53 simd_batch words, shifts;
54 simd_batch results;
55
56 // extract 1-bit bundles 0 to 15
57 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
58 shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
59 results = (words >> shifts) & masks;
60 results.store_unaligned(out);
61 out += 16;
62
63 // extract 1-bit bundles 16 to 31
64 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
65 shifts = simd_batch{ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
66 results = (words >> shifts) & masks;
67 results.store_unaligned(out);
68 out += 16;
69
70 in += 1;
71 return in;
72 }
73
unpack2_32UnpackBits51274 inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) {
75 uint32_t mask = 0x3;
76
77 simd_batch masks(mask);
78 simd_batch words, shifts;
79 simd_batch results;
80
81 // extract 2-bit bundles 0 to 15
82 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
83 shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 };
84 results = (words >> shifts) & masks;
85 results.store_unaligned(out);
86 out += 16;
87
88 // extract 2-bit bundles 16 to 31
89 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
90 shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 };
91 results = (words >> shifts) & masks;
92 results.store_unaligned(out);
93 out += 16;
94
95 in += 2;
96 return in;
97 }
98
unpack3_32UnpackBits51299 inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) {
100 uint32_t mask = 0x7;
101
102 simd_batch masks(mask);
103 simd_batch words, shifts;
104 simd_batch results;
105
106 // extract 3-bit bundles 0 to 15
107 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
108 shifts = simd_batch{ 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 0, 1, 4, 7, 10, 13 };
109 results = (words >> shifts) & masks;
110 results.store_unaligned(out);
111 out += 16;
112
113 // extract 3-bit bundles 16 to 31
114 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
115 shifts = simd_batch{ 16, 19, 22, 25, 28, 0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29 };
116 results = (words >> shifts) & masks;
117 results.store_unaligned(out);
118 out += 16;
119
120 in += 3;
121 return in;
122 }
123
unpack4_32UnpackBits512124 inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) {
125 uint32_t mask = 0xf;
126
127 simd_batch masks(mask);
128 simd_batch words, shifts;
129 simd_batch results;
130
131 // extract 4-bit bundles 0 to 15
132 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
133 shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 };
134 results = (words >> shifts) & masks;
135 results.store_unaligned(out);
136 out += 16;
137
138 // extract 4-bit bundles 16 to 31
139 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
140 shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 };
141 results = (words >> shifts) & masks;
142 results.store_unaligned(out);
143 out += 16;
144
145 in += 4;
146 return in;
147 }
148
unpack5_32UnpackBits512149 inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) {
150 uint32_t mask = 0x1f;
151
152 simd_batch masks(mask);
153 simd_batch words, shifts;
154 simd_batch results;
155
156 // extract 5-bit bundles 0 to 15
157 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
158 shifts = simd_batch{ 0, 5, 10, 15, 20, 25, 0, 3, 8, 13, 18, 23, 0, 1, 6, 11 };
159 results = (words >> shifts) & masks;
160 results.store_unaligned(out);
161 out += 16;
162
163 // extract 5-bit bundles 16 to 31
164 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 31 | SafeLoad<uint32_t>(in + 3) << 1, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 29 | SafeLoad<uint32_t>(in + 4) << 3, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
165 shifts = simd_batch{ 16, 21, 26, 0, 4, 9, 14, 19, 24, 0, 2, 7, 12, 17, 22, 27 };
166 results = (words >> shifts) & masks;
167 results.store_unaligned(out);
168 out += 16;
169
170 in += 5;
171 return in;
172 }
173
unpack6_32UnpackBits512174 inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) {
175 uint32_t mask = 0x3f;
176
177 simd_batch masks(mask);
178 simd_batch words, shifts;
179 simd_batch results;
180
181 // extract 6-bit bundles 0 to 15
182 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
183 shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10, 16, 22, 0, 2, 8, 14, 20, 26 };
184 results = (words >> shifts) & masks;
185 results.store_unaligned(out);
186 out += 16;
187
188 // extract 6-bit bundles 16 to 31
189 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) };
190 shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10, 16, 22, 0, 2, 8, 14, 20, 26 };
191 results = (words >> shifts) & masks;
192 results.store_unaligned(out);
193 out += 16;
194
195 in += 6;
196 return in;
197 }
198
unpack7_32UnpackBits512199 inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) {
200 uint32_t mask = 0x7f;
201
202 simd_batch masks(mask);
203 simd_batch words, shifts;
204 simd_batch results;
205
206 // extract 7-bit bundles 0 to 15
207 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 28 | SafeLoad<uint32_t>(in + 1) << 4, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 27 | SafeLoad<uint32_t>(in + 3) << 5, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
208 shifts = simd_batch{ 0, 7, 14, 21, 0, 3, 10, 17, 24, 0, 6, 13, 20, 0, 2, 9 };
209 results = (words >> shifts) & masks;
210 results.store_unaligned(out);
211 out += 16;
212
213 // extract 7-bit bundles 16 to 31
214 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) };
215 shifts = simd_batch{ 16, 23, 0, 5, 12, 19, 0, 1, 8, 15, 22, 0, 4, 11, 18, 25 };
216 results = (words >> shifts) & masks;
217 results.store_unaligned(out);
218 out += 16;
219
220 in += 7;
221 return in;
222 }
223
unpack8_32UnpackBits512224 inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) {
225 uint32_t mask = 0xff;
226
227 simd_batch masks(mask);
228 simd_batch words, shifts;
229 simd_batch results;
230
231 // extract 8-bit bundles 0 to 15
232 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
233 shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 };
234 results = (words >> shifts) & masks;
235 results.store_unaligned(out);
236 out += 16;
237
238 // extract 8-bit bundles 16 to 31
239 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) };
240 shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 };
241 results = (words >> shifts) & masks;
242 results.store_unaligned(out);
243 out += 16;
244
245 in += 8;
246 return in;
247 }
248
unpack9_32UnpackBits512249 inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) {
250 uint32_t mask = 0x1ff;
251
252 simd_batch masks(mask);
253 simd_batch words, shifts;
254 simd_batch results;
255
256 // extract 9-bit bundles 0 to 15
257 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 27 | SafeLoad<uint32_t>(in + 1) << 5, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4) };
258 shifts = simd_batch{ 0, 9, 18, 0, 4, 13, 22, 0, 8, 17, 0, 3, 12, 21, 0, 7 };
259 results = (words >> shifts) & masks;
260 results.store_unaligned(out);
261 out += 16;
262
263 // extract 9-bit bundles 16 to 31
264 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 25 | SafeLoad<uint32_t>(in + 5) << 7, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) };
265 shifts = simd_batch{ 16, 0, 2, 11, 20, 0, 6, 15, 0, 1, 10, 19, 0, 5, 14, 23 };
266 results = (words >> shifts) & masks;
267 results.store_unaligned(out);
268 out += 16;
269
270 in += 9;
271 return in;
272 }
273
unpack10_32UnpackBits512274 inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) {
275 uint32_t mask = 0x3ff;
276
277 simd_batch masks(mask);
278 simd_batch words, shifts;
279 simd_batch results;
280
281 // extract 10-bit bundles 0 to 15
282 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
283 shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6, 16, 0, 4, 14, 0, 2, 12, 22 };
284 results = (words >> shifts) & masks;
285 results.store_unaligned(out);
286 out += 16;
287
288 // extract 10-bit bundles 16 to 31
289 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 30 | SafeLoad<uint32_t>(in + 6) << 2, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 26 | SafeLoad<uint32_t>(in + 8) << 6, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) };
290 shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6, 16, 0, 4, 14, 0, 2, 12, 22 };
291 results = (words >> shifts) & masks;
292 results.store_unaligned(out);
293 out += 16;
294
295 in += 10;
296 return in;
297 }
298
unpack11_32UnpackBits512299 inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) {
300 uint32_t mask = 0x7ff;
301
302 simd_batch masks(mask);
303 simd_batch words, shifts;
304 simd_batch results;
305
306 // extract 11-bit bundles 0 to 15
307 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 22 | SafeLoad<uint32_t>(in + 1) << 10, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 23 | SafeLoad<uint32_t>(in + 2) << 9, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 24 | SafeLoad<uint32_t>(in + 3) << 8, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 25 | SafeLoad<uint32_t>(in + 4) << 7, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6, SafeLoad<uint32_t>(in + 5) };
308 shifts = simd_batch{ 0, 11, 0, 1, 12, 0, 2, 13, 0, 3, 14, 0, 4, 15, 0, 5 };
309 results = (words >> shifts) & masks;
310 results.store_unaligned(out);
311 out += 16;
312
313 // extract 11-bit bundles 16 to 31
314 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 27 | SafeLoad<uint32_t>(in + 6) << 5, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 29 | SafeLoad<uint32_t>(in + 8) << 3, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 31 | SafeLoad<uint32_t>(in + 10) << 1, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) };
315 shifts = simd_batch{ 16, 0, 6, 17, 0, 7, 18, 0, 8, 19, 0, 9, 20, 0, 10, 21 };
316 results = (words >> shifts) & masks;
317 results.store_unaligned(out);
318 out += 16;
319
320 in += 11;
321 return in;
322 }
323
unpack12_32UnpackBits512324 inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) {
325 uint32_t mask = 0xfff;
326
327 simd_batch masks(mask);
328 simd_batch words, shifts;
329 simd_batch results;
330
331 // extract 12-bit bundles 0 to 15
332 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 24 | SafeLoad<uint32_t>(in + 1) << 8, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) };
333 shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20, 0, 12, 0, 4, 16, 0, 8, 20 };
334 results = (words >> shifts) & masks;
335 results.store_unaligned(out);
336 out += 16;
337
338 // extract 12-bit bundles 16 to 31
339 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 28 | SafeLoad<uint32_t>(in + 11) << 4, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) };
340 shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20, 0, 12, 0, 4, 16, 0, 8, 20 };
341 results = (words >> shifts) & masks;
342 results.store_unaligned(out);
343 out += 16;
344
345 in += 12;
346 return in;
347 }
348
unpack13_32UnpackBits512349 inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) {
350 uint32_t mask = 0x1fff;
351
352 simd_batch masks(mask);
353 simd_batch words, shifts;
354 simd_batch results;
355
356 // extract 13-bit bundles 0 to 15
357 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 26 | SafeLoad<uint32_t>(in + 1) << 6, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 20 | SafeLoad<uint32_t>(in + 2) << 12, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 27 | SafeLoad<uint32_t>(in + 3) << 5, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 21 | SafeLoad<uint32_t>(in + 4) << 11, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 22 | SafeLoad<uint32_t>(in + 6) << 10, SafeLoad<uint32_t>(in + 6) };
358 shifts = simd_batch{ 0, 13, 0, 7, 0, 1, 14, 0, 8, 0, 2, 15, 0, 9, 0, 3 };
359 results = (words >> shifts) & masks;
360 results.store_unaligned(out);
361 out += 16;
362
363 // extract 13-bit bundles 16 to 31
364 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 29 | SafeLoad<uint32_t>(in + 7) << 3, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 23 | SafeLoad<uint32_t>(in + 8) << 9, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 31 | SafeLoad<uint32_t>(in + 11) << 1, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 25 | SafeLoad<uint32_t>(in + 12) << 7, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) };
365 shifts = simd_batch{ 16, 0, 10, 0, 4, 17, 0, 11, 0, 5, 18, 0, 12, 0, 6, 19 };
366 results = (words >> shifts) & masks;
367 results.store_unaligned(out);
368 out += 16;
369
370 in += 13;
371 return in;
372 }
373
unpack14_32UnpackBits512374 inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) {
375 uint32_t mask = 0x3fff;
376
377 simd_batch masks(mask);
378 simd_batch words, shifts;
379 simd_batch results;
380
381 // extract 14-bit bundles 0 to 15
382 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 28 | SafeLoad<uint32_t>(in + 1) << 4, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 24 | SafeLoad<uint32_t>(in + 2) << 8, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 20 | SafeLoad<uint32_t>(in + 3) << 12, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 22 | SafeLoad<uint32_t>(in + 6) << 10, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) };
383 shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2, 16, 0, 12, 0, 8, 0, 4, 18 };
384 results = (words >> shifts) & masks;
385 results.store_unaligned(out);
386 out += 16;
387
388 // extract 14-bit bundles 16 to 31
389 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 20 | SafeLoad<uint32_t>(in + 10) << 12, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 30 | SafeLoad<uint32_t>(in + 11) << 2, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 22 | SafeLoad<uint32_t>(in + 13) << 10, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) };
390 shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2, 16, 0, 12, 0, 8, 0, 4, 18 };
391 results = (words >> shifts) & masks;
392 results.store_unaligned(out);
393 out += 16;
394
395 in += 14;
396 return in;
397 }
398
unpack15_32UnpackBits512399 inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) {
400 uint32_t mask = 0x7fff;
401
402 simd_batch masks(mask);
403 simd_batch words, shifts;
404 simd_batch results;
405
406 // extract 15-bit bundles 0 to 15
407 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 22 | SafeLoad<uint32_t>(in + 5) << 10, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 18 | SafeLoad<uint32_t>(in + 7) << 14, SafeLoad<uint32_t>(in + 7) };
408 shifts = simd_batch{ 0, 15, 0, 13, 0, 11, 0, 9, 0, 7, 0, 5, 0, 3, 0, 1 };
409 results = (words >> shifts) & masks;
410 results.store_unaligned(out);
411 out += 16;
412
413 // extract 15-bit bundles 16 to 31
414 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 31 | SafeLoad<uint32_t>(in + 8) << 1, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 29 | SafeLoad<uint32_t>(in + 9) << 3, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 27 | SafeLoad<uint32_t>(in + 10) << 5, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 25 | SafeLoad<uint32_t>(in + 11) << 7, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 23 | SafeLoad<uint32_t>(in + 12) << 9, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 21 | SafeLoad<uint32_t>(in + 13) << 11, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 19 | SafeLoad<uint32_t>(in + 14) << 13, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) };
415 shifts = simd_batch{ 16, 0, 14, 0, 12, 0, 10, 0, 8, 0, 6, 0, 4, 0, 2, 17 };
416 results = (words >> shifts) & masks;
417 results.store_unaligned(out);
418 out += 16;
419
420 in += 15;
421 return in;
422 }
423
unpack16_32UnpackBits512424 inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) {
425 uint32_t mask = 0xffff;
426
427 simd_batch masks(mask);
428 simd_batch words, shifts;
429 simd_batch results;
430
431 // extract 16-bit bundles 0 to 15
432 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) };
433 shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16 };
434 results = (words >> shifts) & masks;
435 results.store_unaligned(out);
436 out += 16;
437
438 // extract 16-bit bundles 16 to 31
439 words = simd_batch{ SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) };
440 shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16 };
441 results = (words >> shifts) & masks;
442 results.store_unaligned(out);
443 out += 16;
444
445 in += 16;
446 return in;
447 }
448
unpack17_32UnpackBits512449 inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) {
450 uint32_t mask = 0x1ffff;
451
452 simd_batch masks(mask);
453 simd_batch words, shifts;
454 simd_batch results;
455
456 // extract 17-bit bundles 0 to 15
457 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 17 | SafeLoad<uint32_t>(in + 1) << 15, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 19 | SafeLoad<uint32_t>(in + 2) << 13, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 21 | SafeLoad<uint32_t>(in + 3) << 11, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 23 | SafeLoad<uint32_t>(in + 4) << 9, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 25 | SafeLoad<uint32_t>(in + 5) << 7, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 27 | SafeLoad<uint32_t>(in + 6) << 5, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 29 | SafeLoad<uint32_t>(in + 7) << 3, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 31 | SafeLoad<uint32_t>(in + 8) << 1 };
458 shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0 };
459 results = (words >> shifts) & masks;
460 results.store_unaligned(out);
461 out += 16;
462
463 // extract 17-bit bundles 16 to 31
464 words = simd_batch{ SafeLoad<uint32_t>(in + 8) >> 16 | SafeLoad<uint32_t>(in + 9) << 16, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 18 | SafeLoad<uint32_t>(in + 10) << 14, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 20 | SafeLoad<uint32_t>(in + 11) << 12, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 22 | SafeLoad<uint32_t>(in + 12) << 10, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 24 | SafeLoad<uint32_t>(in + 13) << 8, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 26 | SafeLoad<uint32_t>(in + 14) << 6, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 28 | SafeLoad<uint32_t>(in + 15) << 4, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 30 | SafeLoad<uint32_t>(in + 16) << 2, SafeLoad<uint32_t>(in + 16) };
465 shifts = simd_batch{ 0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15 };
466 results = (words >> shifts) & masks;
467 results.store_unaligned(out);
468 out += 16;
469
470 in += 17;
471 return in;
472 }
473
unpack18_32UnpackBits512474 inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) {
475 uint32_t mask = 0x3ffff;
476
477 simd_batch masks(mask);
478 simd_batch words, shifts;
479 simd_batch results;
480
481 // extract 18-bit bundles 0 to 15
482 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 18 | SafeLoad<uint32_t>(in + 1) << 14, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 22 | SafeLoad<uint32_t>(in + 2) << 10, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4) >> 16 | SafeLoad<uint32_t>(in + 5) << 16, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8) };
483 shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0, 0, 2, 0, 6, 0, 10, 0, 14 };
484 results = (words >> shifts) & masks;
485 results.store_unaligned(out);
486 out += 16;
487
488 // extract 18-bit bundles 16 to 31
489 words = simd_batch{ SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 18 | SafeLoad<uint32_t>(in + 10) << 14, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 22 | SafeLoad<uint32_t>(in + 11) << 10, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 30 | SafeLoad<uint32_t>(in + 13) << 2, SafeLoad<uint32_t>(in + 13) >> 16 | SafeLoad<uint32_t>(in + 14) << 16, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 20 | SafeLoad<uint32_t>(in + 15) << 12, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16), SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) };
490 shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0, 0, 2, 0, 6, 0, 10, 0, 14 };
491 results = (words >> shifts) & masks;
492 results.store_unaligned(out);
493 out += 16;
494
495 in += 18;
496 return in;
497 }
498
unpack19_32UnpackBits512499 inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) {
500 uint32_t mask = 0x7ffff;
501
502 simd_batch masks(mask);
503 simd_batch words, shifts;
504 simd_batch results;
505
506 // extract 19-bit bundles 0 to 15
507 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 19 | SafeLoad<uint32_t>(in + 1) << 13, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 25 | SafeLoad<uint32_t>(in + 2) << 7, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 31 | SafeLoad<uint32_t>(in + 3) << 1, SafeLoad<uint32_t>(in + 3) >> 18 | SafeLoad<uint32_t>(in + 4) << 14, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 24 | SafeLoad<uint32_t>(in + 5) << 8, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 30 | SafeLoad<uint32_t>(in + 6) << 2, SafeLoad<uint32_t>(in + 6) >> 17 | SafeLoad<uint32_t>(in + 7) << 15, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 23 | SafeLoad<uint32_t>(in + 8) << 9, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 29 | SafeLoad<uint32_t>(in + 9) << 3 };
508 shifts = simd_batch{ 0, 0, 6, 0, 12, 0, 0, 5, 0, 11, 0, 0, 4, 0, 10, 0 };
509 results = (words >> shifts) & masks;
510 results.store_unaligned(out);
511 out += 16;
512
513 // extract 19-bit bundles 16 to 31
514 words = simd_batch{ SafeLoad<uint32_t>(in + 9) >> 16 | SafeLoad<uint32_t>(in + 10) << 16, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 22 | SafeLoad<uint32_t>(in + 11) << 10, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 28 | SafeLoad<uint32_t>(in + 12) << 4, SafeLoad<uint32_t>(in + 12) >> 15 | SafeLoad<uint32_t>(in + 13) << 17, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 21 | SafeLoad<uint32_t>(in + 14) << 11, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 27 | SafeLoad<uint32_t>(in + 15) << 5, SafeLoad<uint32_t>(in + 15) >> 14 | SafeLoad<uint32_t>(in + 16) << 18, SafeLoad<uint32_t>(in + 16), SafeLoad<uint32_t>(in + 16) >> 20 | SafeLoad<uint32_t>(in + 17) << 12, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 26 | SafeLoad<uint32_t>(in + 18) << 6, SafeLoad<uint32_t>(in + 18) };
515 shifts = simd_batch{ 0, 3, 0, 9, 0, 0, 2, 0, 8, 0, 0, 1, 0, 7, 0, 13 };
516 results = (words >> shifts) & masks;
517 results.store_unaligned(out);
518 out += 16;
519
520 in += 19;
521 return in;
522 }
523
unpack20_32UnpackBits512524 inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) {
525 uint32_t mask = 0xfffff;
526
527 simd_batch masks(mask);
528 simd_batch words, shifts;
529 simd_batch results;
530
531 // extract 20-bit bundles 0 to 15
532 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 20 | SafeLoad<uint32_t>(in + 1) << 12, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2) >> 16 | SafeLoad<uint32_t>(in + 3) << 16, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7) >> 16 | SafeLoad<uint32_t>(in + 8) << 16, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9) };
533 shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12, 0, 0, 8, 0, 0, 4, 0, 12 };
534 results = (words >> shifts) & masks;
535 results.store_unaligned(out);
536 out += 16;
537
538 // extract 20-bit bundles 16 to 31
539 words = simd_batch{ SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 20 | SafeLoad<uint32_t>(in + 11) << 12, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 28 | SafeLoad<uint32_t>(in + 12) << 4, SafeLoad<uint32_t>(in + 12) >> 16 | SafeLoad<uint32_t>(in + 13) << 16, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 24 | SafeLoad<uint32_t>(in + 14) << 8, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 20 | SafeLoad<uint32_t>(in + 16) << 12, SafeLoad<uint32_t>(in + 16), SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) >> 16 | SafeLoad<uint32_t>(in + 18) << 16, SafeLoad<uint32_t>(in + 18), SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) };
540 shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12, 0, 0, 8, 0, 0, 4, 0, 12 };
541 results = (words >> shifts) & masks;
542 results.store_unaligned(out);
543 out += 16;
544
545 in += 20;
546 return in;
547 }
548
unpack21_32UnpackBits512549 inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) {
550 uint32_t mask = 0x1fffff;
551
552 simd_batch masks(mask);
553 simd_batch words, shifts;
554 simd_batch results;
555
556 // extract 21-bit bundles 0 to 15
557 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 21 | SafeLoad<uint32_t>(in + 1) << 11, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1, SafeLoad<uint32_t>(in + 2) >> 20 | SafeLoad<uint32_t>(in + 3) << 12, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4) >> 19 | SafeLoad<uint32_t>(in + 5) << 13, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3, SafeLoad<uint32_t>(in + 6) >> 18 | SafeLoad<uint32_t>(in + 7) << 14, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8) >> 17 | SafeLoad<uint32_t>(in + 9) << 15, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 27 | SafeLoad<uint32_t>(in + 10) << 5 };
558 shifts = simd_batch{ 0, 0, 10, 0, 0, 9, 0, 0, 8, 0, 0, 7, 0, 0, 6, 0 };
559 results = (words >> shifts) & masks;
560 results.store_unaligned(out);
561 out += 16;
562
563 // extract 21-bit bundles 16 to 31
564 words = simd_batch{ SafeLoad<uint32_t>(in + 10) >> 16 | SafeLoad<uint32_t>(in + 11) << 16, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12) >> 15 | SafeLoad<uint32_t>(in + 13) << 17, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 25 | SafeLoad<uint32_t>(in + 14) << 7, SafeLoad<uint32_t>(in + 14) >> 14 | SafeLoad<uint32_t>(in + 15) << 18, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16) >> 13 | SafeLoad<uint32_t>(in + 17) << 19, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 23 | SafeLoad<uint32_t>(in + 18) << 9, SafeLoad<uint32_t>(in + 18) >> 12 | SafeLoad<uint32_t>(in + 19) << 20, SafeLoad<uint32_t>(in + 19), SafeLoad<uint32_t>(in + 19) >> 22 | SafeLoad<uint32_t>(in + 20) << 10, SafeLoad<uint32_t>(in + 20) };
565 shifts = simd_batch{ 0, 5, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 11 };
566 results = (words >> shifts) & masks;
567 results.store_unaligned(out);
568 out += 16;
569
570 in += 21;
571 return in;
572 }
573
unpack22_32UnpackBits512574 inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) {
575 uint32_t mask = 0x3fffff;
576
577 simd_batch masks(mask);
578 simd_batch words, shifts;
579 simd_batch results;
580
581 // extract 22-bit bundles 0 to 15
582 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 22 | SafeLoad<uint32_t>(in + 1) << 10, SafeLoad<uint32_t>(in + 1) >> 12 | SafeLoad<uint32_t>(in + 2) << 20, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 24 | SafeLoad<uint32_t>(in + 3) << 8, SafeLoad<uint32_t>(in + 3) >> 14 | SafeLoad<uint32_t>(in + 4) << 18, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6, SafeLoad<uint32_t>(in + 5) >> 16 | SafeLoad<uint32_t>(in + 6) << 16, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7) >> 18 | SafeLoad<uint32_t>(in + 8) << 14, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9) >> 20 | SafeLoad<uint32_t>(in + 10) << 12, SafeLoad<uint32_t>(in + 10) };
583 shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0, 0, 6, 0, 0, 8, 0, 0, 10 };
584 results = (words >> shifts) & masks;
585 results.store_unaligned(out);
586 out += 16;
587
588 // extract 22-bit bundles 16 to 31
589 words = simd_batch{ SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 22 | SafeLoad<uint32_t>(in + 12) << 10, SafeLoad<uint32_t>(in + 12) >> 12 | SafeLoad<uint32_t>(in + 13) << 20, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 24 | SafeLoad<uint32_t>(in + 14) << 8, SafeLoad<uint32_t>(in + 14) >> 14 | SafeLoad<uint32_t>(in + 15) << 18, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 26 | SafeLoad<uint32_t>(in + 16) << 6, SafeLoad<uint32_t>(in + 16) >> 16 | SafeLoad<uint32_t>(in + 17) << 16, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 28 | SafeLoad<uint32_t>(in + 18) << 4, SafeLoad<uint32_t>(in + 18) >> 18 | SafeLoad<uint32_t>(in + 19) << 14, SafeLoad<uint32_t>(in + 19), SafeLoad<uint32_t>(in + 19) >> 30 | SafeLoad<uint32_t>(in + 20) << 2, SafeLoad<uint32_t>(in + 20) >> 20 | SafeLoad<uint32_t>(in + 21) << 12, SafeLoad<uint32_t>(in + 21) };
590 shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0, 0, 6, 0, 0, 8, 0, 0, 10 };
591 results = (words >> shifts) & masks;
592 results.store_unaligned(out);
593 out += 16;
594
595 in += 22;
596 return in;
597 }
598
unpack23_32UnpackBits512599 inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) {
600 uint32_t mask = 0x7fffff;
601
602 simd_batch masks(mask);
603 simd_batch words, shifts;
604 simd_batch results;
605
606 // extract 23-bit bundles 0 to 15
607 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 23 | SafeLoad<uint32_t>(in + 1) << 9, SafeLoad<uint32_t>(in + 1) >> 14 | SafeLoad<uint32_t>(in + 2) << 18, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 28 | SafeLoad<uint32_t>(in + 3) << 4, SafeLoad<uint32_t>(in + 3) >> 19 | SafeLoad<uint32_t>(in + 4) << 13, SafeLoad<uint32_t>(in + 4) >> 10 | SafeLoad<uint32_t>(in + 5) << 22, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 24 | SafeLoad<uint32_t>(in + 6) << 8, SafeLoad<uint32_t>(in + 6) >> 15 | SafeLoad<uint32_t>(in + 7) << 17, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 29 | SafeLoad<uint32_t>(in + 8) << 3, SafeLoad<uint32_t>(in + 8) >> 20 | SafeLoad<uint32_t>(in + 9) << 12, SafeLoad<uint32_t>(in + 9) >> 11 | SafeLoad<uint32_t>(in + 10) << 21, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 25 | SafeLoad<uint32_t>(in + 11) << 7 };
608 shifts = simd_batch{ 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 6, 0, 0, 0, 2, 0 };
609 results = (words >> shifts) & masks;
610 results.store_unaligned(out);
611 out += 16;
612
613 // extract 23-bit bundles 16 to 31
614 words = simd_batch{ SafeLoad<uint32_t>(in + 11) >> 16 | SafeLoad<uint32_t>(in + 12) << 16, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 30 | SafeLoad<uint32_t>(in + 13) << 2, SafeLoad<uint32_t>(in + 13) >> 21 | SafeLoad<uint32_t>(in + 14) << 11, SafeLoad<uint32_t>(in + 14) >> 12 | SafeLoad<uint32_t>(in + 15) << 20, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 26 | SafeLoad<uint32_t>(in + 16) << 6, SafeLoad<uint32_t>(in + 16) >> 17 | SafeLoad<uint32_t>(in + 17) << 15, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 31 | SafeLoad<uint32_t>(in + 18) << 1, SafeLoad<uint32_t>(in + 18) >> 22 | SafeLoad<uint32_t>(in + 19) << 10, SafeLoad<uint32_t>(in + 19) >> 13 | SafeLoad<uint32_t>(in + 20) << 19, SafeLoad<uint32_t>(in + 20), SafeLoad<uint32_t>(in + 20) >> 27 | SafeLoad<uint32_t>(in + 21) << 5, SafeLoad<uint32_t>(in + 21) >> 18 | SafeLoad<uint32_t>(in + 22) << 14, SafeLoad<uint32_t>(in + 22) };
615 shifts = simd_batch{ 0, 7, 0, 0, 0, 3, 0, 0, 8, 0, 0, 0, 4, 0, 0, 9 };
616 results = (words >> shifts) & masks;
617 results.store_unaligned(out);
618 out += 16;
619
620 in += 23;
621 return in;
622 }
623
unpack24_32UnpackBits512624 inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) {
625 uint32_t mask = 0xffffff;
626
627 simd_batch masks(mask);
628 simd_batch words, shifts;
629 simd_batch results;
630
631 // extract 24-bit bundles 0 to 15
632 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 24 | SafeLoad<uint32_t>(in + 1) << 8, SafeLoad<uint32_t>(in + 1) >> 16 | SafeLoad<uint32_t>(in + 2) << 16, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4) >> 16 | SafeLoad<uint32_t>(in + 5) << 16, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7) >> 16 | SafeLoad<uint32_t>(in + 8) << 16, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10) >> 16 | SafeLoad<uint32_t>(in + 11) << 16, SafeLoad<uint32_t>(in + 11) };
633 shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8 };
634 results = (words >> shifts) & masks;
635 results.store_unaligned(out);
636 out += 16;
637
638 // extract 24-bit bundles 16 to 31
639 words = simd_batch{ SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 24 | SafeLoad<uint32_t>(in + 13) << 8, SafeLoad<uint32_t>(in + 13) >> 16 | SafeLoad<uint32_t>(in + 14) << 16, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16) >> 16 | SafeLoad<uint32_t>(in + 17) << 16, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 18), SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) >> 16 | SafeLoad<uint32_t>(in + 20) << 16, SafeLoad<uint32_t>(in + 20), SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 24 | SafeLoad<uint32_t>(in + 22) << 8, SafeLoad<uint32_t>(in + 22) >> 16 | SafeLoad<uint32_t>(in + 23) << 16, SafeLoad<uint32_t>(in + 23) };
640 shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8 };
641 results = (words >> shifts) & masks;
642 results.store_unaligned(out);
643 out += 16;
644
645 in += 24;
646 return in;
647 }
648
unpack25_32UnpackBits512649 inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) {
650 uint32_t mask = 0x1ffffff;
651
652 simd_batch masks(mask);
653 simd_batch words, shifts;
654 simd_batch results;
655
656 // extract 25-bit bundles 0 to 15
657 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 25 | SafeLoad<uint32_t>(in + 1) << 7, SafeLoad<uint32_t>(in + 1) >> 18 | SafeLoad<uint32_t>(in + 2) << 14, SafeLoad<uint32_t>(in + 2) >> 11 | SafeLoad<uint32_t>(in + 3) << 21, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 29 | SafeLoad<uint32_t>(in + 4) << 3, SafeLoad<uint32_t>(in + 4) >> 22 | SafeLoad<uint32_t>(in + 5) << 10, SafeLoad<uint32_t>(in + 5) >> 15 | SafeLoad<uint32_t>(in + 6) << 17, SafeLoad<uint32_t>(in + 6) >> 8 | SafeLoad<uint32_t>(in + 7) << 24, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 26 | SafeLoad<uint32_t>(in + 8) << 6, SafeLoad<uint32_t>(in + 8) >> 19 | SafeLoad<uint32_t>(in + 9) << 13, SafeLoad<uint32_t>(in + 9) >> 12 | SafeLoad<uint32_t>(in + 10) << 20, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 30 | SafeLoad<uint32_t>(in + 11) << 2, SafeLoad<uint32_t>(in + 11) >> 23 | SafeLoad<uint32_t>(in + 12) << 9 };
658 shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0 };
659 results = (words >> shifts) & masks;
660 results.store_unaligned(out);
661 out += 16;
662
663 // extract 25-bit bundles 16 to 31
664 words = simd_batch{ SafeLoad<uint32_t>(in + 12) >> 16 | SafeLoad<uint32_t>(in + 13) << 16, SafeLoad<uint32_t>(in + 13) >> 9 | SafeLoad<uint32_t>(in + 14) << 23, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 27 | SafeLoad<uint32_t>(in + 15) << 5, SafeLoad<uint32_t>(in + 15) >> 20 | SafeLoad<uint32_t>(in + 16) << 12, SafeLoad<uint32_t>(in + 16) >> 13 | SafeLoad<uint32_t>(in + 17) << 19, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 31 | SafeLoad<uint32_t>(in + 18) << 1, SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) >> 17 | SafeLoad<uint32_t>(in + 20) << 15, SafeLoad<uint32_t>(in + 20) >> 10 | SafeLoad<uint32_t>(in + 21) << 22, SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 28 | SafeLoad<uint32_t>(in + 22) << 4, SafeLoad<uint32_t>(in + 22) >> 21 | SafeLoad<uint32_t>(in + 23) << 11, SafeLoad<uint32_t>(in + 23) >> 14 | SafeLoad<uint32_t>(in + 24) << 18, SafeLoad<uint32_t>(in + 24) };
665 shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 0, 3, 0, 0, 0, 7 };
666 results = (words >> shifts) & masks;
667 results.store_unaligned(out);
668 out += 16;
669
670 in += 25;
671 return in;
672 }
673
unpack26_32UnpackBits512674 inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) {
675 uint32_t mask = 0x3ffffff;
676
677 simd_batch masks(mask);
678 simd_batch words, shifts;
679 simd_batch results;
680
681 // extract 26-bit bundles 0 to 15
682 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 26 | SafeLoad<uint32_t>(in + 1) << 6, SafeLoad<uint32_t>(in + 1) >> 20 | SafeLoad<uint32_t>(in + 2) << 12, SafeLoad<uint32_t>(in + 2) >> 14 | SafeLoad<uint32_t>(in + 3) << 18, SafeLoad<uint32_t>(in + 3) >> 8 | SafeLoad<uint32_t>(in + 4) << 24, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5) >> 22 | SafeLoad<uint32_t>(in + 6) << 10, SafeLoad<uint32_t>(in + 6) >> 16 | SafeLoad<uint32_t>(in + 7) << 16, SafeLoad<uint32_t>(in + 7) >> 10 | SafeLoad<uint32_t>(in + 8) << 22, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10) >> 18 | SafeLoad<uint32_t>(in + 11) << 14, SafeLoad<uint32_t>(in + 11) >> 12 | SafeLoad<uint32_t>(in + 12) << 20, SafeLoad<uint32_t>(in + 12) };
683 shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 6 };
684 results = (words >> shifts) & masks;
685 results.store_unaligned(out);
686 out += 16;
687
688 // extract 26-bit bundles 16 to 31
689 words = simd_batch{ SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 26 | SafeLoad<uint32_t>(in + 14) << 6, SafeLoad<uint32_t>(in + 14) >> 20 | SafeLoad<uint32_t>(in + 15) << 12, SafeLoad<uint32_t>(in + 15) >> 14 | SafeLoad<uint32_t>(in + 16) << 18, SafeLoad<uint32_t>(in + 16) >> 8 | SafeLoad<uint32_t>(in + 17) << 24, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 28 | SafeLoad<uint32_t>(in + 18) << 4, SafeLoad<uint32_t>(in + 18) >> 22 | SafeLoad<uint32_t>(in + 19) << 10, SafeLoad<uint32_t>(in + 19) >> 16 | SafeLoad<uint32_t>(in + 20) << 16, SafeLoad<uint32_t>(in + 20) >> 10 | SafeLoad<uint32_t>(in + 21) << 22, SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 30 | SafeLoad<uint32_t>(in + 22) << 2, SafeLoad<uint32_t>(in + 22) >> 24 | SafeLoad<uint32_t>(in + 23) << 8, SafeLoad<uint32_t>(in + 23) >> 18 | SafeLoad<uint32_t>(in + 24) << 14, SafeLoad<uint32_t>(in + 24) >> 12 | SafeLoad<uint32_t>(in + 25) << 20, SafeLoad<uint32_t>(in + 25) };
690 shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 6 };
691 results = (words >> shifts) & masks;
692 results.store_unaligned(out);
693 out += 16;
694
695 in += 26;
696 return in;
697 }
698
unpack27_32UnpackBits512699 inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) {
700 uint32_t mask = 0x7ffffff;
701
702 simd_batch masks(mask);
703 simd_batch words, shifts;
704 simd_batch results;
705
706 // extract 27-bit bundles 0 to 15
707 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 27 | SafeLoad<uint32_t>(in + 1) << 5, SafeLoad<uint32_t>(in + 1) >> 22 | SafeLoad<uint32_t>(in + 2) << 10, SafeLoad<uint32_t>(in + 2) >> 17 | SafeLoad<uint32_t>(in + 3) << 15, SafeLoad<uint32_t>(in + 3) >> 12 | SafeLoad<uint32_t>(in + 4) << 20, SafeLoad<uint32_t>(in + 4) >> 7 | SafeLoad<uint32_t>(in + 5) << 25, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3, SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7) >> 19 | SafeLoad<uint32_t>(in + 8) << 13, SafeLoad<uint32_t>(in + 8) >> 14 | SafeLoad<uint32_t>(in + 9) << 18, SafeLoad<uint32_t>(in + 9) >> 9 | SafeLoad<uint32_t>(in + 10) << 23, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 31 | SafeLoad<uint32_t>(in + 11) << 1, SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12) >> 21 | SafeLoad<uint32_t>(in + 13) << 11 };
708 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 4, 0, 0, 0 };
709 results = (words >> shifts) & masks;
710 results.store_unaligned(out);
711 out += 16;
712
713 // extract 27-bit bundles 16 to 31
714 words = simd_batch{ SafeLoad<uint32_t>(in + 13) >> 16 | SafeLoad<uint32_t>(in + 14) << 16, SafeLoad<uint32_t>(in + 14) >> 11 | SafeLoad<uint32_t>(in + 15) << 21, SafeLoad<uint32_t>(in + 15) >> 6 | SafeLoad<uint32_t>(in + 16) << 26, SafeLoad<uint32_t>(in + 16), SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) >> 23 | SafeLoad<uint32_t>(in + 18) << 9, SafeLoad<uint32_t>(in + 18) >> 18 | SafeLoad<uint32_t>(in + 19) << 14, SafeLoad<uint32_t>(in + 19) >> 13 | SafeLoad<uint32_t>(in + 20) << 19, SafeLoad<uint32_t>(in + 20) >> 8 | SafeLoad<uint32_t>(in + 21) << 24, SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 30 | SafeLoad<uint32_t>(in + 22) << 2, SafeLoad<uint32_t>(in + 22) >> 25 | SafeLoad<uint32_t>(in + 23) << 7, SafeLoad<uint32_t>(in + 23) >> 20 | SafeLoad<uint32_t>(in + 24) << 12, SafeLoad<uint32_t>(in + 24) >> 15 | SafeLoad<uint32_t>(in + 25) << 17, SafeLoad<uint32_t>(in + 25) >> 10 | SafeLoad<uint32_t>(in + 26) << 22, SafeLoad<uint32_t>(in + 26) };
715 shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 5 };
716 results = (words >> shifts) & masks;
717 results.store_unaligned(out);
718 out += 16;
719
720 in += 27;
721 return in;
722 }
723
unpack28_32UnpackBits512724 inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) {
725 uint32_t mask = 0xfffffff;
726
727 simd_batch masks(mask);
728 simd_batch words, shifts;
729 simd_batch results;
730
731 // extract 28-bit bundles 0 to 15
732 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 28 | SafeLoad<uint32_t>(in + 1) << 4, SafeLoad<uint32_t>(in + 1) >> 24 | SafeLoad<uint32_t>(in + 2) << 8, SafeLoad<uint32_t>(in + 2) >> 20 | SafeLoad<uint32_t>(in + 3) << 12, SafeLoad<uint32_t>(in + 3) >> 16 | SafeLoad<uint32_t>(in + 4) << 16, SafeLoad<uint32_t>(in + 4) >> 12 | SafeLoad<uint32_t>(in + 5) << 20, SafeLoad<uint32_t>(in + 5) >> 8 | SafeLoad<uint32_t>(in + 6) << 24, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9) >> 20 | SafeLoad<uint32_t>(in + 10) << 12, SafeLoad<uint32_t>(in + 10) >> 16 | SafeLoad<uint32_t>(in + 11) << 16, SafeLoad<uint32_t>(in + 11) >> 12 | SafeLoad<uint32_t>(in + 12) << 20, SafeLoad<uint32_t>(in + 12) >> 8 | SafeLoad<uint32_t>(in + 13) << 24, SafeLoad<uint32_t>(in + 13) };
733 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4 };
734 results = (words >> shifts) & masks;
735 results.store_unaligned(out);
736 out += 16;
737
738 // extract 28-bit bundles 16 to 31
739 words = simd_batch{ SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 28 | SafeLoad<uint32_t>(in + 15) << 4, SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16) >> 20 | SafeLoad<uint32_t>(in + 17) << 12, SafeLoad<uint32_t>(in + 17) >> 16 | SafeLoad<uint32_t>(in + 18) << 16, SafeLoad<uint32_t>(in + 18) >> 12 | SafeLoad<uint32_t>(in + 19) << 20, SafeLoad<uint32_t>(in + 19) >> 8 | SafeLoad<uint32_t>(in + 20) << 24, SafeLoad<uint32_t>(in + 20), SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 28 | SafeLoad<uint32_t>(in + 22) << 4, SafeLoad<uint32_t>(in + 22) >> 24 | SafeLoad<uint32_t>(in + 23) << 8, SafeLoad<uint32_t>(in + 23) >> 20 | SafeLoad<uint32_t>(in + 24) << 12, SafeLoad<uint32_t>(in + 24) >> 16 | SafeLoad<uint32_t>(in + 25) << 16, SafeLoad<uint32_t>(in + 25) >> 12 | SafeLoad<uint32_t>(in + 26) << 20, SafeLoad<uint32_t>(in + 26) >> 8 | SafeLoad<uint32_t>(in + 27) << 24, SafeLoad<uint32_t>(in + 27) };
740 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4 };
741 results = (words >> shifts) & masks;
742 results.store_unaligned(out);
743 out += 16;
744
745 in += 28;
746 return in;
747 }
748
unpack29_32UnpackBits512749 inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) {
750 uint32_t mask = 0x1fffffff;
751
752 simd_batch masks(mask);
753 simd_batch words, shifts;
754 simd_batch results;
755
756 // extract 29-bit bundles 0 to 15
757 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 29 | SafeLoad<uint32_t>(in + 1) << 3, SafeLoad<uint32_t>(in + 1) >> 26 | SafeLoad<uint32_t>(in + 2) << 6, SafeLoad<uint32_t>(in + 2) >> 23 | SafeLoad<uint32_t>(in + 3) << 9, SafeLoad<uint32_t>(in + 3) >> 20 | SafeLoad<uint32_t>(in + 4) << 12, SafeLoad<uint32_t>(in + 4) >> 17 | SafeLoad<uint32_t>(in + 5) << 15, SafeLoad<uint32_t>(in + 5) >> 14 | SafeLoad<uint32_t>(in + 6) << 18, SafeLoad<uint32_t>(in + 6) >> 11 | SafeLoad<uint32_t>(in + 7) << 21, SafeLoad<uint32_t>(in + 7) >> 8 | SafeLoad<uint32_t>(in + 8) << 24, SafeLoad<uint32_t>(in + 8) >> 5 | SafeLoad<uint32_t>(in + 9) << 27, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 31 | SafeLoad<uint32_t>(in + 10) << 1, SafeLoad<uint32_t>(in + 10) >> 28 | SafeLoad<uint32_t>(in + 11) << 4, SafeLoad<uint32_t>(in + 11) >> 25 | SafeLoad<uint32_t>(in + 12) << 7, SafeLoad<uint32_t>(in + 12) >> 22 | SafeLoad<uint32_t>(in + 13) << 10, SafeLoad<uint32_t>(in + 13) >> 19 | SafeLoad<uint32_t>(in + 14) << 13 };
758 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0 };
759 results = (words >> shifts) & masks;
760 results.store_unaligned(out);
761 out += 16;
762
763 // extract 29-bit bundles 16 to 31
764 words = simd_batch{ SafeLoad<uint32_t>(in + 14) >> 16 | SafeLoad<uint32_t>(in + 15) << 16, SafeLoad<uint32_t>(in + 15) >> 13 | SafeLoad<uint32_t>(in + 16) << 19, SafeLoad<uint32_t>(in + 16) >> 10 | SafeLoad<uint32_t>(in + 17) << 22, SafeLoad<uint32_t>(in + 17) >> 7 | SafeLoad<uint32_t>(in + 18) << 25, SafeLoad<uint32_t>(in + 18) >> 4 | SafeLoad<uint32_t>(in + 19) << 28, SafeLoad<uint32_t>(in + 19), SafeLoad<uint32_t>(in + 19) >> 30 | SafeLoad<uint32_t>(in + 20) << 2, SafeLoad<uint32_t>(in + 20) >> 27 | SafeLoad<uint32_t>(in + 21) << 5, SafeLoad<uint32_t>(in + 21) >> 24 | SafeLoad<uint32_t>(in + 22) << 8, SafeLoad<uint32_t>(in + 22) >> 21 | SafeLoad<uint32_t>(in + 23) << 11, SafeLoad<uint32_t>(in + 23) >> 18 | SafeLoad<uint32_t>(in + 24) << 14, SafeLoad<uint32_t>(in + 24) >> 15 | SafeLoad<uint32_t>(in + 25) << 17, SafeLoad<uint32_t>(in + 25) >> 12 | SafeLoad<uint32_t>(in + 26) << 20, SafeLoad<uint32_t>(in + 26) >> 9 | SafeLoad<uint32_t>(in + 27) << 23, SafeLoad<uint32_t>(in + 27) >> 6 | SafeLoad<uint32_t>(in + 28) << 26, SafeLoad<uint32_t>(in + 28) };
765 shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 };
766 results = (words >> shifts) & masks;
767 results.store_unaligned(out);
768 out += 16;
769
770 in += 29;
771 return in;
772 }
773
unpack30_32UnpackBits512774 inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) {
775 uint32_t mask = 0x3fffffff;
776
777 simd_batch masks(mask);
778 simd_batch words, shifts;
779 simd_batch results;
780
781 // extract 30-bit bundles 0 to 15
782 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4) >> 22 | SafeLoad<uint32_t>(in + 5) << 10, SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6) >> 18 | SafeLoad<uint32_t>(in + 7) << 14, SafeLoad<uint32_t>(in + 7) >> 16 | SafeLoad<uint32_t>(in + 8) << 16, SafeLoad<uint32_t>(in + 8) >> 14 | SafeLoad<uint32_t>(in + 9) << 18, SafeLoad<uint32_t>(in + 9) >> 12 | SafeLoad<uint32_t>(in + 10) << 20, SafeLoad<uint32_t>(in + 10) >> 10 | SafeLoad<uint32_t>(in + 11) << 22, SafeLoad<uint32_t>(in + 11) >> 8 | SafeLoad<uint32_t>(in + 12) << 24, SafeLoad<uint32_t>(in + 12) >> 6 | SafeLoad<uint32_t>(in + 13) << 26, SafeLoad<uint32_t>(in + 13) >> 4 | SafeLoad<uint32_t>(in + 14) << 28, SafeLoad<uint32_t>(in + 14) };
783 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 };
784 results = (words >> shifts) & masks;
785 results.store_unaligned(out);
786 out += 16;
787
788 // extract 30-bit bundles 16 to 31
789 words = simd_batch{ SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 30 | SafeLoad<uint32_t>(in + 16) << 2, SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) >> 26 | SafeLoad<uint32_t>(in + 18) << 6, SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) >> 22 | SafeLoad<uint32_t>(in + 20) << 10, SafeLoad<uint32_t>(in + 20) >> 20 | SafeLoad<uint32_t>(in + 21) << 12, SafeLoad<uint32_t>(in + 21) >> 18 | SafeLoad<uint32_t>(in + 22) << 14, SafeLoad<uint32_t>(in + 22) >> 16 | SafeLoad<uint32_t>(in + 23) << 16, SafeLoad<uint32_t>(in + 23) >> 14 | SafeLoad<uint32_t>(in + 24) << 18, SafeLoad<uint32_t>(in + 24) >> 12 | SafeLoad<uint32_t>(in + 25) << 20, SafeLoad<uint32_t>(in + 25) >> 10 | SafeLoad<uint32_t>(in + 26) << 22, SafeLoad<uint32_t>(in + 26) >> 8 | SafeLoad<uint32_t>(in + 27) << 24, SafeLoad<uint32_t>(in + 27) >> 6 | SafeLoad<uint32_t>(in + 28) << 26, SafeLoad<uint32_t>(in + 28) >> 4 | SafeLoad<uint32_t>(in + 29) << 28, SafeLoad<uint32_t>(in + 29) };
790 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 };
791 results = (words >> shifts) & masks;
792 results.store_unaligned(out);
793 out += 16;
794
795 in += 30;
796 return in;
797 }
798
unpack31_32UnpackBits512799 inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) {
800 uint32_t mask = 0x7fffffff;
801
802 simd_batch masks(mask);
803 simd_batch words, shifts;
804 simd_batch results;
805
806 // extract 31-bit bundles 0 to 15
807 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 31 | SafeLoad<uint32_t>(in + 1) << 1, SafeLoad<uint32_t>(in + 1) >> 30 | SafeLoad<uint32_t>(in + 2) << 2, SafeLoad<uint32_t>(in + 2) >> 29 | SafeLoad<uint32_t>(in + 3) << 3, SafeLoad<uint32_t>(in + 3) >> 28 | SafeLoad<uint32_t>(in + 4) << 4, SafeLoad<uint32_t>(in + 4) >> 27 | SafeLoad<uint32_t>(in + 5) << 5, SafeLoad<uint32_t>(in + 5) >> 26 | SafeLoad<uint32_t>(in + 6) << 6, SafeLoad<uint32_t>(in + 6) >> 25 | SafeLoad<uint32_t>(in + 7) << 7, SafeLoad<uint32_t>(in + 7) >> 24 | SafeLoad<uint32_t>(in + 8) << 8, SafeLoad<uint32_t>(in + 8) >> 23 | SafeLoad<uint32_t>(in + 9) << 9, SafeLoad<uint32_t>(in + 9) >> 22 | SafeLoad<uint32_t>(in + 10) << 10, SafeLoad<uint32_t>(in + 10) >> 21 | SafeLoad<uint32_t>(in + 11) << 11, SafeLoad<uint32_t>(in + 11) >> 20 | SafeLoad<uint32_t>(in + 12) << 12, SafeLoad<uint32_t>(in + 12) >> 19 | SafeLoad<uint32_t>(in + 13) << 13, SafeLoad<uint32_t>(in + 13) >> 18 | SafeLoad<uint32_t>(in + 14) << 14, SafeLoad<uint32_t>(in + 14) >> 17 | SafeLoad<uint32_t>(in + 15) << 15 };
808 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
809 results = (words >> shifts) & masks;
810 results.store_unaligned(out);
811 out += 16;
812
813 // extract 31-bit bundles 16 to 31
814 words = simd_batch{ SafeLoad<uint32_t>(in + 15) >> 16 | SafeLoad<uint32_t>(in + 16) << 16, SafeLoad<uint32_t>(in + 16) >> 15 | SafeLoad<uint32_t>(in + 17) << 17, SafeLoad<uint32_t>(in + 17) >> 14 | SafeLoad<uint32_t>(in + 18) << 18, SafeLoad<uint32_t>(in + 18) >> 13 | SafeLoad<uint32_t>(in + 19) << 19, SafeLoad<uint32_t>(in + 19) >> 12 | SafeLoad<uint32_t>(in + 20) << 20, SafeLoad<uint32_t>(in + 20) >> 11 | SafeLoad<uint32_t>(in + 21) << 21, SafeLoad<uint32_t>(in + 21) >> 10 | SafeLoad<uint32_t>(in + 22) << 22, SafeLoad<uint32_t>(in + 22) >> 9 | SafeLoad<uint32_t>(in + 23) << 23, SafeLoad<uint32_t>(in + 23) >> 8 | SafeLoad<uint32_t>(in + 24) << 24, SafeLoad<uint32_t>(in + 24) >> 7 | SafeLoad<uint32_t>(in + 25) << 25, SafeLoad<uint32_t>(in + 25) >> 6 | SafeLoad<uint32_t>(in + 26) << 26, SafeLoad<uint32_t>(in + 26) >> 5 | SafeLoad<uint32_t>(in + 27) << 27, SafeLoad<uint32_t>(in + 27) >> 4 | SafeLoad<uint32_t>(in + 28) << 28, SafeLoad<uint32_t>(in + 28) >> 3 | SafeLoad<uint32_t>(in + 29) << 29, SafeLoad<uint32_t>(in + 29) >> 2 | SafeLoad<uint32_t>(in + 30) << 30, SafeLoad<uint32_t>(in + 30) };
815 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
816 results = (words >> shifts) & masks;
817 results.store_unaligned(out);
818 out += 16;
819
820 in += 31;
821 return in;
822 }
823
unpack32_32UnpackBits512824 inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) {
825 memcpy(out, in, 32 * sizeof(*out));
826 in += 32;
827 out += 32;
828
829 return in;
830 }
831
832 }; // struct UnpackBits512
833
834 } // namespace
835 } // namespace internal
836 } // namespace arrow
837
838