1 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
2 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
3 /* :: Begin ../simde/simde/x86/avx2.h :: */
4 /* SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person
7  * obtaining a copy of this software and associated documentation
8  * files (the "Software"), to deal in the Software without
9  * restriction, including without limitation the rights to use, copy,
10  * modify, merge, publish, distribute, sublicense, and/or sell copies
11  * of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be
15  * included in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24  * SOFTWARE.
25  *
26  * Copyright:
27  *   2018-2020 Evan Nemerson <evan@nemerson.com>
28  *   2019-2020 Michael R. Crusoe <crusoe@debian.org>
29  *   2020      Himanshi Mathur <himanshi18037@iiitd.ac.in>
30  *   2020      Hidayat Khan <huk2209@gmail.com>
31  */
32 
33 #if !defined(SIMDE_X86_AVX2_H)
34 #define SIMDE_X86_AVX2_H
35 
36 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
37 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
38 /* :: Begin ../simde/simde/x86/avx.h :: */
39 /* SPDX-License-Identifier: MIT
40  *
41  * Permission is hereby granted, free of charge, to any person
42  * obtaining a copy of this software and associated documentation
43  * files (the "Software"), to deal in the Software without
44  * restriction, including without limitation the rights to use, copy,
45  * modify, merge, publish, distribute, sublicense, and/or sell copies
46  * of the Software, and to permit persons to whom the Software is
47  * furnished to do so, subject to the following conditions:
48  *
49  * The above copyright notice and this permission notice shall be
50  * included in all copies or substantial portions of the Software.
51  *
52  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
53  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
54  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
55  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
56  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
57  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
58  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
59  * SOFTWARE.
60  *
61  * Copyright:
62  *   2018-2020 Evan Nemerson <evan@nemerson.com>
63  *        2020 Michael R. Crusoe <crusoe@debian.org>
64  */
65 
66 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
67 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
68 /* :: Begin ../simde/simde/x86/sse.h :: */
69 /* SPDX-License-Identifier: MIT
70  *
71  * Permission is hereby granted, free of charge, to any person
72  * obtaining a copy of this software and associated documentation
73  * files (the "Software"), to deal in the Software without
74  * restriction, including without limitation the rights to use, copy,
75  * modify, merge, publish, distribute, sublicense, and/or sell copies
76  * of the Software, and to permit persons to whom the Software is
77  * furnished to do so, subject to the following conditions:
78  *
79  * The above copyright notice and this permission notice shall be
80  * included in all copies or substantial portions of the Software.
81  *
82  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
83  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
84  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
85  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
86  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
87  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
88  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
89  * SOFTWARE.
90  *
91  * Copyright:
92  *   2017-2020 Evan Nemerson <evan@nemerson.com>
93  *   2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
94  *   2015      Brandon Rowlett <browlett@nvidia.com>
95  *   2015      Ken Fast <kfast@gdeb.com>
96  */
97 
98 #if !defined(SIMDE_X86_SSE_H)
99 #define SIMDE_X86_SSE_H
100 
101 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
102 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
103 /* :: Begin ../simde/simde/x86/mmx.h :: */
104 /* SPDX-License-Identifier: MIT
105  *
106  * Permission is hereby granted, free of charge, to any person
107  * obtaining a copy of this software and associated documentation
108  * files (the "Software"), to deal in the Software without
109  * restriction, including without limitation the rights to use, copy,
110  * modify, merge, publish, distribute, sublicense, and/or sell copies
111  * of the Software, and to permit persons to whom the Software is
112  * furnished to do so, subject to the following conditions:
113  *
114  * The above copyright notice and this permission notice shall be
115  * included in all copies or substantial portions of the Software.
116  *
117  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
118  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
119  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
120  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
121  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
122  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
123  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
124  * SOFTWARE.
125  *
126  * Copyright:
127  *   2017-2020 Evan Nemerson <evan@nemerson.com>
128  */
129 
130 #if !defined(SIMDE_X86_MMX_H)
131 #define SIMDE_X86_MMX_H
132 
133 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
134 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
135 /* :: Begin ../simde/simde/simde-common.h :: */
136 /* SPDX-License-Identifier: MIT
137  *
138  * Permission is hereby granted, free of charge, to any person
139  * obtaining a copy of this software and associated documentation
140  * files (the "Software"), to deal in the Software without
141  * restriction, including without limitation the rights to use, copy,
142  * modify, merge, publish, distribute, sublicense, and/or sell copies
143  * of the Software, and to permit persons to whom the Software is
144  * furnished to do so, subject to the following conditions:
145  *
146  * The above copyright notice and this permission notice shall be
147  * included in all copies or substantial portions of the Software.
148  *
149  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
150  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
151  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
152  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
153  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
154  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
155  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
156  * SOFTWARE.
157  *
158  * Copyright:
159  *   2017-2020 Evan Nemerson <evan@nemerson.com>
160  */
161 
162 #if !defined(SIMDE_COMMON_H)
163 #define SIMDE_COMMON_H
164 
165 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
166 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
167 /* :: Begin ../simde/simde/hedley.h :: */
168 /* Hedley - https://nemequ.github.io/hedley
169  * Created by Evan Nemerson <evan@nemerson.com>
170  *
171  * To the extent possible under law, the author(s) have dedicated all
172  * copyright and related and neighboring rights to this software to
173  * the public domain worldwide. This software is distributed without
174  * any warranty.
175  *
176  * For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
177  * SPDX-License-Identifier: CC0-1.0
178  */
179 
180 #if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 15)
181 #if defined(HEDLEY_VERSION)
182 #  undef HEDLEY_VERSION
183 #endif
184 #define HEDLEY_VERSION 15
185 
186 #if defined(HEDLEY_STRINGIFY_EX)
187 #  undef HEDLEY_STRINGIFY_EX
188 #endif
189 #define HEDLEY_STRINGIFY_EX(x) #x
190 
191 #if defined(HEDLEY_STRINGIFY)
192 #  undef HEDLEY_STRINGIFY
193 #endif
194 #define HEDLEY_STRINGIFY(x) HEDLEY_STRINGIFY_EX(x)
195 
196 #if defined(HEDLEY_CONCAT_EX)
197 #  undef HEDLEY_CONCAT_EX
198 #endif
199 #define HEDLEY_CONCAT_EX(a,b) a##b
200 
201 #if defined(HEDLEY_CONCAT)
202 #  undef HEDLEY_CONCAT
203 #endif
204 #define HEDLEY_CONCAT(a,b) HEDLEY_CONCAT_EX(a,b)
205 
206 #if defined(HEDLEY_CONCAT3_EX)
207 #  undef HEDLEY_CONCAT3_EX
208 #endif
209 #define HEDLEY_CONCAT3_EX(a,b,c) a##b##c
210 
211 #if defined(HEDLEY_CONCAT3)
212 #  undef HEDLEY_CONCAT3
213 #endif
214 #define HEDLEY_CONCAT3(a,b,c) HEDLEY_CONCAT3_EX(a,b,c)
215 
216 #if defined(HEDLEY_VERSION_ENCODE)
217 #  undef HEDLEY_VERSION_ENCODE
218 #endif
219 #define HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision))
220 
221 #if defined(HEDLEY_VERSION_DECODE_MAJOR)
222 #  undef HEDLEY_VERSION_DECODE_MAJOR
223 #endif
224 #define HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)
225 
226 #if defined(HEDLEY_VERSION_DECODE_MINOR)
227 #  undef HEDLEY_VERSION_DECODE_MINOR
228 #endif
229 #define HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)
230 
231 #if defined(HEDLEY_VERSION_DECODE_REVISION)
232 #  undef HEDLEY_VERSION_DECODE_REVISION
233 #endif
234 #define HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)
235 
236 #if defined(HEDLEY_GNUC_VERSION)
237 #  undef HEDLEY_GNUC_VERSION
238 #endif
239 #if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
240 #  define HEDLEY_GNUC_VERSION HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
241 #elif defined(__GNUC__)
242 #  define HEDLEY_GNUC_VERSION HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
243 #endif
244 
245 #if defined(HEDLEY_GNUC_VERSION_CHECK)
246 #  undef HEDLEY_GNUC_VERSION_CHECK
247 #endif
248 #if defined(HEDLEY_GNUC_VERSION)
249 #  define HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (HEDLEY_GNUC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
250 #else
251 #  define HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0)
252 #endif
253 
254 #if defined(HEDLEY_MSVC_VERSION)
255 #  undef HEDLEY_MSVC_VERSION
256 #endif
257 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL)
258 #  define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
259 #elif defined(_MSC_FULL_VER) && !defined(__ICL)
260 #  define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
261 #elif defined(_MSC_VER) && !defined(__ICL)
262 #  define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
263 #endif
264 
265 #if defined(HEDLEY_MSVC_VERSION_CHECK)
266 #  undef HEDLEY_MSVC_VERSION_CHECK
267 #endif
268 #if !defined(HEDLEY_MSVC_VERSION)
269 #  define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0)
270 #elif defined(_MSC_VER) && (_MSC_VER >= 1400)
271 #  define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
272 #elif defined(_MSC_VER) && (_MSC_VER >= 1200)
273 #  define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
274 #else
275 #  define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor)))
276 #endif
277 
278 #if defined(HEDLEY_INTEL_VERSION)
279 #  undef HEDLEY_INTEL_VERSION
280 #endif
281 #if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL)
282 #  define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
283 #elif defined(__INTEL_COMPILER) && !defined(__ICL)
284 #  define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
285 #endif
286 
287 #if defined(HEDLEY_INTEL_VERSION_CHECK)
288 #  undef HEDLEY_INTEL_VERSION_CHECK
289 #endif
290 #if defined(HEDLEY_INTEL_VERSION)
291 #  define HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (HEDLEY_INTEL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
292 #else
293 #  define HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0)
294 #endif
295 
296 #if defined(HEDLEY_INTEL_CL_VERSION)
297 #  undef HEDLEY_INTEL_CL_VERSION
298 #endif
299 #if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL)
300 #  define HEDLEY_INTEL_CL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0)
301 #endif
302 
303 #if defined(HEDLEY_INTEL_CL_VERSION_CHECK)
304 #  undef HEDLEY_INTEL_CL_VERSION_CHECK
305 #endif
306 #if defined(HEDLEY_INTEL_CL_VERSION)
307 #  define HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (HEDLEY_INTEL_CL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
308 #else
309 #  define HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0)
310 #endif
311 
312 #if defined(HEDLEY_PGI_VERSION)
313 #  undef HEDLEY_PGI_VERSION
314 #endif
315 #if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__)
316 #  define HEDLEY_PGI_VERSION HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
317 #endif
318 
319 #if defined(HEDLEY_PGI_VERSION_CHECK)
320 #  undef HEDLEY_PGI_VERSION_CHECK
321 #endif
322 #if defined(HEDLEY_PGI_VERSION)
323 #  define HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (HEDLEY_PGI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
324 #else
325 #  define HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0)
326 #endif
327 
328 #if defined(HEDLEY_SUNPRO_VERSION)
329 #  undef HEDLEY_SUNPRO_VERSION
330 #endif
331 #if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
332 #  define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10)
333 #elif defined(__SUNPRO_C)
334 #  define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf)
335 #elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
336 #  define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10)
337 #elif defined(__SUNPRO_CC)
338 #  define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf)
339 #endif
340 
341 #if defined(HEDLEY_SUNPRO_VERSION_CHECK)
342 #  undef HEDLEY_SUNPRO_VERSION_CHECK
343 #endif
344 #if defined(HEDLEY_SUNPRO_VERSION)
345 #  define HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (HEDLEY_SUNPRO_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
346 #else
347 #  define HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0)
348 #endif
349 
350 #if defined(HEDLEY_EMSCRIPTEN_VERSION)
351 #  undef HEDLEY_EMSCRIPTEN_VERSION
352 #endif
353 #if defined(__EMSCRIPTEN__)
354 #  define HEDLEY_EMSCRIPTEN_VERSION HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__)
355 #endif
356 
357 #if defined(HEDLEY_EMSCRIPTEN_VERSION_CHECK)
358 #  undef HEDLEY_EMSCRIPTEN_VERSION_CHECK
359 #endif
360 #if defined(HEDLEY_EMSCRIPTEN_VERSION)
361 #  define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (HEDLEY_EMSCRIPTEN_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
362 #else
363 #  define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0)
364 #endif
365 
366 #if defined(HEDLEY_ARM_VERSION)
367 #  undef HEDLEY_ARM_VERSION
368 #endif
369 #if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
370 #  define HEDLEY_ARM_VERSION HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100)
371 #elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
372 #  define HEDLEY_ARM_VERSION HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100)
373 #endif
374 
375 #if defined(HEDLEY_ARM_VERSION_CHECK)
376 #  undef HEDLEY_ARM_VERSION_CHECK
377 #endif
378 #if defined(HEDLEY_ARM_VERSION)
379 #  define HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (HEDLEY_ARM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
380 #else
381 #  define HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0)
382 #endif
383 
384 #if defined(HEDLEY_IBM_VERSION)
385 #  undef HEDLEY_IBM_VERSION
386 #endif
387 #if defined(__ibmxl__)
388 #  define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
389 #elif defined(__xlC__) && defined(__xlC_ver__)
390 #  define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff)
391 #elif defined(__xlC__)
392 #  define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
393 #endif
394 
395 #if defined(HEDLEY_IBM_VERSION_CHECK)
396 #  undef HEDLEY_IBM_VERSION_CHECK
397 #endif
398 #if defined(HEDLEY_IBM_VERSION)
399 #  define HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (HEDLEY_IBM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
400 #else
401 #  define HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0)
402 #endif
403 
404 #if defined(HEDLEY_TI_VERSION)
405 #  undef HEDLEY_TI_VERSION
406 #endif
407 #if \
408     defined(__TI_COMPILER_VERSION__) && \
409     ( \
410       defined(__TMS470__) || defined(__TI_ARM__) || \
411       defined(__MSP430__) || \
412       defined(__TMS320C2000__) \
413     )
414 #  if (__TI_COMPILER_VERSION__ >= 16000000)
415 #    define HEDLEY_TI_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
416 #  endif
417 #endif
418 
419 #if defined(HEDLEY_TI_VERSION_CHECK)
420 #  undef HEDLEY_TI_VERSION_CHECK
421 #endif
422 #if defined(HEDLEY_TI_VERSION)
423 #  define HEDLEY_TI_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
424 #else
425 #  define HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0)
426 #endif
427 
428 #if defined(HEDLEY_TI_CL2000_VERSION)
429 #  undef HEDLEY_TI_CL2000_VERSION
430 #endif
431 #if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__)
432 #  define HEDLEY_TI_CL2000_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
433 #endif
434 
435 #if defined(HEDLEY_TI_CL2000_VERSION_CHECK)
436 #  undef HEDLEY_TI_CL2000_VERSION_CHECK
437 #endif
438 #if defined(HEDLEY_TI_CL2000_VERSION)
439 #  define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL2000_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
440 #else
441 #  define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0)
442 #endif
443 
444 #if defined(HEDLEY_TI_CL430_VERSION)
445 #  undef HEDLEY_TI_CL430_VERSION
446 #endif
447 #if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__)
448 #  define HEDLEY_TI_CL430_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
449 #endif
450 
451 #if defined(HEDLEY_TI_CL430_VERSION_CHECK)
452 #  undef HEDLEY_TI_CL430_VERSION_CHECK
453 #endif
454 #if defined(HEDLEY_TI_CL430_VERSION)
455 #  define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL430_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
456 #else
457 #  define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0)
458 #endif
459 
460 #if defined(HEDLEY_TI_ARMCL_VERSION)
461 #  undef HEDLEY_TI_ARMCL_VERSION
462 #endif
463 #if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__))
464 #  define HEDLEY_TI_ARMCL_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
465 #endif
466 
467 #if defined(HEDLEY_TI_ARMCL_VERSION_CHECK)
468 #  undef HEDLEY_TI_ARMCL_VERSION_CHECK
469 #endif
470 #if defined(HEDLEY_TI_ARMCL_VERSION)
471 #  define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_ARMCL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
472 #else
473 #  define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0)
474 #endif
475 
476 #if defined(HEDLEY_TI_CL6X_VERSION)
477 #  undef HEDLEY_TI_CL6X_VERSION
478 #endif
479 #if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__)
480 #  define HEDLEY_TI_CL6X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
481 #endif
482 
483 #if defined(HEDLEY_TI_CL6X_VERSION_CHECK)
484 #  undef HEDLEY_TI_CL6X_VERSION_CHECK
485 #endif
486 #if defined(HEDLEY_TI_CL6X_VERSION)
487 #  define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL6X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
488 #else
489 #  define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0)
490 #endif
491 
492 #if defined(HEDLEY_TI_CL7X_VERSION)
493 #  undef HEDLEY_TI_CL7X_VERSION
494 #endif
495 #if defined(__TI_COMPILER_VERSION__) && defined(__C7000__)
496 #  define HEDLEY_TI_CL7X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
497 #endif
498 
499 #if defined(HEDLEY_TI_CL7X_VERSION_CHECK)
500 #  undef HEDLEY_TI_CL7X_VERSION_CHECK
501 #endif
502 #if defined(HEDLEY_TI_CL7X_VERSION)
503 #  define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL7X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
504 #else
505 #  define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0)
506 #endif
507 
508 #if defined(HEDLEY_TI_CLPRU_VERSION)
509 #  undef HEDLEY_TI_CLPRU_VERSION
510 #endif
511 #if defined(__TI_COMPILER_VERSION__) && defined(__PRU__)
512 #  define HEDLEY_TI_CLPRU_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
513 #endif
514 
515 #if defined(HEDLEY_TI_CLPRU_VERSION_CHECK)
516 #  undef HEDLEY_TI_CLPRU_VERSION_CHECK
517 #endif
518 #if defined(HEDLEY_TI_CLPRU_VERSION)
519 #  define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CLPRU_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
520 #else
521 #  define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0)
522 #endif
523 
524 #if defined(HEDLEY_CRAY_VERSION)
525 #  undef HEDLEY_CRAY_VERSION
526 #endif
527 #if defined(_CRAYC)
528 #  if defined(_RELEASE_PATCHLEVEL)
529 #    define HEDLEY_CRAY_VERSION HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL)
530 #  else
531 #    define HEDLEY_CRAY_VERSION HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
532 #  endif
533 #endif
534 
535 #if defined(HEDLEY_CRAY_VERSION_CHECK)
536 #  undef HEDLEY_CRAY_VERSION_CHECK
537 #endif
538 #if defined(HEDLEY_CRAY_VERSION)
539 #  define HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (HEDLEY_CRAY_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
540 #else
541 #  define HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0)
542 #endif
543 
544 #if defined(HEDLEY_IAR_VERSION)
545 #  undef HEDLEY_IAR_VERSION
546 #endif
547 #if defined(__IAR_SYSTEMS_ICC__)
548 #  if __VER__ > 1000
549 #    define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
550 #  else
551 #    define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0)
552 #  endif
553 #endif
554 
555 #if defined(HEDLEY_IAR_VERSION_CHECK)
556 #  undef HEDLEY_IAR_VERSION_CHECK
557 #endif
558 #if defined(HEDLEY_IAR_VERSION)
559 #  define HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (HEDLEY_IAR_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
560 #else
561 #  define HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0)
562 #endif
563 
564 #if defined(HEDLEY_TINYC_VERSION)
565 #  undef HEDLEY_TINYC_VERSION
566 #endif
567 #if defined(__TINYC__)
568 #  define HEDLEY_TINYC_VERSION HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100)
569 #endif
570 
571 #if defined(HEDLEY_TINYC_VERSION_CHECK)
572 #  undef HEDLEY_TINYC_VERSION_CHECK
573 #endif
574 #if defined(HEDLEY_TINYC_VERSION)
575 #  define HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (HEDLEY_TINYC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
576 #else
577 #  define HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0)
578 #endif
579 
580 #if defined(HEDLEY_DMC_VERSION)
581 #  undef HEDLEY_DMC_VERSION
582 #endif
583 #if defined(__DMC__)
584 #  define HEDLEY_DMC_VERSION HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
585 #endif
586 
587 #if defined(HEDLEY_DMC_VERSION_CHECK)
588 #  undef HEDLEY_DMC_VERSION_CHECK
589 #endif
590 #if defined(HEDLEY_DMC_VERSION)
591 #  define HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (HEDLEY_DMC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
592 #else
593 #  define HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0)
594 #endif
595 
596 #if defined(HEDLEY_COMPCERT_VERSION)
597 #  undef HEDLEY_COMPCERT_VERSION
598 #endif
599 #if defined(__COMPCERT_VERSION__)
600 #  define HEDLEY_COMPCERT_VERSION HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100)
601 #endif
602 
603 #if defined(HEDLEY_COMPCERT_VERSION_CHECK)
604 #  undef HEDLEY_COMPCERT_VERSION_CHECK
605 #endif
606 #if defined(HEDLEY_COMPCERT_VERSION)
607 #  define HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (HEDLEY_COMPCERT_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
608 #else
609 #  define HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0)
610 #endif
611 
612 #if defined(HEDLEY_PELLES_VERSION)
613 #  undef HEDLEY_PELLES_VERSION
614 #endif
615 #if defined(__POCC__)
616 #  define HEDLEY_PELLES_VERSION HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
617 #endif
618 
619 #if defined(HEDLEY_PELLES_VERSION_CHECK)
620 #  undef HEDLEY_PELLES_VERSION_CHECK
621 #endif
622 #if defined(HEDLEY_PELLES_VERSION)
623 #  define HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (HEDLEY_PELLES_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
624 #else
625 #  define HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0)
626 #endif
627 
628 #if defined(HEDLEY_MCST_LCC_VERSION)
629 #  undef HEDLEY_MCST_LCC_VERSION
630 #endif
631 #if defined(__LCC__) && defined(__LCC_MINOR__)
632 #  define HEDLEY_MCST_LCC_VERSION HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__)
633 #endif
634 
635 #if defined(HEDLEY_MCST_LCC_VERSION_CHECK)
636 #  undef HEDLEY_MCST_LCC_VERSION_CHECK
637 #endif
638 #if defined(HEDLEY_MCST_LCC_VERSION)
639 #  define HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (HEDLEY_MCST_LCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
640 #else
641 #  define HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0)
642 #endif
643 
644 #if defined(HEDLEY_GCC_VERSION)
645 #  undef HEDLEY_GCC_VERSION
646 #endif
647 #if \
648   defined(HEDLEY_GNUC_VERSION) && \
649   !defined(__clang__) && \
650   !defined(HEDLEY_INTEL_VERSION) && \
651   !defined(HEDLEY_PGI_VERSION) && \
652   !defined(HEDLEY_ARM_VERSION) && \
653   !defined(HEDLEY_CRAY_VERSION) && \
654   !defined(HEDLEY_TI_VERSION) && \
655   !defined(HEDLEY_TI_ARMCL_VERSION) && \
656   !defined(HEDLEY_TI_CL430_VERSION) && \
657   !defined(HEDLEY_TI_CL2000_VERSION) && \
658   !defined(HEDLEY_TI_CL6X_VERSION) && \
659   !defined(HEDLEY_TI_CL7X_VERSION) && \
660   !defined(HEDLEY_TI_CLPRU_VERSION) && \
661   !defined(__COMPCERT__) && \
662   !defined(HEDLEY_MCST_LCC_VERSION)
663 #  define HEDLEY_GCC_VERSION HEDLEY_GNUC_VERSION
664 #endif
665 
666 #if defined(HEDLEY_GCC_VERSION_CHECK)
667 #  undef HEDLEY_GCC_VERSION_CHECK
668 #endif
669 #if defined(HEDLEY_GCC_VERSION)
670 #  define HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (HEDLEY_GCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
671 #else
672 #  define HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0)
673 #endif
674 
675 #if defined(HEDLEY_HAS_ATTRIBUTE)
676 #  undef HEDLEY_HAS_ATTRIBUTE
677 #endif
678 #if \
679   defined(__has_attribute) && \
680   ( \
681     (!defined(HEDLEY_IAR_VERSION) || HEDLEY_IAR_VERSION_CHECK(8,5,9)) \
682   )
683 #  define HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
684 #else
685 #  define HEDLEY_HAS_ATTRIBUTE(attribute) (0)
686 #endif
687 
688 #if defined(HEDLEY_GNUC_HAS_ATTRIBUTE)
689 #  undef HEDLEY_GNUC_HAS_ATTRIBUTE
690 #endif
691 #if defined(__has_attribute)
692 #  define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_HAS_ATTRIBUTE(attribute)
693 #else
694 #  define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
695 #endif
696 
697 #if defined(HEDLEY_GCC_HAS_ATTRIBUTE)
698 #  undef HEDLEY_GCC_HAS_ATTRIBUTE
699 #endif
700 #if defined(__has_attribute)
701 #  define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_HAS_ATTRIBUTE(attribute)
702 #else
703 #  define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
704 #endif
705 
706 #if defined(HEDLEY_HAS_CPP_ATTRIBUTE)
707 #  undef HEDLEY_HAS_CPP_ATTRIBUTE
708 #endif
709 #if \
710   defined(__has_cpp_attribute) && \
711   defined(__cplusplus) && \
712   (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0))
713 #  define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
714 #else
715 #  define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
716 #endif
717 
718 #if defined(HEDLEY_HAS_CPP_ATTRIBUTE_NS)
719 #  undef HEDLEY_HAS_CPP_ATTRIBUTE_NS
720 #endif
721 #if !defined(__cplusplus) || !defined(__has_cpp_attribute)
722 #  define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
723 #elif \
724   !defined(HEDLEY_PGI_VERSION) && \
725   !defined(HEDLEY_IAR_VERSION) && \
726   (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
727   (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0))
728 #  define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
729 #else
730 #  define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
731 #endif
732 
733 #if defined(HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
734 #  undef HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
735 #endif
736 #if defined(__has_cpp_attribute) && defined(__cplusplus)
737 #  define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
738 #else
739 #  define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
740 #endif
741 
742 #if defined(HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
743 #  undef HEDLEY_GCC_HAS_CPP_ATTRIBUTE
744 #endif
745 #if defined(__has_cpp_attribute) && defined(__cplusplus)
746 #  define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
747 #else
748 #  define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
749 #endif
750 
751 #if defined(HEDLEY_HAS_BUILTIN)
752 #  undef HEDLEY_HAS_BUILTIN
753 #endif
754 #if defined(__has_builtin)
755 #  define HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
756 #else
757 #  define HEDLEY_HAS_BUILTIN(builtin) (0)
758 #endif
759 
760 #if defined(HEDLEY_GNUC_HAS_BUILTIN)
761 #  undef HEDLEY_GNUC_HAS_BUILTIN
762 #endif
763 #if defined(__has_builtin)
764 #  define HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
765 #else
766 #  define HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
767 #endif
768 
769 #if defined(HEDLEY_GCC_HAS_BUILTIN)
770 #  undef HEDLEY_GCC_HAS_BUILTIN
771 #endif
772 #if defined(__has_builtin)
773 #  define HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
774 #else
775 #  define HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
776 #endif
777 
778 #if defined(HEDLEY_HAS_FEATURE)
779 #  undef HEDLEY_HAS_FEATURE
780 #endif
781 #if defined(__has_feature)
782 #  define HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
783 #else
784 #  define HEDLEY_HAS_FEATURE(feature) (0)
785 #endif
786 
787 #if defined(HEDLEY_GNUC_HAS_FEATURE)
788 #  undef HEDLEY_GNUC_HAS_FEATURE
789 #endif
790 #if defined(__has_feature)
791 #  define HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
792 #else
793 #  define HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
794 #endif
795 
796 #if defined(HEDLEY_GCC_HAS_FEATURE)
797 #  undef HEDLEY_GCC_HAS_FEATURE
798 #endif
799 #if defined(__has_feature)
800 #  define HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
801 #else
802 #  define HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
803 #endif
804 
805 #if defined(HEDLEY_HAS_EXTENSION)
806 #  undef HEDLEY_HAS_EXTENSION
807 #endif
808 #if defined(__has_extension)
809 #  define HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
810 #else
811 #  define HEDLEY_HAS_EXTENSION(extension) (0)
812 #endif
813 
814 #if defined(HEDLEY_GNUC_HAS_EXTENSION)
815 #  undef HEDLEY_GNUC_HAS_EXTENSION
816 #endif
817 #if defined(__has_extension)
818 #  define HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
819 #else
820 #  define HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
821 #endif
822 
823 #if defined(HEDLEY_GCC_HAS_EXTENSION)
824 #  undef HEDLEY_GCC_HAS_EXTENSION
825 #endif
826 #if defined(__has_extension)
827 #  define HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
828 #else
829 #  define HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
830 #endif
831 
832 #if defined(HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
833 #  undef HEDLEY_HAS_DECLSPEC_ATTRIBUTE
834 #endif
835 #if defined(__has_declspec_attribute)
836 #  define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute)
837 #else
838 #  define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
839 #endif
840 
841 #if defined(HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
842 #  undef HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
843 #endif
844 #if defined(__has_declspec_attribute)
845 #  define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
846 #else
847 #  define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
848 #endif
849 
850 #if defined(HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
851 #  undef HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
852 #endif
853 #if defined(__has_declspec_attribute)
854 #  define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
855 #else
856 #  define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
857 #endif
858 
859 #if defined(HEDLEY_HAS_WARNING)
860 #  undef HEDLEY_HAS_WARNING
861 #endif
862 #if defined(__has_warning)
863 #  define HEDLEY_HAS_WARNING(warning) __has_warning(warning)
864 #else
865 #  define HEDLEY_HAS_WARNING(warning) (0)
866 #endif
867 
868 #if defined(HEDLEY_GNUC_HAS_WARNING)
869 #  undef HEDLEY_GNUC_HAS_WARNING
870 #endif
871 #if defined(__has_warning)
872 #  define HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
873 #else
874 #  define HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
875 #endif
876 
877 #if defined(HEDLEY_GCC_HAS_WARNING)
878 #  undef HEDLEY_GCC_HAS_WARNING
879 #endif
880 #if defined(__has_warning)
881 #  define HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
882 #else
883 #  define HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
884 #endif
885 
886 #if \
887   (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
888   defined(__clang__) || \
889   HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
890   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
891   HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
892   HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
893   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
894   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
895   HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
896   HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
897   HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
898   HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \
899   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
900   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
901   HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \
902   HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \
903   HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \
904   (HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR))
905 #  define HEDLEY_PRAGMA(value) _Pragma(#value)
906 #elif HEDLEY_MSVC_VERSION_CHECK(15,0,0)
907 #  define HEDLEY_PRAGMA(value) __pragma(value)
908 #else
909 #  define HEDLEY_PRAGMA(value)
910 #endif
911 
912 #if defined(HEDLEY_DIAGNOSTIC_PUSH)
913 #  undef HEDLEY_DIAGNOSTIC_PUSH
914 #endif
915 #if defined(HEDLEY_DIAGNOSTIC_POP)
916 #  undef HEDLEY_DIAGNOSTIC_POP
917 #endif
918 #if defined(__clang__)
919 #  define HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
920 #  define HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
921 #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
922 #  define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
923 #  define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
924 #elif HEDLEY_GCC_VERSION_CHECK(4,6,0)
925 #  define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
926 #  define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
927 #elif \
928   HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
929   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
930 #  define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
931 #  define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
932 #elif HEDLEY_ARM_VERSION_CHECK(5,6,0)
933 #  define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
934 #  define HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
935 #elif \
936     HEDLEY_TI_VERSION_CHECK(15,12,0) || \
937     HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
938     HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \
939     HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
940     HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
941     HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
942 #  define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
943 #  define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
944 #elif HEDLEY_PELLES_VERSION_CHECK(2,90,0)
945 #  define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
946 #  define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
947 #else
948 #  define HEDLEY_DIAGNOSTIC_PUSH
949 #  define HEDLEY_DIAGNOSTIC_POP
950 #endif
951 
952 /* HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
953    HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
954 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
955 #  undef HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
956 #endif
957 #if defined(__cplusplus)
958 #  if HEDLEY_HAS_WARNING("-Wc++98-compat")
959 #    if HEDLEY_HAS_WARNING("-Wc++17-extensions")
960 #      if HEDLEY_HAS_WARNING("-Wc++1z-extensions")
961 #        define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
962            HEDLEY_DIAGNOSTIC_PUSH \
963            _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
964            _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
965            _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \
966            xpr \
967            HEDLEY_DIAGNOSTIC_POP
968 #      else
969 #        define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
970            HEDLEY_DIAGNOSTIC_PUSH \
971            _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
972            _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
973            xpr \
974            HEDLEY_DIAGNOSTIC_POP
975 #      endif
976 #    else
977 #      define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
978          HEDLEY_DIAGNOSTIC_PUSH \
979          _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
980          xpr \
981          HEDLEY_DIAGNOSTIC_POP
982 #    endif
983 #  endif
984 #endif
985 #if !defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
986 #  define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
987 #endif
988 
989 #if defined(HEDLEY_CONST_CAST)
990 #  undef HEDLEY_CONST_CAST
991 #endif
992 #if defined(__cplusplus)
993 #  define HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
994 #elif \
995   HEDLEY_HAS_WARNING("-Wcast-qual") || \
996   HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
997   HEDLEY_INTEL_VERSION_CHECK(13,0,0)
998 #  define HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
999       HEDLEY_DIAGNOSTIC_PUSH \
1000       HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
1001       ((T) (expr)); \
1002       HEDLEY_DIAGNOSTIC_POP \
1003     }))
1004 #else
1005 #  define HEDLEY_CONST_CAST(T, expr) ((T) (expr))
1006 #endif
1007 
1008 #if defined(HEDLEY_REINTERPRET_CAST)
1009 #  undef HEDLEY_REINTERPRET_CAST
1010 #endif
1011 #if defined(__cplusplus)
1012 #  define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
1013 #else
1014 #  define HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr))
1015 #endif
1016 
1017 #if defined(HEDLEY_STATIC_CAST)
1018 #  undef HEDLEY_STATIC_CAST
1019 #endif
1020 #if defined(__cplusplus)
1021 #  define HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
1022 #else
1023 #  define HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
1024 #endif
1025 
1026 #if defined(HEDLEY_CPP_CAST)
1027 #  undef HEDLEY_CPP_CAST
1028 #endif
1029 #if defined(__cplusplus)
1030 #  if HEDLEY_HAS_WARNING("-Wold-style-cast")
1031 #    define HEDLEY_CPP_CAST(T, expr) \
1032        HEDLEY_DIAGNOSTIC_PUSH \
1033        _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \
1034        ((T) (expr)) \
1035        HEDLEY_DIAGNOSTIC_POP
1036 #  elif HEDLEY_IAR_VERSION_CHECK(8,3,0)
1037 #    define HEDLEY_CPP_CAST(T, expr) \
1038        HEDLEY_DIAGNOSTIC_PUSH \
1039        _Pragma("diag_suppress=Pe137") \
1040        HEDLEY_DIAGNOSTIC_POP
1041 #  else
1042 #    define HEDLEY_CPP_CAST(T, expr) ((T) (expr))
1043 #  endif
1044 #else
1045 #  define HEDLEY_CPP_CAST(T, expr) (expr)
1046 #endif
1047 
1048 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
1049 #  undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
1050 #endif
1051 #if HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
1052 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
1053 #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
1054 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
1055 #elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
1056 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786))
1057 #elif HEDLEY_PGI_VERSION_CHECK(20,7,0)
1058 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445")
1059 #elif HEDLEY_PGI_VERSION_CHECK(17,10,0)
1060 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
1061 #elif HEDLEY_GCC_VERSION_CHECK(4,3,0)
1062 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
1063 #elif HEDLEY_MSVC_VERSION_CHECK(15,0,0)
1064 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996))
1065 #elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1066 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
1067 #elif \
1068     HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1069     (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1070     HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1071     (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1072     HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1073     (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1074     HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1075     (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1076     HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
1077     HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1078     HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
1079 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
1080 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus)
1081 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
1082 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus)
1083 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)")
1084 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
1085 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215")
1086 #elif HEDLEY_PELLES_VERSION_CHECK(2,90,0)
1087 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
1088 #else
1089 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
1090 #endif
1091 
1092 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
1093 #  undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
1094 #endif
1095 #if HEDLEY_HAS_WARNING("-Wunknown-pragmas")
1096 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
1097 #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
1098 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
1099 #elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
1100 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161))
1101 #elif HEDLEY_PGI_VERSION_CHECK(17,10,0)
1102 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
1103 #elif HEDLEY_GCC_VERSION_CHECK(4,3,0)
1104 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
1105 #elif HEDLEY_MSVC_VERSION_CHECK(15,0,0)
1106 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068))
1107 #elif \
1108     HEDLEY_TI_VERSION_CHECK(16,9,0) || \
1109     HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
1110     HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1111     HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
1112 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
1113 #elif HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0)
1114 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
1115 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
1116 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
1117 #elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1118 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161")
1119 #else
1120 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
1121 #endif
1122 
1123 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
1124 #  undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
1125 #endif
1126 #if HEDLEY_HAS_WARNING("-Wunknown-attributes")
1127 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
1128 #elif HEDLEY_GCC_VERSION_CHECK(4,6,0)
1129 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
1130 #elif HEDLEY_INTEL_VERSION_CHECK(17,0,0)
1131 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
1132 #elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
1133 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292))
1134 #elif HEDLEY_MSVC_VERSION_CHECK(19,0,0)
1135 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030))
1136 #elif HEDLEY_PGI_VERSION_CHECK(20,7,0)
1137 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098")
1138 #elif HEDLEY_PGI_VERSION_CHECK(17,10,0)
1139 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
1140 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
1141 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
1142 #elif \
1143     HEDLEY_TI_VERSION_CHECK(18,1,0) || \
1144     HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
1145     HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0)
1146 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
1147 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
1148 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097")
1149 #elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1150 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
1151 #else
1152 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
1153 #endif
1154 
1155 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
1156 #  undef HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
1157 #endif
1158 #if HEDLEY_HAS_WARNING("-Wcast-qual")
1159 #  define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
1160 #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
1161 #  define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)")
1162 #elif HEDLEY_GCC_VERSION_CHECK(3,0,0)
1163 #  define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
1164 #else
1165 #  define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
1166 #endif
1167 
1168 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION)
1169 #  undef HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
1170 #endif
1171 #if HEDLEY_HAS_WARNING("-Wunused-function")
1172 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"")
1173 #elif HEDLEY_GCC_VERSION_CHECK(3,4,0)
1174 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"")
1175 #elif HEDLEY_MSVC_VERSION_CHECK(1,0,0)
1176 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505))
1177 #elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1178 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142")
1179 #else
1180 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
1181 #endif
1182 
1183 #if defined(HEDLEY_DEPRECATED)
1184 #  undef HEDLEY_DEPRECATED
1185 #endif
1186 #if defined(HEDLEY_DEPRECATED_FOR)
1187 #  undef HEDLEY_DEPRECATED_FOR
1188 #endif
1189 #if \
1190   HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
1191   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
1192 #  define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
1193 #  define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
1194 #elif \
1195   (HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(HEDLEY_IAR_VERSION)) || \
1196   HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
1197   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1198   HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
1199   HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
1200   HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
1201   HEDLEY_TI_VERSION_CHECK(18,1,0) || \
1202   HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \
1203   HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
1204   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1205   HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \
1206   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1207 #  define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
1208 #  define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement)))
1209 #elif defined(__cplusplus) && (__cplusplus >= 201402L)
1210 #  define HEDLEY_DEPRECATED(since) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
1211 #  define HEDLEY_DEPRECATED_FOR(since, replacement) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
1212 #elif \
1213   HEDLEY_HAS_ATTRIBUTE(deprecated) || \
1214   HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
1215   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1216   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1217   (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1218   HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1219   (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1220   HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1221   (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1222   HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1223   (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1224   HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
1225   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1226   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1227   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
1228   HEDLEY_IAR_VERSION_CHECK(8,10,0)
1229 #  define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
1230 #  define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
1231 #elif \
1232   HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
1233   HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \
1234   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
1235 #  define HEDLEY_DEPRECATED(since) __declspec(deprecated)
1236 #  define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
1237 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
1238 #  define HEDLEY_DEPRECATED(since) _Pragma("deprecated")
1239 #  define HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
1240 #else
1241 #  define HEDLEY_DEPRECATED(since)
1242 #  define HEDLEY_DEPRECATED_FOR(since, replacement)
1243 #endif
1244 
1245 #if defined(HEDLEY_UNAVAILABLE)
1246 #  undef HEDLEY_UNAVAILABLE
1247 #endif
1248 #if \
1249   HEDLEY_HAS_ATTRIBUTE(warning) || \
1250   HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
1251   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1252   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1253 #  define HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
1254 #else
1255 #  define HEDLEY_UNAVAILABLE(available_since)
1256 #endif
1257 
1258 #if defined(HEDLEY_WARN_UNUSED_RESULT)
1259 #  undef HEDLEY_WARN_UNUSED_RESULT
1260 #endif
1261 #if defined(HEDLEY_WARN_UNUSED_RESULT_MSG)
1262 #  undef HEDLEY_WARN_UNUSED_RESULT_MSG
1263 #endif
1264 #if \
1265   HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \
1266   HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
1267   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1268   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1269   (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1270   HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1271   (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1272   HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1273   (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1274   HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1275   (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1276   HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
1277   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1278   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1279   (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
1280   HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
1281   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1282 #  define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
1283 #  define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__))
1284 #elif (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
1285 #  define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
1286 #  define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
1287 #elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
1288 #  define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
1289 #  define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
1290 #elif defined(_Check_return_) /* SAL */
1291 #  define HEDLEY_WARN_UNUSED_RESULT _Check_return_
1292 #  define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
1293 #else
1294 #  define HEDLEY_WARN_UNUSED_RESULT
1295 #  define HEDLEY_WARN_UNUSED_RESULT_MSG(msg)
1296 #endif
1297 
1298 #if defined(HEDLEY_SENTINEL)
1299 #  undef HEDLEY_SENTINEL
1300 #endif
1301 #if \
1302   HEDLEY_HAS_ATTRIBUTE(sentinel) || \
1303   HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
1304   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1305   HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
1306   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1307 #  define HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
1308 #else
1309 #  define HEDLEY_SENTINEL(position)
1310 #endif
1311 
1312 #if defined(HEDLEY_NO_RETURN)
1313 #  undef HEDLEY_NO_RETURN
1314 #endif
1315 #if HEDLEY_IAR_VERSION_CHECK(8,0,0)
1316 #  define HEDLEY_NO_RETURN __noreturn
1317 #elif \
1318   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1319   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1320 #  define HEDLEY_NO_RETURN __attribute__((__noreturn__))
1321 #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
1322 #  define HEDLEY_NO_RETURN _Noreturn
1323 #elif defined(__cplusplus) && (__cplusplus >= 201103L)
1324 #  define HEDLEY_NO_RETURN HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
1325 #elif \
1326   HEDLEY_HAS_ATTRIBUTE(noreturn) || \
1327   HEDLEY_GCC_VERSION_CHECK(3,2,0) || \
1328   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
1329   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1330   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
1331   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1332   (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1333   HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1334   (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1335   HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1336   (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1337   HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1338   (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1339   HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
1340   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1341   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1342   HEDLEY_IAR_VERSION_CHECK(8,10,0)
1343 #  define HEDLEY_NO_RETURN __attribute__((__noreturn__))
1344 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
1345 #  define HEDLEY_NO_RETURN _Pragma("does_not_return")
1346 #elif \
1347   HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
1348   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
1349 #  define HEDLEY_NO_RETURN __declspec(noreturn)
1350 #elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
1351 #  define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
1352 #elif HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
1353 #  define HEDLEY_NO_RETURN __attribute((noreturn))
1354 #elif HEDLEY_PELLES_VERSION_CHECK(9,0,0)
1355 #  define HEDLEY_NO_RETURN __declspec(noreturn)
1356 #else
1357 #  define HEDLEY_NO_RETURN
1358 #endif
1359 
1360 #if defined(HEDLEY_NO_ESCAPE)
1361 #  undef HEDLEY_NO_ESCAPE
1362 #endif
1363 #if HEDLEY_HAS_ATTRIBUTE(noescape)
1364 #  define HEDLEY_NO_ESCAPE __attribute__((__noescape__))
1365 #else
1366 #  define HEDLEY_NO_ESCAPE
1367 #endif
1368 
1369 #if defined(HEDLEY_UNREACHABLE)
1370 #  undef HEDLEY_UNREACHABLE
1371 #endif
1372 #if defined(HEDLEY_UNREACHABLE_RETURN)
1373 #  undef HEDLEY_UNREACHABLE_RETURN
1374 #endif
1375 #if defined(HEDLEY_ASSUME)
1376 #  undef HEDLEY_ASSUME
1377 #endif
1378 #if \
1379   HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
1380   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1381   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
1382 #  define HEDLEY_ASSUME(expr) __assume(expr)
1383 #elif HEDLEY_HAS_BUILTIN(__builtin_assume)
1384 #  define HEDLEY_ASSUME(expr) __builtin_assume(expr)
1385 #elif \
1386     HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
1387     HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
1388 #  if defined(__cplusplus)
1389 #    define HEDLEY_ASSUME(expr) std::_nassert(expr)
1390 #  else
1391 #    define HEDLEY_ASSUME(expr) _nassert(expr)
1392 #  endif
1393 #endif
1394 #if \
1395   (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(HEDLEY_ARM_VERSION))) || \
1396   HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
1397   HEDLEY_PGI_VERSION_CHECK(18,10,0) || \
1398   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1399   HEDLEY_IBM_VERSION_CHECK(13,1,5) || \
1400   HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \
1401   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1402 #  define HEDLEY_UNREACHABLE() __builtin_unreachable()
1403 #elif defined(HEDLEY_ASSUME)
1404 #  define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0)
1405 #endif
1406 #if !defined(HEDLEY_ASSUME)
1407 #  if defined(HEDLEY_UNREACHABLE)
1408 #    define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (HEDLEY_UNREACHABLE(), 1)))
1409 #  else
1410 #    define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, expr)
1411 #  endif
1412 #endif
1413 #if defined(HEDLEY_UNREACHABLE)
1414 #  if  \
1415       HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
1416       HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
1417 #    define HEDLEY_UNREACHABLE_RETURN(value) return (HEDLEY_STATIC_CAST(void, HEDLEY_ASSUME(0)), (value))
1418 #  else
1419 #    define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE()
1420 #  endif
1421 #else
1422 #  define HEDLEY_UNREACHABLE_RETURN(value) return (value)
1423 #endif
1424 #if !defined(HEDLEY_UNREACHABLE)
1425 #  define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0)
1426 #endif
1427 
1428 HEDLEY_DIAGNOSTIC_PUSH
1429 #if HEDLEY_HAS_WARNING("-Wpedantic")
1430 #  pragma clang diagnostic ignored "-Wpedantic"
1431 #endif
1432 #if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
1433 #  pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
1434 #endif
1435 #if HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0)
1436 #  if defined(__clang__)
1437 #    pragma clang diagnostic ignored "-Wvariadic-macros"
1438 #  elif defined(HEDLEY_GCC_VERSION)
1439 #    pragma GCC diagnostic ignored "-Wvariadic-macros"
1440 #  endif
1441 #endif
1442 #if defined(HEDLEY_NON_NULL)
1443 #  undef HEDLEY_NON_NULL
1444 #endif
1445 #if \
1446   HEDLEY_HAS_ATTRIBUTE(nonnull) || \
1447   HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
1448   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1449   HEDLEY_ARM_VERSION_CHECK(4,1,0)
1450 #  define HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
1451 #else
1452 #  define HEDLEY_NON_NULL(...)
1453 #endif
1454 HEDLEY_DIAGNOSTIC_POP
1455 
1456 #if defined(HEDLEY_PRINTF_FORMAT)
1457 #  undef HEDLEY_PRINTF_FORMAT
1458 #endif
1459 #if defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO)
1460 #  define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check)))
1461 #elif defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO)
1462 #  define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check)))
1463 #elif \
1464   HEDLEY_HAS_ATTRIBUTE(format) || \
1465   HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
1466   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1467   HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
1468   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
1469   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1470   (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1471   HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1472   (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1473   HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1474   (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1475   HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1476   (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1477   HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
1478   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1479   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1480   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1481 #  define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check)))
1482 #elif HEDLEY_PELLES_VERSION_CHECK(6,0,0)
1483 #  define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check))
1484 #else
1485 #  define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check)
1486 #endif
1487 
1488 #if defined(HEDLEY_CONSTEXPR)
1489 #  undef HEDLEY_CONSTEXPR
1490 #endif
1491 #if defined(__cplusplus)
1492 #  if __cplusplus >= 201103L
1493 #    define HEDLEY_CONSTEXPR HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
1494 #  endif
1495 #endif
1496 #if !defined(HEDLEY_CONSTEXPR)
1497 #  define HEDLEY_CONSTEXPR
1498 #endif
1499 
1500 #if defined(HEDLEY_PREDICT)
1501 #  undef HEDLEY_PREDICT
1502 #endif
1503 #if defined(HEDLEY_LIKELY)
1504 #  undef HEDLEY_LIKELY
1505 #endif
1506 #if defined(HEDLEY_UNLIKELY)
1507 #  undef HEDLEY_UNLIKELY
1508 #endif
1509 #if defined(HEDLEY_UNPREDICTABLE)
1510 #  undef HEDLEY_UNPREDICTABLE
1511 #endif
1512 #if HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
1513 #  define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
1514 #endif
1515 #if \
1516   (HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(HEDLEY_PGI_VERSION)) || \
1517   HEDLEY_GCC_VERSION_CHECK(9,0,0) || \
1518   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1519 #  define HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(  (expr), (value), (probability))
1520 #  define HEDLEY_PREDICT_TRUE(expr, probability)   __builtin_expect_with_probability(!!(expr),    1   , (probability))
1521 #  define HEDLEY_PREDICT_FALSE(expr, probability)  __builtin_expect_with_probability(!!(expr),    0   , (probability))
1522 #  define HEDLEY_LIKELY(expr)                      __builtin_expect                 (!!(expr),    1                  )
1523 #  define HEDLEY_UNLIKELY(expr)                    __builtin_expect                 (!!(expr),    0                  )
1524 #elif \
1525   (HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(HEDLEY_INTEL_CL_VERSION)) || \
1526   HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
1527   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1528   (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
1529   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1530   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
1531   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1532   HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
1533   HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
1534   HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
1535   HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
1536   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1537   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1538   HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \
1539   HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
1540   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1541 #  define HEDLEY_PREDICT(expr, expected, probability) \
1542      (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (HEDLEY_STATIC_CAST(void, expected), (expr)))
1543 #  define HEDLEY_PREDICT_TRUE(expr, probability) \
1544      (__extension__ ({ \
1545        double hedley_probability_ = (probability); \
1546        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
1547      }))
1548 #  define HEDLEY_PREDICT_FALSE(expr, probability) \
1549      (__extension__ ({ \
1550        double hedley_probability_ = (probability); \
1551        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
1552      }))
1553 #  define HEDLEY_LIKELY(expr)   __builtin_expect(!!(expr), 1)
1554 #  define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
1555 #else
1556 #  define HEDLEY_PREDICT(expr, expected, probability) (HEDLEY_STATIC_CAST(void, expected), (expr))
1557 #  define HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
1558 #  define HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
1559 #  define HEDLEY_LIKELY(expr) (!!(expr))
1560 #  define HEDLEY_UNLIKELY(expr) (!!(expr))
1561 #endif
1562 #if !defined(HEDLEY_UNPREDICTABLE)
1563 #  define HEDLEY_UNPREDICTABLE(expr) HEDLEY_PREDICT(expr, 1, 0.5)
1564 #endif
1565 
1566 #if defined(HEDLEY_MALLOC)
1567 #  undef HEDLEY_MALLOC
1568 #endif
1569 #if \
1570   HEDLEY_HAS_ATTRIBUTE(malloc) || \
1571   HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
1572   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1573   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
1574   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1575   HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
1576   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1577   (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1578   HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1579   (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1580   HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1581   (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1582   HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1583   (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1584   HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
1585   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1586   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1587   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1588 #  define HEDLEY_MALLOC __attribute__((__malloc__))
1589 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
1590 #  define HEDLEY_MALLOC _Pragma("returns_new_memory")
1591 #elif \
1592   HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
1593   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
1594 #  define HEDLEY_MALLOC __declspec(restrict)
1595 #else
1596 #  define HEDLEY_MALLOC
1597 #endif
1598 
1599 #if defined(HEDLEY_PURE)
1600 #  undef HEDLEY_PURE
1601 #endif
1602 #if \
1603   HEDLEY_HAS_ATTRIBUTE(pure) || \
1604   HEDLEY_GCC_VERSION_CHECK(2,96,0) || \
1605   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1606   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
1607   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1608   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
1609   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1610   (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1611   HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1612   (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1613   HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1614   (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1615   HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1616   (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1617   HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
1618   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1619   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1620   HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
1621   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1622 #  define HEDLEY_PURE __attribute__((__pure__))
1623 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
1624 #  define HEDLEY_PURE _Pragma("does_not_write_global_data")
1625 #elif defined(__cplusplus) && \
1626     ( \
1627       HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
1628       HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \
1629       HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \
1630     )
1631 #  define HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
1632 #else
1633 #  define HEDLEY_PURE
1634 #endif
1635 
1636 #if defined(HEDLEY_CONST)
1637 #  undef HEDLEY_CONST
1638 #endif
1639 #if \
1640   HEDLEY_HAS_ATTRIBUTE(const) || \
1641   HEDLEY_GCC_VERSION_CHECK(2,5,0) || \
1642   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1643   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
1644   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1645   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
1646   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1647   (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1648   HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1649   (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1650   HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1651   (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1652   HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1653   (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1654   HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
1655   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1656   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1657   HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
1658   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1659 #  define HEDLEY_CONST __attribute__((__const__))
1660 #elif \
1661   HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
1662 #  define HEDLEY_CONST _Pragma("no_side_effect")
1663 #else
1664 #  define HEDLEY_CONST HEDLEY_PURE
1665 #endif
1666 
1667 #if defined(HEDLEY_RESTRICT)
1668 #  undef HEDLEY_RESTRICT
1669 #endif
1670 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
1671 #  define HEDLEY_RESTRICT restrict
1672 #elif \
1673   HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
1674   HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
1675   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1676   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
1677   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1678   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
1679   HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
1680   HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1681   HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \
1682   HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
1683   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1684   (HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \
1685   HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
1686   defined(__clang__) || \
1687   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1688 #  define HEDLEY_RESTRICT __restrict
1689 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus)
1690 #  define HEDLEY_RESTRICT _Restrict
1691 #else
1692 #  define HEDLEY_RESTRICT
1693 #endif
1694 
1695 #if defined(HEDLEY_INLINE)
1696 #  undef HEDLEY_INLINE
1697 #endif
1698 #if \
1699   (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
1700   (defined(__cplusplus) && (__cplusplus >= 199711L))
1701 #  define HEDLEY_INLINE inline
1702 #elif \
1703   defined(HEDLEY_GCC_VERSION) || \
1704   HEDLEY_ARM_VERSION_CHECK(6,2,0)
1705 #  define HEDLEY_INLINE __inline__
1706 #elif \
1707   HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
1708   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
1709   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1710   HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \
1711   HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
1712   HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
1713   HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
1714   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1715   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1716   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1717 #  define HEDLEY_INLINE __inline
1718 #else
1719 #  define HEDLEY_INLINE
1720 #endif
1721 
1722 #if defined(HEDLEY_ALWAYS_INLINE)
1723 #  undef HEDLEY_ALWAYS_INLINE
1724 #endif
1725 #if \
1726   HEDLEY_HAS_ATTRIBUTE(always_inline) || \
1727   HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
1728   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1729   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
1730   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1731   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
1732   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1733   (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1734   HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1735   (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1736   HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1737   (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1738   HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1739   (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1740   HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
1741   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1742   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1743   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
1744   HEDLEY_IAR_VERSION_CHECK(8,10,0)
1745 #  define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE
1746 #elif \
1747   HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
1748   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
1749 #  define HEDLEY_ALWAYS_INLINE __forceinline
1750 #elif defined(__cplusplus) && \
1751     ( \
1752       HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1753       HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1754       HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1755       HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
1756       HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1757       HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \
1758     )
1759 #  define HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
1760 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
1761 #  define HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
1762 #else
1763 #  define HEDLEY_ALWAYS_INLINE HEDLEY_INLINE
1764 #endif
1765 
1766 #if defined(HEDLEY_NEVER_INLINE)
1767 #  undef HEDLEY_NEVER_INLINE
1768 #endif
1769 #if \
1770   HEDLEY_HAS_ATTRIBUTE(noinline) || \
1771   HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
1772   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1773   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
1774   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1775   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
1776   HEDLEY_TI_VERSION_CHECK(15,12,0) || \
1777   (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1778   HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
1779   (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1780   HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
1781   (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1782   HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
1783   (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1784   HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
1785   HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
1786   HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
1787   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
1788   HEDLEY_IAR_VERSION_CHECK(8,10,0)
1789 #  define HEDLEY_NEVER_INLINE __attribute__((__noinline__))
1790 #elif \
1791   HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
1792   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
1793 #  define HEDLEY_NEVER_INLINE __declspec(noinline)
1794 #elif HEDLEY_PGI_VERSION_CHECK(10,2,0)
1795 #  define HEDLEY_NEVER_INLINE _Pragma("noinline")
1796 #elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
1797 #  define HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
1798 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
1799 #  define HEDLEY_NEVER_INLINE _Pragma("inline=never")
1800 #elif HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
1801 #  define HEDLEY_NEVER_INLINE __attribute((noinline))
1802 #elif HEDLEY_PELLES_VERSION_CHECK(9,0,0)
1803 #  define HEDLEY_NEVER_INLINE __declspec(noinline)
1804 #else
1805 #  define HEDLEY_NEVER_INLINE
1806 #endif
1807 
1808 #if defined(HEDLEY_PRIVATE)
1809 #  undef HEDLEY_PRIVATE
1810 #endif
1811 #if defined(HEDLEY_PUBLIC)
1812 #  undef HEDLEY_PUBLIC
1813 #endif
1814 #if defined(HEDLEY_IMPORT)
1815 #  undef HEDLEY_IMPORT
1816 #endif
1817 #if defined(_WIN32) || defined(__CYGWIN__)
1818 #  define HEDLEY_PRIVATE
1819 #  define HEDLEY_PUBLIC   __declspec(dllexport)
1820 #  define HEDLEY_IMPORT   __declspec(dllimport)
1821 #else
1822 #  if \
1823     HEDLEY_HAS_ATTRIBUTE(visibility) || \
1824     HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
1825     HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
1826     HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1827     HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1828     HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
1829     ( \
1830       defined(__TI_EABI__) && \
1831       ( \
1832         (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
1833         HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \
1834       ) \
1835     ) || \
1836     HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1837 #    define HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
1838 #    define HEDLEY_PUBLIC  __attribute__((__visibility__("default")))
1839 #  else
1840 #    define HEDLEY_PRIVATE
1841 #    define HEDLEY_PUBLIC
1842 #  endif
1843 #  define HEDLEY_IMPORT    extern
1844 #endif
1845 
1846 #if defined(HEDLEY_NO_THROW)
1847 #  undef HEDLEY_NO_THROW
1848 #endif
1849 #if \
1850   HEDLEY_HAS_ATTRIBUTE(nothrow) || \
1851   HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
1852   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1853   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1854 #  define HEDLEY_NO_THROW __attribute__((__nothrow__))
1855 #elif \
1856   HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \
1857   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
1858   HEDLEY_ARM_VERSION_CHECK(4,1,0)
1859 #  define HEDLEY_NO_THROW __declspec(nothrow)
1860 #else
1861 #  define HEDLEY_NO_THROW
1862 #endif
1863 
1864 #if defined(HEDLEY_FALL_THROUGH)
1865 # undef HEDLEY_FALL_THROUGH
1866 #endif
1867 #if \
1868   HEDLEY_HAS_ATTRIBUTE(fallthrough) || \
1869   HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
1870   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1871 #  define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
1872 #elif HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough)
1873 #  define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
1874 #elif HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
1875 #  define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
1876 #elif defined(__fallthrough) /* SAL */
1877 #  define HEDLEY_FALL_THROUGH __fallthrough
1878 #else
1879 #  define HEDLEY_FALL_THROUGH
1880 #endif
1881 
1882 #if defined(HEDLEY_RETURNS_NON_NULL)
1883 #  undef HEDLEY_RETURNS_NON_NULL
1884 #endif
1885 #if \
1886   HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \
1887   HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
1888   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1889 #  define HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
1890 #elif defined(_Ret_notnull_) /* SAL */
1891 #  define HEDLEY_RETURNS_NON_NULL _Ret_notnull_
1892 #else
1893 #  define HEDLEY_RETURNS_NON_NULL
1894 #endif
1895 
1896 #if defined(HEDLEY_ARRAY_PARAM)
1897 #  undef HEDLEY_ARRAY_PARAM
1898 #endif
1899 #if \
1900   defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
1901   !defined(__STDC_NO_VLA__) && \
1902   !defined(__cplusplus) && \
1903   !defined(HEDLEY_PGI_VERSION) && \
1904   !defined(HEDLEY_TINYC_VERSION)
1905 #  define HEDLEY_ARRAY_PARAM(name) (name)
1906 #else
1907 #  define HEDLEY_ARRAY_PARAM(name)
1908 #endif
1909 
1910 #if defined(HEDLEY_IS_CONSTANT)
1911 #  undef HEDLEY_IS_CONSTANT
1912 #endif
1913 #if defined(HEDLEY_REQUIRE_CONSTEXPR)
1914 #  undef HEDLEY_REQUIRE_CONSTEXPR
1915 #endif
1916 /* HEDLEY_IS_CONSTEXPR_ is for
1917    HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
1918 #if defined(HEDLEY_IS_CONSTEXPR_)
1919 #  undef HEDLEY_IS_CONSTEXPR_
1920 #endif
1921 #if \
1922   HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
1923   HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
1924   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1925   HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
1926   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
1927   HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
1928   HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
1929   (HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
1930   HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
1931   HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
1932 #  define HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
1933 #endif
1934 #if !defined(__cplusplus)
1935 #  if \
1936        HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \
1937        HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
1938        HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
1939        HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
1940        HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
1941        HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
1942        HEDLEY_TINYC_VERSION_CHECK(0,9,24)
1943 #    if defined(__INTPTR_TYPE__)
1944 #      define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
1945 #    else
1946 #      include <stdint.h>
1947 #      define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
1948 #    endif
1949 #  elif \
1950        ( \
1951           defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
1952           !defined(HEDLEY_SUNPRO_VERSION) && \
1953           !defined(HEDLEY_PGI_VERSION) && \
1954           !defined(HEDLEY_IAR_VERSION)) || \
1955        (HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(HEDLEY_IAR_VERSION)) || \
1956        HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
1957        HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \
1958        HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
1959        HEDLEY_ARM_VERSION_CHECK(5,3,0)
1960 #    if defined(__INTPTR_TYPE__)
1961 #      define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
1962 #    else
1963 #      include <stdint.h>
1964 #      define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
1965 #    endif
1966 #  elif \
1967        defined(HEDLEY_GCC_VERSION) || \
1968        defined(HEDLEY_INTEL_VERSION) || \
1969        defined(HEDLEY_TINYC_VERSION) || \
1970        defined(HEDLEY_TI_ARMCL_VERSION) || \
1971        HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \
1972        defined(HEDLEY_TI_CL2000_VERSION) || \
1973        defined(HEDLEY_TI_CL6X_VERSION) || \
1974        defined(HEDLEY_TI_CL7X_VERSION) || \
1975        defined(HEDLEY_TI_CLPRU_VERSION) || \
1976        defined(__clang__)
1977 #    define HEDLEY_IS_CONSTEXPR_(expr) ( \
1978          sizeof(void) != \
1979          sizeof(*( \
1980            1 ? \
1981              ((void*) ((expr) * 0L) ) : \
1982              ((struct { char v[sizeof(void) * 2]; } *) 1) \
1983            ) \
1984          ) \
1985        )
1986 #  endif
1987 #endif
1988 #if defined(HEDLEY_IS_CONSTEXPR_)
1989 #  if !defined(HEDLEY_IS_CONSTANT)
1990 #    define HEDLEY_IS_CONSTANT(expr) HEDLEY_IS_CONSTEXPR_(expr)
1991 #  endif
1992 #  define HEDLEY_REQUIRE_CONSTEXPR(expr) (HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
1993 #else
1994 #  if !defined(HEDLEY_IS_CONSTANT)
1995 #    define HEDLEY_IS_CONSTANT(expr) (0)
1996 #  endif
1997 #  define HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
1998 #endif
1999 
2000 #if defined(HEDLEY_BEGIN_C_DECLS)
2001 #  undef HEDLEY_BEGIN_C_DECLS
2002 #endif
2003 #if defined(HEDLEY_END_C_DECLS)
2004 #  undef HEDLEY_END_C_DECLS
2005 #endif
2006 #if defined(HEDLEY_C_DECL)
2007 #  undef HEDLEY_C_DECL
2008 #endif
2009 #if defined(__cplusplus)
2010 #  define HEDLEY_BEGIN_C_DECLS extern "C" {
2011 #  define HEDLEY_END_C_DECLS }
2012 #  define HEDLEY_C_DECL extern "C"
2013 #else
2014 #  define HEDLEY_BEGIN_C_DECLS
2015 #  define HEDLEY_END_C_DECLS
2016 #  define HEDLEY_C_DECL
2017 #endif
2018 
2019 #if defined(HEDLEY_STATIC_ASSERT)
2020 #  undef HEDLEY_STATIC_ASSERT
2021 #endif
2022 #if \
2023   !defined(__cplusplus) && ( \
2024       (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
2025       (HEDLEY_HAS_FEATURE(c_static_assert) && !defined(HEDLEY_INTEL_CL_VERSION)) || \
2026       HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
2027       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
2028       defined(_Static_assert) \
2029     )
2030 #  define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
2031 #elif \
2032   (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
2033   HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \
2034   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
2035 #  define HEDLEY_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
2036 #else
2037 #  define HEDLEY_STATIC_ASSERT(expr, message)
2038 #endif
2039 
2040 #if defined(HEDLEY_NULL)
2041 #  undef HEDLEY_NULL
2042 #endif
2043 #if defined(__cplusplus)
2044 #  if __cplusplus >= 201103L
2045 #    define HEDLEY_NULL HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
2046 #  elif defined(NULL)
2047 #    define HEDLEY_NULL NULL
2048 #  else
2049 #    define HEDLEY_NULL HEDLEY_STATIC_CAST(void*, 0)
2050 #  endif
2051 #elif defined(NULL)
2052 #  define HEDLEY_NULL NULL
2053 #else
2054 #  define HEDLEY_NULL ((void*) 0)
2055 #endif
2056 
2057 #if defined(HEDLEY_MESSAGE)
2058 #  undef HEDLEY_MESSAGE
2059 #endif
2060 #if HEDLEY_HAS_WARNING("-Wunknown-pragmas")
2061 #  define HEDLEY_MESSAGE(msg) \
2062   HEDLEY_DIAGNOSTIC_PUSH \
2063   HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
2064   HEDLEY_PRAGMA(message msg) \
2065   HEDLEY_DIAGNOSTIC_POP
2066 #elif \
2067   HEDLEY_GCC_VERSION_CHECK(4,4,0) || \
2068   HEDLEY_INTEL_VERSION_CHECK(13,0,0)
2069 #  define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message msg)
2070 #elif HEDLEY_CRAY_VERSION_CHECK(5,0,0)
2071 #  define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(_CRI message msg)
2072 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
2073 #  define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg))
2074 #elif HEDLEY_PELLES_VERSION_CHECK(2,0,0)
2075 #  define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg))
2076 #else
2077 #  define HEDLEY_MESSAGE(msg)
2078 #endif
2079 
2080 #if defined(HEDLEY_WARNING)
2081 #  undef HEDLEY_WARNING
2082 #endif
2083 #if HEDLEY_HAS_WARNING("-Wunknown-pragmas")
2084 #  define HEDLEY_WARNING(msg) \
2085   HEDLEY_DIAGNOSTIC_PUSH \
2086   HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
2087   HEDLEY_PRAGMA(clang warning msg) \
2088   HEDLEY_DIAGNOSTIC_POP
2089 #elif \
2090   HEDLEY_GCC_VERSION_CHECK(4,8,0) || \
2091   HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
2092   HEDLEY_INTEL_VERSION_CHECK(13,0,0)
2093 #  define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg)
2094 #elif \
2095   HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
2096   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
2097 #  define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg))
2098 #else
2099 #  define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg)
2100 #endif
2101 
2102 #if defined(HEDLEY_REQUIRE)
2103 #  undef HEDLEY_REQUIRE
2104 #endif
2105 #if defined(HEDLEY_REQUIRE_MSG)
2106 #  undef HEDLEY_REQUIRE_MSG
2107 #endif
2108 #if HEDLEY_HAS_ATTRIBUTE(diagnose_if)
2109 #  if HEDLEY_HAS_WARNING("-Wgcc-compat")
2110 #    define HEDLEY_REQUIRE(expr) \
2111        HEDLEY_DIAGNOSTIC_PUSH \
2112        _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
2113        __attribute__((diagnose_if(!(expr), #expr, "error"))) \
2114        HEDLEY_DIAGNOSTIC_POP
2115 #    define HEDLEY_REQUIRE_MSG(expr,msg) \
2116        HEDLEY_DIAGNOSTIC_PUSH \
2117        _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
2118        __attribute__((diagnose_if(!(expr), msg, "error"))) \
2119        HEDLEY_DIAGNOSTIC_POP
2120 #  else
2121 #    define HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
2122 #    define HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error")))
2123 #  endif
2124 #else
2125 #  define HEDLEY_REQUIRE(expr)
2126 #  define HEDLEY_REQUIRE_MSG(expr,msg)
2127 #endif
2128 
2129 #if defined(HEDLEY_FLAGS)
2130 #  undef HEDLEY_FLAGS
2131 #endif
2132 #if HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion"))
2133 #  define HEDLEY_FLAGS __attribute__((__flag_enum__))
2134 #else
2135 #  define HEDLEY_FLAGS
2136 #endif
2137 
2138 #if defined(HEDLEY_FLAGS_CAST)
2139 #  undef HEDLEY_FLAGS_CAST
2140 #endif
2141 #if HEDLEY_INTEL_VERSION_CHECK(19,0,0)
2142 #  define HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \
2143   HEDLEY_DIAGNOSTIC_PUSH \
2144       _Pragma("warning(disable:188)") \
2145       ((T) (expr)); \
2146       HEDLEY_DIAGNOSTIC_POP \
2147     }))
2148 #else
2149 #  define HEDLEY_FLAGS_CAST(T, expr) HEDLEY_STATIC_CAST(T, expr)
2150 #endif
2151 
2152 #if defined(HEDLEY_EMPTY_BASES)
2153 #  undef HEDLEY_EMPTY_BASES
2154 #endif
2155 #if \
2156   (HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \
2157   HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
2158 #  define HEDLEY_EMPTY_BASES __declspec(empty_bases)
2159 #else
2160 #  define HEDLEY_EMPTY_BASES
2161 #endif
2162 
2163 /* Remaining macros are deprecated. */
2164 
2165 #if defined(HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
2166 #  undef HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
2167 #endif
2168 #if defined(__clang__)
2169 #  define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0)
2170 #else
2171 #  define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
2172 #endif
2173 
2174 #if defined(HEDLEY_CLANG_HAS_ATTRIBUTE)
2175 #  undef HEDLEY_CLANG_HAS_ATTRIBUTE
2176 #endif
2177 #define HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) HEDLEY_HAS_ATTRIBUTE(attribute)
2178 
2179 #if defined(HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
2180 #  undef HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
2181 #endif
2182 #define HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) HEDLEY_HAS_CPP_ATTRIBUTE(attribute)
2183 
2184 #if defined(HEDLEY_CLANG_HAS_BUILTIN)
2185 #  undef HEDLEY_CLANG_HAS_BUILTIN
2186 #endif
2187 #define HEDLEY_CLANG_HAS_BUILTIN(builtin) HEDLEY_HAS_BUILTIN(builtin)
2188 
2189 #if defined(HEDLEY_CLANG_HAS_FEATURE)
2190 #  undef HEDLEY_CLANG_HAS_FEATURE
2191 #endif
2192 #define HEDLEY_CLANG_HAS_FEATURE(feature) HEDLEY_HAS_FEATURE(feature)
2193 
2194 #if defined(HEDLEY_CLANG_HAS_EXTENSION)
2195 #  undef HEDLEY_CLANG_HAS_EXTENSION
2196 #endif
2197 #define HEDLEY_CLANG_HAS_EXTENSION(extension) HEDLEY_HAS_EXTENSION(extension)
2198 
2199 #if defined(HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
2200 #  undef HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
2201 #endif
2202 #define HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)
2203 
2204 #if defined(HEDLEY_CLANG_HAS_WARNING)
2205 #  undef HEDLEY_CLANG_HAS_WARNING
2206 #endif
2207 #define HEDLEY_CLANG_HAS_WARNING(warning) HEDLEY_HAS_WARNING(warning)
2208 
2209 #endif /* !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < X) */
2210 /* :: End ../simde/simde/hedley.h :: */
2211 
2212 #define SIMDE_VERSION_MAJOR 0
2213 #define SIMDE_VERSION_MINOR 7
2214 #define SIMDE_VERSION_MICRO 3
2215 #define SIMDE_VERSION HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, SIMDE_VERSION_MICRO)
2216 // Also update meson.build in the root directory of the repository
2217 
2218 #include <stddef.h>
2219 #include <stdint.h>
2220 
2221 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
2222 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
2223 /* :: Begin ../simde/simde/simde-detect-clang.h :: */
2224 /* Detect Clang Version
2225  * Created by Evan Nemerson <evan@nemerson.com>
2226  *
2227  * To the extent possible under law, the author(s) have dedicated all
2228  * copyright and related and neighboring rights to this software to
2229  * the public domain worldwide. This software is distributed without
2230  * any warranty.
2231  *
2232  * For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
2233  * SPDX-License-Identifier: CC0-1.0
2234  */
2235 
2236 /* This file was originally part of SIMDe
2237  * (<https://github.com/simd-everywhere/simde>).  You're free to do with it as
2238  * you please, but I do have a few small requests:
2239  *
2240  *  * If you make improvements, please submit them back to SIMDe
2241  *    (at <https://github.com/simd-everywhere/simde/issues>) so others can
2242  *    benefit from them.
2243  *  * Please keep a link to SIMDe intact so people know where to submit
2244  *    improvements.
2245  *  * If you expose it publicly, please change the SIMDE_ prefix to
2246  *    something specific to your project.
2247  *
2248  * The version numbers clang exposes (in the ___clang_major__,
2249  * __clang_minor__, and __clang_patchlevel__ macros) are unreliable.
2250  * Vendors such as Apple will define these values to their version
2251  * numbers; for example, "Apple Clang 4.0" is really clang 3.1, but
2252  * __clang_major__ and __clang_minor__ are defined to 4 and 0
2253  * respectively, instead of 3 and 1.
2254  *
2255  * The solution is *usually* to use clang's feature detection macros
2256  * (<https://clang.llvm.org/docs/LanguageExtensions.html#feature-checking-macros>)
2257  * to determine if the feature you're interested in is available.  This
2258  * generally works well, and it should probably be the first thing you
2259  * try.  Unfortunately, it's not possible to check for everything.  In
2260  * particular, compiler bugs.
2261  *
2262  * This file just uses the feature checking macros to detect features
2263  * added in specific versions of clang to identify which version of
2264  * clang the compiler is based on.
2265  *
2266  * Right now it only goes back to 3.6, but I'm happy to accept patches
2267  * to go back further.  And, of course, newer versions are welcome if
2268  * they're not already present, and if you find a way to detect a point
2269  * release that would be great, too!
2270  */
2271 
2272 #if !defined(SIMDE_DETECT_CLANG_H)
2273 #define SIMDE_DETECT_CLANG_H 1
2274 
2275 /* Attempt to detect the upstream clang version number.  I usually only
2276  * worry about major version numbers (at least for 4.0+), but if you
2277  * need more resolution I'm happy to accept patches that are able to
2278  * detect minor versions as well.  That said, you'll probably have a
2279  * hard time with detection since AFAIK most minor releases don't add
2280  * anything we can detect. */
2281 
2282 #if defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION)
2283 #  if __has_warning("-Wformat-insufficient-args")
2284 #    define SIMDE_DETECT_CLANG_VERSION 120000
2285 #  elif __has_warning("-Wimplicit-const-int-float-conversion")
2286 #    define SIMDE_DETECT_CLANG_VERSION 110000
2287 #  elif __has_warning("-Wmisleading-indentation")
2288 #    define SIMDE_DETECT_CLANG_VERSION 100000
2289 #  elif defined(__FILE_NAME__)
2290 #    define SIMDE_DETECT_CLANG_VERSION 90000
2291 #  elif __has_warning("-Wextra-semi-stmt") || __has_builtin(__builtin_rotateleft32)
2292 #    define SIMDE_DETECT_CLANG_VERSION 80000
2293 #  elif __has_warning("-Wc++98-compat-extra-semi")
2294 #    define SIMDE_DETECT_CLANG_VERSION 70000
2295 #  elif __has_warning("-Wpragma-pack")
2296 #    define SIMDE_DETECT_CLANG_VERSION 60000
2297 #  elif __has_warning("-Wbitfield-enum-conversion")
2298 #    define SIMDE_DETECT_CLANG_VERSION 50000
2299 #  elif __has_attribute(diagnose_if)
2300 #    define SIMDE_DETECT_CLANG_VERSION 40000
2301 #  elif __has_warning("-Wcomma")
2302 #    define SIMDE_DETECT_CLANG_VERSION 39000
2303 #  elif __has_warning("-Wdouble-promotion")
2304 #    define SIMDE_DETECT_CLANG_VERSION 38000
2305 #  elif __has_warning("-Wshift-negative-value")
2306 #    define SIMDE_DETECT_CLANG_VERSION 37000
2307 #  elif __has_warning("-Wambiguous-ellipsis")
2308 #    define SIMDE_DETECT_CLANG_VERSION 36000
2309 #  else
2310 #    define SIMDE_DETECT_CLANG_VERSION 1
2311 #  endif
2312 #endif /* defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION) */
2313 
2314 /* The SIMDE_DETECT_CLANG_VERSION_CHECK macro is pretty
2315  * straightforward; it returns true if the compiler is a derivative
2316  * of clang >= the specified version.
2317  *
2318  * Since this file is often (primarily?) useful for working around bugs
2319  * it is also helpful to have a macro which returns true if only if the
2320  * compiler is a version of clang *older* than the specified version to
2321  * make it a bit easier to ifdef regions to add code for older versions,
2322  * such as pragmas to disable a specific warning. */
2323 
2324 #if defined(SIMDE_DETECT_CLANG_VERSION)
2325 #  define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (SIMDE_DETECT_CLANG_VERSION >= ((major * 10000) + (minor * 1000) + (revision)))
2326 #  define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (SIMDE_DETECT_CLANG_VERSION < ((major * 10000) + (minor * 1000) + (revision)))
2327 #else
2328 #  define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (0)
2329 #  define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (0)
2330 #endif
2331 
2332 #endif /* !defined(SIMDE_DETECT_CLANG_H) */
2333 /* :: End ../simde/simde/simde-detect-clang.h :: */
2334 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
2335 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
2336 /* :: Begin ../simde/simde/simde-arch.h :: */
2337 /* Architecture detection
2338  * Created by Evan Nemerson <evan@nemerson.com>
2339  *
2340  *   To the extent possible under law, the authors have waived all
2341  *   copyright and related or neighboring rights to this code.  For
2342  *   details, see the Creative Commons Zero 1.0 Universal license at
2343  *   <https://creativecommons.org/publicdomain/zero/1.0/>
2344  *
2345  * SPDX-License-Identifier: CC0-1.0
2346  *
2347  * Different compilers define different preprocessor macros for the
2348  * same architecture.  This is an attempt to provide a single
2349  * interface which is usable on any compiler.
2350  *
2351  * In general, a macro named SIMDE_ARCH_* is defined for each
2352  * architecture the CPU supports.  When there are multiple possible
2353  * versions, we try to define the macro to the target version.  For
2354  * example, if you want to check for i586+, you could do something
2355  * like:
2356  *
2357  *   #if defined(SIMDE_ARCH_X86) && (SIMDE_ARCH_X86 >= 5)
2358  *   ...
2359  *   #endif
2360  *
2361  * You could also just check that SIMDE_ARCH_X86 >= 5 without checking
2362  * if it's defined first, but some compilers may emit a warning about
2363  * an undefined macro being used (e.g., GCC with -Wundef).
2364  *
2365  * This was originally created for SIMDe
2366  * <https://github.com/simd-everywhere/simde> (hence the prefix), but this
2367  * header has no dependencies and may be used anywhere.  It is
2368  * originally based on information from
2369  * <https://sourceforge.net/p/predef/wiki/Architectures/>, though it
2370  * has been enhanced with additional information.
2371  *
2372  * If you improve this file, or find a bug, please file the issue at
2373  * <https://github.com/simd-everywhere/simde/issues>.  If you copy this into
2374  * your project, even if you change the prefix, please keep the links
2375  * to SIMDe intact so others know where to report issues, submit
2376  * enhancements, and find the latest version. */
2377 
2378 #if !defined(SIMDE_ARCH_H)
2379 #define SIMDE_ARCH_H
2380 
2381 /* Alpha
2382    <https://en.wikipedia.org/wiki/DEC_Alpha> */
2383 #if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
2384 #  if defined(__alpha_ev6__)
2385 #    define SIMDE_ARCH_ALPHA 6
2386 #  elif defined(__alpha_ev5__)
2387 #    define SIMDE_ARCH_ALPHA 5
2388 #  elif defined(__alpha_ev4__)
2389 #    define SIMDE_ARCH_ALPHA 4
2390 #  else
2391 #    define SIMDE_ARCH_ALPHA 1
2392 #  endif
2393 #endif
2394 #if defined(SIMDE_ARCH_ALPHA)
2395 #  define SIMDE_ARCH_ALPHA_CHECK(version) ((version) <= SIMDE_ARCH_ALPHA)
2396 #else
2397 #  define SIMDE_ARCH_ALPHA_CHECK(version) (0)
2398 #endif
2399 
2400 /* Atmel AVR
2401    <https://en.wikipedia.org/wiki/Atmel_AVR> */
2402 #if defined(__AVR_ARCH__)
2403 #  define SIMDE_ARCH_AVR __AVR_ARCH__
2404 #endif
2405 
2406 /* AMD64 / x86_64
2407    <https://en.wikipedia.org/wiki/X86-64> */
2408 #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
2409 #  define SIMDE_ARCH_AMD64 1000
2410 #endif
2411 
2412 /* ARM
2413    <https://en.wikipedia.org/wiki/ARM_architecture> */
2414 #if defined(__ARM_ARCH_8A__)
2415 #  define SIMDE_ARCH_ARM 82
2416 #elif defined(__ARM_ARCH_8R__)
2417 #  define SIMDE_ARCH_ARM 81
2418 #elif defined(__ARM_ARCH_8__)
2419 #  define SIMDE_ARCH_ARM 80
2420 #elif defined(__ARM_ARCH_7S__)
2421 #  define SIMDE_ARCH_ARM 74
2422 #elif defined(__ARM_ARCH_7M__)
2423 #  define SIMDE_ARCH_ARM 73
2424 #elif defined(__ARM_ARCH_7R__)
2425 #  define SIMDE_ARCH_ARM 72
2426 #elif defined(__ARM_ARCH_7A__)
2427 #  define SIMDE_ARCH_ARM 71
2428 #elif defined(__ARM_ARCH_7__)
2429 #  define SIMDE_ARCH_ARM 70
2430 #elif defined(__ARM_ARCH)
2431 #  define SIMDE_ARCH_ARM (__ARM_ARCH * 10)
2432 #elif defined(_M_ARM)
2433 #  define SIMDE_ARCH_ARM (_M_ARM * 10)
2434 #elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) || defined(_ARM) || defined(_M_ARM) || defined(_M_ARM)
2435 #  define SIMDE_ARCH_ARM 1
2436 #endif
2437 #if defined(SIMDE_ARCH_ARM  )
2438 #  define SIMDE_ARCH_ARM_CHECK(version) ((version) <= SIMDE_ARCH_ARM)
2439 #else
2440 #  define SIMDE_ARCH_ARM_CHECK(version) (0)
2441 #endif
2442 
2443 /* AArch64
2444    <https://en.wikipedia.org/wiki/ARM_architecture> */
2445 #if defined(__aarch64__) || defined(_M_ARM64)
2446 #  define SIMDE_ARCH_AARCH64 1000
2447 #endif
2448 #if defined(SIMDE_ARCH_AARCH64)
2449 #  define SIMDE_ARCH_AARCH64_CHECK(version) ((version) <= SIMDE_ARCH_AARCH64)
2450 #else
2451 #  define SIMDE_ARCH_AARCH64_CHECK(version) (0)
2452 #endif
2453 
2454 /* ARM SIMD ISA extensions */
2455 #if defined(__ARM_NEON)
2456 #  if defined(SIMDE_ARCH_AARCH64)
2457 #    define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_AARCH64
2458 #  elif defined(SIMDE_ARCH_ARM)
2459 #    define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_ARM
2460 #  endif
2461 #endif
2462 #if defined(__ARM_FEATURE_SVE)
2463 #  define SIMDE_ARCH_ARM_SVE
2464 #endif
2465 
2466 /* Blackfin
2467    <https://en.wikipedia.org/wiki/Blackfin> */
2468 #if defined(__bfin) || defined(__BFIN__) || defined(__bfin__)
2469 #  define SIMDE_ARCH_BLACKFIN 1
2470 #endif
2471 
2472 /* CRIS
2473    <https://en.wikipedia.org/wiki/ETRAX_CRIS> */
2474 #if defined(__CRIS_arch_version)
2475 #  define SIMDE_ARCH_CRIS __CRIS_arch_version
2476 #elif defined(__cris__) || defined(__cris) || defined(__CRIS) || defined(__CRIS__)
2477 #  define SIMDE_ARCH_CRIS 1
2478 #endif
2479 
2480 /* Convex
2481    <https://en.wikipedia.org/wiki/Convex_Computer> */
2482 #if defined(__convex_c38__)
2483 #  define SIMDE_ARCH_CONVEX 38
2484 #elif defined(__convex_c34__)
2485 #  define SIMDE_ARCH_CONVEX 34
2486 #elif defined(__convex_c32__)
2487 #  define SIMDE_ARCH_CONVEX 32
2488 #elif defined(__convex_c2__)
2489 #  define SIMDE_ARCH_CONVEX 2
2490 #elif defined(__convex__)
2491 #  define SIMDE_ARCH_CONVEX 1
2492 #endif
2493 #if defined(SIMDE_ARCH_CONVEX)
2494 #  define SIMDE_ARCH_CONVEX_CHECK(version) ((version) <= SIMDE_ARCH_CONVEX)
2495 #else
2496 #  define SIMDE_ARCH_CONVEX_CHECK(version) (0)
2497 #endif
2498 
2499 /* Adapteva Epiphany
2500    <https://en.wikipedia.org/wiki/Adapteva_Epiphany> */
2501 #if defined(__epiphany__)
2502 #  define SIMDE_ARCH_EPIPHANY 1
2503 #endif
2504 
2505 /* Fujitsu FR-V
2506    <https://en.wikipedia.org/wiki/FR-V_(microprocessor)> */
2507 #if defined(__frv__)
2508 #  define SIMDE_ARCH_FRV 1
2509 #endif
2510 
2511 /* H8/300
2512    <https://en.wikipedia.org/wiki/H8_Family> */
2513 #if defined(__H8300__)
2514 #  define SIMDE_ARCH_H8300
2515 #endif
2516 
2517 /* Elbrus (8S, 8SV and successors)
2518    <https://en.wikipedia.org/wiki/Elbrus-8S> */
2519 #if defined(__e2k__)
2520 #  define SIMDE_ARCH_E2K
2521 #endif
2522 
2523 /* HP/PA / PA-RISC
2524    <https://en.wikipedia.org/wiki/PA-RISC> */
2525 #if defined(__PA8000__) || defined(__HPPA20__) || defined(__RISC2_0__) || defined(_PA_RISC2_0)
2526 #  define SIMDE_ARCH_HPPA 20
2527 #elif defined(__PA7100__) || defined(__HPPA11__) || defined(_PA_RISC1_1)
2528 #  define SIMDE_ARCH_HPPA 11
2529 #elif defined(_PA_RISC1_0)
2530 #  define SIMDE_ARCH_HPPA 10
2531 #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa)
2532 #  define SIMDE_ARCH_HPPA 1
2533 #endif
2534 #if defined(SIMDE_ARCH_HPPA)
2535 #  define SIMDE_ARCH_HPPA_CHECK(version) ((version) <= SIMDE_ARCH_HPPA)
2536 #else
2537 #  define SIMDE_ARCH_HPPA_CHECK(version) (0)
2538 #endif
2539 
2540 /* x86
2541    <https://en.wikipedia.org/wiki/X86> */
2542 #if defined(_M_IX86)
2543 #  define SIMDE_ARCH_X86 (_M_IX86 / 100)
2544 #elif defined(__I86__)
2545 #  define SIMDE_ARCH_X86 __I86__
2546 #elif defined(i686) || defined(__i686) || defined(__i686__)
2547 #  define SIMDE_ARCH_X86 6
2548 #elif defined(i586) || defined(__i586) || defined(__i586__)
2549 #  define SIMDE_ARCH_X86 5
2550 #elif defined(i486) || defined(__i486) || defined(__i486__)
2551 #  define SIMDE_ARCH_X86 4
2552 #elif defined(i386) || defined(__i386) || defined(__i386__)
2553 #  define SIMDE_ARCH_X86 3
2554 #elif defined(_X86_) || defined(__X86__) || defined(__THW_INTEL__)
2555 #  define SIMDE_ARCH_X86 3
2556 #endif
2557 #if defined(SIMDE_ARCH_X86)
2558 #  define SIMDE_ARCH_X86_CHECK(version) ((version) <= SIMDE_ARCH_X86)
2559 #else
2560 #  define SIMDE_ARCH_X86_CHECK(version) (0)
2561 #endif
2562 
2563 /* SIMD ISA extensions for x86/x86_64 and Elbrus */
2564 #if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) || defined(SIMDE_ARCH_E2K)
2565 #  if defined(_M_IX86_FP)
2566 #    define SIMDE_ARCH_X86_MMX
2567 #    if (_M_IX86_FP >= 1)
2568 #      define SIMDE_ARCH_X86_SSE 1
2569 #    endif
2570 #    if (_M_IX86_FP >= 2)
2571 #      define SIMDE_ARCH_X86_SSE2 1
2572 #    endif
2573 #  elif defined(_M_X64)
2574 #    define SIMDE_ARCH_X86_SSE 1
2575 #    define SIMDE_ARCH_X86_SSE2 1
2576 #  else
2577 #    if defined(__MMX__)
2578 #      define SIMDE_ARCH_X86_MMX 1
2579 #    endif
2580 #    if defined(__SSE__)
2581 #      define SIMDE_ARCH_X86_SSE 1
2582 #    endif
2583 #    if defined(__SSE2__)
2584 #      define SIMDE_ARCH_X86_SSE2 1
2585 #    endif
2586 #  endif
2587 #  if defined(__SSE3__)
2588 #    define SIMDE_ARCH_X86_SSE3 1
2589 #  endif
2590 #  if defined(__SSSE3__)
2591 #    define SIMDE_ARCH_X86_SSSE3 1
2592 #  endif
2593 #  if defined(__SSE4_1__)
2594 #    define SIMDE_ARCH_X86_SSE4_1 1
2595 #  endif
2596 #  if defined(__SSE4_2__)
2597 #    define SIMDE_ARCH_X86_SSE4_2 1
2598 #  endif
2599 #  if defined(__XOP__)
2600 #    define SIMDE_ARCH_X86_XOP 1
2601 #  endif
2602 #  if defined(__AVX__)
2603 #    define SIMDE_ARCH_X86_AVX 1
2604 #    if !defined(SIMDE_ARCH_X86_SSE3)
2605 #      define SIMDE_ARCH_X86_SSE3 1
2606 #    endif
2607 #    if !defined(SIMDE_ARCH_X86_SSE4_1)
2608 #      define SIMDE_ARCH_X86_SSE4_1 1
2609 #    endif
2610 #    if !defined(SIMDE_ARCH_X86_SSE4_1)
2611 #      define SIMDE_ARCH_X86_SSE4_2 1
2612 #    endif
2613 #  endif
2614 #  if defined(__AVX2__)
2615 #    define SIMDE_ARCH_X86_AVX2 1
2616 #  endif
2617 #  if defined(__FMA__)
2618 #    define SIMDE_ARCH_X86_FMA 1
2619 #    if !defined(SIMDE_ARCH_X86_AVX)
2620 #      define SIMDE_ARCH_X86_AVX 1
2621 #    endif
2622 #  endif
2623 #  if defined(__AVX512VP2INTERSECT__)
2624 #    define SIMDE_ARCH_X86_AVX512VP2INTERSECT 1
2625 #  endif
2626 #  if defined(__AVX512VBMI__)
2627 #    define SIMDE_ARCH_X86_AVX512VBMI 1
2628 #  endif
2629 #  if defined(__AVX512BW__)
2630 #    define SIMDE_ARCH_X86_AVX512BW 1
2631 #  endif
2632 #  if defined(__AVX512CD__)
2633 #    define SIMDE_ARCH_X86_AVX512CD 1
2634 #  endif
2635 #  if defined(__AVX512DQ__)
2636 #    define SIMDE_ARCH_X86_AVX512DQ 1
2637 #  endif
2638 #  if defined(__AVX512F__)
2639 #    define SIMDE_ARCH_X86_AVX512F 1
2640 #  endif
2641 #  if defined(__AVX512VL__)
2642 #    define SIMDE_ARCH_X86_AVX512VL 1
2643 #  endif
2644 #  if defined(__GFNI__)
2645 #    define SIMDE_ARCH_X86_GFNI 1
2646 #  endif
2647 #  if defined(__PCLMUL__)
2648 #    define SIMDE_ARCH_X86_PCLMUL 1
2649 #  endif
2650 #  if defined(__VPCLMULQDQ__)
2651 #    define SIMDE_ARCH_X86_VPCLMULQDQ 1
2652 #  endif
2653 #  if defined(__F16C__)
2654 #    define SIMDE_ARCH_X86_F16C 1
2655 #  endif
2656 #endif
2657 
2658 /* Itanium
2659    <https://en.wikipedia.org/wiki/Itanium> */
2660 #if defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || defined(__itanium__)
2661 #  define SIMDE_ARCH_IA64 1
2662 #endif
2663 
2664 /* Renesas M32R
2665    <https://en.wikipedia.org/wiki/M32R> */
2666 #if defined(__m32r__) || defined(__M32R__)
2667 #  define SIMDE_ARCH_M32R
2668 #endif
2669 
2670 /* Motorola 68000
2671    <https://en.wikipedia.org/wiki/Motorola_68000> */
2672 #if defined(__mc68060__) || defined(__MC68060__)
2673 #  define SIMDE_ARCH_M68K 68060
2674 #elif defined(__mc68040__) || defined(__MC68040__)
2675 #  define SIMDE_ARCH_M68K 68040
2676 #elif defined(__mc68030__) || defined(__MC68030__)
2677 #  define SIMDE_ARCH_M68K 68030
2678 #elif defined(__mc68020__) || defined(__MC68020__)
2679 #  define SIMDE_ARCH_M68K 68020
2680 #elif defined(__mc68010__) || defined(__MC68010__)
2681 #  define SIMDE_ARCH_M68K 68010
2682 #elif defined(__mc68000__) || defined(__MC68000__)
2683 #  define SIMDE_ARCH_M68K 68000
2684 #endif
2685 #if defined(SIMDE_ARCH_M68K)
2686 #  define SIMDE_ARCH_M68K_CHECK(version) ((version) <= SIMDE_ARCH_M68K)
2687 #else
2688 #  define SIMDE_ARCH_M68K_CHECK(version) (0)
2689 #endif
2690 
2691 /* Xilinx MicroBlaze
2692    <https://en.wikipedia.org/wiki/MicroBlaze> */
2693 #if defined(__MICROBLAZE__) || defined(__microblaze__)
2694 #  define SIMDE_ARCH_MICROBLAZE
2695 #endif
2696 
2697 /* MIPS
2698    <https://en.wikipedia.org/wiki/MIPS_architecture> */
2699 #if defined(_MIPS_ISA_MIPS64R2)
2700 #  define SIMDE_ARCH_MIPS 642
2701 #elif defined(_MIPS_ISA_MIPS64)
2702 #  define SIMDE_ARCH_MIPS 640
2703 #elif defined(_MIPS_ISA_MIPS32R2)
2704 #  define SIMDE_ARCH_MIPS 322
2705 #elif defined(_MIPS_ISA_MIPS32)
2706 #  define SIMDE_ARCH_MIPS 320
2707 #elif defined(_MIPS_ISA_MIPS4)
2708 #  define SIMDE_ARCH_MIPS 4
2709 #elif defined(_MIPS_ISA_MIPS3)
2710 #  define SIMDE_ARCH_MIPS 3
2711 #elif defined(_MIPS_ISA_MIPS2)
2712 #  define SIMDE_ARCH_MIPS 2
2713 #elif defined(_MIPS_ISA_MIPS1)
2714 #  define SIMDE_ARCH_MIPS 1
2715 #elif defined(_MIPS_ISA_MIPS) || defined(__mips) || defined(__MIPS__)
2716 #  define SIMDE_ARCH_MIPS 1
2717 #endif
2718 #if defined(SIMDE_ARCH_MIPS)
2719 #  define SIMDE_ARCH_MIPS_CHECK(version) ((version) <= SIMDE_ARCH_MIPS)
2720 #else
2721 #  define SIMDE_ARCH_MIPS_CHECK(version) (0)
2722 #endif
2723 
2724 #if defined(__mips_loongson_mmi)
2725 #  define SIMDE_ARCH_MIPS_LOONGSON_MMI 1
2726 #endif
2727 
2728 /* Matsushita MN10300
2729    <https://en.wikipedia.org/wiki/MN103> */
2730 #if defined(__MN10300__) || defined(__mn10300__)
2731 #  define SIMDE_ARCH_MN10300 1
2732 #endif
2733 
2734 /* POWER
2735    <https://en.wikipedia.org/wiki/IBM_POWER_Instruction_Set_Architecture> */
2736 #if defined(_M_PPC)
2737 #  define SIMDE_ARCH_POWER _M_PPC
2738 #elif defined(_ARCH_PWR9)
2739 #  define SIMDE_ARCH_POWER 900
2740 #elif defined(_ARCH_PWR8)
2741 #  define SIMDE_ARCH_POWER 800
2742 #elif defined(_ARCH_PWR7)
2743 #  define SIMDE_ARCH_POWER 700
2744 #elif defined(_ARCH_PWR6)
2745 #  define SIMDE_ARCH_POWER 600
2746 #elif defined(_ARCH_PWR5)
2747 #  define SIMDE_ARCH_POWER 500
2748 #elif defined(_ARCH_PWR4)
2749 #  define SIMDE_ARCH_POWER 400
2750 #elif defined(_ARCH_440) || defined(__ppc440__)
2751 #  define SIMDE_ARCH_POWER 440
2752 #elif defined(_ARCH_450) || defined(__ppc450__)
2753 #  define SIMDE_ARCH_POWER 450
2754 #elif defined(_ARCH_601) || defined(__ppc601__)
2755 #  define SIMDE_ARCH_POWER 601
2756 #elif defined(_ARCH_603) || defined(__ppc603__)
2757 #  define SIMDE_ARCH_POWER 603
2758 #elif defined(_ARCH_604) || defined(__ppc604__)
2759 #  define SIMDE_ARCH_POWER 604
2760 #elif defined(_ARCH_605) || defined(__ppc605__)
2761 #  define SIMDE_ARCH_POWER 605
2762 #elif defined(_ARCH_620) || defined(__ppc620__)
2763 #  define SIMDE_ARCH_POWER 620
2764 #elif defined(__powerpc) || defined(__powerpc__) || defined(__POWERPC__) || defined(__ppc__) || defined(__PPC__) || defined(_ARCH_PPC) || defined(__ppc)
2765 #  define SIMDE_ARCH_POWER 1
2766 #endif
2767 #if defined(SIMDE_ARCH_POWER)
2768   #define SIMDE_ARCH_POWER_CHECK(version) ((version) <= SIMDE_ARCH_POWER)
2769 #else
2770   #define SIMDE_ARCH_POWER_CHECK(version) (0)
2771 #endif
2772 
2773 #if defined(__ALTIVEC__)
2774 #  define SIMDE_ARCH_POWER_ALTIVEC SIMDE_ARCH_POWER
2775 #endif
2776 #if defined(SIMDE_ARCH_POWER)
2777   #define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) ((version) <= SIMDE_ARCH_POWER)
2778 #else
2779   #define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) (0)
2780 #endif
2781 
2782 /* SPARC
2783    <https://en.wikipedia.org/wiki/SPARC> */
2784 #if defined(__sparc_v9__) || defined(__sparcv9)
2785 #  define SIMDE_ARCH_SPARC 9
2786 #elif defined(__sparc_v8__) || defined(__sparcv8)
2787 #  define SIMDE_ARCH_SPARC 8
2788 #elif defined(__sparc_v7__) || defined(__sparcv7)
2789 #  define SIMDE_ARCH_SPARC 7
2790 #elif defined(__sparc_v6__) || defined(__sparcv6)
2791 #  define SIMDE_ARCH_SPARC 6
2792 #elif defined(__sparc_v5__) || defined(__sparcv5)
2793 #  define SIMDE_ARCH_SPARC 5
2794 #elif defined(__sparc_v4__) || defined(__sparcv4)
2795 #  define SIMDE_ARCH_SPARC 4
2796 #elif defined(__sparc_v3__) || defined(__sparcv3)
2797 #  define SIMDE_ARCH_SPARC 3
2798 #elif defined(__sparc_v2__) || defined(__sparcv2)
2799 #  define SIMDE_ARCH_SPARC 2
2800 #elif defined(__sparc_v1__) || defined(__sparcv1)
2801 #  define SIMDE_ARCH_SPARC 1
2802 #elif defined(__sparc__) || defined(__sparc)
2803 #  define SIMDE_ARCH_SPARC 1
2804 #endif
2805 #if defined(SIMDE_ARCH_SPARC)
2806   #define SIMDE_ARCH_SPARC_CHECK(version) ((version) <= SIMDE_ARCH_SPARC)
2807 #else
2808   #define SIMDE_ARCH_SPARC_CHECK(version) (0)
2809 #endif
2810 
2811 /* SuperH
2812    <https://en.wikipedia.org/wiki/SuperH> */
2813 #if defined(__sh5__) || defined(__SH5__)
2814 #  define SIMDE_ARCH_SUPERH 5
2815 #elif defined(__sh4__) || defined(__SH4__)
2816 #  define SIMDE_ARCH_SUPERH 4
2817 #elif defined(__sh3__) || defined(__SH3__)
2818 #  define SIMDE_ARCH_SUPERH 3
2819 #elif defined(__sh2__) || defined(__SH2__)
2820 #  define SIMDE_ARCH_SUPERH 2
2821 #elif defined(__sh1__) || defined(__SH1__)
2822 #  define SIMDE_ARCH_SUPERH 1
2823 #elif defined(__sh__) || defined(__SH__)
2824 #  define SIMDE_ARCH_SUPERH 1
2825 #endif
2826 
2827 /* IBM System z
2828    <https://en.wikipedia.org/wiki/IBM_System_z> */
2829 #if defined(__370__) || defined(__THW_370__) || defined(__s390__) || defined(__s390x__) || defined(__zarch__) || defined(__SYSC_ZARCH__)
2830 #  define SIMDE_ARCH_ZARCH __ARCH__
2831 #endif
2832 #if defined(SIMDE_ARCH_ZARCH)
2833   #define SIMDE_ARCH_ZARCH_CHECK(version) ((version) <= SIMDE_ARCH_ZARCH)
2834 #else
2835   #define SIMDE_ARCH_ZARCH_CHECK(version) (0)
2836 #endif
2837 
2838 #if defined(SIMDE_ARCH_ZARCH) && defined(__VEC__)
2839   #define SIMDE_ARCH_ZARCH_ZVECTOR SIMDE_ARCH_ZARCH
2840 #endif
2841 
2842 /* TMS320 DSP
2843    <https://en.wikipedia.org/wiki/Texas_Instruments_TMS320> */
2844 #if defined(_TMS320C6740) || defined(__TMS320C6740__)
2845 #  define SIMDE_ARCH_TMS320 6740
2846 #elif defined(_TMS320C6700_PLUS) || defined(__TMS320C6700_PLUS__)
2847 #  define SIMDE_ARCH_TMS320 6701
2848 #elif defined(_TMS320C6700) || defined(__TMS320C6700__)
2849 #  define SIMDE_ARCH_TMS320 6700
2850 #elif defined(_TMS320C6600) || defined(__TMS320C6600__)
2851 #  define SIMDE_ARCH_TMS320 6600
2852 #elif defined(_TMS320C6400_PLUS) || defined(__TMS320C6400_PLUS__)
2853 #  define SIMDE_ARCH_TMS320 6401
2854 #elif defined(_TMS320C6400) || defined(__TMS320C6400__)
2855 #  define SIMDE_ARCH_TMS320 6400
2856 #elif defined(_TMS320C6200) || defined(__TMS320C6200__)
2857 #  define SIMDE_ARCH_TMS320 6200
2858 #elif defined(_TMS320C55X) || defined(__TMS320C55X__)
2859 #  define SIMDE_ARCH_TMS320 550
2860 #elif defined(_TMS320C54X) || defined(__TMS320C54X__)
2861 #  define SIMDE_ARCH_TMS320 540
2862 #elif defined(_TMS320C28X) || defined(__TMS320C28X__)
2863 #  define SIMDE_ARCH_TMS320 280
2864 #endif
2865 #if defined(SIMDE_ARCH_TMS320)
2866   #define SIMDE_ARCH_TMS320_CHECK(version) ((version) <= SIMDE_ARCH_TMS320)
2867 #else
2868   #define SIMDE_ARCH_TMS320_CHECK(version) (0)
2869 #endif
2870 
2871 /* WebAssembly */
2872 #if defined(__wasm__)
2873 #  define SIMDE_ARCH_WASM 1
2874 #endif
2875 
2876 #if defined(SIMDE_ARCH_WASM) && defined(__wasm_simd128__)
2877 #  define SIMDE_ARCH_WASM_SIMD128
2878 #endif
2879 
2880 /* Xtensa
2881    <https://en.wikipedia.org/wiki/> */
2882 #if defined(__xtensa__) || defined(__XTENSA__)
2883 #  define SIMDE_ARCH_XTENSA 1
2884 #endif
2885 
2886 #endif /* !defined(SIMDE_ARCH_H) */
2887 /* :: End ../simde/simde/simde-arch.h :: */
2888 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
2889 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
2890 /* :: Begin ../simde/simde/simde-features.h :: */
2891 /* SPDX-License-Identifier: MIT
2892  *
2893  * Permission is hereby granted, free of charge, to any person
2894  * obtaining a copy of this software and associated documentation
2895  * files (the "Software"), to deal in the Software without
2896  * restriction, including without limitation the rights to use, copy,
2897  * modify, merge, publish, distribute, sublicense, and/or sell copies
2898  * of the Software, and to permit persons to whom the Software is
2899  * furnished to do so, subject to the following conditions:
2900  *
2901  * The above copyright notice and this permission notice shall be
2902  * included in all copies or substantial portions of the Software.
2903  *
2904  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2905  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2906  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2907  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2908  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2909  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2910  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2911  * SOFTWARE.
2912  *
2913  * Copyright:
2914  *   2020      Evan Nemerson <evan@nemerson.com>
2915  */
2916 
2917 /* simde-arch.h is used to determine which features are available according
2918    to the compiler.  However, we want to make it possible to forcibly enable
2919    or disable APIs */
2920 
2921 #if !defined(SIMDE_FEATURES_H)
2922 #define SIMDE_FEATURES_H
2923 
2924 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
2925 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
2926 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
2927 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
2928 /* :: Begin ../simde/simde/simde-diagnostic.h :: */
2929 /* SPDX-License-Identifier: MIT
2930  *
2931  * Permission is hereby granted, free of charge, to any person
2932  * obtaining a copy of this software and associated documentation
2933  * files (the "Software"), to deal in the Software without
2934  * restriction, including without limitation the rights to use, copy,
2935  * modify, merge, publish, distribute, sublicense, and/or sell copies
2936  * of the Software, and to permit persons to whom the Software is
2937  * furnished to do so, subject to the following conditions:
2938  *
2939  * The above copyright notice and this permission notice shall be
2940  * included in all copies or substantial portions of the Software.
2941  *
2942  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2943  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2944  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2945  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2946  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2947  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2948  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2949  * SOFTWARE.
2950  *
2951  * Copyright:
2952  *   2017-2020 Evan Nemerson <evan@nemerson.com>
2953  */
2954 
2955 /* SIMDe targets a very wide range of standards and compilers, and our
2956  * goal is to compile cleanly even with extremely aggressive warnings
2957  * (i.e., -Weverything in clang, -Wextra in GCC, /W4 for MSVC, etc.)
2958  * treated as errors.
2959  *
2960  * While our preference is to resolve the underlying issue a given
2961  * diagnostic is warning us about, sometimes that's not possible.
2962  * Fixing a warning in one compiler may cause problems in another.
2963  * Sometimes a warning doesn't really apply to us (false positives),
2964  * and sometimes adhering to a warning would mean dropping a feature
2965  * we *know* the compiler supports since we have tested specifically
2966  * for the compiler or feature.
2967  *
2968  * When practical, warnings are only disabled for specific code.  For
2969  * a list of warnings which are enabled by default in all SIMDe code,
2970  * see SIMDE_DISABLE_UNWANTED_DIAGNOSTICS.  Note that we restore the
2971  * warning stack when SIMDe is done parsing, so code which includes
2972  * SIMDe is not deprived of these warnings.
2973  */
2974 
2975 #if !defined(SIMDE_DIAGNOSTIC_H)
2976 #define SIMDE_DIAGNOSTIC_H
2977 
2978 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
2979 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
2980 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
2981 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
2982 
2983 /* This is only to help us implement functions like _mm_undefined_ps. */
2984 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
2985   #undef SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
2986 #endif
2987 #if HEDLEY_HAS_WARNING("-Wuninitialized")
2988   #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("clang diagnostic ignored \"-Wuninitialized\"")
2989 #elif HEDLEY_GCC_VERSION_CHECK(4,2,0)
2990   #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("GCC diagnostic ignored \"-Wuninitialized\"")
2991 #elif HEDLEY_PGI_VERSION_CHECK(19,10,0)
2992   #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 549")
2993 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
2994   #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE,unassigned)")
2995 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0)
2996   #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE)")
2997 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) && defined(__cplusplus)
2998   #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,unassigned)")
2999 #elif \
3000      HEDLEY_TI_VERSION_CHECK(16,9,9) || \
3001      HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
3002      HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
3003      HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2)
3004   #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 551")
3005 #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
3006   #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("warning(disable:592)")
3007 #elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) && !defined(__MSVC_RUNTIME_CHECKS)
3008   #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ __pragma(warning(disable:4700))
3009 #endif
3010 
3011 /* GCC emits a lot of "notes" about the ABI being different for things
3012  * in newer versions of GCC.  We don't really care because all our
3013  * functions are inlined and don't generate ABI. */
3014 #if HEDLEY_GCC_VERSION_CHECK(7,0,0)
3015   #define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ _Pragma("GCC diagnostic ignored \"-Wpsabi\"")
3016 #else
3017   #define SIMDE_DIAGNOSTIC_DISABLE_PSABI_
3018 #endif
3019 
3020 /* Since MMX uses x87 FP registers, you're supposed to call _mm_empty()
3021  * after each MMX function before any floating point instructions.
3022  * Some compilers warn about functions which use MMX functions but
3023  * don't call _mm_empty().  However, since SIMDe is implementyng the
3024  * MMX API we shouldn't be calling _mm_empty(); we leave it to the
3025  * caller to invoke simde_mm_empty(). */
3026 #if HEDLEY_INTEL_VERSION_CHECK(19,0,0)
3027   #define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ _Pragma("warning(disable:13200 13203)")
3028 #elif defined(HEDLEY_MSVC_VERSION)
3029   #define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ __pragma(warning(disable:4799))
3030 #else
3031   #define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_
3032 #endif
3033 
3034 /* Intel is pushing people to use OpenMP SIMD instead of Cilk+, so they
3035  * emit a diagnostic if you use #pragma simd instead of
3036  * #pragma omp simd.  SIMDe supports OpenMP SIMD, you just need to
3037  * compile with -qopenmp or -qopenmp-simd and define
3038  * SIMDE_ENABLE_OPENMP.  Cilk+ is just a fallback. */
3039 #if HEDLEY_INTEL_VERSION_CHECK(18,0,0)
3040   #define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ _Pragma("warning(disable:3948)")
3041 #else
3042   #define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_
3043 #endif
3044 
3045 /* MSVC emits a diagnostic when we call a function (like
3046  * simde_mm_set_epi32) while initializing a struct.  We currently do
3047  * this a *lot* in the tests. */
3048 #if \
3049   defined(HEDLEY_MSVC_VERSION)
3050   #define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ __pragma(warning(disable:4204))
3051 #else
3052   #define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_
3053 #endif
3054 
3055 /* This warning needs a lot of work.  It is triggered if all you do is
3056  * pass the value to memcpy/__builtin_memcpy, or if you initialize a
3057  * member of the union, even if that member takes up the entire union.
3058  * Last tested with clang-10, hopefully things will improve in the
3059  * future; if clang fixes this I'd love to enable it. */
3060 #if \
3061   HEDLEY_HAS_WARNING("-Wconditional-uninitialized")
3062   #define SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ _Pragma("clang diagnostic ignored \"-Wconditional-uninitialized\"")
3063 #else
3064   #define SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_
3065 #endif
3066 
3067 /* This warning is meant to catch things like `0.3 + 0.4 == 0.7`, which
3068  * will is false.  However, SIMDe uses these operations exclusively
3069  * for things like _mm_cmpeq_ps, for which we really do want to check
3070  * for equality (or inequality).
3071  *
3072  * If someone wants to put together a SIMDE_FLOAT_EQUAL(a, op, b) macro
3073  * which just wraps a check in some code do disable this diagnostic I'd
3074  * be happy to accept it. */
3075 #if \
3076   HEDLEY_HAS_WARNING("-Wfloat-equal") || \
3077   HEDLEY_GCC_VERSION_CHECK(3,0,0)
3078   #define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
3079 #else
3080   #define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_
3081 #endif
3082 
3083 /* This is because we use HEDLEY_STATIC_ASSERT for static assertions.
3084  * If Hedley can't find an implementation it will preprocess to
3085  * nothing, which means there will be a trailing semi-colon. */
3086 #if HEDLEY_HAS_WARNING("-Wextra-semi")
3087   #define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ _Pragma("clang diagnostic ignored \"-Wextra-semi\"")
3088 #elif HEDLEY_GCC_VERSION_CHECK(8,1,0) && defined(__cplusplus)
3089   #define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ _Pragma("GCC diagnostic ignored \"-Wextra-semi\"")
3090 #else
3091   #define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_
3092 #endif
3093 
3094 /* We do use a few variadic macros, which technically aren't available
3095  * until C99 and C++11, but every compiler I'm aware of has supported
3096  * them for much longer.  That said, usage is isolated to the test
3097  * suite and compilers known to support them. */
3098 #if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4,0,0)
3099   #if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic")
3100     #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ \
3101       _Pragma("clang diagnostic ignored \"-Wvariadic-macros\"") \
3102       _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"")
3103   #else
3104     #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ _Pragma("GCC diagnostic ignored \"-Wvariadic-macros\"")
3105   #endif
3106 #else
3107   #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_
3108 #endif
3109 
3110 /* emscripten requires us to use a __wasm_unimplemented_simd128__ macro
3111  * before we can access certain SIMD intrinsics, but this diagnostic
3112  * warns about it being a reserved name.  It is a reserved name, but
3113  * it's reserved for the compiler and we are using it to convey
3114  * information to the compiler.
3115  *
3116  * This is also used when enabling native aliases since we don't get to
3117  * choose the macro names. */
3118 #if HEDLEY_HAS_WARNING("-Wdouble-promotion")
3119   #define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_ _Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"")
3120 #else
3121   #define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_
3122 #endif
3123 
3124 /* clang 3.8 warns about the packed attribute being unnecessary when
3125  * used in the _mm_loadu_* functions.  That *may* be true for version
3126  * 3.8, but for later versions it is crucial in order to make unaligned
3127  * access safe. */
3128 #if HEDLEY_HAS_WARNING("-Wpacked")
3129   #define SIMDE_DIAGNOSTIC_DISABLE_PACKED_ _Pragma("clang diagnostic ignored \"-Wpacked\"")
3130 #else
3131   #define SIMDE_DIAGNOSTIC_DISABLE_PACKED_
3132 #endif
3133 
3134 /* Triggered when assigning a float to a double implicitly.  We use
3135  * explicit casts in SIMDe, this is only used in the test suite. */
3136 #if HEDLEY_HAS_WARNING("-Wdouble-promotion")
3137   #define SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ _Pragma("clang diagnostic ignored \"-Wdouble-promotion\"")
3138 #else
3139   #define SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_
3140 #endif
3141 
3142 /* Several compilers treat conformant array parameters as VLAs.  We
3143  * test to make sure we're in C mode (C++ doesn't support CAPs), and
3144  * that the version of the standard supports CAPs.  We also reject
3145  * some buggy compilers like MSVC (the logic is in Hedley if you want
3146  * to take a look), but with certain warnings enabled some compilers
3147  * still like to emit a diagnostic. */
3148 #if HEDLEY_HAS_WARNING("-Wvla")
3149   #define SIMDE_DIAGNOSTIC_DISABLE_VLA_ _Pragma("clang diagnostic ignored \"-Wvla\"")
3150 #elif HEDLEY_GCC_VERSION_CHECK(4,3,0)
3151   #define SIMDE_DIAGNOSTIC_DISABLE_VLA_ _Pragma("GCC diagnostic ignored \"-Wvla\"")
3152 #else
3153   #define SIMDE_DIAGNOSTIC_DISABLE_VLA_
3154 #endif
3155 
3156 #if HEDLEY_HAS_WARNING("-Wused-but-marked-unused")
3157   #define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ _Pragma("clang diagnostic ignored \"-Wused-but-marked-unused\"")
3158 #else
3159   #define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_
3160 #endif
3161 
3162 #if HEDLEY_HAS_WARNING("-Wpass-failed")
3163   #define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ _Pragma("clang diagnostic ignored \"-Wpass-failed\"")
3164 #else
3165   #define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_
3166 #endif
3167 
3168 #if HEDLEY_HAS_WARNING("-Wpadded")
3169   #define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ _Pragma("clang diagnostic ignored \"-Wpadded\"")
3170 #elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) /* Likely goes back further */
3171   #define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ __pragma(warning(disable:4324))
3172 #else
3173   #define SIMDE_DIAGNOSTIC_DISABLE_PADDED_
3174 #endif
3175 
3176 #if HEDLEY_HAS_WARNING("-Wzero-as-null-pointer-constant")
3177   #define SIMDE_DIAGNOSTIC_DISABLE_ZERO_AS_NULL_POINTER_CONSTANT_ _Pragma("clang diagnostic ignored \"-Wzero-as-null-pointer-constant\"")
3178 #else
3179   #define SIMDE_DIAGNOSTIC_DISABLE_ZERO_AS_NULL_POINTER_CONSTANT_
3180 #endif
3181 
3182 #if HEDLEY_HAS_WARNING("-Wold-style-cast")
3183   #define SIMDE_DIAGNOSTIC_DISABLE_OLD_STYLE_CAST_ _Pragma("clang diagnostic ignored \"-Wold-style-cast\"")
3184 #else
3185   #define SIMDE_DIAGNOSTIC_DISABLE_OLD_STYLE_CAST_
3186 #endif
3187 
3188 #if HEDLEY_HAS_WARNING("-Wcast-function-type") || HEDLEY_GCC_VERSION_CHECK(8,0,0)
3189   #define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_ _Pragma("GCC diagnostic ignored \"-Wcast-function-type\"")
3190 #else
3191   #define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_
3192 #endif
3193 
3194 /* clang will emit this warning when we use C99 extensions whan not in
3195  * C99 mode, even though it does support this.  In such cases we check
3196  * the compiler and version first, so we know it's not a problem. */
3197 #if HEDLEY_HAS_WARNING("-Wc99-extensions")
3198   #define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ _Pragma("clang diagnostic ignored \"-Wc99-extensions\"")
3199 #else
3200   #define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_
3201 #endif
3202 
3203 /* https://github.com/simd-everywhere/simde/issues/277 */
3204 #if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4,6,0) && !HEDLEY_GCC_VERSION_CHECK(6,4,0) && defined(__cplusplus)
3205   #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ _Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"")
3206 #else
3207   #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_
3208 #endif
3209 
3210 /* This is the warning that you normally define _CRT_SECURE_NO_WARNINGS
3211  * to silence, but you have to do that before including anything and
3212  * that would require reordering includes. */
3213 #if defined(_MSC_VER)
3214   #define SIMDE_DIAGNOSTIC_DISABLE_ANNEX_K_ __pragma(warning(disable:4996))
3215 #else
3216   #define SIMDE_DIAGNOSTIC_DISABLE_ANNEX_K_
3217 #endif
3218 
3219 /* Some compilers, such as clang, may use `long long` for 64-bit
3220  * integers, but `long long` triggers a diagnostic with
3221  * -Wc++98-compat-pedantic which says 'long long' is incompatible with
3222  * C++98. */
3223 #if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic")
3224   #if HEDLEY_HAS_WARNING("-Wc++11-long-long")
3225     #define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ \
3226       _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"") \
3227       _Pragma("clang diagnostic ignored \"-Wc++11-long-long\"")
3228   #else
3229     #define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"")
3230   #endif
3231 #else
3232   #define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_
3233 #endif
3234 
3235 /* Some problem as above */
3236 #if HEDLEY_HAS_WARNING("-Wc++11-long-long")
3237   #define SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_ _Pragma("clang diagnostic ignored \"-Wc++11-long-long\"")
3238 #else
3239   #define SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_
3240 #endif
3241 
3242 /* emscripten emits this whenever stdin/stdout/stderr is used in a
3243  * macro. */
3244 #if HEDLEY_HAS_WARNING("-Wdisabled-macro-expansion")
3245   #define SIMDE_DIAGNOSTIC_DISABLE_DISABLED_MACRO_EXPANSION_ _Pragma("clang diagnostic ignored \"-Wdisabled-macro-expansion\"")
3246 #else
3247   #define SIMDE_DIAGNOSTIC_DISABLE_DISABLED_MACRO_EXPANSION_
3248 #endif
3249 
3250 /* Clang uses C11 generic selections to implement some AltiVec
3251  * functions, which triggers this diagnostic when not compiling
3252  * in C11 mode */
3253 #if HEDLEY_HAS_WARNING("-Wc11-extensions")
3254   #define SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_ _Pragma("clang diagnostic ignored \"-Wc11-extensions\"")
3255 #else
3256   #define SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_
3257 #endif
3258 
3259 /* Clang sometimes triggers this warning in macros in the AltiVec and
3260  * NEON headers, or due to missing functions. */
3261 #if HEDLEY_HAS_WARNING("-Wvector-conversion")
3262   #define SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ _Pragma("clang diagnostic ignored \"-Wvector-conversion\"")
3263   /* For NEON, the situation with -Wvector-conversion in clang < 10 is
3264    * bad enough that we just disable the warning altogether.  On x86,
3265    * clang has similar issues on several sse4.2+ intrinsics before 3.8. */
3266   #if \
3267       (defined(SIMDE_ARCH_ARM) && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0)) || \
3268       SIMDE_DETECT_CLANG_VERSION_NOT(3,8,0)
3269     #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_ SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
3270   #endif
3271 #else
3272   #define SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
3273 #endif
3274 #if !defined(SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_)
3275   #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_
3276 #endif
3277 
3278 /* Prior to 5.0, clang didn't support disabling diagnostics in
3279  * statement exprs.  As a result, some macros we use don't
3280  * properly silence warnings. */
3281 #if SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) && HEDLEY_HAS_WARNING("-Wcast-qual") && HEDLEY_HAS_WARNING("-Wcast-align")
3282   #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ _Pragma("clang diagnostic ignored \"-Wcast-qual\"") _Pragma("clang diagnostic ignored \"-Wcast-align\"")
3283 #elif SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) && HEDLEY_HAS_WARNING("-Wcast-qual")
3284   #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
3285 #elif SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) && HEDLEY_HAS_WARNING("-Wcast-align")
3286   #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ _Pragma("clang diagnostic ignored \"-Wcast-align\"")
3287 #else
3288   #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_
3289 #endif
3290 
3291 /* SLEEF triggers this a *lot* in their headers */
3292 #if HEDLEY_HAS_WARNING("-Wignored-qualifiers")
3293   #define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ _Pragma("clang diagnostic ignored \"-Wignored-qualifiers\"")
3294 #elif HEDLEY_GCC_VERSION_CHECK(4,3,0)
3295   #define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ _Pragma("GCC diagnostic ignored \"-Wignored-qualifiers\"")
3296 #else
3297   #define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_
3298 #endif
3299 
3300 /* GCC emits this under some circumstances when using __int128 */
3301 #if HEDLEY_GCC_VERSION_CHECK(4,8,0)
3302   #define SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ _Pragma("GCC diagnostic ignored \"-Wpedantic\"")
3303 #else
3304   #define SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_
3305 #endif
3306 
3307 /* MSVC doesn't like (__assume(0), code) and will warn about code being
3308  * unreachable, but we want it there because not all compilers
3309  * understand the unreachable macro and will complain if it is missing.
3310  * I'm planning on adding a new macro to Hedley to handle this a bit
3311  * more elegantly, but until then... */
3312 #if defined(HEDLEY_MSVC_VERSION)
3313   #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ __pragma(warning(disable:4702))
3314 #else
3315   #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_
3316 #endif
3317 
3318 /* This is a false positive from GCC in a few places. */
3319 #if HEDLEY_GCC_VERSION_CHECK(4,7,0)
3320   #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"")
3321 #else
3322   #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
3323 #endif
3324 
3325 #if defined(SIMDE_ENABLE_NATIVE_ALIASES)
3326   #define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_ \
3327     SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_
3328 #else
3329   #define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_
3330 #endif
3331 
3332 #define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS \
3333   HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION \
3334   SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_ \
3335   SIMDE_DIAGNOSTIC_DISABLE_PSABI_ \
3336   SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ \
3337   SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ \
3338   SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ \
3339   SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ \
3340   SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \
3341   SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ \
3342   SIMDE_DIAGNOSTIC_DISABLE_VLA_ \
3343   SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ \
3344   SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ \
3345   SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ \
3346   SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_ \
3347   SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ \
3348   SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ \
3349   SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_
3350 
3351 #endif /* !defined(SIMDE_DIAGNOSTIC_H) */
3352 /* :: End ../simde/simde/simde-diagnostic.h :: */
3353 
3354 #if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3355   #if defined(SIMDE_ARCH_X86_SVML)
3356     #define SIMDE_X86_SVML_NATIVE
3357   #endif
3358 #endif
3359 #if defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
3360   #define SIMDE_X86_AVX512F_NATIVE
3361 #endif
3362 
3363 #if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && !defined(SIMDE_X86_AVX512VP2INTERSECT_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3364   #if defined(SIMDE_ARCH_X86_AVX512VP2INTERSECT)
3365     #define SIMDE_X86_AVX512VP2INTERSECT_NATIVE
3366   #endif
3367 #endif
3368 #if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
3369   #define SIMDE_X86_AVX512F_NATIVE
3370 #endif
3371 
3372 #if !defined(SIMDE_X86_AVX512VBMI_NATIVE) && !defined(SIMDE_X86_AVX512VBMI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3373   #if defined(SIMDE_ARCH_X86_AVX512VBMI)
3374     #define SIMDE_X86_AVX512VBMI_NATIVE
3375   #endif
3376 #endif
3377 #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
3378   #define SIMDE_X86_AVX512F_NATIVE
3379 #endif
3380 
3381 #if !defined(SIMDE_X86_AVX512CD_NATIVE) && !defined(SIMDE_X86_AVX512CD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3382   #if defined(SIMDE_ARCH_X86_AVX512CD)
3383     #define SIMDE_X86_AVX512CD_NATIVE
3384   #endif
3385 #endif
3386 #if defined(SIMDE_X86_AVX512CD_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
3387   #define SIMDE_X86_AVX512F_NATIVE
3388 #endif
3389 
3390 #if !defined(SIMDE_X86_AVX512DQ_NATIVE) && !defined(SIMDE_X86_AVX512DQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3391   #if defined(SIMDE_ARCH_X86_AVX512DQ)
3392     #define SIMDE_X86_AVX512DQ_NATIVE
3393   #endif
3394 #endif
3395 #if defined(SIMDE_X86_AVX512DQ_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
3396   #define SIMDE_X86_AVX512F_NATIVE
3397 #endif
3398 
3399 #if !defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_X86_AVX512VL_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3400   #if defined(SIMDE_ARCH_X86_AVX512VL)
3401     #define SIMDE_X86_AVX512VL_NATIVE
3402   #endif
3403 #endif
3404 #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
3405   #define SIMDE_X86_AVX512F_NATIVE
3406 #endif
3407 
3408 #if !defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512BW_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3409   #if defined(SIMDE_ARCH_X86_AVX512BW)
3410     #define SIMDE_X86_AVX512BW_NATIVE
3411   #endif
3412 #endif
3413 #if defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
3414   #define SIMDE_X86_AVX512F_NATIVE
3415 #endif
3416 
3417 #if !defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_X86_AVX512F_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3418   #if defined(SIMDE_ARCH_X86_AVX512F)
3419     #define SIMDE_X86_AVX512F_NATIVE
3420   #endif
3421 #endif
3422 #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_X86_AVX2_NATIVE)
3423   #define SIMDE_X86_AVX2_NATIVE
3424 #endif
3425 
3426 #if !defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_X86_FMA_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3427   #if defined(SIMDE_ARCH_X86_FMA)
3428     #define SIMDE_X86_FMA_NATIVE
3429   #endif
3430 #endif
3431 #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_X86_AVX_NATIVE)
3432   #define SIMDE_X86_AVX_NATIVE
3433 #endif
3434 
3435 #if !defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_X86_AVX2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3436   #if defined(SIMDE_ARCH_X86_AVX2)
3437     #define SIMDE_X86_AVX2_NATIVE
3438   #endif
3439 #endif
3440 #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_X86_AVX_NATIVE)
3441   #define SIMDE_X86_AVX_NATIVE
3442 #endif
3443 
3444 #if !defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_AVX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3445   #if defined(SIMDE_ARCH_X86_AVX)
3446     #define SIMDE_X86_AVX_NATIVE
3447   #endif
3448 #endif
3449 #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_SSE4_1_NATIVE)
3450   #define SIMDE_X86_SSE4_2_NATIVE
3451 #endif
3452 
3453 #if !defined(SIMDE_X86_XOP_NATIVE) && !defined(SIMDE_X86_XOP_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3454   #if defined(SIMDE_ARCH_X86_XOP)
3455     #define SIMDE_X86_XOP_NATIVE
3456   #endif
3457 #endif
3458 #if defined(SIMDE_X86_XOP_NATIVE) && !defined(SIMDE_X86_SSE4_2_NATIVE)
3459   #define SIMDE_X86_SSE4_2_NATIVE
3460 #endif
3461 
3462 #if !defined(SIMDE_X86_SSE4_2_NATIVE) && !defined(SIMDE_X86_SSE4_2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3463   #if defined(SIMDE_ARCH_X86_SSE4_2)
3464     #define SIMDE_X86_SSE4_2_NATIVE
3465   #endif
3466 #endif
3467 #if defined(SIMDE_X86_SSE4_2_NATIVE) && !defined(SIMDE_X86_SSE4_1_NATIVE)
3468   #define SIMDE_X86_SSE4_1_NATIVE
3469 #endif
3470 
3471 #if !defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_X86_SSE4_1_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3472   #if defined(SIMDE_ARCH_X86_SSE4_1)
3473     #define SIMDE_X86_SSE4_1_NATIVE
3474   #endif
3475 #endif
3476 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_X86_SSSE3_NATIVE)
3477   #define SIMDE_X86_SSSE3_NATIVE
3478 #endif
3479 
3480 #if !defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_X86_SSSE3_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3481   #if defined(SIMDE_ARCH_X86_SSSE3)
3482     #define SIMDE_X86_SSSE3_NATIVE
3483   #endif
3484 #endif
3485 #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_X86_SSE3_NATIVE)
3486   #define SIMDE_X86_SSE3_NATIVE
3487 #endif
3488 
3489 #if !defined(SIMDE_X86_SSE3_NATIVE) && !defined(SIMDE_X86_SSE3_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3490   #if defined(SIMDE_ARCH_X86_SSE3)
3491     #define SIMDE_X86_SSE3_NATIVE
3492   #endif
3493 #endif
3494 #if defined(SIMDE_X86_SSE3_NATIVE) && !defined(SIMDE_X86_SSE2_NATIVE)
3495   #define SIMDE_X86_SSE2_NATIVE
3496 #endif
3497 
3498 #if !defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3499   #if defined(SIMDE_ARCH_X86_SSE2)
3500     #define SIMDE_X86_SSE2_NATIVE
3501   #endif
3502 #endif
3503 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE_NATIVE)
3504   #define SIMDE_X86_SSE_NATIVE
3505 #endif
3506 
3507 #if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_SSE_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3508   #if defined(SIMDE_ARCH_X86_SSE)
3509     #define SIMDE_X86_SSE_NATIVE
3510   #endif
3511 #endif
3512 
3513 #if !defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_X86_MMX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3514   #if defined(SIMDE_ARCH_X86_MMX)
3515     #define SIMDE_X86_MMX_NATIVE
3516   #endif
3517 #endif
3518 
3519 #if !defined(SIMDE_X86_GFNI_NATIVE) && !defined(SIMDE_X86_GFNI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3520   #if defined(SIMDE_ARCH_X86_GFNI)
3521     #define SIMDE_X86_GFNI_NATIVE
3522   #endif
3523 #endif
3524 
3525 #if !defined(SIMDE_X86_PCLMUL_NATIVE) && !defined(SIMDE_X86_PCLMUL_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3526   #if defined(SIMDE_ARCH_X86_PCLMUL)
3527     #define SIMDE_X86_PCLMUL_NATIVE
3528   #endif
3529 #endif
3530 
3531 #if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && !defined(SIMDE_X86_VPCLMULQDQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3532   #if defined(SIMDE_ARCH_X86_VPCLMULQDQ)
3533     #define SIMDE_X86_VPCLMULQDQ_NATIVE
3534   #endif
3535 #endif
3536 
3537 #if !defined(SIMDE_X86_F16C_NATIVE) && !defined(SIMDE_X86_F16C_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3538   #if defined(SIMDE_ARCH_X86_F16C)
3539     #define SIMDE_X86_F16C_NATIVE
3540   #endif
3541 #endif
3542 
3543 #if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3544   #if defined(__INTEL_COMPILER)
3545     #define SIMDE_X86_SVML_NATIVE
3546   #endif
3547 #endif
3548 
3549 #if defined(HEDLEY_MSVC_VERSION)
3550   #pragma warning(push)
3551   #pragma warning(disable:4799)
3552 #endif
3553 
3554 #if \
3555     defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE)
3556   #include <immintrin.h>
3557 #elif defined(SIMDE_X86_SSE4_2_NATIVE)
3558   #include <nmmintrin.h>
3559 #elif defined(SIMDE_X86_SSE4_1_NATIVE)
3560   #include <smmintrin.h>
3561 #elif defined(SIMDE_X86_SSSE3_NATIVE)
3562   #include <tmmintrin.h>
3563 #elif defined(SIMDE_X86_SSE3_NATIVE)
3564   #include <pmmintrin.h>
3565 #elif defined(SIMDE_X86_SSE2_NATIVE)
3566   #include <emmintrin.h>
3567 #elif defined(SIMDE_X86_SSE_NATIVE)
3568   #include <xmmintrin.h>
3569 #elif defined(SIMDE_X86_MMX_NATIVE)
3570   #include <mmintrin.h>
3571 #endif
3572 
3573 #if defined(SIMDE_X86_XOP_NATIVE)
3574   #if defined(_MSC_VER)
3575     #include <intrin.h>
3576   #else
3577     #include <x86intrin.h>
3578   #endif
3579 #endif
3580 
3581 #if defined(HEDLEY_MSVC_VERSION)
3582   #pragma warning(pop)
3583 #endif
3584 
3585 #if !defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_ARM_NEON_A64V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3586   #if defined(SIMDE_ARCH_ARM_NEON) && defined(SIMDE_ARCH_AARCH64) && SIMDE_ARCH_ARM_CHECK(80)
3587     #define SIMDE_ARM_NEON_A64V8_NATIVE
3588   #endif
3589 #endif
3590 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NATIVE)
3591   #define SIMDE_ARM_NEON_A32V8_NATIVE
3592 #endif
3593 
3594 #if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3595   #if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(80) && (__ARM_NEON_FP & 0x02)
3596     #define SIMDE_ARM_NEON_A32V8_NATIVE
3597   #endif
3598 #endif
3599 #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3600   #define SIMDE_ARM_NEON_A32V7_NATIVE
3601 #endif
3602 
3603 #if !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A32V7_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3604   #if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(70)
3605     #define SIMDE_ARM_NEON_A32V7_NATIVE
3606   #endif
3607 #endif
3608 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3609   #include <arm_neon.h>
3610 #endif
3611 
3612 #if !defined(SIMDE_ARM_SVE_NATIVE) && !defined(SIMDE_ARM_SVE_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3613   #if defined(SIMDE_ARCH_ARM_SVE)
3614     #define SIMDE_ARM_SVE_NATIVE
3615     #include <arm_sve.h>
3616   #endif
3617 #endif
3618 
3619 #if !defined(SIMDE_WASM_SIMD128_NATIVE) && !defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3620   #if defined(SIMDE_ARCH_WASM_SIMD128)
3621     #define SIMDE_WASM_SIMD128_NATIVE
3622   #endif
3623 #endif
3624 #if defined(SIMDE_WASM_SIMD128_NATIVE)
3625   #include <wasm_simd128.h>
3626 #endif
3627 
3628 #if !defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P9_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3629   #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(900)
3630     #define SIMDE_POWER_ALTIVEC_P9_NATIVE
3631   #endif
3632 #endif
3633 #if defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P8)
3634   #define SIMDE_POWER_ALTIVEC_P8_NATIVE
3635 #endif
3636 
3637 #if !defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3638   #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(800)
3639     #define SIMDE_POWER_ALTIVEC_P8_NATIVE
3640   #endif
3641 #endif
3642 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P7)
3643   #define SIMDE_POWER_ALTIVEC_P7_NATIVE
3644 #endif
3645 
3646 #if !defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P7_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3647   #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(700)
3648     #define SIMDE_POWER_ALTIVEC_P7_NATIVE
3649   #endif
3650 #endif
3651 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P6)
3652   #define SIMDE_POWER_ALTIVEC_P6_NATIVE
3653 #endif
3654 
3655 #if !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P6_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3656   #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(600)
3657     #define SIMDE_POWER_ALTIVEC_P6_NATIVE
3658   #endif
3659 #endif
3660 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P5)
3661   #define SIMDE_POWER_ALTIVEC_P5_NATIVE
3662 #endif
3663 
3664 #if !defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P5_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3665   #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(500)
3666     #define SIMDE_POWER_ALTIVEC_P5_NATIVE
3667   #endif
3668 #endif
3669 
3670 #if !defined(SIMDE_ZARCH_ZVECTOR_15_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_15_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3671   #if SIMDE_ARCH_ZARCH_CHECK(13) && defined(SIMDE_ARCH_ZARCH_ZVECTOR)
3672     #define SIMDE_ZARCH_ZVECTOR_15_NATIVE
3673   #endif
3674 #endif
3675 
3676 #if !defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_14_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3677   #if SIMDE_ARCH_ZARCH_CHECK(12) && defined(SIMDE_ARCH_ZARCH_ZVECTOR)
3678     #define SIMDE_ZARCH_ZVECTOR_14_NATIVE
3679   #endif
3680 #endif
3681 
3682 #if !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3683   #if SIMDE_ARCH_ZARCH_CHECK(11) && defined(SIMDE_ARCH_ZARCH_ZVECTOR)
3684     #define SIMDE_ZARCH_ZVECTOR_13_NATIVE
3685   #endif
3686 #endif
3687 
3688 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3689   /* AltiVec conflicts with lots of stuff.  The bool keyword conflicts
3690    * with the bool keyword in C++ and the bool macro in C99+ (defined
3691    * in stdbool.h).  The vector keyword conflicts with std::vector in
3692    * C++ if you are `using std;`.
3693    *
3694    * Luckily AltiVec allows you to use `__vector`/`__bool`/`__pixel`
3695    * instead, but altivec.h will unconditionally define
3696    * `vector`/`bool`/`pixel` so we need to work around that.
3697    *
3698    * Unfortunately this means that if your code uses AltiVec directly
3699    * it may break.  If this is the case you'll want to define
3700    * `SIMDE_POWER_ALTIVEC_NO_UNDEF` before including SIMDe.  Or, even
3701    * better, port your code to use the double-underscore versions. */
3702   #if defined(bool)
3703     #undef bool
3704   #endif
3705 
3706   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3707     #include <altivec.h>
3708 
3709     #if !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF)
3710       #if defined(vector)
3711         #undef vector
3712       #endif
3713       #if defined(pixel)
3714         #undef pixel
3715       #endif
3716       #if defined(bool)
3717         #undef bool
3718       #endif
3719     #endif /* !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF) */
3720   #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3721     #include <vecintrin.h>
3722   #endif
3723 
3724   /* Use these intsead of vector/pixel/bool in SIMDe. */
3725   #define SIMDE_POWER_ALTIVEC_VECTOR(T) __vector T
3726   #define SIMDE_POWER_ALTIVEC_PIXEL __pixel
3727   #define SIMDE_POWER_ALTIVEC_BOOL __bool
3728 
3729   /* Re-define bool if we're using stdbool.h */
3730   #if !defined(__cplusplus) && defined(__bool_true_false_are_defined) && !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF)
3731     #define bool _Bool
3732   #endif
3733 #endif
3734 
3735 #if !defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) && !defined(SIMDE_MIPS_LOONGSON_MMI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
3736   #if defined(SIMDE_ARCH_MIPS_LOONGSON_MMI)
3737     #define SIMDE_MIPS_LOONGSON_MMI_NATIVE  1
3738   #endif
3739 #endif
3740 #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
3741   #include <loongson-mmiintrin.h>
3742 #endif
3743 
3744 /* This is used to determine whether or not to fall back on a vector
3745  * function in an earlier ISA extensions, as well as whether
3746  * we expected any attempts at vectorization to be fruitful or if we
3747  * expect to always be running serial code. */
3748 
3749 #if !defined(SIMDE_NATURAL_VECTOR_SIZE)
3750   #if defined(SIMDE_X86_AVX512F_NATIVE)
3751     #define SIMDE_NATURAL_VECTOR_SIZE (512)
3752   #elif defined(SIMDE_X86_AVX_NATIVE)
3753     #define SIMDE_NATURAL_VECTOR_SIZE (256)
3754   #elif \
3755       defined(SIMDE_X86_SSE_NATIVE) || \
3756       defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \
3757       defined(SIMDE_WASM_SIMD128_NATIVE) || \
3758       defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3759     #define SIMDE_NATURAL_VECTOR_SIZE (128)
3760   #endif
3761 
3762   #if !defined(SIMDE_NATURAL_VECTOR_SIZE)
3763     #define SIMDE_NATURAL_VECTOR_SIZE (0)
3764   #endif
3765 #endif
3766 
3767 #define SIMDE_NATURAL_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE <= (x)))
3768 #define SIMDE_NATURAL_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE >= (x)))
3769 
3770 /* Native aliases */
3771 #if defined(SIMDE_ENABLE_NATIVE_ALIASES)
3772   #if !defined(SIMDE_X86_MMX_NATIVE)
3773     #define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
3774   #endif
3775   #if !defined(SIMDE_X86_SSE_NATIVE)
3776     #define SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES
3777   #endif
3778   #if !defined(SIMDE_X86_SSE2_NATIVE)
3779     #define SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES
3780   #endif
3781   #if !defined(SIMDE_X86_SSE3_NATIVE)
3782     #define SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES
3783   #endif
3784   #if !defined(SIMDE_X86_SSSE3_NATIVE)
3785     #define SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES
3786   #endif
3787   #if !defined(SIMDE_X86_SSE4_1_NATIVE)
3788     #define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES
3789   #endif
3790   #if !defined(SIMDE_X86_SSE4_2_NATIVE)
3791     #define SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES
3792   #endif
3793   #if !defined(SIMDE_X86_AVX_NATIVE)
3794     #define SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES
3795   #endif
3796   #if !defined(SIMDE_X86_AVX2_NATIVE)
3797     #define SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES
3798   #endif
3799   #if !defined(SIMDE_X86_FMA_NATIVE)
3800     #define SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES
3801   #endif
3802   #if !defined(SIMDE_X86_AVX512F_NATIVE)
3803     #define SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES
3804   #endif
3805   #if !defined(SIMDE_X86_AVX512VL_NATIVE)
3806     #define SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES
3807   #endif
3808   #if !defined(SIMDE_X86_AVX512BW_NATIVE)
3809     #define SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES
3810   #endif
3811   #if !defined(SIMDE_X86_AVX512DQ_NATIVE)
3812     #define SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES
3813   #endif
3814   #if !defined(SIMDE_X86_AVX512CD_NATIVE)
3815     #define SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES
3816   #endif
3817   #if !defined(SIMDE_X86_GFNI_NATIVE)
3818     #define SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES
3819   #endif
3820   #if !defined(SIMDE_X86_PCLMUL_NATIVE)
3821     #define SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES
3822   #endif
3823   #if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE)
3824     #define SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES
3825   #endif
3826   #if !defined(SIMDE_X86_F16C_NATIVE)
3827     #define SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES
3828   #endif
3829 
3830   #if !defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3831     #define SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES
3832   #endif
3833   #if !defined(SIMDE_ARM_NEON_A32V8_NATIVE)
3834     #define SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES
3835   #endif
3836   #if !defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3837     #define SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES
3838   #endif
3839 
3840   #if !defined(SIMDE_WASM_SIMD128_NATIVE)
3841     #define SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES
3842   #endif
3843 #endif
3844 
3845 /* Are floating point values stored using IEEE 754?  Knowing
3846  * this at during preprocessing is a bit tricky, mostly because what
3847  * we're curious about is how values are stored and not whether the
3848  * implementation is fully conformant in terms of rounding, NaN
3849  * handling, etc.
3850  *
3851  * For example, if you use -ffast-math or -Ofast on
3852  * GCC or clang IEEE 754 isn't strictly followed, therefore IEE 754
3853  * support is not advertised (by defining __STDC_IEC_559__).
3854  *
3855  * However, what we care about is whether it is safe to assume that
3856  * floating point values are stored in IEEE 754 format, in which case
3857  * we can provide faster implementations of some functions.
3858  *
3859  * Luckily every vaugely modern architecture I'm aware of uses IEEE 754-
3860  * so we just assume IEEE 754 for now.  There is a test which verifies
3861  * this, if that test fails sowewhere please let us know and we'll add
3862  * an exception for that platform.  Meanwhile, you can define
3863  * SIMDE_NO_IEEE754_STORAGE. */
3864 #if !defined(SIMDE_IEEE754_STORAGE) && !defined(SIMDE_NO_IEE754_STORAGE)
3865   #define SIMDE_IEEE754_STORAGE
3866 #endif
3867 
3868 #endif /* !defined(SIMDE_FEATURES_H) */
3869 /* :: End ../simde/simde/simde-features.h :: */
3870 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
3871 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
3872 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
3873 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
3874 /* :: Begin ../simde/simde/simde-math.h :: */
3875 /* SPDX-License-Identifier: MIT
3876  *
3877  * Permission is hereby granted, free of charge, to any person
3878  * obtaining a copy of this software and associated documentation
3879  * files (the "Software"), to deal in the Software without
3880  * restriction, including without limitation the rights to use, copy,
3881  * modify, merge, publish, distribute, sublicense, and/or sell copies
3882  * of the Software, and to permit persons to whom the Software is
3883  * furnished to do so, subject to the following conditions:
3884  *
3885  * The above copyright notice and this permission notice shall be
3886  * included in all copies or substantial portions of the Software.
3887  *
3888  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3889  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3890  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3891  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3892  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
3893  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
3894  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3895  * SOFTWARE.
3896  *
3897  * Copyright:
3898  *   2017-2020 Evan Nemerson <evan@nemerson.com>
3899  */
3900 
3901 /* Attempt to find math functions.  Functions may be in <cmath>,
3902  * <math.h>, compiler built-ins/intrinsics, or platform/architecture
3903  * specific headers.  In some cases, especially those not built in to
3904  * libm, we may need to define our own implementations. */
3905 
3906 #if !defined(SIMDE_MATH_H)
3907 #define SIMDE_MATH_H 1
3908 
3909 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
3910 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
3911 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
3912 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
3913 
3914 #include <stdint.h>
3915 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3916   #include <arm_neon.h>
3917 #endif
3918 
3919 HEDLEY_DIAGNOSTIC_PUSH
3920 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
3921 
3922 /* SLEEF support
3923  * https://sleef.org/
3924  *
3925  * If you include <sleef.h> prior to including SIMDe, SIMDe will use
3926  * SLEEF.  You can also define SIMDE_MATH_SLEEF_ENABLE prior to
3927  * including SIMDe to force the issue.
3928  *
3929  * Note that SLEEF does requires linking to libsleef.
3930  *
3931  * By default, SIMDe will use the 1 ULP functions, but if you use
3932  * SIMDE_ACCURACY_PREFERENCE of 0 we will use up to 4 ULP.  This is
3933  * only the case for the simde_math_* functions; for code in other
3934  * SIMDe headers which calls SLEEF directly we may use functions with
3935  * greater error if the API we're implementing is less precise (for
3936  * example, SVML guarantees 4 ULP, so we will generally use the 3.5
3937  * ULP functions from SLEEF). */
3938 #if !defined(SIMDE_MATH_SLEEF_DISABLE)
3939   #if defined(__SLEEF_H__)
3940     #define SIMDE_MATH_SLEEF_ENABLE
3941   #endif
3942 #endif
3943 
3944 #if defined(SIMDE_MATH_SLEEF_ENABLE) && !defined(__SLEEF_H__)
3945   HEDLEY_DIAGNOSTIC_PUSH
3946   SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_
3947   #include <sleef.h>
3948   HEDLEY_DIAGNOSTIC_POP
3949 #endif
3950 
3951 #if defined(SIMDE_MATH_SLEEF_ENABLE) && defined(__SLEEF_H__)
3952   #if defined(SLEEF_VERSION_MAJOR)
3953     #define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (HEDLEY_VERSION_ENCODE(SLEEF_VERSION_MAJOR, SLEEF_VERSION_MINOR, SLEEF_VERSION_PATCHLEVEL) >= HEDLEY_VERSION_ENCODE(major, minor, patch))
3954   #else
3955     #define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (HEDLEY_VERSION_ENCODE(3,0,0) >= HEDLEY_VERSION_ENCODE(major, minor, patch))
3956   #endif
3957 #else
3958   #define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (0)
3959 #endif
3960 
3961 #if defined(__has_builtin)
3962   #define SIMDE_MATH_BUILTIN_LIBM(func) __has_builtin(__builtin_##func)
3963 #elif \
3964     HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
3965     HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
3966     HEDLEY_GCC_VERSION_CHECK(4,4,0)
3967   #define SIMDE_MATH_BUILTIN_LIBM(func) (1)
3968 #else
3969   #define SIMDE_MATH_BUILTIN_LIBM(func) (0)
3970 #endif
3971 
3972 #if defined(HUGE_VAL)
3973   /* Looks like <math.h> or <cmath> has already been included. */
3974 
3975   /* The math.h from libc++ (yes, the C header from the C++ standard
3976    * library) will define an isnan function, but not an isnan macro
3977    * like the C standard requires.  So we detect the header guards
3978    * macro libc++ uses. */
3979   #if defined(isnan) || (defined(_LIBCPP_MATH_H) && !defined(_LIBCPP_CMATH))
3980     #define SIMDE_MATH_HAVE_MATH_H
3981   #elif defined(__cplusplus)
3982     #define SIMDE_MATH_HAVE_CMATH
3983   #endif
3984 #elif defined(__has_include)
3985   #if defined(__cplusplus) && (__cplusplus >= 201103L) && __has_include(<cmath>)
3986     #define SIMDE_MATH_HAVE_CMATH
3987     #include <cmath>
3988   #elif __has_include(<math.h>)
3989     #define SIMDE_MATH_HAVE_MATH_H
3990     #include <math.h>
3991   #elif !defined(SIMDE_MATH_NO_LIBM)
3992     #define SIMDE_MATH_NO_LIBM
3993   #endif
3994 #elif !defined(SIMDE_MATH_NO_LIBM)
3995   #if defined(__cplusplus) && (__cplusplus >= 201103L)
3996     #define SIMDE_MATH_HAVE_CMATH
3997     HEDLEY_DIAGNOSTIC_PUSH
3998     #if defined(HEDLEY_MSVC_VERSION)
3999       /* VS 14 emits this diagnostic about noexcept being used on a
4000        * <cmath> function, which we can't do anything about. */
4001       #pragma warning(disable:4996)
4002     #endif
4003     #include <cmath>
4004     HEDLEY_DIAGNOSTIC_POP
4005   #else
4006     #define SIMDE_MATH_HAVE_MATH_H
4007     #include <math.h>
4008   #endif
4009 #endif
4010 
4011 #if !defined(SIMDE_MATH_INFINITY)
4012   #if \
4013       HEDLEY_HAS_BUILTIN(__builtin_inf) || \
4014       HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
4015       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
4016       HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
4017       HEDLEY_CRAY_VERSION_CHECK(8,1,0)
4018     #define SIMDE_MATH_INFINITY (__builtin_inf())
4019   #elif defined(INFINITY)
4020     #define SIMDE_MATH_INFINITY INFINITY
4021   #endif
4022 #endif
4023 
4024 #if !defined(SIMDE_INFINITYF)
4025   #if \
4026       HEDLEY_HAS_BUILTIN(__builtin_inff) || \
4027       HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
4028       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
4029       HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
4030       HEDLEY_IBM_VERSION_CHECK(13,1,0)
4031     #define SIMDE_MATH_INFINITYF (__builtin_inff())
4032   #elif defined(INFINITYF)
4033     #define SIMDE_MATH_INFINITYF INFINITYF
4034   #elif defined(SIMDE_MATH_INFINITY)
4035     #define SIMDE_MATH_INFINITYF HEDLEY_STATIC_CAST(float, SIMDE_MATH_INFINITY)
4036   #endif
4037 #endif
4038 
4039 #if !defined(SIMDE_MATH_NAN)
4040   #if \
4041       HEDLEY_HAS_BUILTIN(__builtin_nan) || \
4042       HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
4043       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
4044       HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
4045       HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
4046       HEDLEY_IBM_VERSION_CHECK(13,1,0)
4047     #define SIMDE_MATH_NAN (__builtin_nan(""))
4048   #elif defined(NAN)
4049     #define SIMDE_MATH_NAN NAN
4050   #endif
4051 #endif
4052 
4053 #if !defined(SIMDE_NANF)
4054   #if \
4055       HEDLEY_HAS_BUILTIN(__builtin_nanf) || \
4056       HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
4057       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
4058       HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
4059       HEDLEY_CRAY_VERSION_CHECK(8,1,0)
4060     #define SIMDE_MATH_NANF (__builtin_nanf(""))
4061   #elif defined(NANF)
4062     #define SIMDE_MATH_NANF NANF
4063   #elif defined(SIMDE_MATH_NAN)
4064     #define SIMDE_MATH_NANF HEDLEY_STATIC_CAST(float, SIMDE_MATH_NAN)
4065   #endif
4066 #endif
4067 
4068 #if !defined(SIMDE_MATH_PI)
4069   #if defined(M_PI)
4070     #define SIMDE_MATH_PI M_PI
4071   #else
4072     #define SIMDE_MATH_PI 3.14159265358979323846
4073   #endif
4074 #endif
4075 
4076 #if !defined(SIMDE_MATH_PIF)
4077   #if defined(M_PI)
4078     #define SIMDE_MATH_PIF HEDLEY_STATIC_CAST(float, M_PI)
4079   #else
4080     #define SIMDE_MATH_PIF 3.14159265358979323846f
4081   #endif
4082 #endif
4083 
4084 #if !defined(SIMDE_MATH_PI_OVER_180)
4085   #define SIMDE_MATH_PI_OVER_180 0.0174532925199432957692369076848861271344287188854172545609719144
4086 #endif
4087 
4088 #if !defined(SIMDE_MATH_PI_OVER_180F)
4089   #define SIMDE_MATH_PI_OVER_180F 0.0174532925199432957692369076848861271344287188854172545609719144f
4090 #endif
4091 
4092 #if !defined(SIMDE_MATH_180_OVER_PI)
4093   #define SIMDE_MATH_180_OVER_PI 57.295779513082320876798154814105170332405472466564321549160243861
4094 #endif
4095 
4096 #if !defined(SIMDE_MATH_180_OVER_PIF)
4097   #define SIMDE_MATH_180_OVER_PIF 57.295779513082320876798154814105170332405472466564321549160243861f
4098 #endif
4099 
4100 #if !defined(SIMDE_MATH_FLT_MIN)
4101   #if defined(FLT_MIN)
4102     #define SIMDE_MATH_FLT_MIN FLT_MIN
4103   #elif defined(__FLT_MIN__)
4104     #define SIMDE_MATH_FLT_MIN __FLT_MIN__
4105   #elif defined(__cplusplus)
4106     #include <cfloat>
4107     #define SIMDE_MATH_FLT_MIN FLT_MIN
4108   #else
4109     #include <float.h>
4110     #define SIMDE_MATH_FLT_MIN FLT_MIN
4111   #endif
4112 #endif
4113 
4114 #if !defined(SIMDE_MATH_DBL_MIN)
4115   #if defined(DBL_MIN)
4116     #define SIMDE_MATH_DBL_MIN DBL_MIN
4117   #elif defined(__DBL_MIN__)
4118     #define SIMDE_MATH_DBL_MIN __DBL_MIN__
4119   #elif defined(__cplusplus)
4120     #include <cfloat>
4121     #define SIMDE_MATH_DBL_MIN DBL_MIN
4122   #else
4123     #include <float.h>
4124     #define SIMDE_MATH_DBL_MIN DBL_MIN
4125   #endif
4126 #endif
4127 
4128 /*** Classification macros from C99 ***/
4129 
4130 #if !defined(simde_math_isinf)
4131   #if SIMDE_MATH_BUILTIN_LIBM(isinf)
4132     #define simde_math_isinf(v) __builtin_isinf(v)
4133   #elif defined(isinf) || defined(SIMDE_MATH_HAVE_MATH_H)
4134     #define simde_math_isinf(v) isinf(v)
4135   #elif defined(SIMDE_MATH_HAVE_CMATH)
4136     #define simde_math_isinf(v) std::isinf(v)
4137   #endif
4138 #endif
4139 
4140 #if !defined(simde_math_isinff)
4141   #if HEDLEY_HAS_BUILTIN(__builtin_isinff) || \
4142       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
4143       HEDLEY_ARM_VERSION_CHECK(4,1,0)
4144     #define simde_math_isinff(v) __builtin_isinff(v)
4145   #elif defined(SIMDE_MATH_HAVE_CMATH)
4146     #define simde_math_isinff(v) std::isinf(v)
4147   #elif defined(simde_math_isinf)
4148     #define simde_math_isinff(v) simde_math_isinf(HEDLEY_STATIC_CAST(double, v))
4149   #endif
4150 #endif
4151 
4152 #if !defined(simde_math_isnan)
4153   #if SIMDE_MATH_BUILTIN_LIBM(isnan)
4154     #define simde_math_isnan(v) __builtin_isnan(v)
4155   #elif defined(isnan) || defined(SIMDE_MATH_HAVE_MATH_H)
4156     #define simde_math_isnan(v) isnan(v)
4157   #elif defined(SIMDE_MATH_HAVE_CMATH)
4158     #define simde_math_isnan(v) std::isnan(v)
4159   #endif
4160 #endif
4161 
4162 #if !defined(simde_math_isnanf)
4163   #if HEDLEY_HAS_BUILTIN(__builtin_isnanf) || \
4164       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
4165       HEDLEY_ARM_VERSION_CHECK(4,1,0)
4166     /* XL C/C++ has __builtin_isnan but not __builtin_isnanf */
4167     #define simde_math_isnanf(v) __builtin_isnanf(v)
4168   #elif defined(SIMDE_MATH_HAVE_CMATH)
4169     #define simde_math_isnanf(v) std::isnan(v)
4170   #elif defined(simde_math_isnan)
4171     #define simde_math_isnanf(v) simde_math_isnan(HEDLEY_STATIC_CAST(double, v))
4172   #endif
4173 #endif
4174 
4175 #if !defined(simde_math_isnormal)
4176   #if SIMDE_MATH_BUILTIN_LIBM(isnormal)
4177     #define simde_math_isnormal(v) __builtin_isnormal(v)
4178   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4179     #define simde_math_isnormal(v) isnormal(v)
4180   #elif defined(SIMDE_MATH_HAVE_CMATH)
4181     #define simde_math_isnormal(v) std::isnormal(v)
4182   #endif
4183 #endif
4184 
4185 #if !defined(simde_math_isnormalf)
4186   #if HEDLEY_HAS_BUILTIN(__builtin_isnormalf)
4187     #define simde_math_isnormalf(v) __builtin_isnormalf(v)
4188   #elif SIMDE_MATH_BUILTIN_LIBM(isnormal)
4189     #define simde_math_isnormalf(v) __builtin_isnormal(v)
4190   #elif defined(isnormalf)
4191     #define simde_math_isnormalf(v) isnormalf(v)
4192   #elif defined(isnormal) || defined(SIMDE_MATH_HAVE_MATH_H)
4193     #define simde_math_isnormalf(v) isnormal(v)
4194   #elif defined(SIMDE_MATH_HAVE_CMATH)
4195     #define simde_math_isnormalf(v) std::isnormal(v)
4196   #elif defined(simde_math_isnormal)
4197     #define simde_math_isnormalf(v) simde_math_isnormal(v)
4198   #endif
4199 #endif
4200 
4201 /*** Manipulation functions ***/
4202 
4203 #if !defined(simde_math_nextafter)
4204   #if \
4205       (HEDLEY_HAS_BUILTIN(__builtin_nextafter) && !defined(HEDLEY_IBM_VERSION)) || \
4206       HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
4207       HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
4208       HEDLEY_INTEL_VERSION_CHECK(13,0,0)
4209     #define simde_math_nextafter(x, y) __builtin_nextafter(x, y)
4210   #elif defined(SIMDE_MATH_HAVE_CMATH)
4211     #define simde_math_nextafter(x, y) std::nextafter(x, y)
4212   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4213     #define simde_math_nextafter(x, y) nextafter(x, y)
4214   #endif
4215 #endif
4216 
4217 #if !defined(simde_math_nextafterf)
4218   #if \
4219       (HEDLEY_HAS_BUILTIN(__builtin_nextafterf) && !defined(HEDLEY_IBM_VERSION)) || \
4220       HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
4221       HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
4222       HEDLEY_INTEL_VERSION_CHECK(13,0,0)
4223     #define simde_math_nextafterf(x, y) __builtin_nextafterf(x, y)
4224   #elif defined(SIMDE_MATH_HAVE_CMATH)
4225     #define simde_math_nextafterf(x, y) std::nextafter(x, y)
4226   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4227     #define simde_math_nextafterf(x, y) nextafterf(x, y)
4228   #endif
4229 #endif
4230 
4231 /*** Functions from C99 ***/
4232 
4233 #if !defined(simde_math_abs)
4234   #if SIMDE_MATH_BUILTIN_LIBM(abs)
4235     #define simde_math_abs(v) __builtin_abs(v)
4236   #elif defined(SIMDE_MATH_HAVE_CMATH)
4237     #define simde_math_abs(v) std::abs(v)
4238   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4239     #define simde_math_abs(v) abs(v)
4240   #endif
4241 #endif
4242 
4243 #if !defined(simde_math_fabsf)
4244   #if SIMDE_MATH_BUILTIN_LIBM(fabsf)
4245     #define simde_math_fabsf(v) __builtin_fabsf(v)
4246   #elif defined(SIMDE_MATH_HAVE_CMATH)
4247     #define simde_math_fabsf(v) std::abs(v)
4248   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4249     #define simde_math_fabsf(v) fabsf(v)
4250   #endif
4251 #endif
4252 
4253 #if !defined(simde_math_acos)
4254   #if SIMDE_MATH_BUILTIN_LIBM(acos)
4255     #define simde_math_acos(v) __builtin_acos(v)
4256   #elif defined(SIMDE_MATH_HAVE_CMATH)
4257     #define simde_math_acos(v) std::acos(v)
4258   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4259     #define simde_math_acos(v) acos(v)
4260   #endif
4261 #endif
4262 
4263 #if !defined(simde_math_acosf)
4264   #if SIMDE_MATH_BUILTIN_LIBM(acosf)
4265     #define simde_math_acosf(v) __builtin_acosf(v)
4266   #elif defined(SIMDE_MATH_HAVE_CMATH)
4267     #define simde_math_acosf(v) std::acos(v)
4268   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4269     #define simde_math_acosf(v) acosf(v)
4270   #endif
4271 #endif
4272 
4273 #if !defined(simde_math_acosh)
4274   #if SIMDE_MATH_BUILTIN_LIBM(acosh)
4275     #define simde_math_acosh(v) __builtin_acosh(v)
4276   #elif defined(SIMDE_MATH_HAVE_CMATH)
4277     #define simde_math_acosh(v) std::acosh(v)
4278   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4279     #define simde_math_acosh(v) acosh(v)
4280   #endif
4281 #endif
4282 
4283 #if !defined(simde_math_acoshf)
4284   #if SIMDE_MATH_BUILTIN_LIBM(acoshf)
4285     #define simde_math_acoshf(v) __builtin_acoshf(v)
4286   #elif defined(SIMDE_MATH_HAVE_CMATH)
4287     #define simde_math_acoshf(v) std::acosh(v)
4288   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4289     #define simde_math_acoshf(v) acoshf(v)
4290   #endif
4291 #endif
4292 
4293 #if !defined(simde_math_asin)
4294   #if SIMDE_MATH_BUILTIN_LIBM(asin)
4295     #define simde_math_asin(v) __builtin_asin(v)
4296   #elif defined(SIMDE_MATH_HAVE_CMATH)
4297     #define simde_math_asin(v) std::asin(v)
4298   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4299     #define simde_math_asin(v) asin(v)
4300   #endif
4301 #endif
4302 
4303 #if !defined(simde_math_asinf)
4304   #if SIMDE_MATH_BUILTIN_LIBM(asinf)
4305     #define simde_math_asinf(v) __builtin_asinf(v)
4306   #elif defined(SIMDE_MATH_HAVE_CMATH)
4307     #define simde_math_asinf(v) std::asin(v)
4308   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4309     #define simde_math_asinf(v) asinf(v)
4310   #endif
4311 #endif
4312 
4313 #if !defined(simde_math_asinh)
4314   #if SIMDE_MATH_BUILTIN_LIBM(asinh)
4315     #define simde_math_asinh(v) __builtin_asinh(v)
4316   #elif defined(SIMDE_MATH_HAVE_CMATH)
4317     #define simde_math_asinh(v) std::asinh(v)
4318   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4319     #define simde_math_asinh(v) asinh(v)
4320   #endif
4321 #endif
4322 
4323 #if !defined(simde_math_asinhf)
4324   #if SIMDE_MATH_BUILTIN_LIBM(asinhf)
4325     #define simde_math_asinhf(v) __builtin_asinhf(v)
4326   #elif defined(SIMDE_MATH_HAVE_CMATH)
4327     #define simde_math_asinhf(v) std::asinh(v)
4328   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4329     #define simde_math_asinhf(v) asinhf(v)
4330   #endif
4331 #endif
4332 
4333 #if !defined(simde_math_atan)
4334   #if SIMDE_MATH_BUILTIN_LIBM(atan)
4335     #define simde_math_atan(v) __builtin_atan(v)
4336   #elif defined(SIMDE_MATH_HAVE_CMATH)
4337     #define simde_math_atan(v) std::atan(v)
4338   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4339     #define simde_math_atan(v) atan(v)
4340   #endif
4341 #endif
4342 
4343 #if !defined(simde_math_atan2)
4344   #if SIMDE_MATH_BUILTIN_LIBM(atan2)
4345     #define simde_math_atan2(y, x) __builtin_atan2(y, x)
4346   #elif defined(SIMDE_MATH_HAVE_CMATH)
4347     #define simde_math_atan2(y, x) std::atan2(y, x)
4348   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4349     #define simde_math_atan2(y, x) atan2(y, x)
4350   #endif
4351 #endif
4352 
4353 #if !defined(simde_math_atan2f)
4354   #if SIMDE_MATH_BUILTIN_LIBM(atan2f)
4355     #define simde_math_atan2f(y, x) __builtin_atan2f(y, x)
4356   #elif defined(SIMDE_MATH_HAVE_CMATH)
4357     #define simde_math_atan2f(y, x) std::atan2(y, x)
4358   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4359     #define simde_math_atan2f(y, x) atan2f(y, x)
4360   #endif
4361 #endif
4362 
4363 #if !defined(simde_math_atanf)
4364   #if SIMDE_MATH_BUILTIN_LIBM(atanf)
4365     #define simde_math_atanf(v) __builtin_atanf(v)
4366   #elif defined(SIMDE_MATH_HAVE_CMATH)
4367     #define simde_math_atanf(v) std::atan(v)
4368   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4369     #define simde_math_atanf(v) atanf(v)
4370   #endif
4371 #endif
4372 
4373 #if !defined(simde_math_atanh)
4374   #if SIMDE_MATH_BUILTIN_LIBM(atanh)
4375     #define simde_math_atanh(v) __builtin_atanh(v)
4376   #elif defined(SIMDE_MATH_HAVE_CMATH)
4377     #define simde_math_atanh(v) std::atanh(v)
4378   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4379     #define simde_math_atanh(v) atanh(v)
4380   #endif
4381 #endif
4382 
4383 #if !defined(simde_math_atanhf)
4384   #if SIMDE_MATH_BUILTIN_LIBM(atanhf)
4385     #define simde_math_atanhf(v) __builtin_atanhf(v)
4386   #elif defined(SIMDE_MATH_HAVE_CMATH)
4387     #define simde_math_atanhf(v) std::atanh(v)
4388   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4389     #define simde_math_atanhf(v) atanhf(v)
4390   #endif
4391 #endif
4392 
4393 #if !defined(simde_math_cbrt)
4394   #if SIMDE_MATH_BUILTIN_LIBM(cbrt)
4395     #define simde_math_cbrt(v) __builtin_cbrt(v)
4396   #elif defined(SIMDE_MATH_HAVE_CMATH)
4397     #define simde_math_cbrt(v) std::cbrt(v)
4398   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4399     #define simde_math_cbrt(v) cbrt(v)
4400   #endif
4401 #endif
4402 
4403 #if !defined(simde_math_cbrtf)
4404   #if SIMDE_MATH_BUILTIN_LIBM(cbrtf)
4405     #define simde_math_cbrtf(v) __builtin_cbrtf(v)
4406   #elif defined(SIMDE_MATH_HAVE_CMATH)
4407     #define simde_math_cbrtf(v) std::cbrt(v)
4408   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4409     #define simde_math_cbrtf(v) cbrtf(v)
4410   #endif
4411 #endif
4412 
4413 #if !defined(simde_math_ceil)
4414   #if SIMDE_MATH_BUILTIN_LIBM(ceil)
4415     #define simde_math_ceil(v) __builtin_ceil(v)
4416   #elif defined(SIMDE_MATH_HAVE_CMATH)
4417     #define simde_math_ceil(v) std::ceil(v)
4418   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4419     #define simde_math_ceil(v) ceil(v)
4420   #endif
4421 #endif
4422 
4423 #if !defined(simde_math_ceilf)
4424   #if SIMDE_MATH_BUILTIN_LIBM(ceilf)
4425     #define simde_math_ceilf(v) __builtin_ceilf(v)
4426   #elif defined(SIMDE_MATH_HAVE_CMATH)
4427     #define simde_math_ceilf(v) std::ceil(v)
4428   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4429     #define simde_math_ceilf(v) ceilf(v)
4430   #endif
4431 #endif
4432 
4433 #if !defined(simde_math_copysign)
4434   #if SIMDE_MATH_BUILTIN_LIBM(copysign)
4435     #define simde_math_copysign(x, y) __builtin_copysign(x, y)
4436   #elif defined(SIMDE_MATH_HAVE_CMATH)
4437     #define simde_math_copysign(x, y) std::copysign(x, y)
4438   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4439     #define simde_math_copysign(x, y) copysign(x, y)
4440   #endif
4441 #endif
4442 
4443 #if !defined(simde_math_copysignf)
4444   #if SIMDE_MATH_BUILTIN_LIBM(copysignf)
4445     #define simde_math_copysignf(x, y) __builtin_copysignf(x, y)
4446   #elif defined(SIMDE_MATH_HAVE_CMATH)
4447     #define simde_math_copysignf(x, y) std::copysignf(x, y)
4448   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4449     #define simde_math_copysignf(x, y) copysignf(x, y)
4450   #endif
4451 #endif
4452 
4453 #if !defined(simde_math_cos)
4454   #if SIMDE_MATH_BUILTIN_LIBM(cos)
4455     #define simde_math_cos(v) __builtin_cos(v)
4456   #elif defined(SIMDE_MATH_HAVE_CMATH)
4457     #define simde_math_cos(v) std::cos(v)
4458   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4459     #define simde_math_cos(v) cos(v)
4460   #endif
4461 #endif
4462 
4463 #if !defined(simde_math_cosf)
4464   #if defined(SIMDE_MATH_SLEEF_ENABLE)
4465     #if SIMDE_ACCURACY_PREFERENCE < 1
4466       #define simde_math_cosf(v) Sleef_cosf_u35(v)
4467     #else
4468       #define simde_math_cosf(v) Sleef_cosf_u10(v)
4469     #endif
4470   #elif SIMDE_MATH_BUILTIN_LIBM(cosf)
4471     #define simde_math_cosf(v) __builtin_cosf(v)
4472   #elif defined(SIMDE_MATH_HAVE_CMATH)
4473     #define simde_math_cosf(v) std::cos(v)
4474   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4475     #define simde_math_cosf(v) cosf(v)
4476   #endif
4477 #endif
4478 
4479 #if !defined(simde_math_cosh)
4480   #if SIMDE_MATH_BUILTIN_LIBM(cosh)
4481     #define simde_math_cosh(v) __builtin_cosh(v)
4482   #elif defined(SIMDE_MATH_HAVE_CMATH)
4483     #define simde_math_cosh(v) std::cosh(v)
4484   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4485     #define simde_math_cosh(v) cosh(v)
4486   #endif
4487 #endif
4488 
4489 #if !defined(simde_math_coshf)
4490   #if SIMDE_MATH_BUILTIN_LIBM(coshf)
4491     #define simde_math_coshf(v) __builtin_coshf(v)
4492   #elif defined(SIMDE_MATH_HAVE_CMATH)
4493     #define simde_math_coshf(v) std::cosh(v)
4494   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4495     #define simde_math_coshf(v) coshf(v)
4496   #endif
4497 #endif
4498 
4499 #if !defined(simde_math_erf)
4500   #if SIMDE_MATH_BUILTIN_LIBM(erf)
4501     #define simde_math_erf(v) __builtin_erf(v)
4502   #elif defined(SIMDE_MATH_HAVE_CMATH)
4503     #define simde_math_erf(v) std::erf(v)
4504   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4505     #define simde_math_erf(v) erf(v)
4506   #endif
4507 #endif
4508 
4509 #if !defined(simde_math_erff)
4510   #if SIMDE_MATH_BUILTIN_LIBM(erff)
4511     #define simde_math_erff(v) __builtin_erff(v)
4512   #elif defined(SIMDE_MATH_HAVE_CMATH)
4513     #define simde_math_erff(v) std::erf(v)
4514   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4515     #define simde_math_erff(v) erff(v)
4516   #endif
4517 #endif
4518 
4519 #if !defined(simde_math_erfc)
4520   #if SIMDE_MATH_BUILTIN_LIBM(erfc)
4521     #define simde_math_erfc(v) __builtin_erfc(v)
4522   #elif defined(SIMDE_MATH_HAVE_CMATH)
4523     #define simde_math_erfc(v) std::erfc(v)
4524   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4525     #define simde_math_erfc(v) erfc(v)
4526   #endif
4527 #endif
4528 
4529 #if !defined(simde_math_erfcf)
4530   #if SIMDE_MATH_BUILTIN_LIBM(erfcf)
4531     #define simde_math_erfcf(v) __builtin_erfcf(v)
4532   #elif defined(SIMDE_MATH_HAVE_CMATH)
4533     #define simde_math_erfcf(v) std::erfc(v)
4534   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4535     #define simde_math_erfcf(v) erfcf(v)
4536   #endif
4537 #endif
4538 
4539 #if !defined(simde_math_exp)
4540   #if SIMDE_MATH_BUILTIN_LIBM(exp)
4541     #define simde_math_exp(v) __builtin_exp(v)
4542   #elif defined(SIMDE_MATH_HAVE_CMATH)
4543     #define simde_math_exp(v) std::exp(v)
4544   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4545     #define simde_math_exp(v) exp(v)
4546   #endif
4547 #endif
4548 
4549 #if !defined(simde_math_expf)
4550   #if SIMDE_MATH_BUILTIN_LIBM(expf)
4551     #define simde_math_expf(v) __builtin_expf(v)
4552   #elif defined(SIMDE_MATH_HAVE_CMATH)
4553     #define simde_math_expf(v) std::exp(v)
4554   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4555     #define simde_math_expf(v) expf(v)
4556   #endif
4557 #endif
4558 
4559 #if !defined(simde_math_expm1)
4560   #if SIMDE_MATH_BUILTIN_LIBM(expm1)
4561     #define simde_math_expm1(v) __builtin_expm1(v)
4562   #elif defined(SIMDE_MATH_HAVE_CMATH)
4563     #define simde_math_expm1(v) std::expm1(v)
4564   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4565     #define simde_math_expm1(v) expm1(v)
4566   #endif
4567 #endif
4568 
4569 #if !defined(simde_math_expm1f)
4570   #if SIMDE_MATH_BUILTIN_LIBM(expm1f)
4571     #define simde_math_expm1f(v) __builtin_expm1f(v)
4572   #elif defined(SIMDE_MATH_HAVE_CMATH)
4573     #define simde_math_expm1f(v) std::expm1(v)
4574   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4575     #define simde_math_expm1f(v) expm1f(v)
4576   #endif
4577 #endif
4578 
4579 #if !defined(simde_math_exp2)
4580   #if SIMDE_MATH_BUILTIN_LIBM(exp2)
4581     #define simde_math_exp2(v) __builtin_exp2(v)
4582   #elif defined(SIMDE_MATH_HAVE_CMATH)
4583     #define simde_math_exp2(v) std::exp2(v)
4584   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4585     #define simde_math_exp2(v) exp2(v)
4586   #endif
4587 #endif
4588 
4589 #if !defined(simde_math_exp2f)
4590   #if SIMDE_MATH_BUILTIN_LIBM(exp2f)
4591     #define simde_math_exp2f(v) __builtin_exp2f(v)
4592   #elif defined(SIMDE_MATH_HAVE_CMATH)
4593     #define simde_math_exp2f(v) std::exp2(v)
4594   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4595     #define simde_math_exp2f(v) exp2f(v)
4596   #endif
4597 #endif
4598 
4599 #if HEDLEY_HAS_BUILTIN(__builtin_exp10) ||  HEDLEY_GCC_VERSION_CHECK(3,4,0)
4600   #  define simde_math_exp10(v) __builtin_exp10(v)
4601 #else
4602 #  define simde_math_exp10(v) pow(10.0, (v))
4603 #endif
4604 
4605 #if HEDLEY_HAS_BUILTIN(__builtin_exp10f) ||  HEDLEY_GCC_VERSION_CHECK(3,4,0)
4606   #  define simde_math_exp10f(v) __builtin_exp10f(v)
4607 #else
4608 #  define simde_math_exp10f(v) powf(10.0f, (v))
4609 #endif
4610 
4611 #if !defined(simde_math_fabs)
4612   #if SIMDE_MATH_BUILTIN_LIBM(fabs)
4613     #define simde_math_fabs(v) __builtin_fabs(v)
4614   #elif defined(SIMDE_MATH_HAVE_CMATH)
4615     #define simde_math_fabs(v) std::fabs(v)
4616   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4617     #define simde_math_fabs(v) fabs(v)
4618   #endif
4619 #endif
4620 
4621 #if !defined(simde_math_fabsf)
4622   #if SIMDE_MATH_BUILTIN_LIBM(fabsf)
4623     #define simde_math_fabsf(v) __builtin_fabsf(v)
4624   #elif defined(SIMDE_MATH_HAVE_CMATH)
4625     #define simde_math_fabsf(v) std::fabs(v)
4626   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4627     #define simde_math_fabsf(v) fabsf(v)
4628   #endif
4629 #endif
4630 
4631 #if !defined(simde_math_floor)
4632   #if SIMDE_MATH_BUILTIN_LIBM(floor)
4633     #define simde_math_floor(v) __builtin_floor(v)
4634   #elif defined(SIMDE_MATH_HAVE_CMATH)
4635     #define simde_math_floor(v) std::floor(v)
4636   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4637     #define simde_math_floor(v) floor(v)
4638   #endif
4639 #endif
4640 
4641 #if !defined(simde_math_floorf)
4642   #if SIMDE_MATH_BUILTIN_LIBM(floorf)
4643     #define simde_math_floorf(v) __builtin_floorf(v)
4644   #elif defined(SIMDE_MATH_HAVE_CMATH)
4645     #define simde_math_floorf(v) std::floor(v)
4646   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4647     #define simde_math_floorf(v) floorf(v)
4648   #endif
4649 #endif
4650 
4651 #if !defined(simde_math_fma)
4652   #if SIMDE_MATH_BUILTIN_LIBM(fma)
4653     #define simde_math_fma(x, y, z) __builtin_fma(x, y, z)
4654   #elif defined(SIMDE_MATH_HAVE_CMATH)
4655     #define simde_math_fma(x, y, z) std::fma(x, y, z)
4656   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4657     #define simde_math_fma(x, y, z) fma(x, y, z)
4658   #endif
4659 #endif
4660 
4661 #if !defined(simde_math_fmaf)
4662   #if SIMDE_MATH_BUILTIN_LIBM(fmaf)
4663     #define simde_math_fmaf(x, y, z) __builtin_fmaf(x, y, z)
4664   #elif defined(SIMDE_MATH_HAVE_CMATH)
4665     #define simde_math_fmaf(x, y, z) std::fma(x, y, z)
4666   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4667     #define simde_math_fmaf(x, y, z) fmaf(x, y, z)
4668   #endif
4669 #endif
4670 
4671 #if !defined(simde_math_fmax)
4672   #if SIMDE_MATH_BUILTIN_LIBM(fmax)
4673     #define simde_math_fmax(x, y, z) __builtin_fmax(x, y, z)
4674   #elif defined(SIMDE_MATH_HAVE_CMATH)
4675     #define simde_math_fmax(x, y, z) std::fmax(x, y, z)
4676   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4677     #define simde_math_fmax(x, y, z) fmax(x, y, z)
4678   #endif
4679 #endif
4680 
4681 #if !defined(simde_math_fmaxf)
4682   #if SIMDE_MATH_BUILTIN_LIBM(fmaxf)
4683     #define simde_math_fmaxf(x, y, z) __builtin_fmaxf(x, y, z)
4684   #elif defined(SIMDE_MATH_HAVE_CMATH)
4685     #define simde_math_fmaxf(x, y, z) std::fmax(x, y, z)
4686   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4687     #define simde_math_fmaxf(x, y, z) fmaxf(x, y, z)
4688   #endif
4689 #endif
4690 
4691 #if !defined(simde_math_hypot)
4692   #if SIMDE_MATH_BUILTIN_LIBM(hypot)
4693     #define simde_math_hypot(y, x) __builtin_hypot(y, x)
4694   #elif defined(SIMDE_MATH_HAVE_CMATH)
4695     #define simde_math_hypot(y, x) std::hypot(y, x)
4696   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4697     #define simde_math_hypot(y, x) hypot(y, x)
4698   #endif
4699 #endif
4700 
4701 #if !defined(simde_math_hypotf)
4702   #if SIMDE_MATH_BUILTIN_LIBM(hypotf)
4703     #define simde_math_hypotf(y, x) __builtin_hypotf(y, x)
4704   #elif defined(SIMDE_MATH_HAVE_CMATH)
4705     #define simde_math_hypotf(y, x) std::hypot(y, x)
4706   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4707     #define simde_math_hypotf(y, x) hypotf(y, x)
4708   #endif
4709 #endif
4710 
4711 #if !defined(simde_math_log)
4712   #if SIMDE_MATH_BUILTIN_LIBM(log)
4713     #define simde_math_log(v) __builtin_log(v)
4714   #elif defined(SIMDE_MATH_HAVE_CMATH)
4715     #define simde_math_log(v) std::log(v)
4716   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4717     #define simde_math_log(v) log(v)
4718   #endif
4719 #endif
4720 
4721 #if !defined(simde_math_logf)
4722   #if SIMDE_MATH_BUILTIN_LIBM(logf)
4723     #define simde_math_logf(v) __builtin_logf(v)
4724   #elif defined(SIMDE_MATH_HAVE_CMATH)
4725     #define simde_math_logf(v) std::log(v)
4726   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4727     #define simde_math_logf(v) logf(v)
4728   #endif
4729 #endif
4730 
4731 #if !defined(simde_math_logb)
4732   #if SIMDE_MATH_BUILTIN_LIBM(logb)
4733     #define simde_math_logb(v) __builtin_logb(v)
4734   #elif defined(SIMDE_MATH_HAVE_CMATH)
4735     #define simde_math_logb(v) std::logb(v)
4736   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4737     #define simde_math_logb(v) logb(v)
4738   #endif
4739 #endif
4740 
4741 #if !defined(simde_math_logbf)
4742   #if SIMDE_MATH_BUILTIN_LIBM(logbf)
4743     #define simde_math_logbf(v) __builtin_logbf(v)
4744   #elif defined(SIMDE_MATH_HAVE_CMATH)
4745     #define simde_math_logbf(v) std::logb(v)
4746   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4747     #define simde_math_logbf(v) logbf(v)
4748   #endif
4749 #endif
4750 
4751 #if !defined(simde_math_log1p)
4752   #if SIMDE_MATH_BUILTIN_LIBM(log1p)
4753     #define simde_math_log1p(v) __builtin_log1p(v)
4754   #elif defined(SIMDE_MATH_HAVE_CMATH)
4755     #define simde_math_log1p(v) std::log1p(v)
4756   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4757     #define simde_math_log1p(v) log1p(v)
4758   #endif
4759 #endif
4760 
4761 #if !defined(simde_math_log1pf)
4762   #if SIMDE_MATH_BUILTIN_LIBM(log1pf)
4763     #define simde_math_log1pf(v) __builtin_log1pf(v)
4764   #elif defined(SIMDE_MATH_HAVE_CMATH)
4765     #define simde_math_log1pf(v) std::log1p(v)
4766   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4767     #define simde_math_log1pf(v) log1pf(v)
4768   #endif
4769 #endif
4770 
4771 #if !defined(simde_math_log2)
4772   #if SIMDE_MATH_BUILTIN_LIBM(log2)
4773     #define simde_math_log2(v) __builtin_log2(v)
4774   #elif defined(SIMDE_MATH_HAVE_CMATH)
4775     #define simde_math_log2(v) std::log2(v)
4776   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4777     #define simde_math_log2(v) log2(v)
4778   #endif
4779 #endif
4780 
4781 #if !defined(simde_math_log2f)
4782   #if SIMDE_MATH_BUILTIN_LIBM(log2f)
4783     #define simde_math_log2f(v) __builtin_log2f(v)
4784   #elif defined(SIMDE_MATH_HAVE_CMATH)
4785     #define simde_math_log2f(v) std::log2(v)
4786   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4787     #define simde_math_log2f(v) log2f(v)
4788   #endif
4789 #endif
4790 
4791 #if !defined(simde_math_log10)
4792   #if SIMDE_MATH_BUILTIN_LIBM(log10)
4793     #define simde_math_log10(v) __builtin_log10(v)
4794   #elif defined(SIMDE_MATH_HAVE_CMATH)
4795     #define simde_math_log10(v) std::log10(v)
4796   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4797     #define simde_math_log10(v) log10(v)
4798   #endif
4799 #endif
4800 
4801 #if !defined(simde_math_log10f)
4802   #if SIMDE_MATH_BUILTIN_LIBM(log10f)
4803     #define simde_math_log10f(v) __builtin_log10f(v)
4804   #elif defined(SIMDE_MATH_HAVE_CMATH)
4805     #define simde_math_log10f(v) std::log10(v)
4806   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4807     #define simde_math_log10f(v) log10f(v)
4808   #endif
4809 #endif
4810 
4811 #if !defined(simde_math_modf)
4812   #if SIMDE_MATH_BUILTIN_LIBM(modf)
4813     #define simde_math_modf(x, iptr) __builtin_modf(x, iptr)
4814   #elif defined(SIMDE_MATH_HAVE_CMATH)
4815     #define simde_math_modf(x, iptr) std::modf(x, iptr)
4816   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4817     #define simde_math_modf(x, iptr) modf(x, iptr)
4818   #endif
4819 #endif
4820 
4821 #if !defined(simde_math_modff)
4822   #if SIMDE_MATH_BUILTIN_LIBM(modff)
4823     #define simde_math_modff(x, iptr) __builtin_modff(x, iptr)
4824   #elif defined(SIMDE_MATH_HAVE_CMATH)
4825     #define simde_math_modff(x, iptr) std::modf(x, iptr)
4826   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4827     #define simde_math_modff(x, iptr) modff(x, iptr)
4828   #endif
4829 #endif
4830 
4831 #if !defined(simde_math_nearbyint)
4832   #if SIMDE_MATH_BUILTIN_LIBM(nearbyint)
4833     #define simde_math_nearbyint(v) __builtin_nearbyint(v)
4834   #elif defined(SIMDE_MATH_HAVE_CMATH)
4835     #define simde_math_nearbyint(v) std::nearbyint(v)
4836   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4837     #define simde_math_nearbyint(v) nearbyint(v)
4838   #endif
4839 #endif
4840 
4841 #if !defined(simde_math_nearbyintf)
4842   #if SIMDE_MATH_BUILTIN_LIBM(nearbyintf)
4843     #define simde_math_nearbyintf(v) __builtin_nearbyintf(v)
4844   #elif defined(SIMDE_MATH_HAVE_CMATH)
4845     #define simde_math_nearbyintf(v) std::nearbyint(v)
4846   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4847     #define simde_math_nearbyintf(v) nearbyintf(v)
4848   #endif
4849 #endif
4850 
4851 #if !defined(simde_math_pow)
4852   #if SIMDE_MATH_BUILTIN_LIBM(pow)
4853     #define simde_math_pow(y, x) __builtin_pow(y, x)
4854   #elif defined(SIMDE_MATH_HAVE_CMATH)
4855     #define simde_math_pow(y, x) std::pow(y, x)
4856   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4857     #define simde_math_pow(y, x) pow(y, x)
4858   #endif
4859 #endif
4860 
4861 #if !defined(simde_math_powf)
4862   #if SIMDE_MATH_BUILTIN_LIBM(powf)
4863     #define simde_math_powf(y, x) __builtin_powf(y, x)
4864   #elif defined(SIMDE_MATH_HAVE_CMATH)
4865     #define simde_math_powf(y, x) std::pow(y, x)
4866   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4867     #define simde_math_powf(y, x) powf(y, x)
4868   #endif
4869 #endif
4870 
4871 #if !defined(simde_math_rint)
4872   #if SIMDE_MATH_BUILTIN_LIBM(rint)
4873     #define simde_math_rint(v) __builtin_rint(v)
4874   #elif defined(SIMDE_MATH_HAVE_CMATH)
4875     #define simde_math_rint(v) std::rint(v)
4876   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4877     #define simde_math_rint(v) rint(v)
4878   #endif
4879 #endif
4880 
4881 #if !defined(simde_math_rintf)
4882   #if SIMDE_MATH_BUILTIN_LIBM(rintf)
4883     #define simde_math_rintf(v) __builtin_rintf(v)
4884   #elif defined(SIMDE_MATH_HAVE_CMATH)
4885     #define simde_math_rintf(v) std::rint(v)
4886   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4887     #define simde_math_rintf(v) rintf(v)
4888   #endif
4889 #endif
4890 
4891 #if !defined(simde_math_round)
4892   #if SIMDE_MATH_BUILTIN_LIBM(round)
4893     #define simde_math_round(v) __builtin_round(v)
4894   #elif defined(SIMDE_MATH_HAVE_CMATH)
4895     #define simde_math_round(v) std::round(v)
4896   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4897     #define simde_math_round(v) round(v)
4898   #endif
4899 #endif
4900 
4901 #if !defined(simde_math_roundf)
4902   #if SIMDE_MATH_BUILTIN_LIBM(roundf)
4903     #define simde_math_roundf(v) __builtin_roundf(v)
4904   #elif defined(SIMDE_MATH_HAVE_CMATH)
4905     #define simde_math_roundf(v) std::round(v)
4906   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4907     #define simde_math_roundf(v) roundf(v)
4908   #endif
4909 #endif
4910 
4911 #if !defined(simde_math_roundeven)
4912   #if \
4913       HEDLEY_HAS_BUILTIN(__builtin_roundeven) || \
4914       HEDLEY_GCC_VERSION_CHECK(10,0,0)
4915     #define simde_math_roundeven(v) __builtin_roundeven(v)
4916   #elif defined(simde_math_round) && defined(simde_math_fabs)
4917     static HEDLEY_INLINE
4918     double
4919     simde_math_roundeven(double v) {
4920       double rounded = simde_math_round(v);
4921       double diff = rounded - v;
4922       if (HEDLEY_UNLIKELY(simde_math_fabs(diff) == 0.5) && (HEDLEY_STATIC_CAST(int64_t, rounded) & 1)) {
4923         rounded = v - diff;
4924       }
4925       return rounded;
4926     }
4927     #define simde_math_roundeven simde_math_roundeven
4928   #endif
4929 #endif
4930 
4931 #if !defined(simde_math_roundevenf)
4932   #if \
4933       HEDLEY_HAS_BUILTIN(__builtin_roundevenf) || \
4934       HEDLEY_GCC_VERSION_CHECK(10,0,0)
4935     #define simde_math_roundevenf(v) __builtin_roundevenf(v)
4936   #elif defined(simde_math_roundf) && defined(simde_math_fabsf)
4937     static HEDLEY_INLINE
4938     float
4939     simde_math_roundevenf(float v) {
4940       float rounded = simde_math_roundf(v);
4941       float diff = rounded - v;
4942       if (HEDLEY_UNLIKELY(simde_math_fabsf(diff) == 0.5f) && (HEDLEY_STATIC_CAST(int32_t, rounded) & 1)) {
4943         rounded = v - diff;
4944       }
4945       return rounded;
4946     }
4947     #define simde_math_roundevenf simde_math_roundevenf
4948   #endif
4949 #endif
4950 
4951 #if !defined(simde_math_sin)
4952   #if SIMDE_MATH_BUILTIN_LIBM(sin)
4953     #define simde_math_sin(v) __builtin_sin(v)
4954   #elif defined(SIMDE_MATH_HAVE_CMATH)
4955     #define simde_math_sin(v) std::sin(v)
4956   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4957     #define simde_math_sin(v) sin(v)
4958   #endif
4959 #endif
4960 
4961 #if !defined(simde_math_sinf)
4962   #if SIMDE_MATH_BUILTIN_LIBM(sinf)
4963     #define simde_math_sinf(v) __builtin_sinf(v)
4964   #elif defined(SIMDE_MATH_HAVE_CMATH)
4965     #define simde_math_sinf(v) std::sin(v)
4966   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4967     #define simde_math_sinf(v) sinf(v)
4968   #endif
4969 #endif
4970 
4971 #if !defined(simde_math_sinh)
4972   #if SIMDE_MATH_BUILTIN_LIBM(sinh)
4973     #define simde_math_sinh(v) __builtin_sinh(v)
4974   #elif defined(SIMDE_MATH_HAVE_CMATH)
4975     #define simde_math_sinh(v) std::sinh(v)
4976   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4977     #define simde_math_sinh(v) sinh(v)
4978   #endif
4979 #endif
4980 
4981 #if !defined(simde_math_sinhf)
4982   #if SIMDE_MATH_BUILTIN_LIBM(sinhf)
4983     #define simde_math_sinhf(v) __builtin_sinhf(v)
4984   #elif defined(SIMDE_MATH_HAVE_CMATH)
4985     #define simde_math_sinhf(v) std::sinh(v)
4986   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4987     #define simde_math_sinhf(v) sinhf(v)
4988   #endif
4989 #endif
4990 
4991 #if !defined(simde_math_sqrt)
4992   #if SIMDE_MATH_BUILTIN_LIBM(sqrt)
4993     #define simde_math_sqrt(v) __builtin_sqrt(v)
4994   #elif defined(SIMDE_MATH_HAVE_CMATH)
4995     #define simde_math_sqrt(v) std::sqrt(v)
4996   #elif defined(SIMDE_MATH_HAVE_MATH_H)
4997     #define simde_math_sqrt(v) sqrt(v)
4998   #endif
4999 #endif
5000 
5001 #if !defined(simde_math_sqrtf)
5002   #if SIMDE_MATH_BUILTIN_LIBM(sqrtf)
5003     #define simde_math_sqrtf(v) __builtin_sqrtf(v)
5004   #elif defined(SIMDE_MATH_HAVE_CMATH)
5005     #define simde_math_sqrtf(v) std::sqrt(v)
5006   #elif defined(SIMDE_MATH_HAVE_MATH_H)
5007     #define simde_math_sqrtf(v) sqrtf(v)
5008   #endif
5009 #endif
5010 
5011 #if !defined(simde_math_tan)
5012   #if SIMDE_MATH_BUILTIN_LIBM(tan)
5013     #define simde_math_tan(v) __builtin_tan(v)
5014   #elif defined(SIMDE_MATH_HAVE_CMATH)
5015     #define simde_math_tan(v) std::tan(v)
5016   #elif defined(SIMDE_MATH_HAVE_MATH_H)
5017     #define simde_math_tan(v) tan(v)
5018   #endif
5019 #endif
5020 
5021 #if !defined(simde_math_tanf)
5022   #if SIMDE_MATH_BUILTIN_LIBM(tanf)
5023     #define simde_math_tanf(v) __builtin_tanf(v)
5024   #elif defined(SIMDE_MATH_HAVE_CMATH)
5025     #define simde_math_tanf(v) std::tan(v)
5026   #elif defined(SIMDE_MATH_HAVE_MATH_H)
5027     #define simde_math_tanf(v) tanf(v)
5028   #endif
5029 #endif
5030 
5031 #if !defined(simde_math_tanh)
5032   #if SIMDE_MATH_BUILTIN_LIBM(tanh)
5033     #define simde_math_tanh(v) __builtin_tanh(v)
5034   #elif defined(SIMDE_MATH_HAVE_CMATH)
5035     #define simde_math_tanh(v) std::tanh(v)
5036   #elif defined(SIMDE_MATH_HAVE_MATH_H)
5037     #define simde_math_tanh(v) tanh(v)
5038   #endif
5039 #endif
5040 
5041 #if !defined(simde_math_tanhf)
5042   #if SIMDE_MATH_BUILTIN_LIBM(tanhf)
5043     #define simde_math_tanhf(v) __builtin_tanhf(v)
5044   #elif defined(SIMDE_MATH_HAVE_CMATH)
5045     #define simde_math_tanhf(v) std::tanh(v)
5046   #elif defined(SIMDE_MATH_HAVE_MATH_H)
5047     #define simde_math_tanhf(v) tanhf(v)
5048   #endif
5049 #endif
5050 
5051 #if !defined(simde_math_trunc)
5052   #if SIMDE_MATH_BUILTIN_LIBM(trunc)
5053     #define simde_math_trunc(v) __builtin_trunc(v)
5054   #elif defined(SIMDE_MATH_HAVE_CMATH)
5055     #define simde_math_trunc(v) std::trunc(v)
5056   #elif defined(SIMDE_MATH_HAVE_MATH_H)
5057     #define simde_math_trunc(v) trunc(v)
5058   #endif
5059 #endif
5060 
5061 #if !defined(simde_math_truncf)
5062   #if SIMDE_MATH_BUILTIN_LIBM(truncf)
5063     #define simde_math_truncf(v) __builtin_truncf(v)
5064   #elif defined(SIMDE_MATH_HAVE_CMATH)
5065     #define simde_math_truncf(v) std::trunc(v)
5066   #elif defined(SIMDE_MATH_HAVE_MATH_H)
5067     #define simde_math_truncf(v) truncf(v)
5068   #endif
5069 #endif
5070 
5071 /*** Comparison macros (which don't raise invalid errors) ***/
5072 
5073 #if defined(isunordered)
5074   #define simde_math_isunordered(x, y) isunordered(x, y)
5075 #elif HEDLEY_HAS_BUILTIN(__builtin_isunordered)
5076   #define simde_math_isunordered(x, y) __builtin_isunordered(x, y)
5077 #else
5078   static HEDLEY_INLINE
5079   int simde_math_isunordered(double x, double y) {
5080     return (x != y) && (x != x || y != y);
5081   }
5082   #define simde_math_isunordered simde_math_isunordered
5083 
5084   static HEDLEY_INLINE
5085   int simde_math_isunorderedf(float x, float y) {
5086     return (x != y) && (x != x || y != y);
5087   }
5088   #define simde_math_isunorderedf simde_math_isunorderedf
5089 #endif
5090 #if !defined(simde_math_isunorderedf)
5091   #define simde_math_isunorderedf simde_math_isunordered
5092 #endif
5093 
5094 /*** Additional functions not in libm ***/
5095 
5096 #if defined(simde_math_fabs) && defined(simde_math_sqrt) && defined(simde_math_exp)
5097   static HEDLEY_INLINE
5098   double
simde_math_cdfnorm(double x)5099   simde_math_cdfnorm(double x) {
5100     /* https://www.johndcook.com/blog/cpp_phi/
5101     * Public Domain */
5102     static const double a1 =  0.254829592;
5103     static const double a2 = -0.284496736;
5104     static const double a3 =  1.421413741;
5105     static const double a4 = -1.453152027;
5106     static const double a5 =  1.061405429;
5107     static const double p  =  0.3275911;
5108 
5109     const int sign = x < 0;
5110     x = simde_math_fabs(x) / simde_math_sqrt(2.0);
5111 
5112     /* A&S formula 7.1.26 */
5113     double t = 1.0 / (1.0 + p * x);
5114     double y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * simde_math_exp(-x * x);
5115 
5116     return 0.5 * (1.0 + (sign ? -y : y));
5117   }
5118   #define simde_math_cdfnorm simde_math_cdfnorm
5119 #endif
5120 
5121 #if defined(simde_math_fabsf) && defined(simde_math_sqrtf) && defined(simde_math_expf)
5122   static HEDLEY_INLINE
5123   float
simde_math_cdfnormf(float x)5124   simde_math_cdfnormf(float x) {
5125     /* https://www.johndcook.com/blog/cpp_phi/
5126     * Public Domain */
5127     static const float a1 =  0.254829592f;
5128     static const float a2 = -0.284496736f;
5129     static const float a3 =  1.421413741f;
5130     static const float a4 = -1.453152027f;
5131     static const float a5 =  1.061405429f;
5132     static const float p  =  0.3275911f;
5133 
5134     const int sign = x < 0;
5135     x = simde_math_fabsf(x) / simde_math_sqrtf(2.0f);
5136 
5137     /* A&S formula 7.1.26 */
5138     float t = 1.0f / (1.0f + p * x);
5139     float y = 1.0f - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * simde_math_expf(-x * x);
5140 
5141     return 0.5f * (1.0f + (sign ? -y : y));
5142   }
5143   #define simde_math_cdfnormf simde_math_cdfnormf
5144 #endif
5145 
5146 HEDLEY_DIAGNOSTIC_PUSH
5147 SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_
5148 
5149 #if !defined(simde_math_cdfnorminv) && defined(simde_math_log) && defined(simde_math_sqrt)
5150   /*https://web.archive.org/web/20150910081113/http://home.online.no/~pjacklam/notes/invnorm/impl/sprouse/ltqnorm.c*/
5151   static HEDLEY_INLINE
5152   double
simde_math_cdfnorminv(double p)5153   simde_math_cdfnorminv(double p) {
5154     static const double a[] = {
5155       -3.969683028665376e+01,
5156        2.209460984245205e+02,
5157       -2.759285104469687e+02,
5158        1.383577518672690e+02,
5159       -3.066479806614716e+01,
5160        2.506628277459239e+00
5161     };
5162 
5163     static const double b[] = {
5164       -5.447609879822406e+01,
5165        1.615858368580409e+02,
5166       -1.556989798598866e+02,
5167        6.680131188771972e+01,
5168       -1.328068155288572e+01
5169     };
5170 
5171     static const double c[] = {
5172       -7.784894002430293e-03,
5173       -3.223964580411365e-01,
5174       -2.400758277161838e+00,
5175       -2.549732539343734e+00,
5176        4.374664141464968e+00,
5177        2.938163982698783e+00
5178     };
5179 
5180     static const double d[] = {
5181       7.784695709041462e-03,
5182       3.224671290700398e-01,
5183       2.445134137142996e+00,
5184       3.754408661907416e+00
5185     };
5186 
5187     static const double low  = 0.02425;
5188     static const double high = 0.97575;
5189     double q, r;
5190 
5191     if (p < 0 || p > 1) {
5192       return 0.0;
5193     } else if (p == 0) {
5194       return -SIMDE_MATH_INFINITY;
5195     } else if (p == 1) {
5196       return SIMDE_MATH_INFINITY;
5197     } else if (p < low) {
5198       q = simde_math_sqrt(-2.0 * simde_math_log(p));
5199       return
5200         (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) /
5201         (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1));
5202     } else if (p > high) {
5203       q = simde_math_sqrt(-2.0 * simde_math_log(1.0 - p));
5204       return
5205         -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) /
5206          (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1));
5207     } else {
5208       q = p - 0.5;
5209       r = q * q;
5210       return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) *
5211         q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
5212     }
5213 }
5214 #define simde_math_cdfnorminv simde_math_cdfnorminv
5215 #endif
5216 
5217 #if !defined(simde_math_cdfnorminvf) && defined(simde_math_logf) && defined(simde_math_sqrtf)
5218   static HEDLEY_INLINE
5219   float
simde_math_cdfnorminvf(float p)5220   simde_math_cdfnorminvf(float p) {
5221     static const float a[] = {
5222       -3.969683028665376e+01f,
5223        2.209460984245205e+02f,
5224       -2.759285104469687e+02f,
5225        1.383577518672690e+02f,
5226       -3.066479806614716e+01f,
5227        2.506628277459239e+00f
5228     };
5229     static const float b[] = {
5230       -5.447609879822406e+01f,
5231        1.615858368580409e+02f,
5232       -1.556989798598866e+02f,
5233        6.680131188771972e+01f,
5234       -1.328068155288572e+01f
5235     };
5236     static const float c[] = {
5237       -7.784894002430293e-03f,
5238       -3.223964580411365e-01f,
5239       -2.400758277161838e+00f,
5240       -2.549732539343734e+00f,
5241        4.374664141464968e+00f,
5242        2.938163982698783e+00f
5243     };
5244     static const float d[] = {
5245       7.784695709041462e-03f,
5246       3.224671290700398e-01f,
5247       2.445134137142996e+00f,
5248       3.754408661907416e+00f
5249     };
5250     static const float low  = 0.02425f;
5251     static const float high = 0.97575f;
5252     float q, r;
5253 
5254     if (p < 0 || p > 1) {
5255       return 0.0f;
5256     } else if (p == 0) {
5257       return -SIMDE_MATH_INFINITYF;
5258     } else if (p == 1) {
5259       return SIMDE_MATH_INFINITYF;
5260     } else if (p < low) {
5261       q = simde_math_sqrtf(-2.0f * simde_math_logf(p));
5262       return
5263         (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) /
5264         (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1));
5265     } else if (p > high) {
5266       q = simde_math_sqrtf(-2.0f * simde_math_logf(1.0f - p));
5267       return
5268         -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) /
5269          (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1));
5270     } else {
5271       q = p - 0.5f;
5272       r = q * q;
5273       return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) *
5274          q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
5275     }
5276   }
5277   #define simde_math_cdfnorminvf simde_math_cdfnorminvf
5278 #endif
5279 
5280 #if !defined(simde_math_erfinv) && defined(simde_math_log) && defined(simde_math_copysign) && defined(simde_math_sqrt)
5281   static HEDLEY_INLINE
5282   double
simde_math_erfinv(double x)5283   simde_math_erfinv(double x) {
5284     /* https://stackoverflow.com/questions/27229371/inverse-error-function-in-c
5285      *
5286      * The original answer on SO uses a constant of 0.147, but in my
5287      * testing 0.14829094707965850830078125 gives a lower average absolute error
5288      * (0.0001410958211636170744895935 vs. 0.0001465479290345683693885803).
5289      * That said, if your goal is to minimize the *maximum* absolute
5290      * error, 0.15449436008930206298828125 provides significantly better
5291      * results; 0.0009250640869140625000000000 vs ~ 0.005. */
5292     double tt1, tt2, lnx;
5293     double sgn = simde_math_copysign(1.0, x);
5294 
5295     x = (1.0 - x) * (1.0 + x);
5296     lnx = simde_math_log(x);
5297 
5298     tt1 = 2.0 / (SIMDE_MATH_PI * 0.14829094707965850830078125) + 0.5 * lnx;
5299     tt2 = (1.0 / 0.14829094707965850830078125) * lnx;
5300 
5301     return sgn * simde_math_sqrt(-tt1 + simde_math_sqrt(tt1 * tt1 - tt2));
5302   }
5303   #define simde_math_erfinv simde_math_erfinv
5304 #endif
5305 
5306 #if !defined(simde_math_erfinvf) && defined(simde_math_logf) && defined(simde_math_copysignf) && defined(simde_math_sqrtf)
5307   static HEDLEY_INLINE
5308   float
simde_math_erfinvf(float x)5309   simde_math_erfinvf(float x) {
5310     float tt1, tt2, lnx;
5311     float sgn = simde_math_copysignf(1.0f, x);
5312 
5313     x = (1.0f - x) * (1.0f + x);
5314     lnx = simde_math_logf(x);
5315 
5316     tt1 = 2.0f / (SIMDE_MATH_PIF * 0.14829094707965850830078125f) + 0.5f * lnx;
5317     tt2 = (1.0f / 0.14829094707965850830078125f) * lnx;
5318 
5319     return sgn * simde_math_sqrtf(-tt1 + simde_math_sqrtf(tt1 * tt1 - tt2));
5320   }
5321   #define simde_math_erfinvf simde_math_erfinvf
5322 #endif
5323 
5324 #if !defined(simde_math_erfcinv) && defined(simde_math_erfinv) && defined(simde_math_log) && defined(simde_math_sqrt)
5325   static HEDLEY_INLINE
5326   double
simde_math_erfcinv(double x)5327   simde_math_erfcinv(double x) {
5328     if(x >= 0.0625 && x < 2.0) {
5329       return simde_math_erfinv(1.0 - x);
5330     } else if (x < 0.0625 && x >= 1.0e-100) {
5331       double p[6] = {
5332         0.1550470003116,
5333         1.382719649631,
5334         0.690969348887,
5335         -1.128081391617,
5336         0.680544246825,
5337         -0.16444156791
5338       };
5339       double q[3] = {
5340         0.155024849822,
5341         1.385228141995,
5342         1.000000000000
5343       };
5344 
5345       const double t = 1.0 / simde_math_sqrt(-simde_math_log(x));
5346       return (p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) /
5347             (q[0] + t * (q[1] + t * (q[2])));
5348     } else if (x < 1.0e-100 && x >= SIMDE_MATH_DBL_MIN) {
5349       double p[4] = {
5350         0.00980456202915,
5351         0.363667889171,
5352         0.97302949837,
5353         -0.5374947401
5354       };
5355       double q[3] = {
5356         0.00980451277802,
5357         0.363699971544,
5358         1.000000000000
5359       };
5360 
5361       const double t = 1.0 / simde_math_sqrt(-simde_math_log(x));
5362       return (p[0] / t + p[1] + t * (p[2] + t * p[3])) /
5363              (q[0] + t * (q[1] + t * (q[2])));
5364     } else if (!simde_math_isnormal(x)) {
5365       return SIMDE_MATH_INFINITY;
5366     } else {
5367       return -SIMDE_MATH_INFINITY;
5368     }
5369   }
5370 
5371   #define simde_math_erfcinv simde_math_erfcinv
5372 #endif
5373 
5374 #if !defined(simde_math_erfcinvf) && defined(simde_math_erfinvf) && defined(simde_math_logf) && defined(simde_math_sqrtf)
5375   static HEDLEY_INLINE
5376   float
simde_math_erfcinvf(float x)5377   simde_math_erfcinvf(float x) {
5378     if(x >= 0.0625f && x < 2.0f) {
5379       return simde_math_erfinvf(1.0f - x);
5380     } else if (x < 0.0625f && x >= SIMDE_MATH_FLT_MIN) {
5381       static const float p[6] = {
5382          0.1550470003116f,
5383          1.382719649631f,
5384          0.690969348887f,
5385         -1.128081391617f,
5386          0.680544246825f
5387         -0.164441567910f
5388       };
5389       static const float q[3] = {
5390         0.155024849822f,
5391         1.385228141995f,
5392         1.000000000000f
5393       };
5394 
5395       const float t = 1.0f / simde_math_sqrtf(-simde_math_logf(x));
5396       return (p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) /
5397              (q[0] + t * (q[1] + t * (q[2])));
5398     } else if (x < SIMDE_MATH_FLT_MIN && simde_math_isnormalf(x)) {
5399       static const float p[4] = {
5400         0.00980456202915f,
5401         0.36366788917100f,
5402         0.97302949837000f,
5403         -0.5374947401000f
5404       };
5405       static const float q[3] = {
5406         0.00980451277802f,
5407         0.36369997154400f,
5408         1.00000000000000f
5409       };
5410 
5411       const float t = 1.0f / simde_math_sqrtf(-simde_math_logf(x));
5412       return (p[0] / t + p[1] + t * (p[2] + t * p[3])) /
5413              (q[0] + t * (q[1] + t * (q[2])));
5414     } else {
5415       return simde_math_isnormalf(x) ? -SIMDE_MATH_INFINITYF : SIMDE_MATH_INFINITYF;
5416     }
5417   }
5418 
5419   #define simde_math_erfcinvf simde_math_erfcinvf
5420 #endif
5421 
5422 HEDLEY_DIAGNOSTIC_POP
5423 
5424 static HEDLEY_INLINE
5425 double
simde_math_rad2deg(double radians)5426 simde_math_rad2deg(double radians) {
5427  return radians * SIMDE_MATH_180_OVER_PI;
5428 }
5429 
5430 static HEDLEY_INLINE
5431 float
simde_math_rad2degf(float radians)5432 simde_math_rad2degf(float radians) {
5433     return radians * SIMDE_MATH_180_OVER_PIF;
5434 }
5435 
5436 static HEDLEY_INLINE
5437 double
simde_math_deg2rad(double degrees)5438 simde_math_deg2rad(double degrees) {
5439   return degrees * SIMDE_MATH_PI_OVER_180;
5440 }
5441 
5442 static HEDLEY_INLINE
5443 float
simde_math_deg2radf(float degrees)5444 simde_math_deg2radf(float degrees) {
5445     return degrees * (SIMDE_MATH_PI_OVER_180F);
5446 }
5447 
5448 /***  Saturated arithmetic ***/
5449 
5450 static HEDLEY_INLINE
5451 int8_t
simde_math_adds_i8(int8_t a,int8_t b)5452 simde_math_adds_i8(int8_t a, int8_t b) {
5453   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5454     return vqaddb_s8(a, b);
5455   #else
5456     uint8_t a_ = HEDLEY_STATIC_CAST(uint8_t, a);
5457     uint8_t b_ = HEDLEY_STATIC_CAST(uint8_t, b);
5458     uint8_t r_ = a_ + b_;
5459 
5460     a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT8_MAX;
5461     if (HEDLEY_STATIC_CAST(int8_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
5462       r_ = a_;
5463     }
5464 
5465     return HEDLEY_STATIC_CAST(int8_t, r_);
5466   #endif
5467 }
5468 
5469 static HEDLEY_INLINE
5470 int16_t
simde_math_adds_i16(int16_t a,int16_t b)5471 simde_math_adds_i16(int16_t a, int16_t b) {
5472   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5473     return vqaddh_s16(a, b);
5474   #else
5475     uint16_t a_ = HEDLEY_STATIC_CAST(uint16_t, a);
5476     uint16_t b_ = HEDLEY_STATIC_CAST(uint16_t, b);
5477     uint16_t r_ = a_ + b_;
5478 
5479     a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT16_MAX;
5480     if (HEDLEY_STATIC_CAST(int16_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
5481       r_ = a_;
5482     }
5483 
5484     return HEDLEY_STATIC_CAST(int16_t, r_);
5485   #endif
5486 }
5487 
5488 static HEDLEY_INLINE
5489 int32_t
simde_math_adds_i32(int32_t a,int32_t b)5490 simde_math_adds_i32(int32_t a, int32_t b) {
5491   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5492     return vqadds_s32(a, b);
5493   #else
5494     uint32_t a_ = HEDLEY_STATIC_CAST(uint32_t, a);
5495     uint32_t b_ = HEDLEY_STATIC_CAST(uint32_t, b);
5496     uint32_t r_ = a_ + b_;
5497 
5498     a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT32_MAX;
5499     if (HEDLEY_STATIC_CAST(int32_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
5500       r_ = a_;
5501     }
5502 
5503     return HEDLEY_STATIC_CAST(int32_t, r_);
5504   #endif
5505 }
5506 
5507 static HEDLEY_INLINE
5508 int64_t
simde_math_adds_i64(int64_t a,int64_t b)5509 simde_math_adds_i64(int64_t a, int64_t b) {
5510   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5511     return vqaddd_s64(a, b);
5512   #else
5513     uint64_t a_ = HEDLEY_STATIC_CAST(uint64_t, a);
5514     uint64_t b_ = HEDLEY_STATIC_CAST(uint64_t, b);
5515     uint64_t r_ = a_ + b_;
5516 
5517     a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT64_MAX;
5518     if (HEDLEY_STATIC_CAST(int64_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
5519       r_ = a_;
5520     }
5521 
5522     return HEDLEY_STATIC_CAST(int64_t, r_);
5523   #endif
5524 }
5525 
5526 static HEDLEY_INLINE
5527 uint8_t
simde_math_adds_u8(uint8_t a,uint8_t b)5528 simde_math_adds_u8(uint8_t a, uint8_t b) {
5529   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5530     return vqaddb_u8(a, b);
5531   #else
5532     uint8_t r = a + b;
5533     r |= -(r < a);
5534     return r;
5535   #endif
5536 }
5537 
5538 static HEDLEY_INLINE
5539 uint16_t
simde_math_adds_u16(uint16_t a,uint16_t b)5540 simde_math_adds_u16(uint16_t a, uint16_t b) {
5541   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5542     return vqaddh_u16(a, b);
5543   #else
5544     uint16_t r = a + b;
5545     r |= -(r < a);
5546     return r;
5547   #endif
5548 }
5549 
5550 static HEDLEY_INLINE
5551 uint32_t
simde_math_adds_u32(uint32_t a,uint32_t b)5552 simde_math_adds_u32(uint32_t a, uint32_t b) {
5553   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5554     return vqadds_u32(a, b);
5555   #else
5556     uint32_t r = a + b;
5557     r |= -(r < a);
5558     return r;
5559   #endif
5560 }
5561 
5562 static HEDLEY_INLINE
5563 uint64_t
simde_math_adds_u64(uint64_t a,uint64_t b)5564 simde_math_adds_u64(uint64_t a, uint64_t b) {
5565   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5566     return vqaddd_u64(a, b);
5567   #else
5568     uint64_t r = a + b;
5569     r |= -(r < a);
5570     return r;
5571   #endif
5572 }
5573 
5574 static HEDLEY_INLINE
5575 int8_t
simde_math_subs_i8(int8_t a,int8_t b)5576 simde_math_subs_i8(int8_t a, int8_t b) {
5577   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5578     return vqsubb_s8(a, b);
5579   #else
5580     uint8_t a_ = HEDLEY_STATIC_CAST(uint8_t, a);
5581     uint8_t b_ = HEDLEY_STATIC_CAST(uint8_t, b);
5582     uint8_t r_ = a_ - b_;
5583 
5584     a_ = (a_ >> 7) + INT8_MAX;
5585 
5586     if (HEDLEY_STATIC_CAST(int8_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
5587       r_ = a_;
5588     }
5589 
5590     return HEDLEY_STATIC_CAST(int8_t, r_);
5591   #endif
5592 }
5593 
5594 static HEDLEY_INLINE
5595 int16_t
simde_math_subs_i16(int16_t a,int16_t b)5596 simde_math_subs_i16(int16_t a, int16_t b) {
5597   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5598     return vqsubh_s16(a, b);
5599   #else
5600     uint16_t a_ = HEDLEY_STATIC_CAST(uint16_t, a);
5601     uint16_t b_ = HEDLEY_STATIC_CAST(uint16_t, b);
5602     uint16_t r_ = a_ - b_;
5603 
5604     a_ = (a_ >> 15) + INT16_MAX;
5605 
5606     if (HEDLEY_STATIC_CAST(int16_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
5607       r_ = a_;
5608     }
5609 
5610     return HEDLEY_STATIC_CAST(int16_t, r_);
5611   #endif
5612 }
5613 
5614 static HEDLEY_INLINE
5615 int32_t
simde_math_subs_i32(int32_t a,int32_t b)5616 simde_math_subs_i32(int32_t a, int32_t b) {
5617   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5618     return vqsubs_s32(a, b);
5619   #else
5620     uint32_t a_ = HEDLEY_STATIC_CAST(uint32_t, a);
5621     uint32_t b_ = HEDLEY_STATIC_CAST(uint32_t, b);
5622     uint32_t r_ = a_ - b_;
5623 
5624     a_ = (a_ >> 31) + INT32_MAX;
5625 
5626     if (HEDLEY_STATIC_CAST(int32_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
5627       r_ = a_;
5628     }
5629 
5630     return HEDLEY_STATIC_CAST(int32_t, r_);
5631   #endif
5632 }
5633 
5634 static HEDLEY_INLINE
5635 int64_t
simde_math_subs_i64(int64_t a,int64_t b)5636 simde_math_subs_i64(int64_t a, int64_t b) {
5637   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5638     return vqsubd_s64(a, b);
5639   #else
5640     uint64_t a_ = HEDLEY_STATIC_CAST(uint64_t, a);
5641     uint64_t b_ = HEDLEY_STATIC_CAST(uint64_t, b);
5642     uint64_t r_ = a_ - b_;
5643 
5644     a_ = (a_ >> 63) + INT64_MAX;
5645 
5646     if (HEDLEY_STATIC_CAST(int64_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
5647       r_ = a_;
5648     }
5649 
5650     return HEDLEY_STATIC_CAST(int64_t, r_);
5651   #endif
5652 }
5653 
5654 static HEDLEY_INLINE
5655 uint8_t
simde_math_subs_u8(uint8_t a,uint8_t b)5656 simde_math_subs_u8(uint8_t a, uint8_t b) {
5657   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5658     return vqsubb_u8(a, b);
5659   #else
5660     uint8_t res = a - b;
5661     res &= -(res <= a);
5662     return res;
5663   #endif
5664 }
5665 
5666 static HEDLEY_INLINE
5667 uint16_t
simde_math_subs_u16(uint16_t a,uint16_t b)5668 simde_math_subs_u16(uint16_t a, uint16_t b) {
5669   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5670     return vqsubh_u16(a, b);
5671   #else
5672     uint16_t res = a - b;
5673     res &= -(res <= a);
5674     return res;
5675   #endif
5676 }
5677 
5678 static HEDLEY_INLINE
5679 uint32_t
simde_math_subs_u32(uint32_t a,uint32_t b)5680 simde_math_subs_u32(uint32_t a, uint32_t b) {
5681   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5682     return vqsubs_u32(a, b);
5683   #else
5684     uint32_t res = a - b;
5685     res &= -(res <= a);
5686     return res;
5687   #endif
5688 }
5689 
5690 static HEDLEY_INLINE
5691 uint64_t
simde_math_subs_u64(uint64_t a,uint64_t b)5692 simde_math_subs_u64(uint64_t a, uint64_t b) {
5693   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5694     return vqsubd_u64(a, b);
5695   #else
5696     uint64_t res = a - b;
5697     res &= -(res <= a);
5698     return res;
5699   #endif
5700 }
5701 
5702 HEDLEY_DIAGNOSTIC_POP
5703 
5704 #endif /* !defined(SIMDE_MATH_H) */
5705 /* :: End ../simde/simde/simde-math.h :: */
5706 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
5707 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
5708 /* :: Begin ../simde/simde/simde-constify.h :: */
5709 /* SPDX-License-Identifier: MIT
5710  *
5711  * Permission is hereby granted, free of charge, to any person
5712  * obtaining a copy of this software and associated documentation
5713  * files (the "Software"), to deal in the Software without
5714  * restriction, including without limitation the rights to use, copy,
5715  * modify, merge, publish, distribute, sublicense, and/or sell copies
5716  * of the Software, and to permit persons to whom the Software is
5717  * furnished to do so, subject to the following conditions:
5718  *
5719  * The above copyright notice and this permission notice shall be
5720  * included in all copies or substantial portions of the Software.
5721  *
5722  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
5723  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
5724  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
5725  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
5726  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
5727  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
5728  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
5729  * SOFTWARE.
5730  *
5731  * Copyright:
5732  *   2020      Evan Nemerson <evan@nemerson.com>
5733  */
5734 
5735 /* Constify macros.  For internal use only.
5736  *
5737  * These are used to make it possible to call a function which takes
5738  * an Integer Constant Expression (ICE) using a compile time constant.
5739  * Technically it would also be possible to use a value not trivially
5740  * known by the compiler, but there would be a siginficant performance
5741  * hit (a switch switch is used).
5742  *
5743  * The basic idea is pretty simple; we just emit a do while loop which
5744  * contains a switch with a case for every possible value of the
5745  * constant.
5746  *
5747  * As long as the value you pass to the function in constant, pretty
5748  * much any copmiler shouldn't have a problem generating exactly the
5749  * same code as if you had used an ICE.
5750  *
5751  * This is intended to be used in the SIMDe implementations of
5752  * functions the compilers require to be an ICE, but the other benefit
5753  * is that if we also disable the warnings from
5754  * SIMDE_REQUIRE_CONSTANT_RANGE we can actually just allow the tests
5755  * to use non-ICE parameters
5756  */
5757 
5758 #if !defined(SIMDE_CONSTIFY_H)
5759 #define SIMDE_CONSTIFY_H
5760 
5761 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
5762 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
5763 
5764 HEDLEY_DIAGNOSTIC_PUSH
5765 SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_
5766 SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_
5767 
5768 #define SIMDE_CONSTIFY_2_(func_name, result, default_case, imm, ...) \
5769   do { \
5770     switch(imm) { \
5771       case 0: result = func_name(__VA_ARGS__, 0); break; \
5772       case 1: result = func_name(__VA_ARGS__, 1); break; \
5773       default: result = default_case; break; \
5774     } \
5775   } while (0)
5776 
5777 #define SIMDE_CONSTIFY_4_(func_name, result, default_case, imm, ...) \
5778   do { \
5779     switch(imm) { \
5780       case 0: result = func_name(__VA_ARGS__, 0); break; \
5781       case 1: result = func_name(__VA_ARGS__, 1); break; \
5782       case 2: result = func_name(__VA_ARGS__, 2); break; \
5783       case 3: result = func_name(__VA_ARGS__, 3); break; \
5784       default: result = default_case; break; \
5785     } \
5786   } while (0)
5787 
5788 #define SIMDE_CONSTIFY_8_(func_name, result, default_case, imm, ...) \
5789   do { \
5790     switch(imm) { \
5791       case 0: result = func_name(__VA_ARGS__, 0); break; \
5792       case 1: result = func_name(__VA_ARGS__, 1); break; \
5793       case 2: result = func_name(__VA_ARGS__, 2); break; \
5794       case 3: result = func_name(__VA_ARGS__, 3); break; \
5795       case 4: result = func_name(__VA_ARGS__, 4); break; \
5796       case 5: result = func_name(__VA_ARGS__, 5); break; \
5797       case 6: result = func_name(__VA_ARGS__, 6); break; \
5798       case 7: result = func_name(__VA_ARGS__, 7); break; \
5799       default: result = default_case; break; \
5800     } \
5801   } while (0)
5802 
5803 #define SIMDE_CONSTIFY_16_(func_name, result, default_case, imm, ...) \
5804   do { \
5805     switch(imm) { \
5806       case  0: result = func_name(__VA_ARGS__,  0); break; \
5807       case  1: result = func_name(__VA_ARGS__,  1); break; \
5808       case  2: result = func_name(__VA_ARGS__,  2); break; \
5809       case  3: result = func_name(__VA_ARGS__,  3); break; \
5810       case  4: result = func_name(__VA_ARGS__,  4); break; \
5811       case  5: result = func_name(__VA_ARGS__,  5); break; \
5812       case  6: result = func_name(__VA_ARGS__,  6); break; \
5813       case  7: result = func_name(__VA_ARGS__,  7); break; \
5814       case  8: result = func_name(__VA_ARGS__,  8); break; \
5815       case  9: result = func_name(__VA_ARGS__,  9); break; \
5816       case 10: result = func_name(__VA_ARGS__, 10); break; \
5817       case 11: result = func_name(__VA_ARGS__, 11); break; \
5818       case 12: result = func_name(__VA_ARGS__, 12); break; \
5819       case 13: result = func_name(__VA_ARGS__, 13); break; \
5820       case 14: result = func_name(__VA_ARGS__, 14); break; \
5821       case 15: result = func_name(__VA_ARGS__, 15); break; \
5822       default: result = default_case; break; \
5823     } \
5824   } while (0)
5825 
5826 #define SIMDE_CONSTIFY_32_(func_name, result, default_case, imm, ...) \
5827   do { \
5828     switch(imm) { \
5829       case  0: result = func_name(__VA_ARGS__,  0); break; \
5830       case  1: result = func_name(__VA_ARGS__,  1); break; \
5831       case  2: result = func_name(__VA_ARGS__,  2); break; \
5832       case  3: result = func_name(__VA_ARGS__,  3); break; \
5833       case  4: result = func_name(__VA_ARGS__,  4); break; \
5834       case  5: result = func_name(__VA_ARGS__,  5); break; \
5835       case  6: result = func_name(__VA_ARGS__,  6); break; \
5836       case  7: result = func_name(__VA_ARGS__,  7); break; \
5837       case  8: result = func_name(__VA_ARGS__,  8); break; \
5838       case  9: result = func_name(__VA_ARGS__,  9); break; \
5839       case 10: result = func_name(__VA_ARGS__, 10); break; \
5840       case 11: result = func_name(__VA_ARGS__, 11); break; \
5841       case 12: result = func_name(__VA_ARGS__, 12); break; \
5842       case 13: result = func_name(__VA_ARGS__, 13); break; \
5843       case 14: result = func_name(__VA_ARGS__, 14); break; \
5844       case 15: result = func_name(__VA_ARGS__, 15); break; \
5845       case 16: result = func_name(__VA_ARGS__, 16); break; \
5846       case 17: result = func_name(__VA_ARGS__, 17); break; \
5847       case 18: result = func_name(__VA_ARGS__, 18); break; \
5848       case 19: result = func_name(__VA_ARGS__, 19); break; \
5849       case 20: result = func_name(__VA_ARGS__, 20); break; \
5850       case 21: result = func_name(__VA_ARGS__, 21); break; \
5851       case 22: result = func_name(__VA_ARGS__, 22); break; \
5852       case 23: result = func_name(__VA_ARGS__, 23); break; \
5853       case 24: result = func_name(__VA_ARGS__, 24); break; \
5854       case 25: result = func_name(__VA_ARGS__, 25); break; \
5855       case 26: result = func_name(__VA_ARGS__, 26); break; \
5856       case 27: result = func_name(__VA_ARGS__, 27); break; \
5857       case 28: result = func_name(__VA_ARGS__, 28); break; \
5858       case 29: result = func_name(__VA_ARGS__, 29); break; \
5859       case 30: result = func_name(__VA_ARGS__, 30); break; \
5860       case 31: result = func_name(__VA_ARGS__, 31); break; \
5861       default: result = default_case; break; \
5862     } \
5863   } while (0)
5864 
5865 #define SIMDE_CONSTIFY_64_(func_name, result, default_case, imm, ...) \
5866   do { \
5867     switch(imm) { \
5868       case  0: result = func_name(__VA_ARGS__,  0); break; \
5869       case  1: result = func_name(__VA_ARGS__,  1); break; \
5870       case  2: result = func_name(__VA_ARGS__,  2); break; \
5871       case  3: result = func_name(__VA_ARGS__,  3); break; \
5872       case  4: result = func_name(__VA_ARGS__,  4); break; \
5873       case  5: result = func_name(__VA_ARGS__,  5); break; \
5874       case  6: result = func_name(__VA_ARGS__,  6); break; \
5875       case  7: result = func_name(__VA_ARGS__,  7); break; \
5876       case  8: result = func_name(__VA_ARGS__,  8); break; \
5877       case  9: result = func_name(__VA_ARGS__,  9); break; \
5878       case 10: result = func_name(__VA_ARGS__, 10); break; \
5879       case 11: result = func_name(__VA_ARGS__, 11); break; \
5880       case 12: result = func_name(__VA_ARGS__, 12); break; \
5881       case 13: result = func_name(__VA_ARGS__, 13); break; \
5882       case 14: result = func_name(__VA_ARGS__, 14); break; \
5883       case 15: result = func_name(__VA_ARGS__, 15); break; \
5884       case 16: result = func_name(__VA_ARGS__, 16); break; \
5885       case 17: result = func_name(__VA_ARGS__, 17); break; \
5886       case 18: result = func_name(__VA_ARGS__, 18); break; \
5887       case 19: result = func_name(__VA_ARGS__, 19); break; \
5888       case 20: result = func_name(__VA_ARGS__, 20); break; \
5889       case 21: result = func_name(__VA_ARGS__, 21); break; \
5890       case 22: result = func_name(__VA_ARGS__, 22); break; \
5891       case 23: result = func_name(__VA_ARGS__, 23); break; \
5892       case 24: result = func_name(__VA_ARGS__, 24); break; \
5893       case 25: result = func_name(__VA_ARGS__, 25); break; \
5894       case 26: result = func_name(__VA_ARGS__, 26); break; \
5895       case 27: result = func_name(__VA_ARGS__, 27); break; \
5896       case 28: result = func_name(__VA_ARGS__, 28); break; \
5897       case 29: result = func_name(__VA_ARGS__, 29); break; \
5898       case 30: result = func_name(__VA_ARGS__, 30); break; \
5899       case 31: result = func_name(__VA_ARGS__, 31); break; \
5900       case 32: result = func_name(__VA_ARGS__, 32); break; \
5901       case 33: result = func_name(__VA_ARGS__, 33); break; \
5902       case 34: result = func_name(__VA_ARGS__, 34); break; \
5903       case 35: result = func_name(__VA_ARGS__, 35); break; \
5904       case 36: result = func_name(__VA_ARGS__, 36); break; \
5905       case 37: result = func_name(__VA_ARGS__, 37); break; \
5906       case 38: result = func_name(__VA_ARGS__, 38); break; \
5907       case 39: result = func_name(__VA_ARGS__, 39); break; \
5908       case 40: result = func_name(__VA_ARGS__, 40); break; \
5909       case 41: result = func_name(__VA_ARGS__, 41); break; \
5910       case 42: result = func_name(__VA_ARGS__, 42); break; \
5911       case 43: result = func_name(__VA_ARGS__, 43); break; \
5912       case 44: result = func_name(__VA_ARGS__, 44); break; \
5913       case 45: result = func_name(__VA_ARGS__, 45); break; \
5914       case 46: result = func_name(__VA_ARGS__, 46); break; \
5915       case 47: result = func_name(__VA_ARGS__, 47); break; \
5916       case 48: result = func_name(__VA_ARGS__, 48); break; \
5917       case 49: result = func_name(__VA_ARGS__, 49); break; \
5918       case 50: result = func_name(__VA_ARGS__, 50); break; \
5919       case 51: result = func_name(__VA_ARGS__, 51); break; \
5920       case 52: result = func_name(__VA_ARGS__, 52); break; \
5921       case 53: result = func_name(__VA_ARGS__, 53); break; \
5922       case 54: result = func_name(__VA_ARGS__, 54); break; \
5923       case 55: result = func_name(__VA_ARGS__, 55); break; \
5924       case 56: result = func_name(__VA_ARGS__, 56); break; \
5925       case 57: result = func_name(__VA_ARGS__, 57); break; \
5926       case 58: result = func_name(__VA_ARGS__, 58); break; \
5927       case 59: result = func_name(__VA_ARGS__, 59); break; \
5928       case 60: result = func_name(__VA_ARGS__, 60); break; \
5929       case 61: result = func_name(__VA_ARGS__, 61); break; \
5930       case 62: result = func_name(__VA_ARGS__, 62); break; \
5931       case 63: result = func_name(__VA_ARGS__, 63); break; \
5932       default: result = default_case; break; \
5933     } \
5934   } while (0)
5935 
5936 #define SIMDE_CONSTIFY_2_NO_RESULT_(func_name, default_case, imm, ...) \
5937   do { \
5938     switch(imm) { \
5939       case 0: func_name(__VA_ARGS__, 0); break; \
5940       case 1: func_name(__VA_ARGS__, 1); break; \
5941       default: default_case; break; \
5942     } \
5943   } while (0)
5944 
5945 #define SIMDE_CONSTIFY_4_NO_RESULT_(func_name, default_case, imm, ...) \
5946   do { \
5947     switch(imm) { \
5948       case 0: func_name(__VA_ARGS__, 0); break; \
5949       case 1: func_name(__VA_ARGS__, 1); break; \
5950       case 2: func_name(__VA_ARGS__, 2); break; \
5951       case 3: func_name(__VA_ARGS__, 3); break; \
5952       default: default_case; break; \
5953     } \
5954   } while (0)
5955 
5956 #define SIMDE_CONSTIFY_8_NO_RESULT_(func_name, default_case, imm, ...) \
5957   do { \
5958     switch(imm) { \
5959       case 0: func_name(__VA_ARGS__, 0); break; \
5960       case 1: func_name(__VA_ARGS__, 1); break; \
5961       case 2: func_name(__VA_ARGS__, 2); break; \
5962       case 3: func_name(__VA_ARGS__, 3); break; \
5963       case 4: func_name(__VA_ARGS__, 4); break; \
5964       case 5: func_name(__VA_ARGS__, 5); break; \
5965       case 6: func_name(__VA_ARGS__, 6); break; \
5966       case 7: func_name(__VA_ARGS__, 7); break; \
5967       default: default_case; break; \
5968     } \
5969   } while (0)
5970 
5971 #define SIMDE_CONSTIFY_16_NO_RESULT_(func_name, default_case, imm, ...) \
5972   do { \
5973     switch(imm) { \
5974       case  0: func_name(__VA_ARGS__,  0); break; \
5975       case  1: func_name(__VA_ARGS__,  1); break; \
5976       case  2: func_name(__VA_ARGS__,  2); break; \
5977       case  3: func_name(__VA_ARGS__,  3); break; \
5978       case  4: func_name(__VA_ARGS__,  4); break; \
5979       case  5: func_name(__VA_ARGS__,  5); break; \
5980       case  6: func_name(__VA_ARGS__,  6); break; \
5981       case  7: func_name(__VA_ARGS__,  7); break; \
5982       case  8: func_name(__VA_ARGS__,  8); break; \
5983       case  9: func_name(__VA_ARGS__,  9); break; \
5984       case 10: func_name(__VA_ARGS__, 10); break; \
5985       case 11: func_name(__VA_ARGS__, 11); break; \
5986       case 12: func_name(__VA_ARGS__, 12); break; \
5987       case 13: func_name(__VA_ARGS__, 13); break; \
5988       case 14: func_name(__VA_ARGS__, 14); break; \
5989       case 15: func_name(__VA_ARGS__, 15); break; \
5990       default: default_case; break; \
5991     } \
5992   } while (0)
5993 
5994 #define SIMDE_CONSTIFY_32_NO_RESULT_(func_name, default_case, imm, ...) \
5995   do { \
5996     switch(imm) { \
5997       case  0: func_name(__VA_ARGS__,  0); break; \
5998       case  1: func_name(__VA_ARGS__,  1); break; \
5999       case  2: func_name(__VA_ARGS__,  2); break; \
6000       case  3: func_name(__VA_ARGS__,  3); break; \
6001       case  4: func_name(__VA_ARGS__,  4); break; \
6002       case  5: func_name(__VA_ARGS__,  5); break; \
6003       case  6: func_name(__VA_ARGS__,  6); break; \
6004       case  7: func_name(__VA_ARGS__,  7); break; \
6005       case  8: func_name(__VA_ARGS__,  8); break; \
6006       case  9: func_name(__VA_ARGS__,  9); break; \
6007       case 10: func_name(__VA_ARGS__, 10); break; \
6008       case 11: func_name(__VA_ARGS__, 11); break; \
6009       case 12: func_name(__VA_ARGS__, 12); break; \
6010       case 13: func_name(__VA_ARGS__, 13); break; \
6011       case 14: func_name(__VA_ARGS__, 14); break; \
6012       case 15: func_name(__VA_ARGS__, 15); break; \
6013       case 16: func_name(__VA_ARGS__, 16); break; \
6014       case 17: func_name(__VA_ARGS__, 17); break; \
6015       case 18: func_name(__VA_ARGS__, 18); break; \
6016       case 19: func_name(__VA_ARGS__, 19); break; \
6017       case 20: func_name(__VA_ARGS__, 20); break; \
6018       case 21: func_name(__VA_ARGS__, 21); break; \
6019       case 22: func_name(__VA_ARGS__, 22); break; \
6020       case 23: func_name(__VA_ARGS__, 23); break; \
6021       case 24: func_name(__VA_ARGS__, 24); break; \
6022       case 25: func_name(__VA_ARGS__, 25); break; \
6023       case 26: func_name(__VA_ARGS__, 26); break; \
6024       case 27: func_name(__VA_ARGS__, 27); break; \
6025       case 28: func_name(__VA_ARGS__, 28); break; \
6026       case 29: func_name(__VA_ARGS__, 29); break; \
6027       case 30: func_name(__VA_ARGS__, 30); break; \
6028       case 31: func_name(__VA_ARGS__, 31); break; \
6029       default: default_case; break; \
6030     } \
6031   } while (0)
6032 
6033 #define SIMDE_CONSTIFY_64_NO_RESULT_(func_name, default_case, imm, ...) \
6034   do { \
6035     switch(imm) { \
6036       case  0: func_name(__VA_ARGS__,  0); break; \
6037       case  1: func_name(__VA_ARGS__,  1); break; \
6038       case  2: func_name(__VA_ARGS__,  2); break; \
6039       case  3: func_name(__VA_ARGS__,  3); break; \
6040       case  4: func_name(__VA_ARGS__,  4); break; \
6041       case  5: func_name(__VA_ARGS__,  5); break; \
6042       case  6: func_name(__VA_ARGS__,  6); break; \
6043       case  7: func_name(__VA_ARGS__,  7); break; \
6044       case  8: func_name(__VA_ARGS__,  8); break; \
6045       case  9: func_name(__VA_ARGS__,  9); break; \
6046       case 10: func_name(__VA_ARGS__, 10); break; \
6047       case 11: func_name(__VA_ARGS__, 11); break; \
6048       case 12: func_name(__VA_ARGS__, 12); break; \
6049       case 13: func_name(__VA_ARGS__, 13); break; \
6050       case 14: func_name(__VA_ARGS__, 14); break; \
6051       case 15: func_name(__VA_ARGS__, 15); break; \
6052       case 16: func_name(__VA_ARGS__, 16); break; \
6053       case 17: func_name(__VA_ARGS__, 17); break; \
6054       case 18: func_name(__VA_ARGS__, 18); break; \
6055       case 19: func_name(__VA_ARGS__, 19); break; \
6056       case 20: func_name(__VA_ARGS__, 20); break; \
6057       case 21: func_name(__VA_ARGS__, 21); break; \
6058       case 22: func_name(__VA_ARGS__, 22); break; \
6059       case 23: func_name(__VA_ARGS__, 23); break; \
6060       case 24: func_name(__VA_ARGS__, 24); break; \
6061       case 25: func_name(__VA_ARGS__, 25); break; \
6062       case 26: func_name(__VA_ARGS__, 26); break; \
6063       case 27: func_name(__VA_ARGS__, 27); break; \
6064       case 28: func_name(__VA_ARGS__, 28); break; \
6065       case 29: func_name(__VA_ARGS__, 29); break; \
6066       case 30: func_name(__VA_ARGS__, 30); break; \
6067       case 31: func_name(__VA_ARGS__, 31); break; \
6068       case 32: func_name(__VA_ARGS__, 32); break; \
6069       case 33: func_name(__VA_ARGS__, 33); break; \
6070       case 34: func_name(__VA_ARGS__, 34); break; \
6071       case 35: func_name(__VA_ARGS__, 35); break; \
6072       case 36: func_name(__VA_ARGS__, 36); break; \
6073       case 37: func_name(__VA_ARGS__, 37); break; \
6074       case 38: func_name(__VA_ARGS__, 38); break; \
6075       case 39: func_name(__VA_ARGS__, 39); break; \
6076       case 40: func_name(__VA_ARGS__, 40); break; \
6077       case 41: func_name(__VA_ARGS__, 41); break; \
6078       case 42: func_name(__VA_ARGS__, 42); break; \
6079       case 43: func_name(__VA_ARGS__, 43); break; \
6080       case 44: func_name(__VA_ARGS__, 44); break; \
6081       case 45: func_name(__VA_ARGS__, 45); break; \
6082       case 46: func_name(__VA_ARGS__, 46); break; \
6083       case 47: func_name(__VA_ARGS__, 47); break; \
6084       case 48: func_name(__VA_ARGS__, 48); break; \
6085       case 49: func_name(__VA_ARGS__, 49); break; \
6086       case 50: func_name(__VA_ARGS__, 50); break; \
6087       case 51: func_name(__VA_ARGS__, 51); break; \
6088       case 52: func_name(__VA_ARGS__, 52); break; \
6089       case 53: func_name(__VA_ARGS__, 53); break; \
6090       case 54: func_name(__VA_ARGS__, 54); break; \
6091       case 55: func_name(__VA_ARGS__, 55); break; \
6092       case 56: func_name(__VA_ARGS__, 56); break; \
6093       case 57: func_name(__VA_ARGS__, 57); break; \
6094       case 58: func_name(__VA_ARGS__, 58); break; \
6095       case 59: func_name(__VA_ARGS__, 59); break; \
6096       case 60: func_name(__VA_ARGS__, 60); break; \
6097       case 61: func_name(__VA_ARGS__, 61); break; \
6098       case 62: func_name(__VA_ARGS__, 62); break; \
6099       case 63: func_name(__VA_ARGS__, 63); break; \
6100       default: default_case; break; \
6101     } \
6102   } while (0)
6103 
6104 HEDLEY_DIAGNOSTIC_POP
6105 
6106 #endif
6107 /* :: End ../simde/simde/simde-constify.h :: */
6108 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
6109 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
6110 /* :: Begin ../simde/simde/simde-align.h :: */
6111 /* Alignment
6112  * Created by Evan Nemerson <evan@nemerson.com>
6113  *
6114  *   To the extent possible under law, the authors have waived all
6115  *   copyright and related or neighboring rights to this code.  For
6116  *   details, see the Creative Commons Zero 1.0 Universal license at
6117  *   <https://creativecommons.org/publicdomain/zero/1.0/>
6118  *
6119  * SPDX-License-Identifier: CC0-1.0
6120  *
6121  **********************************************************************
6122  *
6123  * This is portability layer which should help iron out some
6124  * differences across various compilers, as well as various verisons of
6125  * C and C++.
6126  *
6127  * It was originally developed for SIMD Everywhere
6128  * (<https://github.com/simd-everywhere/simde>), but since its only
6129  * dependency is Hedley (<https://nemequ.github.io/hedley>, also CC0)
6130  * it can easily be used in other projects, so please feel free to do
6131  * so.
6132  *
6133  * If you do use this in your project, please keep a link to SIMDe in
6134  * your code to remind you where to report any bugs and/or check for
6135  * updated versions.
6136  *
6137  * # API Overview
6138  *
6139  * The API has several parts, and most macros have a few variations.
6140  * There are APIs for declaring aligned fields/variables, optimization
6141  * hints, and run-time alignment checks.
6142  *
6143  * Briefly, macros ending with "_TO" take numeric values and are great
6144  * when you know the value you would like to use.  Macros ending with
6145  * "_LIKE", on the other hand, accept a type and are used when you want
6146  * to use the alignment of a type instead of hardcoding a value.
6147  *
6148  * Documentation for each section of the API is inline.
6149  *
6150  * True to form, MSVC is the main problem and imposes several
6151  * limitations on the effectiveness of the APIs.  Detailed descriptions
6152  * of the limitations of each macro are inline, but in general:
6153  *
6154  *  * On C11+ or C++11+ code written using this API will work.  The
6155  *    ASSUME macros may or may not generate a hint to the compiler, but
6156  *    that is only an optimization issue and will not actually cause
6157  *    failures.
6158  *  * If you're using pretty much any compiler other than MSVC,
6159  *    everything should basically work as well as in C11/C++11.
6160  */
6161 
6162 #if !defined(SIMDE_ALIGN_H)
6163 #define SIMDE_ALIGN_H
6164 
6165 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
6166 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
6167 
6168 /* I know this seems a little silly, but some non-hosted compilers
6169  * don't have stddef.h, so we try to accomodate them. */
6170 #if !defined(SIMDE_ALIGN_SIZE_T_)
6171   #if defined(__SIZE_TYPE__)
6172     #define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__
6173   #elif defined(__SIZE_T_TYPE__)
6174     #define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__
6175   #elif defined(__cplusplus)
6176     #include <cstddef>
6177     #define SIMDE_ALIGN_SIZE_T_ size_t
6178   #else
6179     #include <stddef.h>
6180     #define SIMDE_ALIGN_SIZE_T_ size_t
6181   #endif
6182 #endif
6183 
6184 #if !defined(SIMDE_ALIGN_INTPTR_T_)
6185   #if defined(__INTPTR_TYPE__)
6186     #define SIMDE_ALIGN_INTPTR_T_ __INTPTR_TYPE__
6187   #elif defined(__PTRDIFF_TYPE__)
6188     #define SIMDE_ALIGN_INTPTR_T_ __PTRDIFF_TYPE__
6189   #elif defined(__PTRDIFF_T_TYPE__)
6190     #define SIMDE_ALIGN_INTPTR_T_ __PTRDIFF_T_TYPE__
6191   #elif defined(__cplusplus)
6192     #include <cstddef>
6193     #define SIMDE_ALIGN_INTPTR_T_ ptrdiff_t
6194   #else
6195     #include <stddef.h>
6196     #define SIMDE_ALIGN_INTPTR_T_ ptrdiff_t
6197   #endif
6198 #endif
6199 
6200 #if defined(SIMDE_ALIGN_DEBUG)
6201   #if defined(__cplusplus)
6202     #include <cstdio>
6203   #else
6204     #include <stdio.h>
6205   #endif
6206 #endif
6207 
6208 /* SIMDE_ALIGN_OF(Type)
6209  *
6210  * The SIMDE_ALIGN_OF macro works like alignof, or _Alignof, or
6211  * __alignof, or __alignof__, or __ALIGNOF__, depending on the compiler.
6212  * It isn't defined everywhere (only when the compiler has some alignof-
6213  * like feature we can use to implement it), but it should work in most
6214  * modern compilers, as well as C11 and C++11.
6215  *
6216  * If we can't find an implementation for SIMDE_ALIGN_OF then the macro
6217  * will not be defined, so if you can handle that situation sensibly
6218  * you may need to sprinkle some ifdefs into your code.
6219  */
6220 #if \
6221     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
6222     (0 && HEDLEY_HAS_FEATURE(c_alignof))
6223   #define SIMDE_ALIGN_OF(Type) _Alignof(Type)
6224 #elif \
6225     (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
6226     (0 && HEDLEY_HAS_FEATURE(cxx_alignof))
6227   #define SIMDE_ALIGN_OF(Type) alignof(Type)
6228 #elif \
6229     HEDLEY_GCC_VERSION_CHECK(2,95,0) || \
6230     HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
6231     HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
6232     HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
6233     HEDLEY_TINYC_VERSION_CHECK(0,9,24) || \
6234     HEDLEY_PGI_VERSION_CHECK(19,10,0) || \
6235     HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \
6236     HEDLEY_TI_ARMCL_VERSION_CHECK(16,9,0) || \
6237     HEDLEY_TI_CL2000_VERSION_CHECK(16,9,0) || \
6238     HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
6239     HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
6240     HEDLEY_TI_CL430_VERSION_CHECK(16,9,0) || \
6241     HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2) || \
6242     HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
6243     defined(__IBM__ALIGNOF__) || \
6244     defined(__clang__)
6245   #define SIMDE_ALIGN_OF(Type) __alignof__(Type)
6246 #elif \
6247   HEDLEY_IAR_VERSION_CHECK(8,40,0)
6248   #define SIMDE_ALIGN_OF(Type) __ALIGNOF__(Type)
6249 #elif \
6250   HEDLEY_MSVC_VERSION_CHECK(19,0,0)
6251   /* Probably goes back much further, but MS takes down their old docs.
6252    * If you can verify that this works in earlier versions please let
6253    * me know! */
6254   #define SIMDE_ALIGN_OF(Type) __alignof(Type)
6255 #endif
6256 
6257 /* SIMDE_ALIGN_MAXIMUM:
6258  *
6259  * This is the maximum alignment that the compiler supports.  You can
6260  * define the value prior to including SIMDe if necessary, but in that
6261  * case *please* submit an issue so we can add the platform to the
6262  * detection code.
6263  *
6264  * Most compilers are okay with types which are aligned beyond what
6265  * they think is the maximum, as long as the alignment is a power
6266  * of two.  Older versions of MSVC is the exception, so we need to cap
6267  * the alignment requests at values that the implementation supports.
6268  *
6269  * XL C/C++ will accept values larger than 16 (which is the alignment
6270  * of an AltiVec vector), but will not reliably align to the larger
6271  * value, so so we cap the value at 16 there.
6272  *
6273  * If the compiler accepts any power-of-two value within reason then
6274  * this macro should be left undefined, and the SIMDE_ALIGN_CAP
6275  * macro will just return the value passed to it. */
6276 #if !defined(SIMDE_ALIGN_MAXIMUM)
6277   #if defined(HEDLEY_MSVC_VERSION)
6278     #if HEDLEY_MSVC_VERSION_CHECK(19, 16, 0)
6279       // Visual studio 2017 and newer does not need a max
6280     #else
6281       #if defined(_M_IX86) || defined(_M_AMD64)
6282         #if HEDLEY_MSVC_VERSION_CHECK(19,14,0)
6283           #define SIMDE_ALIGN_PLATFORM_MAXIMUM 64
6284         #elif HEDLEY_MSVC_VERSION_CHECK(16,0,0)
6285           /* VS 2010 is really a guess based on Wikipedia; if anyone can
6286            * test with old VS versions I'd really appreciate it. */
6287           #define SIMDE_ALIGN_PLATFORM_MAXIMUM 32
6288         #else
6289           #define SIMDE_ALIGN_PLATFORM_MAXIMUM 16
6290         #endif
6291       #elif defined(_M_ARM) || defined(_M_ARM64)
6292         #define SIMDE_ALIGN_PLATFORM_MAXIMUM 8
6293       #endif
6294     #endif
6295   #elif defined(HEDLEY_IBM_VERSION)
6296     #define SIMDE_ALIGN_PLATFORM_MAXIMUM 16
6297   #endif
6298 #endif
6299 
6300 /* You can mostly ignore these; they're intended for internal use.
6301  * If you do need to use them please let me know; if they fulfill
6302  * a common use case I'll probably drop the trailing underscore
6303  * and make them part of the public API. */
6304 #if defined(SIMDE_ALIGN_PLATFORM_MAXIMUM)
6305   #if SIMDE_ALIGN_PLATFORM_MAXIMUM >= 64
6306     #define SIMDE_ALIGN_64_ 64
6307     #define SIMDE_ALIGN_32_ 32
6308     #define SIMDE_ALIGN_16_ 16
6309     #define SIMDE_ALIGN_8_ 8
6310   #elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 32
6311     #define SIMDE_ALIGN_64_ 32
6312     #define SIMDE_ALIGN_32_ 32
6313     #define SIMDE_ALIGN_16_ 16
6314     #define SIMDE_ALIGN_8_ 8
6315   #elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 16
6316     #define SIMDE_ALIGN_64_ 16
6317     #define SIMDE_ALIGN_32_ 16
6318     #define SIMDE_ALIGN_16_ 16
6319     #define SIMDE_ALIGN_8_ 8
6320   #elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 8
6321     #define SIMDE_ALIGN_64_ 8
6322     #define SIMDE_ALIGN_32_ 8
6323     #define SIMDE_ALIGN_16_ 8
6324     #define SIMDE_ALIGN_8_ 8
6325   #else
6326     #error Max alignment expected to be >= 8
6327   #endif
6328 #else
6329   #define SIMDE_ALIGN_64_ 64
6330   #define SIMDE_ALIGN_32_ 32
6331   #define SIMDE_ALIGN_16_ 16
6332   #define SIMDE_ALIGN_8_ 8
6333 #endif
6334 
6335 /**
6336  * SIMDE_ALIGN_CAP(Alignment)
6337  *
6338  * Returns the minimum of Alignment or SIMDE_ALIGN_MAXIMUM.
6339  */
6340 #if defined(SIMDE_ALIGN_MAXIMUM)
6341   #define SIMDE_ALIGN_CAP(Alignment) (((Alignment) < (SIMDE_ALIGN_PLATFORM_MAXIMUM)) ? (Alignment) : (SIMDE_ALIGN_PLATFORM_MAXIMUM))
6342 #else
6343   #define SIMDE_ALIGN_CAP(Alignment) (Alignment)
6344 #endif
6345 
6346 /* SIMDE_ALIGN_TO(Alignment)
6347  *
6348  * SIMDE_ALIGN_TO is used to declare types or variables.  It basically
6349  * maps to the align attribute in most compilers, the align declspec
6350  * in MSVC, or _Alignas/alignas in C11/C++11.
6351  *
6352  * Example:
6353  *
6354  *   struct i32x4 {
6355  *     SIMDE_ALIGN_TO(16) int32_t values[4];
6356  *   }
6357  *
6358  * Limitations:
6359  *
6360  * MSVC requires that the Alignment parameter be numeric; you can't do
6361  * something like `SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(int))`.  This is
6362  * unfortunate because that's really how the LIKE macros are
6363  * implemented, and I am not aware of a way to get anything like this
6364  * to work without using the C11/C++11 keywords.
6365  *
6366  * It also means that we can't use SIMDE_ALIGN_CAP to limit the
6367  * alignment to the value specified, which MSVC also requires, so on
6368  * MSVC you should use the `SIMDE_ALIGN_TO_8/16/32/64` macros instead.
6369  * They work like `SIMDE_ALIGN_TO(SIMDE_ALIGN_CAP(Alignment))` would,
6370  * but should be safe to use on MSVC.
6371  *
6372  * All this is to say that, if you want your code to work on MSVC, you
6373  * should use the SIMDE_ALIGN_TO_8/16/32/64 macros below instead of
6374  * SIMDE_ALIGN_TO(8/16/32/64).
6375  */
6376 #if \
6377     HEDLEY_HAS_ATTRIBUTE(aligned) || \
6378     HEDLEY_GCC_VERSION_CHECK(2,95,0) || \
6379     HEDLEY_CRAY_VERSION_CHECK(8,4,0) || \
6380     HEDLEY_IBM_VERSION_CHECK(11,1,0) || \
6381     HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
6382     HEDLEY_PGI_VERSION_CHECK(19,4,0) || \
6383     HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
6384     HEDLEY_TINYC_VERSION_CHECK(0,9,24) || \
6385     HEDLEY_TI_ARMCL_VERSION_CHECK(16,9,0) || \
6386     HEDLEY_TI_CL2000_VERSION_CHECK(16,9,0) || \
6387     HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
6388     HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
6389     HEDLEY_TI_CL430_VERSION_CHECK(16,9,0) || \
6390     HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2)
6391   #define SIMDE_ALIGN_TO(Alignment) __attribute__((__aligned__(SIMDE_ALIGN_CAP(Alignment))))
6392 #elif \
6393     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))
6394   #define SIMDE_ALIGN_TO(Alignment) _Alignas(SIMDE_ALIGN_CAP(Alignment))
6395 #elif \
6396     (defined(__cplusplus) && (__cplusplus >= 201103L))
6397   #define SIMDE_ALIGN_TO(Alignment) alignas(SIMDE_ALIGN_CAP(Alignment))
6398 #elif \
6399     defined(HEDLEY_MSVC_VERSION)
6400   #define SIMDE_ALIGN_TO(Alignment) __declspec(align(Alignment))
6401   /* Unfortunately MSVC can't handle __declspec(align(__alignof(Type)));
6402    * the alignment passed to the declspec has to be an integer. */
6403   #define SIMDE_ALIGN_OF_UNUSABLE_FOR_LIKE
6404 #endif
6405 #define SIMDE_ALIGN_TO_64 SIMDE_ALIGN_TO(SIMDE_ALIGN_64_)
6406 #define SIMDE_ALIGN_TO_32 SIMDE_ALIGN_TO(SIMDE_ALIGN_32_)
6407 #define SIMDE_ALIGN_TO_16 SIMDE_ALIGN_TO(SIMDE_ALIGN_16_)
6408 #define SIMDE_ALIGN_TO_8 SIMDE_ALIGN_TO(SIMDE_ALIGN_8_)
6409 
6410 /* SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment)
6411  *
6412  * SIMDE_ALIGN_ASSUME_TO is semantically similar to C++20's
6413  * std::assume_aligned, or __builtin_assume_aligned.  It tells the
6414  * compiler to assume that the provided pointer is aligned to an
6415  * `Alignment`-byte boundary.
6416  *
6417  * If you define SIMDE_ALIGN_DEBUG prior to including this header then
6418  * SIMDE_ALIGN_ASSUME_TO will turn into a runtime check.   We don't
6419  * integrate with NDEBUG in this header, but it may be a good idea to
6420  * put something like this in your code:
6421  *
6422  *   #if !defined(NDEBUG)
6423  *     #define SIMDE_ALIGN_DEBUG
6424  *   #endif
6425  *   #include <.../simde-align.h>
6426  */
6427 #if \
6428     HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \
6429     HEDLEY_GCC_VERSION_CHECK(4,7,0)
6430   #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) \
6431     HEDLEY_REINTERPRET_CAST(__typeof__(Pointer), __builtin_assume_aligned(HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, Pointer)), Alignment))
6432 #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
6433   #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) (__extension__ ({ \
6434       __typeof__(v) simde_assume_aligned_t_ = (Pointer); \
6435       __assume_aligned(simde_assume_aligned_t_, Alignment); \
6436       simde_assume_aligned_t_; \
6437     }))
6438 #elif defined(__cplusplus) && (__cplusplus > 201703L)
6439   #include <memory>
6440   #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) std::assume_aligned<Alignment>(Pointer)
6441 #else
6442   #if defined(__cplusplus)
6443     template<typename T> HEDLEY_ALWAYS_INLINE static T* simde_align_assume_to_unchecked(T* ptr, const size_t alignment)
6444   #else
6445     HEDLEY_ALWAYS_INLINE static void* simde_align_assume_to_unchecked(void* ptr, const size_t alignment)
6446   #endif
6447   {
6448     HEDLEY_ASSUME((HEDLEY_REINTERPRET_CAST(size_t, (ptr)) % SIMDE_ALIGN_CAP(alignment)) == 0);
6449     return ptr;
6450   }
6451   #if defined(__cplusplus)
6452     #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) simde_align_assume_to_unchecked((Pointer), (Alignment))
6453   #else
6454     #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) simde_align_assume_to_unchecked(HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, Pointer)), (Alignment))
6455   #endif
6456 #endif
6457 
6458 #if !defined(SIMDE_ALIGN_DEBUG)
6459   #define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment)
6460 #else
6461   #include <stdio.h>
6462   #if defined(__cplusplus)
6463     template<typename T>
6464     static HEDLEY_ALWAYS_INLINE
6465     T*
6466     simde_align_assume_to_checked_uncapped(T* ptr, const size_t alignment, const char* file, int line, const char* ptrname)
6467   #else
6468     static HEDLEY_ALWAYS_INLINE
6469     void*
6470     simde_align_assume_to_checked_uncapped(void* ptr, const size_t alignment, const char* file, int line, const char* ptrname)
6471   #endif
6472   {
6473     if (HEDLEY_UNLIKELY((HEDLEY_REINTERPRET_CAST(SIMDE_ALIGN_INTPTR_T_, (ptr)) % HEDLEY_STATIC_CAST(SIMDE_ALIGN_INTPTR_T_, SIMDE_ALIGN_CAP(alignment))) != 0)) {
6474       fprintf(stderr, "%s:%d: alignment check failed for `%s' (%p %% %u == %u)\n",
6475         file, line, ptrname, HEDLEY_REINTERPRET_CAST(const void*, ptr),
6476         HEDLEY_STATIC_CAST(unsigned int, SIMDE_ALIGN_CAP(alignment)),
6477         HEDLEY_STATIC_CAST(unsigned int, HEDLEY_REINTERPRET_CAST(SIMDE_ALIGN_INTPTR_T_, (ptr)) % HEDLEY_STATIC_CAST(SIMDE_ALIGN_INTPTR_T_, SIMDE_ALIGN_CAP(alignment))));
6478     }
6479 
6480     return ptr;
6481   }
6482 
6483   #if defined(__cplusplus)
6484     #define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) simde_align_assume_to_checked_uncapped((Pointer), (Alignment), __FILE__, __LINE__, #Pointer)
6485   #else
6486     #define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) simde_align_assume_to_checked_uncapped(HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, Pointer)), (Alignment), __FILE__, __LINE__, #Pointer)
6487   #endif
6488 #endif
6489 
6490 /* SIMDE_ALIGN_LIKE(Type)
6491  * SIMDE_ALIGN_LIKE_#(Type)
6492  *
6493  * The SIMDE_ALIGN_LIKE macros are similar to the SIMDE_ALIGN_TO macros
6494  * except instead of an integer they take a type; basically, it's just
6495  * a more convenient way to do something like:
6496  *
6497  *   SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(Type))
6498  *
6499  * The versions with a numeric suffix will fall back on using a numeric
6500  * value in the event we can't use SIMDE_ALIGN_OF(Type).  This is
6501  * mainly for MSVC, where __declspec(align()) can't handle anything
6502  * other than hard-coded numeric values.
6503  */
6504 #if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_TO) && !defined(SIMDE_ALIGN_OF_UNUSABLE_FOR_LIKE)
6505   #define SIMDE_ALIGN_LIKE(Type) SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(Type))
6506   #define SIMDE_ALIGN_LIKE_64(Type) SIMDE_ALIGN_LIKE(Type)
6507   #define SIMDE_ALIGN_LIKE_32(Type) SIMDE_ALIGN_LIKE(Type)
6508   #define SIMDE_ALIGN_LIKE_16(Type) SIMDE_ALIGN_LIKE(Type)
6509   #define SIMDE_ALIGN_LIKE_8(Type) SIMDE_ALIGN_LIKE(Type)
6510 #else
6511   #define SIMDE_ALIGN_LIKE_64(Type) SIMDE_ALIGN_TO_64
6512   #define SIMDE_ALIGN_LIKE_32(Type) SIMDE_ALIGN_TO_32
6513   #define SIMDE_ALIGN_LIKE_16(Type) SIMDE_ALIGN_TO_16
6514   #define SIMDE_ALIGN_LIKE_8(Type) SIMDE_ALIGN_TO_8
6515 #endif
6516 
6517 /* SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type)
6518  *
6519  * Tihs is similar to SIMDE_ALIGN_ASSUME_TO, except that it takes a
6520  * type instead of a numeric value. */
6521 #if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_ASSUME_TO)
6522   #define SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) SIMDE_ALIGN_ASSUME_TO(Pointer, SIMDE_ALIGN_OF(Type))
6523 #endif
6524 
6525 /* SIMDE_ALIGN_CAST(Type, Pointer)
6526  *
6527  * SIMDE_ALIGN_CAST is like C++'s reinterpret_cast, but it will try
6528  * to silence warnings that some compilers may produce if you try
6529  * to assign to a type with increased alignment requirements.
6530  *
6531  * Note that it does *not* actually attempt to tell the compiler that
6532  * the pointer is aligned like the destination should be; that's the
6533  * job of the next macro.  This macro is necessary for stupid APIs
6534  * like _mm_loadu_si128 where the input is a __m128i* but the function
6535  * is specifically for data which isn't necessarily aligned to
6536  * _Alignof(__m128i).
6537  */
6538 #if HEDLEY_HAS_WARNING("-Wcast-align") || defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3,4,0)
6539   #define SIMDE_ALIGN_CAST(Type, Pointer) (__extension__({ \
6540       HEDLEY_DIAGNOSTIC_PUSH \
6541       _Pragma("GCC diagnostic ignored \"-Wcast-align\"") \
6542       Type simde_r_ = HEDLEY_REINTERPRET_CAST(Type, Pointer); \
6543       HEDLEY_DIAGNOSTIC_POP \
6544       simde_r_; \
6545     }))
6546 #else
6547   #define SIMDE_ALIGN_CAST(Type, Pointer) HEDLEY_REINTERPRET_CAST(Type, Pointer)
6548 #endif
6549 
6550 /* SIMDE_ALIGN_ASSUME_CAST(Type, Pointer)
6551  *
6552  * This is sort of like a combination of a reinterpret_cast and a
6553  * SIMDE_ALIGN_ASSUME_LIKE.  It uses SIMDE_ALIGN_ASSUME_LIKE to tell
6554  * the compiler that the pointer is aligned like the specified type
6555  * and casts the pointer to the specified type while suppressing any
6556  * warnings from the compiler about casting to a type with greater
6557  * alignment requirements.
6558  */
6559 #define SIMDE_ALIGN_ASSUME_CAST(Type, Pointer) SIMDE_ALIGN_ASSUME_LIKE(SIMDE_ALIGN_CAST(Type, Pointer), Type)
6560 
6561 #endif /* !defined(SIMDE_ALIGN_H) */
6562 /* :: End ../simde/simde/simde-align.h :: */
6563 
6564 /* In some situations, SIMDe has to make large performance sacrifices
6565  * for small increases in how faithfully it reproduces an API, but
6566  * only a relatively small number of users will actually need the API
6567  * to be completely accurate.  The SIMDE_FAST_* options can be used to
6568  * disable these trade-offs.
6569  *
6570  * They can be enabled by passing -DSIMDE_FAST_MATH to the compiler, or
6571  * the individual defines (e.g., -DSIMDE_FAST_NANS) if you only want to
6572  * enable some optimizations.  Using -ffast-math and/or
6573  * -ffinite-math-only will also enable the relevant options.  If you
6574  * don't want that you can pass -DSIMDE_NO_FAST_* to disable them. */
6575 
6576 /* Most programs avoid NaNs by never passing values which can result in
6577  * a NaN; for example, if you only pass non-negative values to the sqrt
6578  * functions, it won't generate a NaN.  On some platforms, similar
6579  * functions handle NaNs differently; for example, the _mm_min_ps SSE
6580  * function will return 0.0 if you pass it (0.0, NaN), but the NEON
6581  * vminq_f32 function will return NaN.  Making them behave like one
6582  * another is expensive; it requires generating a mask of all lanes
6583  * with NaNs, then performing the operation (e.g., vminq_f32), then
6584  * blending together the result with another vector using the mask.
6585  *
6586  * If you don't want SIMDe to worry about the differences between how
6587  * NaNs are handled on the two platforms, define this (or pass
6588  * -ffinite-math-only) */
6589 #if !defined(SIMDE_FAST_MATH) && !defined(SIMDE_NO_FAST_MATH) && defined(__FAST_MATH__)
6590   #define SIMDE_FAST_MATH
6591 #endif
6592 
6593 #if !defined(SIMDE_FAST_NANS) && !defined(SIMDE_NO_FAST_NANS)
6594   #if defined(SIMDE_FAST_MATH)
6595     #define SIMDE_FAST_NANS
6596   #elif defined(__FINITE_MATH_ONLY__)
6597     #if __FINITE_MATH_ONLY__
6598       #define SIMDE_FAST_NANS
6599     #endif
6600   #endif
6601 #endif
6602 
6603 /* Many functions are defined as using the current rounding mode
6604  * (i.e., the SIMD version of fegetround()) when converting to
6605  * an integer.  For example, _mm_cvtpd_epi32.  Unfortunately,
6606  * on some platforms (such as ARMv8+ where round-to-nearest is
6607  * always used, regardless of the FPSCR register) this means we
6608  * have to first query the current rounding mode, then choose
6609  * the proper function (rounnd
6610  , ceil, floor, etc.) */
6611 #if !defined(SIMDE_FAST_ROUND_MODE) && !defined(SIMDE_NO_FAST_ROUND_MODE) && defined(SIMDE_FAST_MATH)
6612   #define SIMDE_FAST_ROUND_MODE
6613 #endif
6614 
6615 /* This controls how ties are rounded.  For example, does 10.5 round to
6616  * 10 or 11?  IEEE 754 specifies round-towards-even, but ARMv7 (for
6617  * example) doesn't support it and it must be emulated (which is rather
6618  * slow).  If you're okay with just using the default for whatever arch
6619  * you're on, you should definitely define this.
6620  *
6621  * Note that we don't use this macro to avoid correct implementations
6622  * in functions which are explicitly about rounding (such as vrnd* on
6623  * NEON, _mm_round_* on x86, etc.); it is only used for code where
6624  * rounding is a component in another function, and even then it isn't
6625  * usually a problem since such functions will use the current rounding
6626  * mode. */
6627 #if !defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_NO_FAST_ROUND_TIES) && defined(SIMDE_FAST_MATH)
6628   #define SIMDE_FAST_ROUND_TIES
6629 #endif
6630 
6631 /* For functions which convert from one type to another (mostly from
6632  * floating point to integer types), sometimes we need to do a range
6633  * check and potentially return a different result if the value
6634  * falls outside that range.  Skipping this check can provide a
6635  * performance boost, at the expense of faithfulness to the API we're
6636  * emulating. */
6637 #if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_NO_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_MATH)
6638   #define SIMDE_FAST_CONVERSION_RANGE
6639 #endif
6640 
6641 #if \
6642     HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
6643     HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
6644     HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
6645     HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
6646     HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
6647     HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
6648     HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
6649     (HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
6650     HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
6651     HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
6652   #define SIMDE_CHECK_CONSTANT_(expr) (__builtin_constant_p(expr))
6653 #elif defined(__cplusplus) && (__cplusplus > 201703L)
6654   #include <type_traits>
6655   #define SIMDE_CHECK_CONSTANT_(expr) (std::is_constant_evaluated())
6656 #endif
6657 
6658 #if !defined(SIMDE_NO_CHECK_IMMEDIATE_CONSTANT)
6659   #if defined(SIMDE_CHECK_CONSTANT_) && \
6660       SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && \
6661       (!defined(__apple_build_version__) || ((__apple_build_version__ < 11000000) || (__apple_build_version__ >= 12000000)))
6662     #define SIMDE_REQUIRE_CONSTANT(arg) HEDLEY_REQUIRE_MSG(SIMDE_CHECK_CONSTANT_(arg), "`" #arg "' must be constant")
6663   #else
6664     #define SIMDE_REQUIRE_CONSTANT(arg)
6665   #endif
6666 #else
6667   #define SIMDE_REQUIRE_CONSTANT(arg)
6668 #endif
6669 
6670 #define SIMDE_REQUIRE_RANGE(arg, min, max) \
6671   HEDLEY_REQUIRE_MSG((((arg) >= (min)) && ((arg) <= (max))), "'" #arg "' must be in [" #min ", " #max "]")
6672 
6673 #define SIMDE_REQUIRE_CONSTANT_RANGE(arg, min, max) \
6674   SIMDE_REQUIRE_CONSTANT(arg) \
6675   SIMDE_REQUIRE_RANGE(arg, min, max)
6676 
6677 /* A copy of HEDLEY_STATIC_ASSERT, except we don't define an empty
6678  * fallback if we can't find an implementation; instead we have to
6679  * check if SIMDE_STATIC_ASSERT is defined before using it. */
6680 #if \
6681   !defined(__cplusplus) && ( \
6682       (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
6683       HEDLEY_HAS_FEATURE(c_static_assert) || \
6684       HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
6685       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
6686       defined(_Static_assert) \
6687     )
6688 #  define SIMDE_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
6689 #elif \
6690   (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
6691   HEDLEY_MSVC_VERSION_CHECK(16,0,0)
6692 #  define SIMDE_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
6693 #endif
6694 
6695 /* Statement exprs */
6696 #if \
6697     HEDLEY_GNUC_VERSION_CHECK(2,95,0) || \
6698     HEDLEY_TINYC_VERSION_CHECK(0,9,26) || \
6699     HEDLEY_INTEL_VERSION_CHECK(9,0,0) || \
6700     HEDLEY_PGI_VERSION_CHECK(18,10,0) || \
6701     HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) || \
6702     HEDLEY_IBM_VERSION_CHECK(11,1,0) || \
6703     HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
6704   #define SIMDE_STATEMENT_EXPR_(expr) (__extension__ expr)
6705 #endif
6706 
6707 #if defined(SIMDE_CHECK_CONSTANT_) && defined(SIMDE_STATIC_ASSERT)
6708   #define SIMDE_ASSERT_CONSTANT_(v) SIMDE_STATIC_ASSERT(SIMDE_CHECK_CONSTANT_(v), #v " must be constant.")
6709 #endif
6710 
6711 #if \
6712   (HEDLEY_HAS_ATTRIBUTE(may_alias) && !defined(HEDLEY_SUNPRO_VERSION)) || \
6713   HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
6714   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
6715   HEDLEY_IBM_VERSION_CHECK(13,1,0)
6716 #  define SIMDE_MAY_ALIAS __attribute__((__may_alias__))
6717 #else
6718 #  define SIMDE_MAY_ALIAS
6719 #endif
6720 
6721 /*  Lots of compilers support GCC-style vector extensions, but many
6722     don't support all the features.  Define different macros depending
6723     on support for
6724 
6725     * SIMDE_VECTOR - Declaring a vector.
6726     * SIMDE_VECTOR_OPS - basic operations (binary and unary).
6727     * SIMDE_VECTOR_NEGATE - negating a vector
6728     * SIMDE_VECTOR_SCALAR - For binary operators, the second argument
6729         can be a scalar, in which case the result is as if that scalar
6730         had been broadcast to all lanes of a vector.
6731     * SIMDE_VECTOR_SUBSCRIPT - Supports array subscript notation for
6732         extracting/inserting a single element.=
6733 
6734     SIMDE_VECTOR can be assumed if any others are defined, the
6735     others are independent. */
6736 #if !defined(SIMDE_NO_VECTOR)
6737 #  if \
6738     HEDLEY_GCC_VERSION_CHECK(4,8,0)
6739 #    define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
6740 #    define SIMDE_VECTOR_OPS
6741 #    define SIMDE_VECTOR_NEGATE
6742 #    define SIMDE_VECTOR_SCALAR
6743 #    define SIMDE_VECTOR_SUBSCRIPT
6744 #  elif HEDLEY_INTEL_VERSION_CHECK(16,0,0)
6745 #    define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
6746 #    define SIMDE_VECTOR_OPS
6747 #    define SIMDE_VECTOR_NEGATE
6748 /* ICC only supports SIMDE_VECTOR_SCALAR for constants */
6749 #    define SIMDE_VECTOR_SUBSCRIPT
6750 #  elif \
6751     HEDLEY_GCC_VERSION_CHECK(4,1,0) || \
6752     HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
6753     HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
6754 #    define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
6755 #    define SIMDE_VECTOR_OPS
6756 #  elif HEDLEY_SUNPRO_VERSION_CHECK(5,12,0)
6757 #    define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
6758 #  elif HEDLEY_HAS_ATTRIBUTE(vector_size)
6759 #    define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
6760 #    define SIMDE_VECTOR_OPS
6761 #    define SIMDE_VECTOR_NEGATE
6762 #    define SIMDE_VECTOR_SUBSCRIPT
6763 #    if SIMDE_DETECT_CLANG_VERSION_CHECK(5,0,0)
6764 #      define SIMDE_VECTOR_SCALAR
6765 #    endif
6766 #  endif
6767 
6768 /* GCC and clang have built-in functions to handle shuffling and
6769    converting of vectors, but the implementations are slightly
6770    different.  This macro is just an abstraction over them.  Note that
6771    elem_size is in bits but vec_size is in bytes. */
6772 #  if !defined(SIMDE_NO_SHUFFLE_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT)
6773      HEDLEY_DIAGNOSTIC_PUSH
6774      /* We don't care about -Wvariadic-macros; all compilers that support
6775       * shufflevector/shuffle support them. */
6776 #    if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic")
6777 #      pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
6778 #    endif
6779 #    if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4,0,0)
6780 #      pragma GCC diagnostic ignored "-Wvariadic-macros"
6781 #    endif
6782 
6783 #    if HEDLEY_HAS_BUILTIN(__builtin_shufflevector)
6784 #      define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__)
6785 #    elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle,4,7,0) && !defined(__INTEL_COMPILER)
6786 #      define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) (__extension__ ({ \
6787          int##elem_size##_t SIMDE_VECTOR(vec_size) simde_shuffle_ = { __VA_ARGS__ }; \
6788            __builtin_shuffle(a, b, simde_shuffle_); \
6789          }))
6790 #    endif
6791      HEDLEY_DIAGNOSTIC_POP
6792 #  endif
6793 
6794 /* TODO: this actually works on XL C/C++ without SIMDE_VECTOR_SUBSCRIPT
6795    but the code needs to be refactored a bit to take advantage. */
6796 #  if !defined(SIMDE_NO_CONVERT_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT)
6797 #    if HEDLEY_HAS_BUILTIN(__builtin_convertvector) || HEDLEY_GCC_VERSION_CHECK(9,0,0)
6798 #      if HEDLEY_GCC_VERSION_CHECK(9,0,0) && !HEDLEY_GCC_VERSION_CHECK(9,3,0)
6799          /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93557 */
6800 #        define SIMDE_CONVERT_VECTOR_(to, from) ((to) = (__extension__({ \
6801              __typeof__(from) from_ = (from); \
6802              ((void) from_); \
6803              __builtin_convertvector(from_, __typeof__(to)); \
6804            })))
6805 #      else
6806 #        define SIMDE_CONVERT_VECTOR_(to, from) ((to) = __builtin_convertvector((from), __typeof__(to)))
6807 #      endif
6808 #    endif
6809 #  endif
6810 #endif
6811 
6812 /* Since we currently require SUBSCRIPT before using a vector in a
6813    union, we define these as dependencies of SUBSCRIPT.  They are
6814    likely to disappear in the future, once SIMDe learns how to make
6815    use of vectors without using the union members.  Do not use them
6816    in your code unless you're okay with it breaking when SIMDe
6817    changes. */
6818 #if defined(SIMDE_VECTOR_SUBSCRIPT)
6819 #  if defined(SIMDE_VECTOR_OPS)
6820 #    define SIMDE_VECTOR_SUBSCRIPT_OPS
6821 #  endif
6822 #  if defined(SIMDE_VECTOR_SCALAR)
6823 #    define SIMDE_VECTOR_SUBSCRIPT_SCALAR
6824 #  endif
6825 #endif
6826 
6827 #if !defined(SIMDE_DISABLE_OPENMP)
6828   #if !defined(SIMDE_ENABLE_OPENMP) && ((defined(_OPENMP) && (_OPENMP >= 201307L)) || (defined(_OPENMP_SIMD) && (_OPENMP_SIMD >= 201307L))) || defined(HEDLEY_MCST_LCC_VERSION)
6829     #define SIMDE_ENABLE_OPENMP
6830   #endif
6831 #endif
6832 
6833 #if !defined(SIMDE_ENABLE_CILKPLUS) && (defined(__cilk) || defined(HEDLEY_INTEL_VERSION))
6834 #  define SIMDE_ENABLE_CILKPLUS
6835 #endif
6836 
6837 #if defined(SIMDE_ENABLE_OPENMP)
6838 #  define SIMDE_VECTORIZE HEDLEY_PRAGMA(omp simd)
6839 #  define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l))
6840 #  if defined(__clang__)
6841 #    define SIMDE_VECTORIZE_REDUCTION(r) \
6842         HEDLEY_DIAGNOSTIC_PUSH \
6843         _Pragma("clang diagnostic ignored \"-Wsign-conversion\"") \
6844         HEDLEY_PRAGMA(omp simd reduction(r)) \
6845         HEDLEY_DIAGNOSTIC_POP
6846 #  else
6847 #    define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r))
6848 #  endif
6849 #  define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a))
6850 #elif defined(SIMDE_ENABLE_CILKPLUS)
6851 #  define SIMDE_VECTORIZE HEDLEY_PRAGMA(simd)
6852 #  define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
6853 #  define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
6854 #  define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a))
6855 #elif defined(__clang__) && !defined(HEDLEY_IBM_VERSION)
6856 #  define SIMDE_VECTORIZE HEDLEY_PRAGMA(clang loop vectorize(enable))
6857 #  define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l))
6858 #  define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
6859 #  define SIMDE_VECTORIZE_ALIGNED(a)
6860 #elif HEDLEY_GCC_VERSION_CHECK(4,9,0)
6861 #  define SIMDE_VECTORIZE HEDLEY_PRAGMA(GCC ivdep)
6862 #  define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
6863 #  define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
6864 #  define SIMDE_VECTORIZE_ALIGNED(a)
6865 #elif HEDLEY_CRAY_VERSION_CHECK(5,0,0)
6866 #  define SIMDE_VECTORIZE HEDLEY_PRAGMA(_CRI ivdep)
6867 #  define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
6868 #  define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
6869 #  define SIMDE_VECTORIZE_ALIGNED(a)
6870 #else
6871 #  define SIMDE_VECTORIZE
6872 #  define SIMDE_VECTORIZE_SAFELEN(l)
6873 #  define SIMDE_VECTORIZE_REDUCTION(r)
6874 #  define SIMDE_VECTORIZE_ALIGNED(a)
6875 #endif
6876 
6877 #define SIMDE_MASK_NZ_(v, mask) (((v) & (mask)) | !((v) & (mask)))
6878 
6879 /* Intended for checking coverage, you should never use this in
6880    production. */
6881 #if defined(SIMDE_NO_INLINE)
6882 #  define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE static
6883 #else
6884 #  define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_ALWAYS_INLINE static
6885 #endif
6886 
6887 #if \
6888     HEDLEY_HAS_ATTRIBUTE(unused) || \
6889     HEDLEY_GCC_VERSION_CHECK(2,95,0)
6890 #  define SIMDE_FUNCTION_POSSIBLY_UNUSED_ __attribute__((__unused__))
6891 #else
6892 #  define SIMDE_FUNCTION_POSSIBLY_UNUSED_
6893 #endif
6894 
6895 #if HEDLEY_HAS_WARNING("-Wused-but-marked-unused")
6896 #  define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED _Pragma("clang diagnostic ignored \"-Wused-but-marked-unused\"")
6897 #else
6898 #  define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED
6899 #endif
6900 
6901 #if defined(_MSC_VER)
6902 #  define SIMDE_BEGIN_DECLS_ HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable:4996 4204)) HEDLEY_BEGIN_C_DECLS
6903 #  define SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS
6904 #else
6905 #  define SIMDE_BEGIN_DECLS_ \
6906      HEDLEY_DIAGNOSTIC_PUSH \
6907      SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED \
6908      HEDLEY_BEGIN_C_DECLS
6909 #  define SIMDE_END_DECLS_ \
6910      HEDLEY_END_C_DECLS \
6911      HEDLEY_DIAGNOSTIC_POP
6912 #endif
6913 
6914 #if defined(__SIZEOF_INT128__)
6915 #  define SIMDE_HAVE_INT128_
6916 HEDLEY_DIAGNOSTIC_PUSH
6917 SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_
6918 typedef __int128 simde_int128;
6919 typedef unsigned __int128 simde_uint128;
6920 HEDLEY_DIAGNOSTIC_POP
6921 #endif
6922 
6923 #if !defined(SIMDE_ENDIAN_LITTLE)
6924 #  define SIMDE_ENDIAN_LITTLE 1234
6925 #endif
6926 #if !defined(SIMDE_ENDIAN_BIG)
6927 #  define SIMDE_ENDIAN_BIG 4321
6928 #endif
6929 
6930 #if !defined(SIMDE_ENDIAN_ORDER)
6931 /* GCC (and compilers masquerading as GCC) define  __BYTE_ORDER__. */
6932 #  if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
6933 #    define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
6934 #  elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
6935 #    define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
6936 /* TI defines _BIG_ENDIAN or _LITTLE_ENDIAN */
6937 #  elif defined(_BIG_ENDIAN)
6938 #    define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
6939 #  elif defined(_LITTLE_ENDIAN)
6940 #    define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
6941 /* We know the endianness of some common architectures.  Common
6942  * architectures not listed (ARM, POWER, MIPS, etc.) here are
6943  * bi-endian. */
6944 #  elif defined(__amd64) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
6945 #    define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
6946 #  elif defined(__s390x__) || defined(__zarch__)
6947 #    define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
6948 /* Looks like we'll have to rely on the platform.  If we're missing a
6949  * platform, please let us know. */
6950 #  elif defined(_WIN32)
6951 #    define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
6952 #  elif defined(sun) || defined(__sun) /* Solaris */
6953 #    include <sys/byteorder.h>
6954 #    if defined(_LITTLE_ENDIAN)
6955 #      define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
6956 #    elif defined(_BIG_ENDIAN)
6957 #      define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
6958 #    endif
6959 #  elif defined(__APPLE__)
6960 #    include <libkern/OSByteOrder.h>
6961 #    if defined(__LITTLE_ENDIAN__)
6962 #      define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
6963 #    elif defined(__BIG_ENDIAN__)
6964 #      define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
6965 #    endif
6966 #  elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) || defined(__DragonFly__) || defined(BSD)
6967 #    include <machine/endian.h>
6968 #    if defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)
6969 #      define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
6970 #    elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)
6971 #      define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
6972 #    endif
6973 #  elif defined(__linux__) || defined(__linux) || defined(__gnu_linux__)
6974 #    include <endian.h>
6975 #    if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)
6976 #      define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
6977 #    elif defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)
6978 #      define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
6979 #    endif
6980 #  endif
6981 #endif
6982 
6983 #if \
6984     HEDLEY_HAS_BUILTIN(__builtin_bswap64) || \
6985     HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
6986     HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
6987     HEDLEY_INTEL_VERSION_CHECK(13,0,0)
6988   #define simde_bswap64(v) __builtin_bswap64(v)
6989 #elif HEDLEY_MSVC_VERSION_CHECK(13,10,0)
6990   #define simde_bswap64(v) _byteswap_uint64(v)
6991 #else
6992   SIMDE_FUNCTION_ATTRIBUTES
6993   uint64_t
6994   simde_bswap64(uint64_t v) {
6995     return
6996       ((v & (((uint64_t) 0xff) << 56)) >> 56) |
6997       ((v & (((uint64_t) 0xff) << 48)) >> 40) |
6998       ((v & (((uint64_t) 0xff) << 40)) >> 24) |
6999       ((v & (((uint64_t) 0xff) << 32)) >>  8) |
7000       ((v & (((uint64_t) 0xff) << 24)) <<  8) |
7001       ((v & (((uint64_t) 0xff) << 16)) << 24) |
7002       ((v & (((uint64_t) 0xff) <<  8)) << 40) |
7003       ((v & (((uint64_t) 0xff)      )) << 56);
7004   }
7005 #endif
7006 
7007 #if !defined(SIMDE_ENDIAN_ORDER)
7008 #  error Unknown byte order; please file a bug
7009 #else
7010 #  if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE
7011 #    define simde_endian_bswap64_be(value) simde_bswap64(value)
7012 #    define simde_endian_bswap64_le(value) (value)
7013 #  elif SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG
7014 #    define simde_endian_bswap64_be(value) (value)
7015 #    define simde_endian_bswap64_le(value) simde_bswap64(value)
7016 #  endif
7017 #endif
7018 
7019 /* TODO: we should at least make an attempt to detect the correct
7020    types for simde_float32/float64 instead of just assuming float and
7021    double. */
7022 
7023 #if !defined(SIMDE_FLOAT32_TYPE)
7024 #  define SIMDE_FLOAT32_TYPE float
7025 #  define SIMDE_FLOAT32_C(value) value##f
7026 #else
7027 #  define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE) value)
7028 #endif
7029 typedef SIMDE_FLOAT32_TYPE simde_float32;
7030 
7031 #if !defined(SIMDE_FLOAT64_TYPE)
7032 #  define SIMDE_FLOAT64_TYPE double
7033 #  define SIMDE_FLOAT64_C(value) value
7034 #else
7035 #  define SIMDE_FLOAT64_C(value) ((SIMDE_FLOAT64_TYPE) value)
7036 #endif
7037 typedef SIMDE_FLOAT64_TYPE simde_float64;
7038 
7039 #if defined(__cplusplus)
7040   typedef bool simde_bool;
7041 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
7042   typedef _Bool simde_bool;
7043 #elif defined(bool)
7044   typedef bool simde_bool;
7045 #else
7046   #include <stdbool.h>
7047   typedef bool simde_bool;
7048 #endif
7049 
7050 #if HEDLEY_HAS_WARNING("-Wbad-function-cast")
7051 #  define SIMDE_CONVERT_FTOI(T,v) \
7052     HEDLEY_DIAGNOSTIC_PUSH \
7053     _Pragma("clang diagnostic ignored \"-Wbad-function-cast\"") \
7054     HEDLEY_STATIC_CAST(T, (v)) \
7055     HEDLEY_DIAGNOSTIC_POP
7056 #else
7057 #  define SIMDE_CONVERT_FTOI(T,v) ((T) (v))
7058 #endif
7059 
7060 /* TODO: detect compilers which support this outside of C11 mode */
7061 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
7062   #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) _Generic((value), to: (value), default: (_Generic((value), from: ((to) (value)))))
7063   #define SIMDE_CHECKED_STATIC_CAST(to, from, value) _Generic((value), to: (value), default: (_Generic((value), from: ((to) (value)))))
7064 #else
7065   #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) HEDLEY_REINTERPRET_CAST(to, value)
7066   #define SIMDE_CHECKED_STATIC_CAST(to, from, value) HEDLEY_STATIC_CAST(to, value)
7067 #endif
7068 
7069 #if HEDLEY_HAS_WARNING("-Wfloat-equal")
7070 #  define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL _Pragma("clang diagnostic ignored \"-Wfloat-equal\"")
7071 #elif HEDLEY_GCC_VERSION_CHECK(3,0,0)
7072 #  define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
7073 #else
7074 #  define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL
7075 #endif
7076 
7077 /* Some functions can trade accuracy for speed.  For those functions
7078    you can control the trade-off using this macro.  Possible values:
7079 
7080    0: prefer speed
7081    1: reasonable trade-offs
7082    2: prefer accuracy */
7083 #if !defined(SIMDE_ACCURACY_PREFERENCE)
7084 #  define SIMDE_ACCURACY_PREFERENCE 1
7085 #endif
7086 
7087 #if defined(__STDC_HOSTED__)
7088 #  define SIMDE_STDC_HOSTED __STDC_HOSTED__
7089 #else
7090 #  if \
7091      defined(HEDLEY_PGI_VERSION) || \
7092      defined(HEDLEY_MSVC_VERSION)
7093 #    define SIMDE_STDC_HOSTED 1
7094 #  else
7095 #    define SIMDE_STDC_HOSTED 0
7096 #  endif
7097 #endif
7098 
7099 /* Try to deal with environments without a standard library. */
7100 #if !defined(simde_memcpy)
7101   #if HEDLEY_HAS_BUILTIN(__builtin_memcpy)
7102     #define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n)
7103   #endif
7104 #endif
7105 #if !defined(simde_memset)
7106   #if HEDLEY_HAS_BUILTIN(__builtin_memset)
7107     #define simde_memset(s, c, n) __builtin_memset(s, c, n)
7108   #endif
7109 #endif
7110 #if !defined(simde_memcmp)
7111   #if HEDLEY_HAS_BUILTIN(__builtin_memcmp)
7112     #define simde_memcmp(s1, s2, n) __builtin_memcmp(s1, s2, n)
7113   #endif
7114 #endif
7115 
7116 #if !defined(simde_memcpy) || !defined(simde_memset) || !defined(simde_memcmp)
7117   #if !defined(SIMDE_NO_STRING_H)
7118     #if defined(__has_include)
7119       #if !__has_include(<string.h>)
7120         #define SIMDE_NO_STRING_H
7121       #endif
7122     #elif (SIMDE_STDC_HOSTED == 0)
7123       #define SIMDE_NO_STRING_H
7124     #endif
7125   #endif
7126 
7127   #if !defined(SIMDE_NO_STRING_H)
7128     #include <string.h>
7129     #if !defined(simde_memcpy)
7130       #define simde_memcpy(dest, src, n) memcpy(dest, src, n)
7131     #endif
7132     #if !defined(simde_memset)
7133       #define simde_memset(s, c, n) memset(s, c, n)
7134     #endif
7135     #if !defined(simde_memcmp)
7136       #define simde_memcmp(s1, s2, n) memcmp(s1, s2, n)
7137     #endif
7138   #else
7139     /* These are meant to be portable, not fast.  If you're hitting them you
7140      * should think about providing your own (by defining the simde_memcpy
7141      * macro prior to including any SIMDe files) or submitting a patch to
7142      * SIMDe so we can detect your system-provided memcpy/memset, like by
7143      * adding your compiler to the checks for __builtin_memcpy and/or
7144      * __builtin_memset. */
7145     #if !defined(simde_memcpy)
7146       SIMDE_FUNCTION_ATTRIBUTES
7147       void
simde_memcpy_(void * dest,const void * src,size_t len)7148       simde_memcpy_(void* dest, const void* src, size_t len) {
7149         char* dest_ = HEDLEY_STATIC_CAST(char*, dest);
7150         char* src_ = HEDLEY_STATIC_CAST(const char*, src);
7151         for (size_t i = 0 ; i < len ; i++) {
7152           dest_[i] = src_[i];
7153         }
7154       }
7155       #define simde_memcpy(dest, src, n) simde_memcpy_(dest, src, n)
7156     #endif
7157 
7158     #if !defined(simde_memset)
7159       SIMDE_FUNCTION_ATTRIBUTES
7160       void
simde_memset_(void * s,int c,size_t len)7161       simde_memset_(void* s, int c, size_t len) {
7162         char* s_ = HEDLEY_STATIC_CAST(char*, s);
7163         char c_ = HEDLEY_STATIC_CAST(char, c);
7164         for (size_t i = 0 ; i < len ; i++) {
7165           s_[i] = c_[i];
7166         }
7167       }
7168       #define simde_memset(s, c, n) simde_memset_(s, c, n)
7169     #endif
7170 
7171     #if !defined(simde_memcmp)
7172       SIMDE_FUCTION_ATTRIBUTES
7173       int
simde_memcmp_(const void * s1,const void * s2,size_t n)7174       simde_memcmp_(const void *s1, const void *s2, size_t n) {
7175         unsigned char* s1_ = HEDLEY_STATIC_CAST(unsigned char*, s1);
7176         unsigned char* s2_ = HEDLEY_STATIC_CAST(unsigned char*, s2);
7177         for (size_t i = 0 ; i < len ; i++) {
7178           if (s1_[i] != s2_[i]) {
7179             return (int) (s1_[i] - s2_[i]);
7180           }
7181         }
7182         return 0;
7183       }
7184     #define simde_memcmp(s1, s2, n) simde_memcmp_(s1, s2, n)
7185     #endif
7186   #endif
7187 #endif
7188 
7189 #if defined(FE_ALL_EXCEPT)
7190   #define SIMDE_HAVE_FENV_H
7191 #elif defined(__has_include)
7192   #if __has_include(<fenv.h>)
7193     #include <fenv.h>
7194     #define SIMDE_HAVE_FENV_H
7195   #endif
7196 #elif SIMDE_STDC_HOSTED == 1
7197   #include <fenv.h>
7198   #define SIMDE_HAVE_FENV_H
7199 #endif
7200 
7201 #if defined(EXIT_FAILURE)
7202   #define SIMDE_HAVE_STDLIB_H
7203 #elif defined(__has_include)
7204   #if __has_include(<stdlib.h>)
7205     #include <stdlib.h>
7206     #define SIMDE_HAVE_STDLIB_H
7207   #endif
7208 #elif SIMDE_STDC_HOSTED == 1
7209   #include <stdlib.h>
7210   #define SIMDE_HAVE_STDLIB_H
7211 #endif
7212 
7213 #if defined(__has_include)
7214 #  if defined(__cplusplus) && (__cplusplus >= 201103L) && __has_include(<cfenv>)
7215 #    include <cfenv>
7216 #  elif __has_include(<fenv.h>)
7217 #    include <fenv.h>
7218 #  endif
7219 #  if __has_include(<stdlib.h>)
7220 #    include <stdlib.h>
7221 #  endif
7222 #elif SIMDE_STDC_HOSTED == 1
7223 #  include <stdlib.h>
7224 #  include <fenv.h>
7225 #endif
7226 
7227 #define SIMDE_DEFINE_CONVERSION_FUNCTION_(Name, T_To, T_From) \
7228   static HEDLEY_ALWAYS_INLINE HEDLEY_CONST \
7229   T_To \
7230   Name (T_From value) { \
7231     T_To r; \
7232     simde_memcpy(&r, &value, sizeof(r)); \
7233     return r; \
7234   }
7235 
7236 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
7237 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
7238 /* :: Begin ../simde/simde/check.h :: */
7239 /* Check (assertions)
7240  * Portable Snippets - https://gitub.com/nemequ/portable-snippets
7241  * Created by Evan Nemerson <evan@nemerson.com>
7242  *
7243  *   To the extent possible under law, the authors have waived all
7244  *   copyright and related or neighboring rights to this code.  For
7245  *   details, see the Creative Commons Zero 1.0 Universal license at
7246  *   https://creativecommons.org/publicdomain/zero/1.0/
7247  *
7248  * SPDX-License-Identifier: CC0-1.0
7249  */
7250 
7251 #if !defined(SIMDE_CHECK_H)
7252 #define SIMDE_CHECK_H
7253 
7254 #if !defined(SIMDE_NDEBUG) && !defined(SIMDE_DEBUG)
7255 #  define SIMDE_NDEBUG 1
7256 #endif
7257 
7258 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
7259 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
7260 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
7261 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
7262 #include <stdint.h>
7263 
7264 #if !defined(_WIN32)
7265 #  define SIMDE_SIZE_MODIFIER "z"
7266 #  define SIMDE_CHAR_MODIFIER "hh"
7267 #  define SIMDE_SHORT_MODIFIER "h"
7268 #else
7269 #  if defined(_M_X64) || defined(__amd64__)
7270 #    define SIMDE_SIZE_MODIFIER "I64"
7271 #  else
7272 #    define SIMDE_SIZE_MODIFIER ""
7273 #  endif
7274 #  define SIMDE_CHAR_MODIFIER ""
7275 #  define SIMDE_SHORT_MODIFIER ""
7276 #endif
7277 
7278 #if defined(_MSC_VER) &&  (_MSC_VER >= 1500)
7279 #  define SIMDE_PUSH_DISABLE_MSVC_C4127_ __pragma(warning(push)) __pragma(warning(disable:4127))
7280 #  define SIMDE_POP_DISABLE_MSVC_C4127_ __pragma(warning(pop))
7281 #else
7282 #  define SIMDE_PUSH_DISABLE_MSVC_C4127_
7283 #  define SIMDE_POP_DISABLE_MSVC_C4127_
7284 #endif
7285 
7286 #if !defined(simde_errorf)
7287 #  if defined(__has_include)
7288 #    if __has_include(<stdio.h>)
7289 #      include <stdio.h>
7290 #    endif
7291 #  elif defined(SIMDE_STDC_HOSTED)
7292 #    if SIMDE_STDC_HOSTED == 1
7293 #      include <stdio.h>
7294 #    endif
7295 #  elif defined(__STDC_HOSTED__)
7296 #    if __STDC_HOSTETD__ == 1
7297 #      include <stdio.h>
7298 #    endif
7299 #  endif
7300 
7301 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
7302 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
7303 /* :: Begin ../simde/simde/debug-trap.h :: */
7304 /* Debugging assertions and traps
7305  * Portable Snippets - https://gitub.com/nemequ/portable-snippets
7306  * Created by Evan Nemerson <evan@nemerson.com>
7307  *
7308  *   To the extent possible under law, the authors have waived all
7309  *   copyright and related or neighboring rights to this code.  For
7310  *   details, see the Creative Commons Zero 1.0 Universal license at
7311  *   https://creativecommons.org/publicdomain/zero/1.0/
7312  *
7313  * SPDX-License-Identifier: CC0-1.0
7314  */
7315 
7316 #if !defined(SIMDE_DEBUG_TRAP_H)
7317 #define SIMDE_DEBUG_TRAP_H
7318 
7319 #if !defined(SIMDE_NDEBUG) && defined(NDEBUG) && !defined(SIMDE_DEBUG)
7320 #  define SIMDE_NDEBUG 1
7321 #endif
7322 
7323 #if defined(__has_builtin) && !defined(__ibmxl__)
7324 #  if __has_builtin(__builtin_debugtrap)
7325 #    define simde_trap() __builtin_debugtrap()
7326 #  elif __has_builtin(__debugbreak)
7327 #    define simde_trap() __debugbreak()
7328 #  endif
7329 #endif
7330 #if !defined(simde_trap)
7331 #  if defined(_MSC_VER) || defined(__INTEL_COMPILER)
7332 #    define simde_trap() __debugbreak()
7333 #  elif defined(__ARMCC_VERSION)
7334 #    define simde_trap() __breakpoint(42)
7335 #  elif defined(__ibmxl__) || defined(__xlC__)
7336 #    include <builtins.h>
7337 #    define simde_trap() __trap(42)
7338 #  elif defined(__DMC__) && defined(_M_IX86)
simde_trap(void)7339      static inline void simde_trap(void) { __asm int 3h; }
7340 #  elif defined(__i386__) || defined(__x86_64__)
simde_trap(void)7341      static inline void simde_trap(void) { __asm__ __volatile__("int $03"); }
7342 #  elif defined(__thumb__)
simde_trap(void)7343      static inline void simde_trap(void) { __asm__ __volatile__(".inst 0xde01"); }
7344 #  elif defined(__aarch64__)
simde_trap(void)7345      static inline void simde_trap(void) { __asm__ __volatile__(".inst 0xd4200000"); }
7346 #  elif defined(__arm__)
simde_trap(void)7347      static inline void simde_trap(void) { __asm__ __volatile__(".inst 0xe7f001f0"); }
7348 #  elif defined (__alpha__) && !defined(__osf__)
simde_trap(void)7349      static inline void simde_trap(void) { __asm__ __volatile__("bpt"); }
7350 #  elif defined(_54_)
simde_trap(void)7351      static inline void simde_trap(void) { __asm__ __volatile__("ESTOP"); }
7352 #  elif defined(_55_)
simde_trap(void)7353      static inline void simde_trap(void) { __asm__ __volatile__(";\n .if (.MNEMONIC)\n ESTOP_1\n .else\n ESTOP_1()\n .endif\n NOP"); }
7354 #  elif defined(_64P_)
simde_trap(void)7355      static inline void simde_trap(void) { __asm__ __volatile__("SWBP 0"); }
7356 #  elif defined(_6x_)
simde_trap(void)7357      static inline void simde_trap(void) { __asm__ __volatile__("NOP\n .word 0x10000000"); }
7358 #  elif defined(__STDC_HOSTED__) && (__STDC_HOSTED__ == 0) && defined(__GNUC__)
7359 #    define simde_trap() __builtin_trap()
7360 #  else
7361 #    include <signal.h>
7362 #    if defined(SIGTRAP)
7363 #      define simde_trap() raise(SIGTRAP)
7364 #    else
7365 #      define simde_trap() raise(SIGABRT)
7366 #    endif
7367 #  endif
7368 #endif
7369 
7370 #if defined(HEDLEY_LIKELY)
7371 #  define SIMDE_DBG_LIKELY(expr) HEDLEY_LIKELY(expr)
7372 #elif defined(__GNUC__) && (__GNUC__ >= 3)
7373 #  define SIMDE_DBG_LIKELY(expr) __builtin_expect(!!(expr), 1)
7374 #else
7375 #  define SIMDE_DBG_LIKELY(expr) (!!(expr))
7376 #endif
7377 
7378 #if !defined(SIMDE_NDEBUG) || (SIMDE_NDEBUG == 0)
7379 #  define simde_dbg_assert(expr) do { \
7380     if (!SIMDE_DBG_LIKELY(expr)) { \
7381       simde_trap(); \
7382     } \
7383   } while (0)
7384 #else
7385 #  define simde_dbg_assert(expr)
7386 #endif
7387 
7388 #endif /* !defined(SIMDE_DEBUG_TRAP_H) */
7389 /* :: End ../simde/simde/debug-trap.h :: */
7390 
7391    HEDLEY_DIAGNOSTIC_PUSH
7392    SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_
7393 #  if defined(EOF)
7394 #    define simde_errorf(format, ...) (fprintf(stderr, format, __VA_ARGS__), abort())
7395 #  else
7396 #    define simde_errorf(format, ...) (simde_trap())
7397 #  endif
7398    HEDLEY_DIAGNOSTIC_POP
7399 #endif
7400 
7401 #define simde_error(msg) simde_errorf("%s", msg)
7402 
7403 #if defined(SIMDE_NDEBUG) || \
7404     (defined(__cplusplus) && (__cplusplus < 201103L)) || \
7405     (defined(__STDC__) && (__STDC__ < 199901L))
7406 #  if defined(SIMDE_CHECK_FAIL_DEFINED)
7407 #    define simde_assert(expr)
7408 #  else
7409 #    if defined(HEDLEY_ASSUME)
7410 #      define simde_assert(expr) HEDLEY_ASSUME(expr)
7411 #    elif HEDLEY_GCC_VERSION_CHECK(4,5,0)
7412 #      define simde_assert(expr) ((void) (!!(expr) ? 1 : (__builtin_unreachable(), 1)))
7413 #    elif HEDLEY_MSVC_VERSION_CHECK(13,10,0)
7414 #      define simde_assert(expr) __assume(expr)
7415 #    else
7416 #      define simde_assert(expr)
7417 #    endif
7418 #  endif
7419 #  define simde_assert_true(expr) simde_assert(expr)
7420 #  define simde_assert_false(expr) simde_assert(!(expr))
7421 #  define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) simde_assert(((a) op (b)))
7422 #  define simde_assert_double_equal(a, b, precision)
7423 #  define simde_assert_string_equal(a, b)
7424 #  define simde_assert_string_not_equal(a, b)
7425 #  define simde_assert_memory_equal(size, a, b)
7426 #  define simde_assert_memory_not_equal(size, a, b)
7427 #else
7428 #  define simde_assert(expr) \
7429     do { \
7430       if (!HEDLEY_LIKELY(expr)) { \
7431         simde_error("assertion failed: " #expr "\n"); \
7432       } \
7433       SIMDE_PUSH_DISABLE_MSVC_C4127_ \
7434     } while (0) \
7435     SIMDE_POP_DISABLE_MSVC_C4127_
7436 
7437 #  define simde_assert_true(expr) \
7438     do { \
7439       if (!HEDLEY_LIKELY(expr)) { \
7440         simde_error("assertion failed: " #expr " is not true\n"); \
7441       } \
7442       SIMDE_PUSH_DISABLE_MSVC_C4127_ \
7443     } while (0) \
7444     SIMDE_POP_DISABLE_MSVC_C4127_
7445 
7446 #  define simde_assert_false(expr) \
7447     do { \
7448       if (!HEDLEY_LIKELY(!(expr))) { \
7449         simde_error("assertion failed: " #expr " is not false\n"); \
7450       } \
7451       SIMDE_PUSH_DISABLE_MSVC_C4127_ \
7452     } while (0) \
7453     SIMDE_POP_DISABLE_MSVC_C4127_
7454 
7455 #  define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b)   \
7456     do { \
7457       T simde_tmp_a_ = (a); \
7458       T simde_tmp_b_ = (b); \
7459       if (!(simde_tmp_a_ op simde_tmp_b_)) { \
7460         simde_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix " %s " prefix "%" fmt suffix ")\n", \
7461                      #a, #op, #b, simde_tmp_a_, #op, simde_tmp_b_); \
7462       } \
7463       SIMDE_PUSH_DISABLE_MSVC_C4127_ \
7464     } while (0) \
7465     SIMDE_POP_DISABLE_MSVC_C4127_
7466 
7467 #  define simde_assert_double_equal(a, b, precision) \
7468     do { \
7469       const double simde_tmp_a_ = (a); \
7470       const double simde_tmp_b_ = (b); \
7471       const double simde_tmp_diff_ = ((simde_tmp_a_ - simde_tmp_b_) < 0) ? \
7472         -(simde_tmp_a_ - simde_tmp_b_) : \
7473         (simde_tmp_a_ - simde_tmp_b_); \
7474       if (HEDLEY_UNLIKELY(simde_tmp_diff_ > 1e-##precision)) { \
7475         simde_errorf("assertion failed: %s == %s (%0." #precision "g == %0." #precision "g)\n", \
7476                      #a, #b, simde_tmp_a_, simde_tmp_b_); \
7477       } \
7478       SIMDE_PUSH_DISABLE_MSVC_C4127_ \
7479     } while (0) \
7480     SIMDE_POP_DISABLE_MSVC_C4127_
7481 
7482 #  include <string.h>
7483 #  define simde_assert_string_equal(a, b) \
7484     do { \
7485       const char* simde_tmp_a_ = a; \
7486       const char* simde_tmp_b_ = b; \
7487       if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) != 0)) { \
7488         simde_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")\n", \
7489                      #a, #b, simde_tmp_a_, simde_tmp_b_); \
7490       } \
7491       SIMDE_PUSH_DISABLE_MSVC_C4127_ \
7492     } while (0) \
7493     SIMDE_POP_DISABLE_MSVC_C4127_
7494 
7495 #  define simde_assert_string_not_equal(a, b) \
7496     do { \
7497       const char* simde_tmp_a_ = a; \
7498       const char* simde_tmp_b_ = b; \
7499       if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) == 0)) { \
7500         simde_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")\n", \
7501                      #a, #b, simde_tmp_a_, simde_tmp_b_); \
7502       } \
7503       SIMDE_PUSH_DISABLE_MSVC_C4127_ \
7504     } while (0) \
7505     SIMDE_POP_DISABLE_MSVC_C4127_
7506 
7507 #  define simde_assert_memory_equal(size, a, b) \
7508     do { \
7509       const unsigned char* simde_tmp_a_ = (const unsigned char*) (a); \
7510       const unsigned char* simde_tmp_b_ = (const unsigned char*) (b); \
7511       const size_t simde_tmp_size_ = (size); \
7512       if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, simde_tmp_size_)) != 0) { \
7513         size_t simde_tmp_pos_; \
7514         for (simde_tmp_pos_ = 0 ; simde_tmp_pos_ < simde_tmp_size_ ; simde_tmp_pos_++) { \
7515           if (simde_tmp_a_[simde_tmp_pos_] != simde_tmp_b_[simde_tmp_pos_]) { \
7516             simde_errorf("assertion failed: memory %s == %s, at offset %" SIMDE_SIZE_MODIFIER "u\n", \
7517                          #a, #b, simde_tmp_pos_); \
7518             break; \
7519           } \
7520         } \
7521       } \
7522       SIMDE_PUSH_DISABLE_MSVC_C4127_ \
7523     } while (0) \
7524     SIMDE_POP_DISABLE_MSVC_C4127_
7525 
7526 #  define simde_assert_memory_not_equal(size, a, b) \
7527     do { \
7528       const unsigned char* simde_tmp_a_ = (const unsigned char*) (a); \
7529       const unsigned char* simde_tmp_b_ = (const unsigned char*) (b); \
7530       const size_t simde_tmp_size_ = (size); \
7531       if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, simde_tmp_size_)) == 0) { \
7532         simde_errorf("assertion failed: memory %s != %s (%" SIMDE_SIZE_MODIFIER "u bytes)\n", \
7533                      #a, #b, simde_tmp_size_); \
7534       } \
7535       SIMDE_PUSH_DISABLE_MSVC_C4127_ \
7536     } while (0) \
7537     SIMDE_POP_DISABLE_MSVC_C4127_
7538 #endif
7539 
7540 #define simde_assert_type(T, fmt, a, op, b) \
7541   simde_assert_type_full("", "", T, fmt, a, op, b)
7542 
7543 #define simde_assert_char(a, op, b) \
7544   simde_assert_type_full("'\\x", "'", char, "02" SIMDE_CHAR_MODIFIER "x", a, op, b)
7545 #define simde_assert_uchar(a, op, b) \
7546   simde_assert_type_full("'\\x", "'", unsigned char, "02" SIMDE_CHAR_MODIFIER "x", a, op, b)
7547 #define simde_assert_short(a, op, b) \
7548   simde_assert_type(short, SIMDE_SHORT_MODIFIER "d", a, op, b)
7549 #define simde_assert_ushort(a, op, b) \
7550   simde_assert_type(unsigned short, SIMDE_SHORT_MODIFIER "u", a, op, b)
7551 #define simde_assert_int(a, op, b) \
7552   simde_assert_type(int, "d", a, op, b)
7553 #define simde_assert_uint(a, op, b) \
7554   simde_assert_type(unsigned int, "u", a, op, b)
7555 #define simde_assert_long(a, op, b) \
7556   simde_assert_type(long int, "ld", a, op, b)
7557 #define simde_assert_ulong(a, op, b) \
7558   simde_assert_type(unsigned long int, "lu", a, op, b)
7559 #define simde_assert_llong(a, op, b) \
7560   simde_assert_type(long long int, "lld", a, op, b)
7561 #define simde_assert_ullong(a, op, b) \
7562   simde_assert_type(unsigned long long int, "llu", a, op, b)
7563 
7564 #define simde_assert_size(a, op, b) \
7565   simde_assert_type(size_t, SIMDE_SIZE_MODIFIER "u", a, op, b)
7566 
7567 #define simde_assert_float(a, op, b) \
7568   simde_assert_type(float, "f", a, op, b)
7569 #define simde_assert_double(a, op, b) \
7570   simde_assert_type(double, "g", a, op, b)
7571 #define simde_assert_ptr(a, op, b) \
7572   simde_assert_type(const void*, "p", a, op, b)
7573 
7574 #define simde_assert_int8(a, op, b) \
7575   simde_assert_type(int8_t, PRIi8, a, op, b)
7576 #define simde_assert_uint8(a, op, b) \
7577   simde_assert_type(uint8_t, PRIu8, a, op, b)
7578 #define simde_assert_int16(a, op, b) \
7579   simde_assert_type(int16_t, PRIi16, a, op, b)
7580 #define simde_assert_uint16(a, op, b) \
7581   simde_assert_type(uint16_t, PRIu16, a, op, b)
7582 #define simde_assert_int32(a, op, b) \
7583   simde_assert_type(int32_t, PRIi32, a, op, b)
7584 #define simde_assert_uint32(a, op, b) \
7585   simde_assert_type(uint32_t, PRIu32, a, op, b)
7586 #define simde_assert_int64(a, op, b) \
7587   simde_assert_type(int64_t, PRIi64, a, op, b)
7588 #define simde_assert_uint64(a, op, b) \
7589   simde_assert_type(uint64_t, PRIu64, a, op, b)
7590 
7591 #define simde_assert_ptr_equal(a, b) \
7592   simde_assert_ptr(a, ==, b)
7593 #define simde_assert_ptr_not_equal(a, b) \
7594   simde_assert_ptr(a, !=, b)
7595 #define simde_assert_null(ptr) \
7596   simde_assert_ptr(ptr, ==, NULL)
7597 #define simde_assert_not_null(ptr) \
7598   simde_assert_ptr(ptr, !=, NULL)
7599 #define simde_assert_ptr_null(ptr) \
7600   simde_assert_ptr(ptr, ==, NULL)
7601 #define simde_assert_ptr_not_null(ptr) \
7602   simde_assert_ptr(ptr, !=, NULL)
7603 
7604 #endif /* !defined(SIMDE_CHECK_H) */
7605 /* :: End ../simde/simde/check.h :: */
7606 
7607 /* GCC/clang have a bunch of functionality in builtins which we would
7608  * like to access, but the suffixes indicate whether the operate on
7609  * int, long, or long long, not fixed width types (e.g., int32_t).
7610  * we use these macros to attempt to map from fixed-width to the
7611  * names GCC uses.  Note that you should still cast the input(s) and
7612  * return values (to/from SIMDE_BUILTIN_TYPE_*_) since often even if
7613  * types are the same size they may not be compatible according to the
7614  * compiler.  For example, on x86 long and long lonsg are generally
7615  * both 64 bits, but platforms vary on whether an int64_t is mapped
7616  * to a long or long long. */
7617 
7618 #include <limits.h>
7619 
7620 HEDLEY_DIAGNOSTIC_PUSH
7621 SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_
7622 
7623 #if (INT8_MAX == INT_MAX) && (INT8_MIN == INT_MIN)
7624   #define SIMDE_BUILTIN_SUFFIX_8_
7625   #define SIMDE_BUILTIN_TYPE_8_ int
7626 #elif (INT8_MAX == LONG_MAX) && (INT8_MIN == LONG_MIN)
7627   #define SIMDE_BUILTIN_SUFFIX_8_ l
7628   #define SIMDE_BUILTIN_TYPE_8_ long
7629 #elif (INT8_MAX == LLONG_MAX) && (INT8_MIN == LLONG_MIN)
7630   #define SIMDE_BUILTIN_SUFFIX_8_ ll
7631   #define SIMDE_BUILTIN_TYPE_8_ long long
7632 #endif
7633 
7634 #if (INT16_MAX == INT_MAX) && (INT16_MIN == INT_MIN)
7635   #define SIMDE_BUILTIN_SUFFIX_16_
7636   #define SIMDE_BUILTIN_TYPE_16_ int
7637 #elif (INT16_MAX == LONG_MAX) && (INT16_MIN == LONG_MIN)
7638   #define SIMDE_BUILTIN_SUFFIX_16_ l
7639   #define SIMDE_BUILTIN_TYPE_16_ long
7640 #elif (INT16_MAX == LLONG_MAX) && (INT16_MIN == LLONG_MIN)
7641   #define SIMDE_BUILTIN_SUFFIX_16_ ll
7642   #define SIMDE_BUILTIN_TYPE_16_ long long
7643 #endif
7644 
7645 #if (INT32_MAX == INT_MAX) && (INT32_MIN == INT_MIN)
7646   #define SIMDE_BUILTIN_SUFFIX_32_
7647   #define SIMDE_BUILTIN_TYPE_32_ int
7648 #elif (INT32_MAX == LONG_MAX) && (INT32_MIN == LONG_MIN)
7649   #define SIMDE_BUILTIN_SUFFIX_32_ l
7650   #define SIMDE_BUILTIN_TYPE_32_ long
7651 #elif (INT32_MAX == LLONG_MAX) && (INT32_MIN == LLONG_MIN)
7652   #define SIMDE_BUILTIN_SUFFIX_32_ ll
7653   #define SIMDE_BUILTIN_TYPE_32_ long long
7654 #endif
7655 
7656 #if (INT64_MAX == INT_MAX) && (INT64_MIN == INT_MIN)
7657   #define SIMDE_BUILTIN_SUFFIX_64_
7658   #define SIMDE_BUILTIN_TYPE_64_ int
7659 #elif (INT64_MAX == LONG_MAX) && (INT64_MIN == LONG_MIN)
7660   #define SIMDE_BUILTIN_SUFFIX_64_ l
7661   #define SIMDE_BUILTIN_TYPE_64_ long
7662 #elif (INT64_MAX == LLONG_MAX) && (INT64_MIN == LLONG_MIN)
7663   #define SIMDE_BUILTIN_SUFFIX_64_ ll
7664   #define SIMDE_BUILTIN_TYPE_64_ long long
7665 #endif
7666 
7667 #if defined(SIMDE_BUILTIN_SUFFIX_8_)
7668   #define SIMDE_BUILTIN_8_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_)
7669   #define SIMDE_BUILTIN_HAS_8_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_))
7670 #else
7671   #define SIMDE_BUILTIN_HAS_8_(name) 0
7672 #endif
7673 #if defined(SIMDE_BUILTIN_SUFFIX_16_)
7674   #define SIMDE_BUILTIN_16_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_16_)
7675   #define SIMDE_BUILTIN_HAS_16_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_16_))
7676 #else
7677   #define SIMDE_BUILTIN_HAS_16_(name) 0
7678 #endif
7679 #if defined(SIMDE_BUILTIN_SUFFIX_32_)
7680   #define SIMDE_BUILTIN_32_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_32_)
7681   #define SIMDE_BUILTIN_HAS_32_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_32_))
7682 #else
7683   #define SIMDE_BUILTIN_HAS_32_(name) 0
7684 #endif
7685 #if defined(SIMDE_BUILTIN_SUFFIX_64_)
7686   #define SIMDE_BUILTIN_64_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_64_)
7687   #define SIMDE_BUILTIN_HAS_64_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_64_))
7688 #else
7689   #define SIMDE_BUILTIN_HAS_64_(name) 0
7690 #endif
7691 
7692 HEDLEY_DIAGNOSTIC_POP
7693 
7694 /* Sometimes we run into problems with specific versions of compilers
7695    which make the native versions unusable for us.  Often this is due
7696    to missing functions, sometimes buggy implementations, etc.  These
7697    macros are how we check for specific bugs.  As they are fixed we'll
7698    start only defining them for problematic compiler versions. */
7699 
7700 #if !defined(SIMDE_IGNORE_COMPILER_BUGS)
7701 #  if defined(HEDLEY_GCC_VERSION)
7702 #    if !HEDLEY_GCC_VERSION_CHECK(4,9,0)
7703 #      define SIMDE_BUG_GCC_REV_208793
7704 #    endif
7705 #    if !HEDLEY_GCC_VERSION_CHECK(5,0,0)
7706 #      define SIMDE_BUG_GCC_BAD_MM_SRA_EPI32 /* TODO: find relevant bug or commit */
7707 #    endif
7708 #    if !HEDLEY_GCC_VERSION_CHECK(6,0,0)
7709 #      define SIMDE_BUG_GCC_SIZEOF_IMMEDIATE
7710 #    endif
7711 #    if !HEDLEY_GCC_VERSION_CHECK(4,6,0)
7712 #      define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */
7713 #    endif
7714 #    if !HEDLEY_GCC_VERSION_CHECK(8,0,0)
7715 #      define SIMDE_BUG_GCC_REV_247851
7716 #    endif
7717 #    if !HEDLEY_GCC_VERSION_CHECK(10,0,0)
7718 #      define SIMDE_BUG_GCC_REV_274313
7719 #      define SIMDE_BUG_GCC_91341
7720 #    endif
7721 #    if !HEDLEY_GCC_VERSION_CHECK(9,0,0) && defined(SIMDE_ARCH_AARCH64)
7722 #      define SIMDE_BUG_GCC_ARM_SHIFT_SCALAR
7723 #    endif
7724 #    if !HEDLEY_GCC_VERSION_CHECK(9,0,0) && defined(SIMDE_ARCH_AARCH64)
7725 #      define SIMDE_BUG_GCC_BAD_VEXT_REV32
7726 #    endif
7727 #    if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)
7728 #      define SIMDE_BUG_GCC_94482
7729 #    endif
7730 #    if (defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) || defined(SIMDE_ARCH_ZARCH)
7731 #      define SIMDE_BUG_GCC_53784
7732 #    endif
7733 #    if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
7734 #      if HEDLEY_GCC_VERSION_CHECK(4,3,0) /* -Wsign-conversion */
7735 #        define SIMDE_BUG_GCC_95144
7736 #      endif
7737 #      if !HEDLEY_GCC_VERSION_CHECK(11,0,0)
7738 #        define SIMDE_BUG_GCC_95483
7739 #      endif
7740 #      define SIMDE_BUG_GCC_98521
7741 #    endif
7742 #    if !HEDLEY_GCC_VERSION_CHECK(9,4,0) && defined(SIMDE_ARCH_AARCH64)
7743 #      define SIMDE_BUG_GCC_94488
7744 #    endif
7745 #    if !HEDLEY_GCC_VERSION_CHECK(9,1,0) && defined(SIMDE_ARCH_AARCH64)
7746 #      define SIMDE_BUG_GCC_REV_264019
7747 #    endif
7748 #    if defined(SIMDE_ARCH_ARM)
7749 #      define SIMDE_BUG_GCC_95399
7750 #      define SIMDE_BUG_GCC_95471
7751 #    elif defined(SIMDE_ARCH_POWER)
7752 #      define SIMDE_BUG_GCC_95227
7753 #      define SIMDE_BUG_GCC_95782
7754 #    elif defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
7755 #      if !HEDLEY_GCC_VERSION_CHECK(10,2,0) && !defined(__OPTIMIZE__)
7756 #        define SIMDE_BUG_GCC_96174
7757 #      endif
7758 #    elif defined(SIMDE_ARCH_ZARCH)
7759 #      if !HEDLEY_GCC_VERSION_CHECK(9,0,0)
7760 #        define SIMDE_BUG_GCC_95782
7761 #      endif
7762 #    endif
7763 #    define SIMDE_BUG_GCC_95399
7764 #  elif defined(__clang__)
7765 #    if defined(SIMDE_ARCH_AARCH64)
7766 #      define SIMDE_BUG_CLANG_45541
7767 #      define SIMDE_BUG_CLANG_46844
7768 #      define SIMDE_BUG_CLANG_48257
7769 #      if SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0)
7770 #        define SIMDE_BUG_CLANG_BAD_VI64_OPS
7771 #      endif
7772 #      if SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0)
7773 #        define SIMDE_BUG_CLANG_GIT_4EC445B8
7774 #        define SIMDE_BUG_CLANG_REV_365298 /* 0464e07c8f6e3310c28eb210a4513bc2243c2a7e */
7775 #      endif
7776 #    endif
7777 #    if defined(SIMDE_ARCH_ARM)
7778 #      if !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
7779 #        define SIMDE_BUG_CLANG_BAD_VGET_SET_LANE_TYPES
7780 #      endif
7781 #    endif
7782 #    if defined(SIMDE_ARCH_POWER)
7783 #      define SIMDE_BUG_CLANG_46770
7784 #    endif
7785 #    if defined(_ARCH_PWR9) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) && !defined(__OPTIMIZE__)
7786 #      define SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT
7787 #    endif
7788 #    if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
7789 #      if SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0)
7790 #        define SIMDE_BUG_CLANG_REV_298042 /* 6afc436a7817a52e78ae7bcdc3faafd460124cac */
7791 #      endif
7792 #      if SIMDE_DETECT_CLANG_VERSION_NOT(3,7,0)
7793 #        define SIMDE_BUG_CLANG_REV_234560 /* b929ad7b1726a32650a8051f69a747fb6836c540 */
7794 #      endif
7795 #      if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) && SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0)
7796 #        define SIMDE_BUG_CLANG_BAD_MADD
7797 #      endif
7798 #      if SIMDE_DETECT_CLANG_VERSION_CHECK(4,0,0) && SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0)
7799 #        define SIMDE_BUG_CLANG_REV_299346 /* ac9959eb533a58482ea4da6c4db1e635a98de384 */
7800 #      endif
7801 #      if SIMDE_DETECT_CLANG_VERSION_NOT(8,0,0)
7802 #        define SIMDE_BUG_CLANG_REV_344862 /* eae26bf73715994c2bd145f9b6dc3836aa4ffd4f */
7803 #      endif
7804 #      if HEDLEY_HAS_WARNING("-Wsign-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0)
7805 #        define SIMDE_BUG_CLANG_45931
7806 #      endif
7807 #      if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0)
7808 #        define SIMDE_BUG_CLANG_44589
7809 #      endif
7810 #      define SIMDE_BUG_CLANG_48673
7811 #    endif
7812 #    define SIMDE_BUG_CLANG_45959
7813 #  elif defined(HEDLEY_MSVC_VERSION)
7814 #    if defined(SIMDE_ARCH_X86)
7815 #      define SIMDE_BUG_MSVC_ROUND_EXTRACT
7816 #    endif
7817 #  elif defined(HEDLEY_INTEL_VERSION)
7818 #    define SIMDE_BUG_INTEL_857088
7819 #  elif defined(HEDLEY_MCST_LCC_VERSION)
7820 #    define SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS
7821 #  endif
7822 #endif
7823 
7824 /* GCC and Clang both have the same issue:
7825  * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95144
7826  * https://bugs.llvm.org/show_bug.cgi?id=45931
7827  * This is just an easy way to work around it.
7828  */
7829 #if \
7830     (HEDLEY_HAS_WARNING("-Wsign-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0)) || \
7831     HEDLEY_GCC_VERSION_CHECK(4,3,0)
7832 #  define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (__extension__ ({ \
7833        HEDLEY_DIAGNOSTIC_PUSH  \
7834        HEDLEY_DIAGNOSTIC_POP  \
7835        _Pragma("GCC diagnostic ignored \"-Wsign-conversion\"") \
7836        __typeof__(expr) simde_bug_ignore_sign_conversion_v_= (expr); \
7837        HEDLEY_DIAGNOSTIC_PUSH  \
7838        simde_bug_ignore_sign_conversion_v_; \
7839      }))
7840 #else
7841 #  define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (expr)
7842 #endif
7843 
7844 /* Usually the shift count is signed (for example, NEON or SSE).
7845  * OTOH, unsigned is good for PPC (vec_srl uses unsigned), and the only option for E2K.
7846  * Further info: https://github.com/simd-everywhere/simde/pull/700
7847  */
7848 #if defined(SIMDE_ARCH_E2K) || defined(SIMDE_ARCH_POWER)
7849   #define SIMDE_CAST_VECTOR_SHIFT_COUNT(width, value) HEDLEY_STATIC_CAST(uint##width##_t, (value))
7850 #else
7851   #define SIMDE_CAST_VECTOR_SHIFT_COUNT(width, value) HEDLEY_STATIC_CAST(int##width##_t, (value))
7852 #endif
7853 
7854 #endif /* !defined(SIMDE_COMMON_H) */
7855 /* :: End ../simde/simde/simde-common.h :: */
7856 
7857 HEDLEY_DIAGNOSTIC_PUSH
7858 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
7859 
7860 #if defined(SIMDE_X86_MMX_NATIVE)
7861   #define SIMDE_X86_MMX_USE_NATIVE_TYPE
7862 #elif defined(SIMDE_X86_SSE_NATIVE)
7863   #define SIMDE_X86_MMX_USE_NATIVE_TYPE
7864 #endif
7865 
7866 #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
7867   #include <mmintrin.h>
7868 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7869   #include <arm_neon.h>
7870 #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
7871   #include <loongson-mmiintrin.h>
7872 #endif
7873 
7874 #include <stdint.h>
7875 #include <limits.h>
7876 
7877 SIMDE_BEGIN_DECLS_
7878 
7879 typedef union {
7880   #if defined(SIMDE_VECTOR_SUBSCRIPT)
7881     SIMDE_ALIGN_TO_8 int8_t          i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7882     SIMDE_ALIGN_TO_8 int16_t        i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7883     SIMDE_ALIGN_TO_8 int32_t        i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7884     SIMDE_ALIGN_TO_8 int64_t        i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7885     SIMDE_ALIGN_TO_8 uint8_t         u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7886     SIMDE_ALIGN_TO_8 uint16_t       u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7887     SIMDE_ALIGN_TO_8 uint32_t       u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7888     SIMDE_ALIGN_TO_8 uint64_t       u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7889     SIMDE_ALIGN_TO_8 simde_float32  f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7890     SIMDE_ALIGN_TO_8 int_fast32_t  i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7891     SIMDE_ALIGN_TO_8 uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7892   #else
7893     SIMDE_ALIGN_TO_8 int8_t          i8[8];
7894     SIMDE_ALIGN_TO_8 int16_t        i16[4];
7895     SIMDE_ALIGN_TO_8 int32_t        i32[2];
7896     SIMDE_ALIGN_TO_8 int64_t        i64[1];
7897     SIMDE_ALIGN_TO_8 uint8_t         u8[8];
7898     SIMDE_ALIGN_TO_8 uint16_t       u16[4];
7899     SIMDE_ALIGN_TO_8 uint32_t       u32[2];
7900     SIMDE_ALIGN_TO_8 uint64_t       u64[1];
7901     SIMDE_ALIGN_TO_8 simde_float32  f32[2];
7902     SIMDE_ALIGN_TO_8 int_fast32_t  i32f[8 / sizeof(int_fast32_t)];
7903     SIMDE_ALIGN_TO_8 uint_fast32_t u32f[8 / sizeof(uint_fast32_t)];
7904   #endif
7905 
7906   #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
7907     __m64          n;
7908   #endif
7909   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7910     int8x8_t       neon_i8;
7911     int16x4_t      neon_i16;
7912     int32x2_t      neon_i32;
7913     int64x1_t      neon_i64;
7914     uint8x8_t      neon_u8;
7915     uint16x4_t     neon_u16;
7916     uint32x2_t     neon_u32;
7917     uint64x1_t     neon_u64;
7918     float32x2_t    neon_f32;
7919   #endif
7920   #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
7921     int8x8_t       mmi_i8;
7922     int16x4_t      mmi_i16;
7923     int32x2_t      mmi_i32;
7924     int64_t        mmi_i64;
7925     uint8x8_t      mmi_u8;
7926     uint16x4_t     mmi_u16;
7927     uint32x2_t     mmi_u32;
7928     uint64_t       mmi_u64;
7929   #endif
7930 } simde__m64_private;
7931 
7932 #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
7933   typedef __m64 simde__m64;
7934 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7935   typedef int32x2_t simde__m64;
7936 #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
7937   typedef int32x2_t simde__m64;
7938 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
7939   typedef int32_t simde__m64 SIMDE_ALIGN_TO_8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
7940 #else
7941   typedef simde__m64_private simde__m64;
7942 #endif
7943 
7944 #if !defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
7945   #define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
7946   typedef simde__m64 __m64;
7947 #endif
7948 
7949 HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
7950 HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64_private), "__m64 size incorrect");
7951 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
7952 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64) == 8, "simde__m64 is not 8-byte aligned");
7953 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64_private) == 8, "simde__m64_private is not 8-byte aligned");
7954 #endif
7955 
7956 SIMDE_FUNCTION_ATTRIBUTES
7957 simde__m64
simde__m64_from_private(simde__m64_private v)7958 simde__m64_from_private(simde__m64_private v) {
7959   simde__m64 r;
7960   simde_memcpy(&r, &v, sizeof(r));
7961   return r;
7962 }
7963 
7964 SIMDE_FUNCTION_ATTRIBUTES
7965 simde__m64_private
simde__m64_to_private(simde__m64 v)7966 simde__m64_to_private(simde__m64 v) {
7967   simde__m64_private r;
7968   simde_memcpy(&r, &v, sizeof(r));
7969   return r;
7970 }
7971 
7972 #define SIMDE_X86_GENERATE_CONVERSION_FUNCTION(simde_type, source_type, isax, fragment) \
7973   SIMDE_FUNCTION_ATTRIBUTES \
7974   simde__##simde_type \
7975   simde__##simde_type##_from_##isax##_##fragment(source_type value) { \
7976     simde__##simde_type##_private r_; \
7977     r_.isax##_##fragment = value; \
7978     return simde__##simde_type##_from_private(r_); \
7979   } \
7980   \
7981   SIMDE_FUNCTION_ATTRIBUTES \
7982   source_type \
7983   simde__##simde_type##_to_##isax##_##fragment(simde__##simde_type value) { \
7984     simde__##simde_type##_private r_ = simde__##simde_type##_to_private(value); \
7985     return r_.isax##_##fragment; \
7986   }
7987 
7988 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64,int8x8_t,neon,i8)7989   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, neon, i8)
7990   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, neon, i16)
7991   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, neon, i32)
7992   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64x1_t, neon, i64)
7993   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, neon, u8)
7994   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, neon, u16)
7995   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, neon, u32)
7996   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64x1_t, neon, u64)
7997   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32)
7998 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
7999 
8000 #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8001   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, mmi, i8)
8002   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, mmi, i16)
8003   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, mmi, i32)
8004   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64_t, mmi, i64)
8005   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, mmi, u8)
8006   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, mmi, u16)
8007   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, mmi, u32)
8008   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64_t, mmi, u64)
8009 #endif /* defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) */
8010 
8011 SIMDE_FUNCTION_ATTRIBUTES
8012 simde__m64
8013 simde_mm_add_pi8 (simde__m64 a, simde__m64 b) {
8014   #if defined(SIMDE_X86_MMX_NATIVE)
8015     return _mm_add_pi8(a, b);
8016   #else
8017     simde__m64_private r_;
8018     simde__m64_private a_ = simde__m64_to_private(a);
8019     simde__m64_private b_ = simde__m64_to_private(b);
8020 
8021     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8022       r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8);
8023     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8024       r_.mmi_i8 = paddb_s(a_.mmi_i8, b_.mmi_i8);
8025     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
8026       r_.i8 = a_.i8 + b_.i8;
8027     #else
8028       SIMDE_VECTORIZE
8029       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
8030         r_.i8[i] = a_.i8[i] + b_.i8[i];
8031       }
8032     #endif
8033 
8034     return simde__m64_from_private(r_);
8035   #endif
8036 }
8037 #define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
8038 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8039 #  define _mm_add_pi8(a, b) simde_mm_add_pi8(a, b)
8040 #  define _m_paddb(a, b) simde_m_paddb(a, b)
8041 #endif
8042 
8043 SIMDE_FUNCTION_ATTRIBUTES
8044 simde__m64
simde_mm_add_pi16(simde__m64 a,simde__m64 b)8045 simde_mm_add_pi16 (simde__m64 a, simde__m64 b) {
8046   #if defined(SIMDE_X86_MMX_NATIVE)
8047     return _mm_add_pi16(a, b);
8048   #else
8049     simde__m64_private r_;
8050     simde__m64_private a_ = simde__m64_to_private(a);
8051     simde__m64_private b_ = simde__m64_to_private(b);
8052 
8053   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8054     r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16);
8055   #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8056     r_.mmi_i16 = paddh_s(a_.mmi_i16, b_.mmi_i16);
8057   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
8058     r_.i16 = a_.i16 + b_.i16;
8059   #else
8060     SIMDE_VECTORIZE
8061     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8062       r_.i16[i] = a_.i16[i] + b_.i16[i];
8063     }
8064   #endif
8065 
8066     return simde__m64_from_private(r_);
8067   #endif
8068 }
8069 #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
8070 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8071 #  define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b)
8072 #  define _m_paddw(a, b) simde_mm_add_pi16(a, b)
8073 #endif
8074 
8075 SIMDE_FUNCTION_ATTRIBUTES
8076 simde__m64
simde_mm_add_pi32(simde__m64 a,simde__m64 b)8077 simde_mm_add_pi32 (simde__m64 a, simde__m64 b) {
8078   #if defined(SIMDE_X86_MMX_NATIVE)
8079     return _mm_add_pi32(a, b);
8080   #else
8081     simde__m64_private r_;
8082     simde__m64_private a_ = simde__m64_to_private(a);
8083     simde__m64_private b_ = simde__m64_to_private(b);
8084 
8085     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8086       r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32);
8087     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8088       r_.mmi_i32 = paddw_s(a_.mmi_i32, b_.mmi_i32);
8089     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
8090       r_.i32 = a_.i32 + b_.i32;
8091     #else
8092       SIMDE_VECTORIZE
8093       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
8094         r_.i32[i] = a_.i32[i] + b_.i32[i];
8095       }
8096     #endif
8097 
8098     return simde__m64_from_private(r_);
8099   #endif
8100 }
8101 #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
8102 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8103 #  define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b)
8104 #  define _m_paddd(a, b) simde_mm_add_pi32(a, b)
8105 #endif
8106 
8107 SIMDE_FUNCTION_ATTRIBUTES
8108 simde__m64
simde_mm_adds_pi8(simde__m64 a,simde__m64 b)8109 simde_mm_adds_pi8 (simde__m64 a, simde__m64 b) {
8110   #if defined(SIMDE_X86_MMX_NATIVE)
8111     return _mm_adds_pi8(a, b);
8112   #else
8113     simde__m64_private
8114       r_,
8115       a_ = simde__m64_to_private(a),
8116       b_ = simde__m64_to_private(b);
8117 
8118     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8119       r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8);
8120     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8121       r_.mmi_i8 = paddsb(a_.mmi_i8, b_.mmi_i8);
8122     #else
8123       SIMDE_VECTORIZE
8124       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
8125         if ((((b_.i8[i]) > 0) && ((a_.i8[i]) > (INT8_MAX - (b_.i8[i]))))) {
8126           r_.i8[i] = INT8_MAX;
8127         } else if ((((b_.i8[i]) < 0) && ((a_.i8[i]) < (INT8_MIN - (b_.i8[i]))))) {
8128           r_.i8[i] = INT8_MIN;
8129         } else {
8130           r_.i8[i] = (a_.i8[i]) + (b_.i8[i]);
8131         }
8132       }
8133     #endif
8134 
8135     return simde__m64_from_private(r_);
8136   #endif
8137 }
8138 #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
8139 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8140 #  define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b)
8141 #  define _m_paddsb(a, b) simde_mm_adds_pi8(a, b)
8142 #endif
8143 
8144 SIMDE_FUNCTION_ATTRIBUTES
8145 simde__m64
simde_mm_adds_pu8(simde__m64 a,simde__m64 b)8146 simde_mm_adds_pu8 (simde__m64 a, simde__m64 b) {
8147   #if defined(SIMDE_X86_MMX_NATIVE)
8148     return _mm_adds_pu8(a, b);
8149   #else
8150     simde__m64_private r_;
8151     simde__m64_private a_ = simde__m64_to_private(a);
8152     simde__m64_private b_ = simde__m64_to_private(b);
8153 
8154     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8155       r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8);
8156     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8157       r_.mmi_u8 = paddusb(a_.mmi_u8, b_.mmi_u8);
8158     #else
8159       SIMDE_VECTORIZE
8160       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
8161         const uint_fast16_t x = HEDLEY_STATIC_CAST(uint_fast16_t, a_.u8[i]) + HEDLEY_STATIC_CAST(uint_fast16_t, b_.u8[i]);
8162         if (x > UINT8_MAX)
8163           r_.u8[i] = UINT8_MAX;
8164         else
8165           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
8166       }
8167     #endif
8168 
8169     return simde__m64_from_private(r_);
8170   #endif
8171 }
8172 #define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
8173 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8174 #  define _mm_adds_pu8(a, b) simde_mm_adds_pu8(a, b)
8175 #  define _m_paddusb(a, b) simde_mm_adds_pu8(a, b)
8176 #endif
8177 
8178 SIMDE_FUNCTION_ATTRIBUTES
8179 simde__m64
simde_mm_adds_pi16(simde__m64 a,simde__m64 b)8180 simde_mm_adds_pi16 (simde__m64 a, simde__m64 b) {
8181   #if defined(SIMDE_X86_MMX_NATIVE)
8182     return _mm_adds_pi16(a, b);
8183   #else
8184     simde__m64_private r_;
8185     simde__m64_private a_ = simde__m64_to_private(a);
8186     simde__m64_private b_ = simde__m64_to_private(b);
8187 
8188     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8189       r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16);
8190     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8191       r_.mmi_i16 = paddsh(a_.mmi_i16, b_.mmi_i16);
8192     #else
8193       SIMDE_VECTORIZE
8194       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8195         if ((((b_.i16[i]) > 0) && ((a_.i16[i]) > (INT16_MAX - (b_.i16[i]))))) {
8196           r_.i16[i] = INT16_MAX;
8197         } else if ((((b_.i16[i]) < 0) && ((a_.i16[i]) < (SHRT_MIN - (b_.i16[i]))))) {
8198           r_.i16[i] = SHRT_MIN;
8199         } else {
8200           r_.i16[i] = (a_.i16[i]) + (b_.i16[i]);
8201         }
8202       }
8203     #endif
8204 
8205     return simde__m64_from_private(r_);
8206   #endif
8207 }
8208 #define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
8209 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8210 #  define _mm_adds_pi16(a, b) simde_mm_adds_pi16(a, b)
8211 #  define _m_paddsw(a, b) simde_mm_adds_pi16(a, b)
8212 #endif
8213 
8214 SIMDE_FUNCTION_ATTRIBUTES
8215 simde__m64
simde_mm_adds_pu16(simde__m64 a,simde__m64 b)8216 simde_mm_adds_pu16 (simde__m64 a, simde__m64 b) {
8217   #if defined(SIMDE_X86_MMX_NATIVE)
8218     return _mm_adds_pu16(a, b);
8219   #else
8220     simde__m64_private r_;
8221     simde__m64_private a_ = simde__m64_to_private(a);
8222     simde__m64_private b_ = simde__m64_to_private(b);
8223 
8224     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8225       r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16);
8226     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8227       r_.mmi_u16 = paddush(a_.mmi_u16, b_.mmi_u16);
8228     #else
8229       SIMDE_VECTORIZE
8230       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8231         const uint32_t x = a_.u16[i] + b_.u16[i];
8232         if (x > UINT16_MAX)
8233           r_.u16[i] = UINT16_MAX;
8234         else
8235           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
8236       }
8237     #endif
8238 
8239     return simde__m64_from_private(r_);
8240   #endif
8241 }
8242 #define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
8243 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8244 #  define _mm_adds_pu16(a, b) simde_mm_adds_pu16(a, b)
8245 #  define _m_paddusw(a, b) simde_mm_adds_pu16(a, b)
8246 #endif
8247 
8248 SIMDE_FUNCTION_ATTRIBUTES
8249 simde__m64
simde_mm_and_si64(simde__m64 a,simde__m64 b)8250 simde_mm_and_si64 (simde__m64 a, simde__m64 b) {
8251   #if defined(SIMDE_X86_MMX_NATIVE)
8252     return _mm_and_si64(a, b);
8253   #else
8254     simde__m64_private r_;
8255     simde__m64_private a_ = simde__m64_to_private(a);
8256     simde__m64_private b_ = simde__m64_to_private(b);
8257 
8258     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8259       r_.neon_i32 = vand_s32(a_.neon_i32, b_.neon_i32);
8260     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
8261       r_.i64 = a_.i64 & b_.i64;
8262     #else
8263       r_.i64[0] = a_.i64[0] & b_.i64[0];
8264     #endif
8265 
8266     return simde__m64_from_private(r_);
8267   #endif
8268 }
8269 #define simde_m_pand(a, b) simde_mm_and_si64(a, b)
8270 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8271 #  define _mm_and_si64(a, b) simde_mm_and_si64(a, b)
8272 #  define _m_pand(a, b) simde_mm_and_si64(a, b)
8273 #endif
8274 
8275 SIMDE_FUNCTION_ATTRIBUTES
8276 simde__m64
simde_mm_andnot_si64(simde__m64 a,simde__m64 b)8277 simde_mm_andnot_si64 (simde__m64 a, simde__m64 b) {
8278   #if defined(SIMDE_X86_MMX_NATIVE)
8279     return _mm_andnot_si64(a, b);
8280   #else
8281     simde__m64_private r_;
8282     simde__m64_private a_ = simde__m64_to_private(a);
8283     simde__m64_private b_ = simde__m64_to_private(b);
8284 
8285     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8286       r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32);
8287     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8288       r_.mmi_i32 = pandn_sw(a_.mmi_i32, b_.mmi_i32);
8289     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
8290       r_.i32f = ~a_.i32f & b_.i32f;
8291     #else
8292       r_.u64[0] = (~(a_.u64[0])) & (b_.u64[0]);
8293     #endif
8294 
8295     return simde__m64_from_private(r_);
8296   #endif
8297 }
8298 #define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
8299 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8300 #  define _mm_andnot_si64(a, b) simde_mm_andnot_si64(a, b)
8301 #  define _m_pandn(a, b) simde_mm_andnot_si64(a, b)
8302 #endif
8303 
8304 SIMDE_FUNCTION_ATTRIBUTES
8305 simde__m64
simde_mm_cmpeq_pi8(simde__m64 a,simde__m64 b)8306 simde_mm_cmpeq_pi8 (simde__m64 a, simde__m64 b) {
8307   #if defined(SIMDE_X86_MMX_NATIVE)
8308     return _mm_cmpeq_pi8(a, b);
8309   #else
8310     simde__m64_private r_;
8311     simde__m64_private a_ = simde__m64_to_private(a);
8312     simde__m64_private b_ = simde__m64_to_private(b);
8313 
8314     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8315       r_.neon_u8 = vceq_s8(a_.neon_i8, b_.neon_i8);
8316     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8317       r_.mmi_i8 = pcmpeqb_s(a_.mmi_i8, b_.mmi_i8);
8318     #else
8319       SIMDE_VECTORIZE
8320       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
8321         r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
8322       }
8323     #endif
8324 
8325     return simde__m64_from_private(r_);
8326   #endif
8327 }
8328 #define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
8329 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8330 #  define _mm_cmpeq_pi8(a, b) simde_mm_cmpeq_pi8(a, b)
8331 #  define _m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
8332 #endif
8333 
8334 SIMDE_FUNCTION_ATTRIBUTES
8335 simde__m64
simde_mm_cmpeq_pi16(simde__m64 a,simde__m64 b)8336 simde_mm_cmpeq_pi16 (simde__m64 a, simde__m64 b) {
8337   #if defined(SIMDE_X86_MMX_NATIVE)
8338     return _mm_cmpeq_pi16(a, b);
8339   #else
8340     simde__m64_private r_;
8341     simde__m64_private a_ = simde__m64_to_private(a);
8342     simde__m64_private b_ = simde__m64_to_private(b);
8343 
8344     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8345       r_.neon_u16 = vceq_s16(a_.neon_i16, b_.neon_i16);
8346     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8347       r_.mmi_i16 = pcmpeqh_s(a_.mmi_i16, b_.mmi_i16);
8348     #else
8349       SIMDE_VECTORIZE
8350       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8351         r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
8352       }
8353     #endif
8354 
8355     return simde__m64_from_private(r_);
8356   #endif
8357 }
8358 #define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
8359 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8360 #  define _mm_cmpeq_pi16(a, b) simde_mm_cmpeq_pi16(a, b)
8361 #  define _m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
8362 #endif
8363 
8364 SIMDE_FUNCTION_ATTRIBUTES
8365 simde__m64
simde_mm_cmpeq_pi32(simde__m64 a,simde__m64 b)8366 simde_mm_cmpeq_pi32 (simde__m64 a, simde__m64 b) {
8367   #if defined(SIMDE_X86_MMX_NATIVE)
8368     return _mm_cmpeq_pi32(a, b);
8369   #else
8370     simde__m64_private r_;
8371     simde__m64_private a_ = simde__m64_to_private(a);
8372     simde__m64_private b_ = simde__m64_to_private(b);
8373 
8374     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8375       r_.neon_u32 = vceq_s32(a_.neon_i32, b_.neon_i32);
8376     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8377       r_.mmi_i32 = pcmpeqw_s(a_.mmi_i32, b_.mmi_i32);
8378     #else
8379       SIMDE_VECTORIZE
8380       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
8381         r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
8382       }
8383     #endif
8384 
8385     return simde__m64_from_private(r_);
8386   #endif
8387 }
8388 #define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
8389 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8390 #  define _mm_cmpeq_pi32(a, b) simde_mm_cmpeq_pi32(a, b)
8391 #  define _m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
8392 #endif
8393 
8394 SIMDE_FUNCTION_ATTRIBUTES
8395 simde__m64
simde_mm_cmpgt_pi8(simde__m64 a,simde__m64 b)8396 simde_mm_cmpgt_pi8 (simde__m64 a, simde__m64 b) {
8397   #if defined(SIMDE_X86_MMX_NATIVE)
8398     return _mm_cmpgt_pi8(a, b);
8399   #else
8400     simde__m64_private r_;
8401     simde__m64_private a_ = simde__m64_to_private(a);
8402     simde__m64_private b_ = simde__m64_to_private(b);
8403 
8404     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8405       r_.neon_u8 = vcgt_s8(a_.neon_i8, b_.neon_i8);
8406     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8407       r_.mmi_i8 = pcmpgtb_s(a_.mmi_i8, b_.mmi_i8);
8408     #else
8409       SIMDE_VECTORIZE
8410       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
8411         r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
8412       }
8413     #endif
8414 
8415     return simde__m64_from_private(r_);
8416   #endif
8417 }
8418 #define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
8419 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8420 #  define _mm_cmpgt_pi8(a, b) simde_mm_cmpgt_pi8(a, b)
8421 #  define _m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
8422 #endif
8423 
8424 SIMDE_FUNCTION_ATTRIBUTES
8425 simde__m64
simde_mm_cmpgt_pi16(simde__m64 a,simde__m64 b)8426 simde_mm_cmpgt_pi16 (simde__m64 a, simde__m64 b) {
8427   #if defined(SIMDE_X86_MMX_NATIVE)
8428     return _mm_cmpgt_pi16(a, b);
8429   #else
8430     simde__m64_private r_;
8431     simde__m64_private a_ = simde__m64_to_private(a);
8432     simde__m64_private b_ = simde__m64_to_private(b);
8433 
8434     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8435       r_.neon_u16 = vcgt_s16(a_.neon_i16, b_.neon_i16);
8436     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8437       r_.mmi_i16 = pcmpgth_s(a_.mmi_i16, b_.mmi_i16);
8438     #else
8439       SIMDE_VECTORIZE
8440       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8441         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
8442       }
8443     #endif
8444 
8445     return simde__m64_from_private(r_);
8446   #endif
8447 }
8448 #define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
8449 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8450 #  define _mm_cmpgt_pi16(a, b) simde_mm_cmpgt_pi16(a, b)
8451 #  define _m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
8452 #endif
8453 
8454 SIMDE_FUNCTION_ATTRIBUTES
8455 simde__m64
simde_mm_cmpgt_pi32(simde__m64 a,simde__m64 b)8456 simde_mm_cmpgt_pi32 (simde__m64 a, simde__m64 b) {
8457   #if defined(SIMDE_X86_MMX_NATIVE)
8458     return _mm_cmpgt_pi32(a, b);
8459   #else
8460     simde__m64_private r_;
8461     simde__m64_private a_ = simde__m64_to_private(a);
8462     simde__m64_private b_ = simde__m64_to_private(b);
8463 
8464     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8465       r_.neon_u32 = vcgt_s32(a_.neon_i32, b_.neon_i32);
8466     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8467       r_.mmi_i32 = pcmpgtw_s(a_.mmi_i32, b_.mmi_i32);
8468     #else
8469       SIMDE_VECTORIZE
8470       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
8471         r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
8472       }
8473     #endif
8474 
8475     return simde__m64_from_private(r_);
8476   #endif
8477 }
8478 #define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
8479 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8480 #  define _mm_cmpgt_pi32(a, b) simde_mm_cmpgt_pi32(a, b)
8481 #  define _m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
8482 #endif
8483 
8484 SIMDE_FUNCTION_ATTRIBUTES
8485 int64_t
simde_mm_cvtm64_si64(simde__m64 a)8486 simde_mm_cvtm64_si64 (simde__m64 a) {
8487   #if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
8488     return _mm_cvtm64_si64(a);
8489   #else
8490     simde__m64_private a_ = simde__m64_to_private(a);
8491 
8492     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8493       HEDLEY_DIAGNOSTIC_PUSH
8494       #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0)
8495         #pragma clang diagnostic ignored "-Wvector-conversion"
8496       #endif
8497       return vget_lane_s64(a_.neon_i64, 0);
8498       HEDLEY_DIAGNOSTIC_POP
8499     #else
8500       return a_.i64[0];
8501     #endif
8502   #endif
8503 }
8504 #define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
8505 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
8506 #  define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(a)
8507 #  define _m_to_int64(a) simde_mm_cvtm64_si64(a)
8508 #endif
8509 
8510 SIMDE_FUNCTION_ATTRIBUTES
8511 simde__m64
simde_mm_cvtsi32_si64(int32_t a)8512 simde_mm_cvtsi32_si64 (int32_t a) {
8513   #if defined(SIMDE_X86_MMX_NATIVE)
8514     return _mm_cvtsi32_si64(a);
8515   #else
8516     simde__m64_private r_;
8517 
8518     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8519       const int32_t av[sizeof(r_.neon_i32) / sizeof(r_.neon_i32[0])] = { a, 0 };
8520       r_.neon_i32 = vld1_s32(av);
8521     #else
8522       r_.i32[0] = a;
8523       r_.i32[1] = 0;
8524     #endif
8525 
8526     return simde__m64_from_private(r_);
8527   #endif
8528 }
8529 #define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
8530 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8531 #  define _mm_cvtsi32_si64(a) simde_mm_cvtsi32_si64(a)
8532 #  define _m_from_int(a) simde_mm_cvtsi32_si64(a)
8533 #endif
8534 
8535 SIMDE_FUNCTION_ATTRIBUTES
8536 simde__m64
simde_mm_cvtsi64_m64(int64_t a)8537 simde_mm_cvtsi64_m64 (int64_t a) {
8538   #if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
8539     return _mm_cvtsi64_m64(a);
8540   #else
8541     simde__m64_private r_;
8542 
8543     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8544       r_.neon_i64 = vld1_s64(&a);
8545     #else
8546       r_.i64[0] = a;
8547     #endif
8548 
8549     return simde__m64_from_private(r_);
8550   #endif
8551 }
8552 #define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
8553 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
8554 #  define _mm_cvtsi64_m64(a) simde_mm_cvtsi64_m64(a)
8555 #  define _m_from_int64(a) simde_mm_cvtsi64_m64(a)
8556 #endif
8557 
8558 SIMDE_FUNCTION_ATTRIBUTES
8559 int32_t
simde_mm_cvtsi64_si32(simde__m64 a)8560 simde_mm_cvtsi64_si32 (simde__m64 a) {
8561   #if defined(SIMDE_X86_MMX_NATIVE)
8562     return _mm_cvtsi64_si32(a);
8563   #else
8564     simde__m64_private a_ = simde__m64_to_private(a);
8565 
8566     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8567       HEDLEY_DIAGNOSTIC_PUSH
8568       #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0)
8569         #pragma clang diagnostic ignored "-Wvector-conversion"
8570       #endif
8571       return vget_lane_s32(a_.neon_i32, 0);
8572       HEDLEY_DIAGNOSTIC_POP
8573     #else
8574       return a_.i32[0];
8575     #endif
8576   #endif
8577 }
8578 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8579 #  define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(a)
8580 #endif
8581 
8582 SIMDE_FUNCTION_ATTRIBUTES
8583 void
simde_mm_empty(void)8584 simde_mm_empty (void) {
8585   #if defined(SIMDE_X86_MMX_NATIVE)
8586     _mm_empty();
8587   #else
8588     /* noop */
8589   #endif
8590 }
8591 #define simde_m_empty() simde_mm_empty()
8592 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8593 #  define _mm_empty() simde_mm_empty()
8594 #  define _m_empty() simde_mm_empty()
8595 #endif
8596 
8597 SIMDE_FUNCTION_ATTRIBUTES
8598 simde__m64
simde_mm_madd_pi16(simde__m64 a,simde__m64 b)8599 simde_mm_madd_pi16 (simde__m64 a, simde__m64 b) {
8600   #if defined(SIMDE_X86_MMX_NATIVE)
8601     return _mm_madd_pi16(a, b);
8602   #else
8603     simde__m64_private r_;
8604     simde__m64_private a_ = simde__m64_to_private(a);
8605     simde__m64_private b_ = simde__m64_to_private(b);
8606 
8607     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8608       int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16);
8609       r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1));
8610     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8611       r_.mmi_i32 = pmaddhw(a_.mmi_i16, b_.mmi_i16);
8612     #else
8613       SIMDE_VECTORIZE
8614       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i += 2) {
8615         r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
8616       }
8617     #endif
8618 
8619     return simde__m64_from_private(r_);
8620   #endif
8621 }
8622 #define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
8623 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8624 #  define _mm_madd_pi16(a, b) simde_mm_madd_pi16(a, b)
8625 #  define _m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
8626 #endif
8627 
8628 SIMDE_FUNCTION_ATTRIBUTES
8629 simde__m64
simde_mm_mulhi_pi16(simde__m64 a,simde__m64 b)8630 simde_mm_mulhi_pi16 (simde__m64 a, simde__m64 b) {
8631   #if defined(SIMDE_X86_MMX_NATIVE)
8632     return _mm_mulhi_pi16(a, b);
8633   #else
8634     simde__m64_private r_;
8635     simde__m64_private a_ = simde__m64_to_private(a);
8636     simde__m64_private b_ = simde__m64_to_private(b);
8637 
8638     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8639       const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
8640       const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16);
8641       const uint16x4_t t3 = vmovn_u32(t2);
8642       r_.neon_u16 = t3;
8643     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8644       r_.mmi_i16 = pmulhh(a_.mmi_i16, b_.mmi_i16);
8645     #else
8646       SIMDE_VECTORIZE
8647       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8648         r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((a_.i16[i] * b_.i16[i]) >> 16));
8649       }
8650     #endif
8651 
8652     return simde__m64_from_private(r_);
8653   #endif
8654 }
8655 #define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
8656 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8657 #  define _mm_mulhi_pi16(a, b) simde_mm_mulhi_pi16(a, b)
8658 #  define _m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
8659 #endif
8660 
8661 SIMDE_FUNCTION_ATTRIBUTES
8662 simde__m64
simde_mm_mullo_pi16(simde__m64 a,simde__m64 b)8663 simde_mm_mullo_pi16 (simde__m64 a, simde__m64 b) {
8664   #if defined(SIMDE_X86_MMX_NATIVE)
8665     return _mm_mullo_pi16(a, b);
8666   #else
8667     simde__m64_private r_;
8668     simde__m64_private a_ = simde__m64_to_private(a);
8669     simde__m64_private b_ = simde__m64_to_private(b);
8670 
8671     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8672       const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
8673       const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1));
8674       r_.neon_u16 = t2;
8675     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8676       r_.mmi_i16 = pmullh(a_.mmi_i16, b_.mmi_i16);
8677     #else
8678       SIMDE_VECTORIZE
8679       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8680         r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((a_.i16[i] * b_.i16[i]) & 0xffff));
8681       }
8682     #endif
8683 
8684     return simde__m64_from_private(r_);
8685   #endif
8686 }
8687 #define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
8688 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8689 #  define _mm_mullo_pi16(a, b) simde_mm_mullo_pi16(a, b)
8690 #  define _m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
8691 #endif
8692 
8693 SIMDE_FUNCTION_ATTRIBUTES
8694 simde__m64
simde_mm_or_si64(simde__m64 a,simde__m64 b)8695 simde_mm_or_si64 (simde__m64 a, simde__m64 b) {
8696   #if defined(SIMDE_X86_MMX_NATIVE)
8697     return _mm_or_si64(a, b);
8698   #else
8699     simde__m64_private r_;
8700     simde__m64_private a_ = simde__m64_to_private(a);
8701     simde__m64_private b_ = simde__m64_to_private(b);
8702 
8703     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8704       r_.neon_i32 = vorr_s32(a_.neon_i32, b_.neon_i32);
8705     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
8706       r_.i64 = a_.i64 | b_.i64;
8707     #else
8708       r_.i64[0] = a_.i64[0] | b_.i64[0];
8709     #endif
8710 
8711     return simde__m64_from_private(r_);
8712   #endif
8713 }
8714 #define simde_m_por(a, b) simde_mm_or_si64(a, b)
8715 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8716 #  define _mm_or_si64(a, b) simde_mm_or_si64(a, b)
8717 #  define _m_por(a, b) simde_mm_or_si64(a, b)
8718 #endif
8719 
8720 SIMDE_FUNCTION_ATTRIBUTES
8721 simde__m64
simde_mm_packs_pi16(simde__m64 a,simde__m64 b)8722 simde_mm_packs_pi16 (simde__m64 a, simde__m64 b) {
8723   #if defined(SIMDE_X86_MMX_NATIVE)
8724     return _mm_packs_pi16(a, b);
8725   #else
8726     simde__m64_private r_;
8727     simde__m64_private a_ = simde__m64_to_private(a);
8728     simde__m64_private b_ = simde__m64_to_private(b);
8729 
8730     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8731       r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16));
8732     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8733       r_.mmi_i8 = packsshb(a_.mmi_i16, b_.mmi_i16);
8734     #else
8735       SIMDE_VECTORIZE
8736       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8737         if (a_.i16[i] < INT8_MIN) {
8738           r_.i8[i] = INT8_MIN;
8739         } else if (a_.i16[i] > INT8_MAX) {
8740           r_.i8[i] = INT8_MAX;
8741         } else {
8742           r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]);
8743         }
8744       }
8745 
8746       SIMDE_VECTORIZE
8747       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8748         if (b_.i16[i] < INT8_MIN) {
8749           r_.i8[i + 4] = INT8_MIN;
8750         } else if (b_.i16[i] > INT8_MAX) {
8751           r_.i8[i + 4] = INT8_MAX;
8752         } else {
8753           r_.i8[i + 4] = HEDLEY_STATIC_CAST(int8_t, b_.i16[i]);
8754         }
8755       }
8756     #endif
8757 
8758     return simde__m64_from_private(r_);
8759   #endif
8760 }
8761 #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
8762 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8763 #  define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b)
8764 #  define _m_packsswb(a, b) simde_mm_packs_pi16(a, b)
8765 #endif
8766 
8767 SIMDE_FUNCTION_ATTRIBUTES
8768 simde__m64
simde_mm_packs_pi32(simde__m64 a,simde__m64 b)8769 simde_mm_packs_pi32 (simde__m64 a, simde__m64 b) {
8770   #if defined(SIMDE_X86_MMX_NATIVE)
8771     return _mm_packs_pi32(a, b);
8772   #else
8773     simde__m64_private r_;
8774     simde__m64_private a_ = simde__m64_to_private(a);
8775     simde__m64_private b_ = simde__m64_to_private(b);
8776 
8777     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8778       r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
8779     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8780       r_.mmi_i16 = packsswh(a_.mmi_i32, b_.mmi_i32);
8781     #else
8782       SIMDE_VECTORIZE
8783       for (size_t i = 0 ; i < (8 / sizeof(a_.i32[0])) ; i++) {
8784         if (a_.i32[i] < SHRT_MIN) {
8785           r_.i16[i] = SHRT_MIN;
8786         } else if (a_.i32[i] > INT16_MAX) {
8787           r_.i16[i] = INT16_MAX;
8788         } else {
8789           r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]);
8790         }
8791       }
8792 
8793       SIMDE_VECTORIZE
8794       for (size_t i = 0 ; i < (8 / sizeof(b_.i32[0])) ; i++) {
8795         if (b_.i32[i] < SHRT_MIN) {
8796           r_.i16[i + 2] = SHRT_MIN;
8797         } else if (b_.i32[i] > INT16_MAX) {
8798           r_.i16[i + 2] = INT16_MAX;
8799         } else {
8800           r_.i16[i + 2] = HEDLEY_STATIC_CAST(int16_t, b_.i32[i]);
8801         }
8802       }
8803     #endif
8804 
8805     return simde__m64_from_private(r_);
8806   #endif
8807 }
8808 #define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
8809 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8810 #  define _mm_packs_pi32(a, b) simde_mm_packs_pi32(a, b)
8811 #  define _m_packssdw(a, b) simde_mm_packs_pi32(a, b)
8812 #endif
8813 
8814 SIMDE_FUNCTION_ATTRIBUTES
8815 simde__m64
simde_mm_packs_pu16(simde__m64 a,simde__m64 b)8816 simde_mm_packs_pu16 (simde__m64 a, simde__m64 b) {
8817   #if defined(SIMDE_X86_MMX_NATIVE)
8818     return _mm_packs_pu16(a, b);
8819   #else
8820     simde__m64_private r_;
8821     simde__m64_private a_ = simde__m64_to_private(a);
8822     simde__m64_private b_ = simde__m64_to_private(b);
8823 
8824     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
8825       const int16x8_t t1 = vcombine_s16(a_.neon_i16, b_.neon_i16);
8826 
8827       /* Set elements which are < 0 to 0 */
8828       const int16x8_t t2 = vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1)));
8829 
8830       /* Vector with all s16 elements set to UINT8_MAX */
8831       const int16x8_t vmax = vmovq_n_s16(HEDLEY_STATIC_CAST(int16_t, UINT8_MAX));
8832 
8833       /* Elements which are within the acceptable range */
8834       const int16x8_t le_max = vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax)));
8835       const int16x8_t gt_max = vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax)));
8836 
8837       /* Final values as 16-bit integers */
8838       const int16x8_t values = vorrq_s16(le_max, gt_max);
8839 
8840       r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values));
8841     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
8842       r_.mmi_u8 = packushb(a_.mmi_u16, b_.mmi_u16);
8843     #else
8844       SIMDE_VECTORIZE
8845       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8846         if (a_.i16[i] > UINT8_MAX) {
8847           r_.u8[i] = UINT8_MAX;
8848         } else if (a_.i16[i] < 0) {
8849           r_.u8[i] = 0;
8850         } else {
8851           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]);
8852         }
8853       }
8854 
8855       SIMDE_VECTORIZE
8856       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
8857         if (b_.i16[i] > UINT8_MAX) {
8858           r_.u8[i + 4] = UINT8_MAX;
8859         } else if (b_.i16[i] < 0) {
8860           r_.u8[i + 4] = 0;
8861         } else {
8862           r_.u8[i + 4] = HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]);
8863         }
8864       }
8865     #endif
8866 
8867     return simde__m64_from_private(r_);
8868   #endif
8869 }
8870 #define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
8871 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8872 #  define _mm_packs_pu16(a, b) simde_mm_packs_pu16(a, b)
8873 #  define _m_packuswb(a, b) simde_mm_packs_pu16(a, b)
8874 #endif
8875 
8876 SIMDE_FUNCTION_ATTRIBUTES
8877 simde__m64
simde_mm_set_pi8(int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)8878 simde_mm_set_pi8 (int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
8879   #if defined(SIMDE_X86_MMX_NATIVE)
8880     return _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
8881   #else
8882     simde__m64_private r_;
8883 
8884     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8885       const int8_t v[sizeof(r_.i8) / sizeof(r_.i8[0])] = { e0, e1, e2, e3, e4, e5, e6, e7 };
8886       r_.neon_i8 = vld1_s8(v);
8887     #else
8888       r_.i8[0] = e0;
8889       r_.i8[1] = e1;
8890       r_.i8[2] = e2;
8891       r_.i8[3] = e3;
8892       r_.i8[4] = e4;
8893       r_.i8[5] = e5;
8894       r_.i8[6] = e6;
8895       r_.i8[7] = e7;
8896     #endif
8897 
8898     return simde__m64_from_private(r_);
8899   #endif
8900 }
8901 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8902 #  define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
8903 #endif
8904 
8905 SIMDE_FUNCTION_ATTRIBUTES
8906 simde__m64
simde_x_mm_set_pu8(uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)8907 simde_x_mm_set_pu8 (uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) {
8908   simde__m64_private r_;
8909 
8910   #if defined(SIMDE_X86_MMX_NATIVE)
8911     r_.n = _mm_set_pi8(
8912         HEDLEY_STATIC_CAST(int8_t, e7),
8913         HEDLEY_STATIC_CAST(int8_t, e6),
8914         HEDLEY_STATIC_CAST(int8_t, e5),
8915         HEDLEY_STATIC_CAST(int8_t, e4),
8916         HEDLEY_STATIC_CAST(int8_t, e3),
8917         HEDLEY_STATIC_CAST(int8_t, e2),
8918         HEDLEY_STATIC_CAST(int8_t, e1),
8919         HEDLEY_STATIC_CAST(int8_t, e0));
8920   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8921     const uint8_t v[sizeof(r_.u8) / sizeof(r_.u8[0])] = { e0, e1, e2, e3, e4, e5, e6, e7 };
8922     r_.neon_u8 = vld1_u8(v);
8923   #else
8924     r_.u8[0] = e0;
8925     r_.u8[1] = e1;
8926     r_.u8[2] = e2;
8927     r_.u8[3] = e3;
8928     r_.u8[4] = e4;
8929     r_.u8[5] = e5;
8930     r_.u8[6] = e6;
8931     r_.u8[7] = e7;
8932   #endif
8933 
8934   return simde__m64_from_private(r_);
8935 }
8936 
8937 SIMDE_FUNCTION_ATTRIBUTES
8938 simde__m64
simde_mm_set_pi16(int16_t e3,int16_t e2,int16_t e1,int16_t e0)8939 simde_mm_set_pi16 (int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
8940   #if defined(SIMDE_X86_MMX_NATIVE)
8941     return _mm_set_pi16(e3, e2, e1, e0);
8942   #else
8943     simde__m64_private r_;
8944 
8945     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8946       const int16_t v[sizeof(r_.i16) / sizeof(r_.i16[0])] = { e0, e1, e2, e3 };
8947       r_.neon_i16 = vld1_s16(v);
8948     #else
8949       r_.i16[0] = e0;
8950       r_.i16[1] = e1;
8951       r_.i16[2] = e2;
8952       r_.i16[3] = e3;
8953     #endif
8954 
8955     return simde__m64_from_private(r_);
8956   #endif
8957 }
8958 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
8959 #  define _mm_set_pi16(e3, e2, e1, e0) simde_mm_set_pi16(e3, e2, e1, e0)
8960 #endif
8961 
8962 SIMDE_FUNCTION_ATTRIBUTES
8963 simde__m64
simde_x_mm_set_pu16(uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)8964 simde_x_mm_set_pu16 (uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
8965   simde__m64_private r_;
8966 
8967 #if defined(SIMDE_X86_MMX_NATIVE)
8968   r_.n = _mm_set_pi16(
8969       HEDLEY_STATIC_CAST(int16_t, e3),
8970       HEDLEY_STATIC_CAST(int16_t, e2),
8971       HEDLEY_STATIC_CAST(int16_t, e1),
8972       HEDLEY_STATIC_CAST(int16_t, e0)
8973     );
8974 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8975   const uint16_t v[sizeof(r_.u16) / sizeof(r_.u16[0])] = { e0, e1, e2, e3 };
8976   r_.neon_u16 = vld1_u16(v);
8977 #else
8978   r_.u16[0] = e0;
8979   r_.u16[1] = e1;
8980   r_.u16[2] = e2;
8981   r_.u16[3] = e3;
8982 #endif
8983 
8984   return simde__m64_from_private(r_);
8985 }
8986 
8987 SIMDE_FUNCTION_ATTRIBUTES
8988 simde__m64
simde_x_mm_set_pu32(uint32_t e1,uint32_t e0)8989 simde_x_mm_set_pu32 (uint32_t e1, uint32_t e0) {
8990   simde__m64_private r_;
8991 
8992 #if defined(SIMDE_X86_MMX_NATIVE)
8993   r_.n = _mm_set_pi32(
8994       HEDLEY_STATIC_CAST(int32_t, e1),
8995       HEDLEY_STATIC_CAST(int32_t, e0));
8996 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
8997   const uint32_t v[sizeof(r_.u32) / sizeof(r_.u32[0])] = { e0, e1 };
8998   r_.neon_u32 = vld1_u32(v);
8999 #else
9000   r_.u32[0] = e0;
9001   r_.u32[1] = e1;
9002 #endif
9003 
9004   return simde__m64_from_private(r_);
9005 }
9006 
9007 SIMDE_FUNCTION_ATTRIBUTES
9008 simde__m64
simde_mm_set_pi32(int32_t e1,int32_t e0)9009 simde_mm_set_pi32 (int32_t e1, int32_t e0) {
9010   simde__m64_private r_;
9011 
9012 #if defined(SIMDE_X86_MMX_NATIVE)
9013   r_.n = _mm_set_pi32(e1, e0);
9014 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9015   const int32_t v[sizeof(r_.i32) / sizeof(r_.i32[0])] = { e0, e1 };
9016   r_.neon_i32 = vld1_s32(v);
9017 #else
9018   r_.i32[0] = e0;
9019   r_.i32[1] = e1;
9020 #endif
9021 
9022   return simde__m64_from_private(r_);
9023 }
9024 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9025 #  define _mm_set_pi32(e1, e0) simde_mm_set_pi32(e1, e0)
9026 #endif
9027 
9028 SIMDE_FUNCTION_ATTRIBUTES
9029 simde__m64
simde_x_mm_set_pi64(int64_t e0)9030 simde_x_mm_set_pi64 (int64_t e0) {
9031   simde__m64_private r_;
9032 
9033 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9034   const int64_t v[sizeof(r_.i64) / sizeof(r_.i64[0])] = { e0 };
9035   r_.neon_i64 = vld1_s64(v);
9036 #else
9037   r_.i64[0] = e0;
9038 #endif
9039 
9040   return simde__m64_from_private(r_);
9041 }
9042 
9043 
9044 SIMDE_FUNCTION_ATTRIBUTES
9045 simde__m64
simde_x_mm_set_f32x2(simde_float32 e1,simde_float32 e0)9046 simde_x_mm_set_f32x2 (simde_float32 e1, simde_float32 e0) {
9047   simde__m64_private r_;
9048 
9049 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9050   const simde_float32 v[sizeof(r_.f32) / sizeof(r_.f32[0])] = { e0, e1 };
9051   r_.neon_f32 = vld1_f32(v);
9052 #else
9053   r_.f32[0] = e0;
9054   r_.f32[1] = e1;
9055 #endif
9056 
9057   return simde__m64_from_private(r_);
9058 }
9059 
9060 SIMDE_FUNCTION_ATTRIBUTES
9061 simde__m64
simde_mm_set1_pi8(int8_t a)9062 simde_mm_set1_pi8 (int8_t a) {
9063   #if defined(SIMDE_X86_MMX_NATIVE)
9064     return _mm_set1_pi8(a);
9065   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9066     simde__m64_private r_;
9067     r_.neon_i8 = vmov_n_s8(a);
9068     return simde__m64_from_private(r_);
9069   #else
9070     return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
9071   #endif
9072 }
9073 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9074 #  define _mm_set1_pi8(a) simde_mm_set1_pi8(a)
9075 #endif
9076 
9077 SIMDE_FUNCTION_ATTRIBUTES
9078 simde__m64
simde_mm_set1_pi16(int16_t a)9079 simde_mm_set1_pi16 (int16_t a) {
9080   #if defined(SIMDE_X86_MMX_NATIVE)
9081     return _mm_set1_pi16(a);
9082   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9083     simde__m64_private r_;
9084     r_.neon_i16 = vmov_n_s16(a);
9085     return simde__m64_from_private(r_);
9086   #else
9087     return simde_mm_set_pi16(a, a, a, a);
9088   #endif
9089 }
9090 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9091 #  define _mm_set1_pi16(a) simde_mm_set1_pi16(a)
9092 #endif
9093 
9094 SIMDE_FUNCTION_ATTRIBUTES
9095 simde__m64
simde_mm_set1_pi32(int32_t a)9096 simde_mm_set1_pi32 (int32_t a) {
9097   #if defined(SIMDE_X86_MMX_NATIVE)
9098     return _mm_set1_pi32(a);
9099   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9100     simde__m64_private r_;
9101     r_.neon_i32 = vmov_n_s32(a);
9102     return simde__m64_from_private(r_);
9103   #else
9104     return simde_mm_set_pi32(a, a);
9105   #endif
9106 }
9107 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9108 #  define _mm_set1_pi32(a) simde_mm_set1_pi32(a)
9109 #endif
9110 
9111 SIMDE_FUNCTION_ATTRIBUTES
9112 simde__m64
simde_mm_setr_pi8(int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)9113 simde_mm_setr_pi8 (int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
9114   #if defined(SIMDE_X86_MMX_NATIVE)
9115     return _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
9116   #else
9117     return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
9118   #endif
9119 }
9120 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9121 #  define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
9122 #endif
9123 
9124 SIMDE_FUNCTION_ATTRIBUTES
9125 simde__m64
simde_mm_setr_pi16(int16_t e3,int16_t e2,int16_t e1,int16_t e0)9126 simde_mm_setr_pi16 (int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
9127   #if defined(SIMDE_X86_MMX_NATIVE)
9128     return _mm_setr_pi16(e3, e2, e1, e0);
9129   #else
9130     return simde_mm_set_pi16(e0, e1, e2, e3);
9131   #endif
9132 }
9133 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9134 #  define _mm_setr_pi16(e3, e2, e1, e0) simde_mm_setr_pi16(e3, e2, e1, e0)
9135 #endif
9136 
9137 SIMDE_FUNCTION_ATTRIBUTES
9138 simde__m64
simde_mm_setr_pi32(int32_t e1,int32_t e0)9139 simde_mm_setr_pi32 (int32_t e1, int32_t e0) {
9140   #if defined(SIMDE_X86_MMX_NATIVE)
9141     return _mm_setr_pi32(e1, e0);
9142   #else
9143     return simde_mm_set_pi32(e0, e1);
9144   #endif
9145 }
9146 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9147 #  define _mm_setr_pi32(e1, e0) simde_mm_setr_pi32(e1, e0)
9148 #endif
9149 
9150 SIMDE_FUNCTION_ATTRIBUTES
9151 simde__m64
simde_mm_setzero_si64(void)9152 simde_mm_setzero_si64 (void) {
9153   #if defined(SIMDE_X86_MMX_NATIVE)
9154     return _mm_setzero_si64();
9155   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9156     simde__m64_private r_;
9157     r_.neon_u32 = vmov_n_u32(0);
9158     return simde__m64_from_private(r_);
9159   #else
9160     return simde_mm_set_pi32(0, 0);
9161   #endif
9162 }
9163 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9164 #  define _mm_setzero_si64() simde_mm_setzero_si64()
9165 #endif
9166 
9167 SIMDE_FUNCTION_ATTRIBUTES
9168 simde__m64
simde_x_mm_load_si64(const void * mem_addr)9169 simde_x_mm_load_si64 (const void* mem_addr) {
9170   simde__m64 r;
9171   simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64), sizeof(r));
9172   return r;
9173 }
9174 
9175 SIMDE_FUNCTION_ATTRIBUTES
9176 simde__m64
simde_x_mm_loadu_si64(const void * mem_addr)9177 simde_x_mm_loadu_si64 (const void* mem_addr) {
9178   simde__m64 r;
9179   simde_memcpy(&r, mem_addr, sizeof(r));
9180   return r;
9181 }
9182 
9183 SIMDE_FUNCTION_ATTRIBUTES
9184 void
simde_x_mm_store_si64(void * mem_addr,simde__m64 value)9185 simde_x_mm_store_si64 (void* mem_addr, simde__m64 value) {
9186   simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64), &value, sizeof(value));
9187 }
9188 
9189 SIMDE_FUNCTION_ATTRIBUTES
9190 void
simde_x_mm_storeu_si64(void * mem_addr,simde__m64 value)9191 simde_x_mm_storeu_si64 (void* mem_addr, simde__m64 value) {
9192   simde_memcpy(mem_addr, &value, sizeof(value));
9193 }
9194 
9195 SIMDE_FUNCTION_ATTRIBUTES
9196 simde__m64
simde_x_mm_setone_si64(void)9197 simde_x_mm_setone_si64 (void) {
9198   return simde_mm_set1_pi32(~INT32_C(0));
9199 }
9200 
9201 SIMDE_FUNCTION_ATTRIBUTES
9202 simde__m64
simde_mm_sll_pi16(simde__m64 a,simde__m64 count)9203 simde_mm_sll_pi16 (simde__m64 a, simde__m64 count) {
9204   #if defined(SIMDE_X86_MMX_NATIVE)
9205     return _mm_sll_pi16(a, count);
9206   #else
9207     simde__m64_private r_;
9208     simde__m64_private a_ = simde__m64_to_private(a);
9209     simde__m64_private count_ = simde__m64_to_private(count);
9210 
9211     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9212       HEDLEY_DIAGNOSTIC_PUSH
9213       #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0)
9214         #pragma clang diagnostic ignored "-Wvector-conversion"
9215       #endif
9216       r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(HEDLEY_STATIC_CAST(int16_t, vget_lane_u64(count_.neon_u64, 0))));
9217       HEDLEY_DIAGNOSTIC_POP
9218     #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
9219       if (HEDLEY_UNLIKELY(count_.u64[0] > 15))
9220         return simde_mm_setzero_si64();
9221 
9222       r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count_.u64[0]);
9223     #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9224       r_.i16 = a_.i16 << count_.u64[0];
9225     #else
9226       if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
9227         simde_memset(&r_, 0, sizeof(r_));
9228         return simde__m64_from_private(r_);
9229       }
9230 
9231       SIMDE_VECTORIZE
9232       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
9233         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count_.u64[0]);
9234       }
9235     #endif
9236 
9237     return simde__m64_from_private(r_);
9238   #endif
9239 }
9240 #define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
9241 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9242 #  define _mm_sll_pi16(a, count) simde_mm_sll_pi16(a, count)
9243 #  define _m_psllw(a, count) simde_mm_sll_pi16(a, count)
9244 #endif
9245 
9246 SIMDE_FUNCTION_ATTRIBUTES
9247 simde__m64
simde_mm_sll_pi32(simde__m64 a,simde__m64 count)9248 simde_mm_sll_pi32 (simde__m64 a, simde__m64 count) {
9249   #if defined(SIMDE_X86_MMX_NATIVE)
9250     return _mm_sll_pi32(a, count);
9251   #else
9252     simde__m64_private r_;
9253     simde__m64_private a_ = simde__m64_to_private(a);
9254     simde__m64_private count_ = simde__m64_to_private(count);
9255 
9256     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9257       HEDLEY_DIAGNOSTIC_PUSH
9258       #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0)
9259         #pragma clang diagnostic ignored "-Wvector-conversion"
9260       #endif
9261       r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(HEDLEY_STATIC_CAST(int32_t, vget_lane_u64(count_.neon_u64, 0))));
9262       HEDLEY_DIAGNOSTIC_POP
9263     #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9264       r_.i32 = a_.i32 << count_.u64[0];
9265     #else
9266       if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
9267         simde_memset(&r_, 0, sizeof(r_));
9268         return simde__m64_from_private(r_);
9269       }
9270 
9271       SIMDE_VECTORIZE
9272       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
9273         r_.u32[i] = a_.u32[i] << count_.u64[0];
9274       }
9275     #endif
9276 
9277     return simde__m64_from_private(r_);
9278   #endif
9279 }
9280 #define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
9281 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9282 #  define _mm_sll_pi32(a, count) simde_mm_sll_pi32(a, count)
9283 #  define _m_pslld(a, count) simde_mm_sll_pi32(a, count)
9284 #endif
9285 
9286 SIMDE_FUNCTION_ATTRIBUTES
9287 simde__m64
simde_mm_slli_pi16(simde__m64 a,int count)9288 simde_mm_slli_pi16 (simde__m64 a, int count) {
9289   #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
9290     return _mm_slli_pi16(a, count);
9291   #else
9292     simde__m64_private r_;
9293     simde__m64_private a_ = simde__m64_to_private(a);
9294 
9295     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
9296       if (HEDLEY_UNLIKELY(count > 15))
9297         return simde_mm_setzero_si64();
9298 
9299       r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count);
9300     #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9301       r_.i16 = a_.i16 << count;
9302     #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9303     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9304       r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t) count));
9305     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9306       r_.mmi_i16 = psllh_s(a_.mmi_i16, b_.mmi_i16);
9307     #else
9308       SIMDE_VECTORIZE
9309       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
9310         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count);
9311       }
9312     #endif
9313 
9314     return simde__m64_from_private(r_);
9315   #endif
9316 }
9317 #define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
9318 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9319 #  define _mm_slli_pi16(a, count) simde_mm_slli_pi16(a, count)
9320 #  define _m_psllwi(a, count) simde_mm_slli_pi16(a, count)
9321 #endif
9322 
9323 SIMDE_FUNCTION_ATTRIBUTES
9324 simde__m64
simde_mm_slli_pi32(simde__m64 a,int count)9325 simde_mm_slli_pi32 (simde__m64 a, int count) {
9326   #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
9327     return _mm_slli_pi32(a, count);
9328   #else
9329     simde__m64_private r_;
9330     simde__m64_private a_ = simde__m64_to_private(a);
9331 
9332     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9333       r_.i32 = a_.i32 << count;
9334     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9335       r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t) count));
9336     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9337       r_.mmi_i32 = psllw_s(a_.mmi_i32, b_.mmi_i32);
9338     #else
9339       SIMDE_VECTORIZE
9340       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
9341         r_.u32[i] = a_.u32[i] << count;
9342       }
9343     #endif
9344 
9345     return simde__m64_from_private(r_);
9346   #endif
9347 }
9348 #define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
9349 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9350 #  define _mm_slli_pi32(a, count) simde_mm_slli_pi32(a, count)
9351 #  define _m_pslldi(a, count) simde_mm_slli_pi32(a, count)
9352 #endif
9353 
9354 SIMDE_FUNCTION_ATTRIBUTES
9355 simde__m64
simde_mm_slli_si64(simde__m64 a,int count)9356 simde_mm_slli_si64 (simde__m64 a, int count) {
9357   #if defined(SIMDE_X86_MMX_NATIVE)
9358     return _mm_slli_si64(a, count);
9359   #else
9360     simde__m64_private r_;
9361     simde__m64_private a_ = simde__m64_to_private(a);
9362 
9363     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9364       r_.i64 = a_.i64 << count;
9365     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9366       r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64((int64_t) count));
9367     #else
9368       r_.u64[0] = a_.u64[0] << count;
9369     #endif
9370 
9371     return simde__m64_from_private(r_);
9372   #endif
9373 }
9374 #define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
9375 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9376 #  define _mm_slli_si64(a, count) simde_mm_slli_si64(a, count)
9377 #  define _m_psllqi(a, count) simde_mm_slli_si64(a, count)
9378 #endif
9379 
9380 SIMDE_FUNCTION_ATTRIBUTES
9381 simde__m64
simde_mm_sll_si64(simde__m64 a,simde__m64 count)9382 simde_mm_sll_si64 (simde__m64 a, simde__m64 count) {
9383   #if defined(SIMDE_X86_MMX_NATIVE)
9384     return _mm_sll_si64(a, count);
9385   #else
9386     simde__m64_private r_;
9387     simde__m64_private a_ = simde__m64_to_private(a);
9388     simde__m64_private count_ = simde__m64_to_private(count);
9389 
9390     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9391       r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64);
9392     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
9393       r_.i64 = a_.i64 << count_.i64;
9394     #else
9395       if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
9396         simde_memset(&r_, 0, sizeof(r_));
9397         return simde__m64_from_private(r_);
9398       }
9399 
9400       r_.u64[0] = a_.u64[0] << count_.u64[0];
9401     #endif
9402 
9403     return simde__m64_from_private(r_);
9404   #endif
9405 }
9406 #define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
9407 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9408 #  define _mm_sll_si64(a, count) simde_mm_sll_si64(a, count)
9409 #  define _m_psllq(a, count) simde_mm_sll_si64(a, count)
9410 #endif
9411 
9412 SIMDE_FUNCTION_ATTRIBUTES
9413 simde__m64
simde_mm_srl_pi16(simde__m64 a,simde__m64 count)9414 simde_mm_srl_pi16 (simde__m64 a, simde__m64 count) {
9415   #if defined(SIMDE_X86_MMX_NATIVE)
9416     return _mm_srl_pi16(a, count);
9417   #else
9418     simde__m64_private r_;
9419     simde__m64_private a_ = simde__m64_to_private(a);
9420     simde__m64_private count_ = simde__m64_to_private(count);
9421 
9422     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
9423       if (HEDLEY_UNLIKELY(count_.u64[0] > 15))
9424         return simde_mm_setzero_si64();
9425 
9426       r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, count_.u64[0]);
9427     #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9428       r_.u16 = a_.u16 >> count_.u64[0];
9429     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9430       r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t) vget_lane_u64(count_.neon_u64, 0))));
9431     #else
9432       if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
9433         simde_memset(&r_, 0, sizeof(r_));
9434         return simde__m64_from_private(r_);
9435       }
9436 
9437       SIMDE_VECTORIZE
9438       for (size_t i = 0 ; i < sizeof(r_.u16) / sizeof(r_.u16[0]) ; i++) {
9439         r_.u16[i] = a_.u16[i] >> count_.u64[0];
9440       }
9441     #endif
9442 
9443     return simde__m64_from_private(r_);
9444   #endif
9445 }
9446 #define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
9447 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9448 #  define _mm_srl_pi16(a, count) simde_mm_srl_pi16(a, count)
9449 #  define _m_psrlw(a, count) simde_mm_srl_pi16(a, count)
9450 #endif
9451 
9452 SIMDE_FUNCTION_ATTRIBUTES
9453 simde__m64
simde_mm_srl_pi32(simde__m64 a,simde__m64 count)9454 simde_mm_srl_pi32 (simde__m64 a, simde__m64 count) {
9455   #if defined(SIMDE_X86_MMX_NATIVE)
9456     return _mm_srl_pi32(a, count);
9457   #else
9458     simde__m64_private r_;
9459     simde__m64_private a_ = simde__m64_to_private(a);
9460     simde__m64_private count_ = simde__m64_to_private(count);
9461 
9462     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9463       r_.u32 = a_.u32 >> count_.u64[0];
9464     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9465       r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t) vget_lane_u64(count_.neon_u64, 0))));
9466     #else
9467       if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
9468         simde_memset(&r_, 0, sizeof(r_));
9469         return simde__m64_from_private(r_);
9470       }
9471 
9472       SIMDE_VECTORIZE
9473       for (size_t i = 0 ; i < sizeof(r_.u32) / sizeof(r_.u32[0]) ; i++) {
9474         r_.u32[i] = a_.u32[i] >> count_.u64[0];
9475       }
9476     #endif
9477 
9478     return simde__m64_from_private(r_);
9479   #endif
9480 }
9481 #define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
9482 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9483 #  define _mm_srl_pi32(a, count) simde_mm_srl_pi32(a, count)
9484 #  define _m_psrld(a, count) simde_mm_srl_pi32(a, count)
9485 #endif
9486 
9487 SIMDE_FUNCTION_ATTRIBUTES
9488 simde__m64
simde_mm_srli_pi16(simde__m64 a,int count)9489 simde_mm_srli_pi16 (simde__m64 a, int count) {
9490   #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
9491     return _mm_srli_pi16(a, count);
9492   #else
9493     simde__m64_private r_;
9494     simde__m64_private a_ = simde__m64_to_private(a);
9495 
9496     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9497       r_.u16 = a_.u16 >> count;
9498     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9499       r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t) count)));
9500     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9501       r_.mmi_i16 = psrlh_s(a_.mmi_i16, b_.mmi_i16);
9502     #else
9503       SIMDE_VECTORIZE
9504       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
9505         r_.u16[i] = a_.u16[i] >> count;
9506       }
9507     #endif
9508 
9509     return simde__m64_from_private(r_);
9510   #endif
9511 }
9512 #define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
9513 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9514 #  define _mm_srli_pi16(a, count) simde_mm_srli_pi16(a, count)
9515 #  define _m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
9516 #endif
9517 
9518 SIMDE_FUNCTION_ATTRIBUTES
9519 simde__m64
simde_mm_srli_pi32(simde__m64 a,int count)9520 simde_mm_srli_pi32 (simde__m64 a, int count) {
9521   #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
9522     return _mm_srli_pi32(a, count);
9523   #else
9524     simde__m64_private r_;
9525     simde__m64_private a_ = simde__m64_to_private(a);
9526 
9527     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9528       r_.u32 = a_.u32 >> count;
9529     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9530       r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t) count)));
9531     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9532       r_.mmi_i32 = psrlw_s(a_.mmi_i32, b_.mmi_i32);
9533     #else
9534       SIMDE_VECTORIZE
9535       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
9536         r_.u32[i] = a_.u32[i] >> count;
9537       }
9538     #endif
9539 
9540     return simde__m64_from_private(r_);
9541   #endif
9542 }
9543 #define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
9544 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9545 #  define _mm_srli_pi32(a, count) simde_mm_srli_pi32(a, count)
9546 #  define _m_psrldi(a, count) simde_mm_srli_pi32(a, count)
9547 #endif
9548 
9549 SIMDE_FUNCTION_ATTRIBUTES
9550 simde__m64
simde_mm_srli_si64(simde__m64 a,int count)9551 simde_mm_srli_si64 (simde__m64 a, int count) {
9552   #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
9553     return _mm_srli_si64(a, count);
9554   #else
9555     simde__m64_private r_;
9556     simde__m64_private a_ = simde__m64_to_private(a);
9557 
9558     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9559       r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(-count));
9560     #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9561       r_.u64 = a_.u64 >> count;
9562     #else
9563       r_.u64[0] = a_.u64[0] >> count;
9564     #endif
9565 
9566     return simde__m64_from_private(r_);
9567   #endif
9568 }
9569 #define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
9570 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9571 #  define _mm_srli_si64(a, count) simde_mm_srli_si64(a, count)
9572 #  define _m_psrlqi(a, count) simde_mm_srli_si64(a, count)
9573 #endif
9574 
9575 SIMDE_FUNCTION_ATTRIBUTES
9576 simde__m64
simde_mm_srl_si64(simde__m64 a,simde__m64 count)9577 simde_mm_srl_si64 (simde__m64 a, simde__m64 count) {
9578   #if defined(SIMDE_X86_MMX_NATIVE)
9579     return _mm_srl_si64(a, count);
9580   #else
9581     simde__m64_private r_;
9582     simde__m64_private a_ = simde__m64_to_private(a);
9583     simde__m64_private count_ = simde__m64_to_private(count);
9584 
9585     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
9586       r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64));
9587     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
9588       r_.u64 = a_.u64 >> count_.u64;
9589     #else
9590       if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
9591         simde_memset(&r_, 0, sizeof(r_));
9592         return simde__m64_from_private(r_);
9593       }
9594 
9595       r_.u64[0] = a_.u64[0] >> count_.u64[0];
9596     #endif
9597 
9598     return simde__m64_from_private(r_);
9599   #endif
9600 }
9601 #define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
9602 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9603 #  define _mm_srl_si64(a, count) simde_mm_srl_si64(a, count)
9604 #  define _m_psrlq(a, count) simde_mm_srl_si64(a, count)
9605 #endif
9606 
9607 SIMDE_FUNCTION_ATTRIBUTES
9608 simde__m64
simde_mm_srai_pi16(simde__m64 a,int count)9609 simde_mm_srai_pi16 (simde__m64 a, int count) {
9610   #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
9611     return _mm_srai_pi16(a, count);
9612   #else
9613     simde__m64_private r_;
9614     simde__m64_private a_ = simde__m64_to_private(a);
9615 
9616     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9617       r_.i16 = a_.i16 >> (count & 0xff);
9618     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9619       r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count)));
9620     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9621       r_.mmi_i16 = psrah_s(a_.mmi_i16, count);
9622     #else
9623       SIMDE_VECTORIZE
9624       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
9625         r_.i16[i] = a_.i16[i] >> (count & 0xff);
9626       }
9627     #endif
9628 
9629     return simde__m64_from_private(r_);
9630   #endif
9631 }
9632 #define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
9633 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9634 #  define _mm_srai_pi16(a, count) simde_mm_srai_pi16(a, count)
9635 #  define _m_psrawi(a, count) simde_mm_srai_pi16(a, count)
9636 #endif
9637 
9638 SIMDE_FUNCTION_ATTRIBUTES
9639 simde__m64
simde_mm_srai_pi32(simde__m64 a,int count)9640 simde_mm_srai_pi32 (simde__m64 a, int count) {
9641   #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
9642     return _mm_srai_pi32(a, count);
9643   #else
9644     simde__m64_private r_;
9645     simde__m64_private a_ = simde__m64_to_private(a);
9646 
9647     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9648       r_.i32 = a_.i32 >> (count & 0xff);
9649     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9650       r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count)));
9651     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9652       r_.mmi_i32 = psraw_s(a_.mmi_i32, count);
9653     #else
9654       SIMDE_VECTORIZE
9655       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
9656         r_.i32[i] = a_.i32[i] >> (count & 0xff);
9657       }
9658     #endif
9659 
9660     return simde__m64_from_private(r_);
9661   #endif
9662 }
9663 #define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count)
9664 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9665 #  define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
9666 #  define _m_psradi(a, count) simde_mm_srai_pi32(a, count)
9667 #endif
9668 
9669 SIMDE_FUNCTION_ATTRIBUTES
9670 simde__m64
simde_mm_sra_pi16(simde__m64 a,simde__m64 count)9671 simde_mm_sra_pi16 (simde__m64 a, simde__m64 count) {
9672   #if defined(SIMDE_X86_MMX_NATIVE)
9673     return _mm_sra_pi16(a, count);
9674   #else
9675     simde__m64_private r_;
9676     simde__m64_private a_ = simde__m64_to_private(a);
9677     simde__m64_private count_ = simde__m64_to_private(count);
9678     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
9679 
9680     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9681       r_.i16 = a_.i16 >> cnt;
9682     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9683       r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, vget_lane_u64(count_.neon_u64, 0))));
9684     #else
9685       SIMDE_VECTORIZE
9686       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
9687         r_.i16[i] = a_.i16[i] >> cnt;
9688       }
9689     #endif
9690 
9691     return simde__m64_from_private(r_);
9692   #endif
9693 }
9694 #define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
9695 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9696 #  define _mm_sra_pi16(a, count) simde_mm_sra_pi16(a, count)
9697 #  define _m_psraw(a, count) simde_mm_sra_pi16(a, count)
9698 #endif
9699 
9700 SIMDE_FUNCTION_ATTRIBUTES
9701 simde__m64
simde_mm_sra_pi32(simde__m64 a,simde__m64 count)9702 simde_mm_sra_pi32 (simde__m64 a, simde__m64 count) {
9703   #if defined(SIMDE_X86_MMX_NATIVE)
9704     return _mm_sra_pi32(a, count);
9705   #else
9706     simde__m64_private r_;
9707     simde__m64_private a_ = simde__m64_to_private(a);
9708     simde__m64_private count_ = simde__m64_to_private(count);
9709     const int32_t cnt = (count_.u64[0] > 31) ? 31 : HEDLEY_STATIC_CAST(int32_t, count_.u64[0]);
9710 
9711     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
9712       r_.i32 = a_.i32 >> cnt;
9713     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9714       r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, vget_lane_u64(count_.neon_u64, 0))));
9715     #else
9716       SIMDE_VECTORIZE
9717       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
9718         r_.i32[i] = a_.i32[i] >> cnt;
9719       }
9720     #endif
9721 
9722     return simde__m64_from_private(r_);
9723   #endif
9724 }
9725 #define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
9726 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9727 #  define _mm_sra_pi32(a, count) simde_mm_sra_pi32(a, count)
9728 #  define _m_psrad(a, count) simde_mm_sra_pi32(a, count)
9729 #endif
9730 
9731 SIMDE_FUNCTION_ATTRIBUTES
9732 simde__m64
simde_mm_sub_pi8(simde__m64 a,simde__m64 b)9733 simde_mm_sub_pi8 (simde__m64 a, simde__m64 b) {
9734   #if defined(SIMDE_X86_MMX_NATIVE)
9735     return _mm_sub_pi8(a, b);
9736   #else
9737     simde__m64_private r_;
9738     simde__m64_private a_ = simde__m64_to_private(a);
9739     simde__m64_private b_ = simde__m64_to_private(b);
9740 
9741     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9742       r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8);
9743     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9744       r_.mmi_i8 = psubb_s(a_.mmi_i8, b_.mmi_i8);
9745     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
9746       r_.i8 = a_.i8 - b_.i8;
9747     #else
9748       SIMDE_VECTORIZE
9749       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
9750         r_.i8[i] = a_.i8[i] - b_.i8[i];
9751       }
9752     #endif
9753 
9754     return simde__m64_from_private(r_);
9755   #endif
9756 }
9757 #define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
9758 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9759 #  define _mm_sub_pi8(a, b) simde_mm_sub_pi8(a, b)
9760 #  define _m_psubb(a, b) simde_mm_sub_pi8(a, b)
9761 #endif
9762 
9763 SIMDE_FUNCTION_ATTRIBUTES
9764 simde__m64
simde_mm_sub_pi16(simde__m64 a,simde__m64 b)9765 simde_mm_sub_pi16 (simde__m64 a, simde__m64 b) {
9766   #if defined(SIMDE_X86_MMX_NATIVE)
9767     return _mm_sub_pi16(a, b);
9768   #else
9769     simde__m64_private r_;
9770     simde__m64_private a_ = simde__m64_to_private(a);
9771     simde__m64_private b_ = simde__m64_to_private(b);
9772 
9773     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9774       r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16);
9775     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9776       r_.mmi_i16 = psubh_s(a_.mmi_i16, b_.mmi_i16);
9777     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
9778       r_.i16 = a_.i16 - b_.i16;
9779     #else
9780       SIMDE_VECTORIZE
9781       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
9782         r_.i16[i] = a_.i16[i] - b_.i16[i];
9783       }
9784     #endif
9785 
9786     return simde__m64_from_private(r_);
9787   #endif
9788 }
9789 #define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
9790 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9791 #  define _mm_sub_pi16(a, b) simde_mm_sub_pi16(a, b)
9792 #  define _m_psubw(a, b) simde_mm_sub_pi16(a, b)
9793 #endif
9794 
9795 SIMDE_FUNCTION_ATTRIBUTES
9796 simde__m64
simde_mm_sub_pi32(simde__m64 a,simde__m64 b)9797 simde_mm_sub_pi32 (simde__m64 a, simde__m64 b) {
9798   #if defined(SIMDE_X86_MMX_NATIVE)
9799     return _mm_sub_pi32(a, b);
9800   #else
9801     simde__m64_private r_;
9802     simde__m64_private a_ = simde__m64_to_private(a);
9803     simde__m64_private b_ = simde__m64_to_private(b);
9804 
9805     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9806       r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32);
9807     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9808       r_.mmi_i32 = psubw_s(a_.mmi_i32, b_.mmi_i32);
9809     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
9810       r_.i32 = a_.i32 - b_.i32;
9811     #else
9812       SIMDE_VECTORIZE
9813       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
9814         r_.i32[i] = a_.i32[i] - b_.i32[i];
9815       }
9816     #endif
9817 
9818     return simde__m64_from_private(r_);
9819   #endif
9820 }
9821 #define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
9822 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9823 #  define _mm_sub_pi32(a, b) simde_mm_sub_pi32(a, b)
9824 #  define _m_psubd(a, b) simde_mm_sub_pi32(a, b)
9825 #endif
9826 
9827 SIMDE_FUNCTION_ATTRIBUTES
9828 simde__m64
simde_mm_subs_pi8(simde__m64 a,simde__m64 b)9829 simde_mm_subs_pi8 (simde__m64 a, simde__m64 b) {
9830   #if defined(SIMDE_X86_MMX_NATIVE)
9831     return _mm_subs_pi8(a, b);
9832   #else
9833     simde__m64_private r_;
9834     simde__m64_private a_ = simde__m64_to_private(a);
9835     simde__m64_private b_ = simde__m64_to_private(b);
9836 
9837     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9838       r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8);
9839     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9840       r_.mmi_i8 = psubsb(a_.mmi_i8, b_.mmi_i8);
9841     #else
9842       SIMDE_VECTORIZE
9843       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
9844         if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
9845           r_.i8[i] = INT8_MIN;
9846         } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
9847           r_.i8[i] = INT8_MAX;
9848         } else {
9849           r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
9850         }
9851       }
9852     #endif
9853 
9854     return simde__m64_from_private(r_);
9855   #endif
9856 }
9857 #define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
9858 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9859 #  define _mm_subs_pi8(a, b) simde_mm_subs_pi8(a, b)
9860 #  define _m_psubsb(a, b) simde_mm_subs_pi8(a, b)
9861 #endif
9862 
9863 SIMDE_FUNCTION_ATTRIBUTES
9864 simde__m64
simde_mm_subs_pu8(simde__m64 a,simde__m64 b)9865 simde_mm_subs_pu8 (simde__m64 a, simde__m64 b) {
9866   #if defined(SIMDE_X86_MMX_NATIVE)
9867     return _mm_subs_pu8(a, b);
9868   #else
9869     simde__m64_private r_;
9870     simde__m64_private a_ = simde__m64_to_private(a);
9871     simde__m64_private b_ = simde__m64_to_private(b);
9872 
9873     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9874       r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8);
9875     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9876       r_.mmi_u8 = psubusb(a_.mmi_u8, b_.mmi_u8);
9877     #else
9878       SIMDE_VECTORIZE
9879       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
9880         const int32_t x = a_.u8[i] - b_.u8[i];
9881         if (x < 0) {
9882           r_.u8[i] = 0;
9883         } else if (x > UINT8_MAX) {
9884           r_.u8[i] = UINT8_MAX;
9885         } else {
9886           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
9887         }
9888       }
9889     #endif
9890 
9891     return simde__m64_from_private(r_);
9892   #endif
9893 }
9894 #define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
9895 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9896 #  define _mm_subs_pu8(a, b) simde_mm_subs_pu8(a, b)
9897 #  define _m_psubusb(a, b) simde_mm_subs_pu8(a, b)
9898 #endif
9899 
9900 SIMDE_FUNCTION_ATTRIBUTES
9901 simde__m64
simde_mm_subs_pi16(simde__m64 a,simde__m64 b)9902 simde_mm_subs_pi16 (simde__m64 a, simde__m64 b) {
9903   #if defined(SIMDE_X86_MMX_NATIVE)
9904     return _mm_subs_pi16(a, b);
9905   #else
9906     simde__m64_private r_;
9907     simde__m64_private a_ = simde__m64_to_private(a);
9908     simde__m64_private b_ = simde__m64_to_private(b);
9909 
9910     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9911       r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16);
9912     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9913       r_.mmi_i16 = psubsh(a_.mmi_i16, b_.mmi_i16);
9914     #else
9915       SIMDE_VECTORIZE
9916       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
9917         if (((b_.i16[i]) > 0 && (a_.i16[i]) < SHRT_MIN + (b_.i16[i]))) {
9918           r_.i16[i] = SHRT_MIN;
9919         } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
9920           r_.i16[i] = INT16_MAX;
9921         } else {
9922           r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
9923         }
9924       }
9925     #endif
9926 
9927     return simde__m64_from_private(r_);
9928   #endif
9929 }
9930 #define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
9931 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9932 #  define _mm_subs_pi16(a, b) simde_mm_subs_pi16(a, b)
9933 #  define _m_psubsw(a, b) simde_mm_subs_pi16(a, b)
9934 #endif
9935 
9936 SIMDE_FUNCTION_ATTRIBUTES
9937 simde__m64
simde_mm_subs_pu16(simde__m64 a,simde__m64 b)9938 simde_mm_subs_pu16 (simde__m64 a, simde__m64 b) {
9939   #if defined(SIMDE_X86_MMX_NATIVE)
9940     return _mm_subs_pu16(a, b);
9941   #else
9942     simde__m64_private r_;
9943     simde__m64_private a_ = simde__m64_to_private(a);
9944     simde__m64_private b_ = simde__m64_to_private(b);
9945 
9946     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
9947       r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16);
9948     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9949       r_.mmi_u16 = psubush(a_.mmi_u16, b_.mmi_u16);
9950     #else
9951       SIMDE_VECTORIZE
9952       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
9953         const int x = a_.u16[i] - b_.u16[i];
9954         if (x < 0) {
9955           r_.u16[i] = 0;
9956         } else if (x > UINT16_MAX) {
9957           r_.u16[i] = UINT16_MAX;
9958         } else {
9959           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
9960         }
9961       }
9962     #endif
9963 
9964     return simde__m64_from_private(r_);
9965   #endif
9966 }
9967 #define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
9968 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
9969 #  define _mm_subs_pu16(a, b) simde_mm_subs_pu16(a, b)
9970 #  define _m_psubusw(a, b) simde_mm_subs_pu16(a, b)
9971 #endif
9972 
9973 SIMDE_FUNCTION_ATTRIBUTES
9974 simde__m64
simde_mm_unpackhi_pi8(simde__m64 a,simde__m64 b)9975 simde_mm_unpackhi_pi8 (simde__m64 a, simde__m64 b) {
9976   #if defined(SIMDE_X86_MMX_NATIVE)
9977     return _mm_unpackhi_pi8(a, b);
9978   #else
9979     simde__m64_private r_;
9980     simde__m64_private a_ = simde__m64_to_private(a);
9981     simde__m64_private b_ = simde__m64_to_private(b);
9982 
9983     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
9984       r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8);
9985     #elif defined(SIMDE_SHUFFLE_VECTOR_)
9986       r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14, 7, 15);
9987     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
9988       r_.mmi_i8 = punpckhbh_s(a_.mmi_i8, b_.mmi_i8);
9989     #else
9990       r_.i8[0] = a_.i8[4];
9991       r_.i8[1] = b_.i8[4];
9992       r_.i8[2] = a_.i8[5];
9993       r_.i8[3] = b_.i8[5];
9994       r_.i8[4] = a_.i8[6];
9995       r_.i8[5] = b_.i8[6];
9996       r_.i8[6] = a_.i8[7];
9997       r_.i8[7] = b_.i8[7];
9998     #endif
9999 
10000     return simde__m64_from_private(r_);
10001   #endif
10002 }
10003 #define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
10004 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
10005 #  define _mm_unpackhi_pi8(a, b) simde_mm_unpackhi_pi8(a, b)
10006 #  define _m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
10007 #endif
10008 
10009 SIMDE_FUNCTION_ATTRIBUTES
10010 simde__m64
simde_mm_unpackhi_pi16(simde__m64 a,simde__m64 b)10011 simde_mm_unpackhi_pi16 (simde__m64 a, simde__m64 b) {
10012   #if defined(SIMDE_X86_MMX_NATIVE)
10013     return _mm_unpackhi_pi16(a, b);
10014   #else
10015     simde__m64_private r_;
10016     simde__m64_private a_ = simde__m64_to_private(a);
10017     simde__m64_private b_ = simde__m64_to_private(b);
10018 
10019     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
10020       r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16);
10021     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
10022       r_.mmi_i16 = punpckhhw_s(a_.mmi_i16, b_.mmi_i16);
10023     #elif defined(SIMDE_SHUFFLE_VECTOR_)
10024       r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7);
10025     #else
10026       r_.i16[0] = a_.i16[2];
10027       r_.i16[1] = b_.i16[2];
10028       r_.i16[2] = a_.i16[3];
10029       r_.i16[3] = b_.i16[3];
10030     #endif
10031 
10032     return simde__m64_from_private(r_);
10033   #endif
10034 }
10035 #define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
10036 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
10037 #  define _mm_unpackhi_pi16(a, b) simde_mm_unpackhi_pi16(a, b)
10038 #  define _m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
10039 #endif
10040 
10041 SIMDE_FUNCTION_ATTRIBUTES
10042 simde__m64
simde_mm_unpackhi_pi32(simde__m64 a,simde__m64 b)10043 simde_mm_unpackhi_pi32 (simde__m64 a, simde__m64 b) {
10044   #if defined(SIMDE_X86_MMX_NATIVE)
10045     return _mm_unpackhi_pi32(a, b);
10046   #else
10047     simde__m64_private r_;
10048     simde__m64_private a_ = simde__m64_to_private(a);
10049     simde__m64_private b_ = simde__m64_to_private(b);
10050 
10051     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
10052       r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32);
10053     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
10054       r_.mmi_i32 = punpckhwd_s(a_.mmi_i32, b_.mmi_i32);
10055     #elif defined(SIMDE_SHUFFLE_VECTOR_)
10056       r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3);
10057     #else
10058       r_.i32[0] = a_.i32[1];
10059       r_.i32[1] = b_.i32[1];
10060     #endif
10061 
10062     return simde__m64_from_private(r_);
10063   #endif
10064 }
10065 #define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
10066 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
10067 #  define _mm_unpackhi_pi32(a, b) simde_mm_unpackhi_pi32(a, b)
10068 #  define _m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
10069 #endif
10070 
10071 SIMDE_FUNCTION_ATTRIBUTES
10072 simde__m64
simde_mm_unpacklo_pi8(simde__m64 a,simde__m64 b)10073 simde_mm_unpacklo_pi8 (simde__m64 a, simde__m64 b) {
10074   #if defined(SIMDE_X86_MMX_NATIVE)
10075     return _mm_unpacklo_pi8(a, b);
10076   #else
10077     simde__m64_private r_;
10078     simde__m64_private a_ = simde__m64_to_private(a);
10079     simde__m64_private b_ = simde__m64_to_private(b);
10080 
10081     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
10082       r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8);
10083     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
10084       r_.mmi_i8 = punpcklbh_s(a_.mmi_i8, b_.mmi_i8);
10085     #elif defined(SIMDE_SHUFFLE_VECTOR_)
10086       r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3, 11);
10087     #else
10088       r_.i8[0] = a_.i8[0];
10089       r_.i8[1] = b_.i8[0];
10090       r_.i8[2] = a_.i8[1];
10091       r_.i8[3] = b_.i8[1];
10092       r_.i8[4] = a_.i8[2];
10093       r_.i8[5] = b_.i8[2];
10094       r_.i8[6] = a_.i8[3];
10095       r_.i8[7] = b_.i8[3];
10096     #endif
10097 
10098     return simde__m64_from_private(r_);
10099   #endif
10100 }
10101 #define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
10102 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
10103 #  define _mm_unpacklo_pi8(a, b) simde_mm_unpacklo_pi8(a, b)
10104 #  define _m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
10105 #endif
10106 
10107 SIMDE_FUNCTION_ATTRIBUTES
10108 simde__m64
simde_mm_unpacklo_pi16(simde__m64 a,simde__m64 b)10109 simde_mm_unpacklo_pi16 (simde__m64 a, simde__m64 b) {
10110   #if defined(SIMDE_X86_MMX_NATIVE)
10111     return _mm_unpacklo_pi16(a, b);
10112   #else
10113     simde__m64_private r_;
10114     simde__m64_private a_ = simde__m64_to_private(a);
10115     simde__m64_private b_ = simde__m64_to_private(b);
10116 
10117     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
10118       r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16);
10119     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
10120       r_.mmi_i16 = punpcklhw_s(a_.mmi_i16, b_.mmi_i16);
10121     #elif defined(SIMDE_SHUFFLE_VECTOR_)
10122       r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5);
10123     #else
10124       r_.i16[0] = a_.i16[0];
10125       r_.i16[1] = b_.i16[0];
10126       r_.i16[2] = a_.i16[1];
10127       r_.i16[3] = b_.i16[1];
10128     #endif
10129 
10130     return simde__m64_from_private(r_);
10131   #endif
10132 }
10133 #define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
10134 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
10135 #  define _mm_unpacklo_pi16(a, b) simde_mm_unpacklo_pi16(a, b)
10136 #  define _m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
10137 #endif
10138 
10139 SIMDE_FUNCTION_ATTRIBUTES
10140 simde__m64
simde_mm_unpacklo_pi32(simde__m64 a,simde__m64 b)10141 simde_mm_unpacklo_pi32 (simde__m64 a, simde__m64 b) {
10142   #if defined(SIMDE_X86_MMX_NATIVE)
10143     return _mm_unpacklo_pi32(a, b);
10144   #else
10145     simde__m64_private r_;
10146     simde__m64_private a_ = simde__m64_to_private(a);
10147     simde__m64_private b_ = simde__m64_to_private(b);
10148 
10149     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
10150       r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32);
10151     #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
10152       r_.mmi_i32 = punpcklwd_s(a_.mmi_i32, b_.mmi_i32);
10153     #elif defined(SIMDE_SHUFFLE_VECTOR_)
10154       r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2);
10155     #else
10156       r_.i32[0] = a_.i32[0];
10157       r_.i32[1] = b_.i32[0];
10158     #endif
10159 
10160     return simde__m64_from_private(r_);
10161   #endif
10162 }
10163 #define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
10164 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
10165 #  define _mm_unpacklo_pi32(a, b) simde_mm_unpacklo_pi32(a, b)
10166 #  define _m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
10167 #endif
10168 
10169 SIMDE_FUNCTION_ATTRIBUTES
10170 simde__m64
simde_mm_xor_si64(simde__m64 a,simde__m64 b)10171 simde_mm_xor_si64 (simde__m64 a, simde__m64 b) {
10172   #if defined(SIMDE_X86_MMX_NATIVE)
10173     return _mm_xor_si64(a, b);
10174   #else
10175     simde__m64_private r_;
10176     simde__m64_private a_ = simde__m64_to_private(a);
10177     simde__m64_private b_ = simde__m64_to_private(b);
10178 
10179     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
10180       r_.neon_i32 = veor_s32(a_.neon_i32, b_.neon_i32);
10181     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
10182       r_.i32f = a_.i32f ^ b_.i32f;
10183     #else
10184       r_.u64[0] = a_.u64[0] ^ b_.u64[0];
10185     #endif
10186 
10187     return simde__m64_from_private(r_);
10188   #endif
10189 }
10190 #define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
10191 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
10192 #  define _mm_xor_si64(a, b) simde_mm_xor_si64(a, b)
10193 #  define _m_pxor(a, b) simde_mm_xor_si64(a, b)
10194 #endif
10195 
10196 SIMDE_FUNCTION_ATTRIBUTES
10197 int32_t
simde_m_to_int(simde__m64 a)10198 simde_m_to_int (simde__m64 a) {
10199   #if defined(SIMDE_X86_MMX_NATIVE)
10200     return _m_to_int(a);
10201   #else
10202     simde__m64_private a_ = simde__m64_to_private(a);
10203 
10204     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
10205       HEDLEY_DIAGNOSTIC_PUSH
10206       #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0)
10207         #pragma clang diagnostic ignored "-Wvector-conversion"
10208       #endif
10209       return vget_lane_s32(a_.neon_i32, 0);
10210       HEDLEY_DIAGNOSTIC_POP
10211     #else
10212       return a_.i32[0];
10213     #endif
10214   #endif
10215 }
10216 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
10217 #  define _m_to_int(a) simde_m_to_int(a)
10218 #endif
10219 
10220 SIMDE_END_DECLS_
10221 
10222 HEDLEY_DIAGNOSTIC_POP
10223 
10224 #endif /* !defined(SIMDE_X86_MMX_H) */
10225 /* :: End ../simde/simde/x86/mmx.h :: */
10226 
10227 #if defined(_WIN32)
10228   #include <windows.h>
10229 #endif
10230 
10231 HEDLEY_DIAGNOSTIC_PUSH
10232 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
10233 SIMDE_BEGIN_DECLS_
10234 
10235 typedef union {
10236   #if defined(SIMDE_VECTOR_SUBSCRIPT)
10237     SIMDE_ALIGN_TO_16 int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10238     SIMDE_ALIGN_TO_16 int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10239     SIMDE_ALIGN_TO_16 int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10240     SIMDE_ALIGN_TO_16 int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10241     SIMDE_ALIGN_TO_16 uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10242     SIMDE_ALIGN_TO_16 uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10243     SIMDE_ALIGN_TO_16 uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10244     SIMDE_ALIGN_TO_16 uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10245     #if defined(SIMDE_HAVE_INT128_)
10246     SIMDE_ALIGN_TO_16 simde_int128  i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10247     SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10248     #endif
10249     SIMDE_ALIGN_TO_16 simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10250     SIMDE_ALIGN_TO_16 int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10251     SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10252   #else
10253     SIMDE_ALIGN_TO_16 int8_t         i8[16];
10254     SIMDE_ALIGN_TO_16 int16_t        i16[8];
10255     SIMDE_ALIGN_TO_16 int32_t        i32[4];
10256     SIMDE_ALIGN_TO_16 int64_t        i64[2];
10257     SIMDE_ALIGN_TO_16 uint8_t        u8[16];
10258     SIMDE_ALIGN_TO_16 uint16_t       u16[8];
10259     SIMDE_ALIGN_TO_16 uint32_t       u32[4];
10260     SIMDE_ALIGN_TO_16 uint64_t       u64[2];
10261     #if defined(SIMDE_HAVE_INT128_)
10262     SIMDE_ALIGN_TO_16 simde_int128  i128[1];
10263     SIMDE_ALIGN_TO_16 simde_uint128 u128[1];
10264     #endif
10265     SIMDE_ALIGN_TO_16 simde_float32  f32[4];
10266     SIMDE_ALIGN_TO_16 int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
10267     SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
10268   #endif
10269 
10270     SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
10271     SIMDE_ALIGN_TO_16 simde__m64         m64[2];
10272 
10273   #if defined(SIMDE_X86_SSE_NATIVE)
10274     SIMDE_ALIGN_TO_16 __m128         n;
10275   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
10276     SIMDE_ALIGN_TO_16 int8x16_t      neon_i8;
10277     SIMDE_ALIGN_TO_16 int16x8_t      neon_i16;
10278     SIMDE_ALIGN_TO_16 int32x4_t      neon_i32;
10279     SIMDE_ALIGN_TO_16 int64x2_t      neon_i64;
10280     SIMDE_ALIGN_TO_16 uint8x16_t     neon_u8;
10281     SIMDE_ALIGN_TO_16 uint16x8_t     neon_u16;
10282     SIMDE_ALIGN_TO_16 uint32x4_t     neon_u32;
10283     SIMDE_ALIGN_TO_16 uint64x2_t     neon_u64;
10284     SIMDE_ALIGN_TO_16 float32x4_t    neon_f32;
10285     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
10286       SIMDE_ALIGN_TO_16 float64x2_t    neon_f64;
10287     #endif
10288   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
10289     SIMDE_ALIGN_TO_16 v128_t         wasm_v128;
10290   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
10291     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)      altivec_u8;
10292     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)     altivec_u16;
10293     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)       altivec_u32;
10294     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char)        altivec_i8;
10295     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short)       altivec_i16;
10296     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)         altivec_i32;
10297     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float)              altivec_f32;
10298     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
10299       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
10300       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64;
10301       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
10302     #endif
10303   #endif
10304 } simde__m128_private;
10305 
10306 #if defined(SIMDE_X86_SSE_NATIVE)
10307   typedef __m128 simde__m128;
10308 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
10309    typedef float32x4_t simde__m128;
10310 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
10311    typedef v128_t simde__m128;
10312 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
10313    typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128;
10314 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
10315   typedef simde_float32 simde__m128 SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
10316 #else
10317   typedef simde__m128_private simde__m128;
10318 #endif
10319 
10320 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10321   typedef simde__m128 __m128;
10322 #endif
10323 
10324 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect");
10325 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128_private), "simde__m128_private size incorrect");
10326 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
10327 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128) == 16, "simde__m128 is not 16-byte aligned");
10328 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128_private) == 16, "simde__m128_private is not 16-byte aligned");
10329 #endif
10330 
10331 SIMDE_FUNCTION_ATTRIBUTES
10332 simde__m128
simde__m128_from_private(simde__m128_private v)10333 simde__m128_from_private(simde__m128_private v) {
10334   simde__m128 r;
10335   simde_memcpy(&r, &v, sizeof(r));
10336   return r;
10337 }
10338 
10339 SIMDE_FUNCTION_ATTRIBUTES
10340 simde__m128_private
simde__m128_to_private(simde__m128 v)10341 simde__m128_to_private(simde__m128 v) {
10342   simde__m128_private r;
10343   simde_memcpy(&r, &v, sizeof(r));
10344   return r;
10345 }
10346 
10347 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,int8x16_t,neon,i8)10348   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int8x16_t, neon, i8)
10349   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int16x8_t, neon, i16)
10350   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int32x4_t, neon, i32)
10351   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int64x2_t, neon, i64)
10352   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint8x16_t, neon, u8)
10353   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint16x8_t, neon, u16)
10354   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint32x4_t, neon, u32)
10355   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint64x2_t, neon, u64)
10356   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float32x4_t, neon, f32)
10357   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
10358     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float64x2_t, neon, f64)
10359   #endif
10360 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
10361 
10362 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
10363   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
10364   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
10365   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
10366   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
10367   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
10368   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
10369 
10370   #if defined(SIMDE_BUG_GCC_95782)
10371     SIMDE_FUNCTION_ATTRIBUTES
10372     SIMDE_POWER_ALTIVEC_VECTOR(float)
10373     simde__m128_to_altivec_f32(simde__m128 value) {
10374       simde__m128_private r_ = simde__m128_to_private(value);
10375       return r_.altivec_f32;
10376     }
10377 
10378     SIMDE_FUNCTION_ATTRIBUTES
10379     simde__m128
simde__m128_from_altivec_f32(SIMDE_POWER_ALTIVEC_VECTOR (float)value)10380     simde__m128_from_altivec_f32(SIMDE_POWER_ALTIVEC_VECTOR(float) value) {
10381       simde__m128_private r_;
10382       r_.altivec_f32 = value;
10383       return simde__m128_from_private(r_);
10384     }
10385   #else
10386     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(float), altivec, f32)
10387   #endif
10388 
10389   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
10390     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
10391     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
10392   #endif
10393 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
10394   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v128_t, wasm, v128);
10395 #endif /* defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) */
10396 
10397 enum {
10398   #if defined(SIMDE_X86_SSE_NATIVE)
10399     SIMDE_MM_ROUND_NEAREST     = _MM_ROUND_NEAREST,
10400     SIMDE_MM_ROUND_DOWN        = _MM_ROUND_DOWN,
10401     SIMDE_MM_ROUND_UP          = _MM_ROUND_UP,
10402     SIMDE_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO
10403   #else
10404     SIMDE_MM_ROUND_NEAREST     = 0x0000,
10405     SIMDE_MM_ROUND_DOWN        = 0x2000,
10406     SIMDE_MM_ROUND_UP          = 0x4000,
10407     SIMDE_MM_ROUND_TOWARD_ZERO = 0x6000
10408   #endif
10409 };
10410 
10411 #if defined(_MM_FROUND_TO_NEAREST_INT)
10412 #  define SIMDE_MM_FROUND_TO_NEAREST_INT _MM_FROUND_TO_NEAREST_INT
10413 #  define SIMDE_MM_FROUND_TO_NEG_INF     _MM_FROUND_TO_NEG_INF
10414 #  define SIMDE_MM_FROUND_TO_POS_INF     _MM_FROUND_TO_POS_INF
10415 #  define SIMDE_MM_FROUND_TO_ZERO        _MM_FROUND_TO_ZERO
10416 #  define SIMDE_MM_FROUND_CUR_DIRECTION  _MM_FROUND_CUR_DIRECTION
10417 
10418 #  define SIMDE_MM_FROUND_RAISE_EXC      _MM_FROUND_RAISE_EXC
10419 #  define SIMDE_MM_FROUND_NO_EXC         _MM_FROUND_NO_EXC
10420 #else
10421 #  define SIMDE_MM_FROUND_TO_NEAREST_INT 0x00
10422 #  define SIMDE_MM_FROUND_TO_NEG_INF     0x01
10423 #  define SIMDE_MM_FROUND_TO_POS_INF     0x02
10424 #  define SIMDE_MM_FROUND_TO_ZERO        0x03
10425 #  define SIMDE_MM_FROUND_CUR_DIRECTION  0x04
10426 
10427 #  define SIMDE_MM_FROUND_RAISE_EXC      0x00
10428 #  define SIMDE_MM_FROUND_NO_EXC         0x08
10429 #endif
10430 
10431 #define SIMDE_MM_FROUND_NINT \
10432   (SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_RAISE_EXC)
10433 #define SIMDE_MM_FROUND_FLOOR \
10434   (SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_RAISE_EXC)
10435 #define SIMDE_MM_FROUND_CEIL \
10436   (SIMDE_MM_FROUND_TO_POS_INF | SIMDE_MM_FROUND_RAISE_EXC)
10437 #define SIMDE_MM_FROUND_TRUNC \
10438   (SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_RAISE_EXC)
10439 #define SIMDE_MM_FROUND_RINT \
10440   (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_RAISE_EXC)
10441 #define SIMDE_MM_FROUND_NEARBYINT \
10442   (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_NO_EXC)
10443 
10444 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) && !defined(_MM_FROUND_TO_NEAREST_INT)
10445 #  define _MM_FROUND_TO_NEAREST_INT SIMDE_MM_FROUND_TO_NEAREST_INT
10446 #  define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF
10447 #  define _MM_FROUND_TO_POS_INF SIMDE_MM_FROUND_TO_POS_INF
10448 #  define _MM_FROUND_TO_ZERO SIMDE_MM_FROUND_TO_ZERO
10449 #  define _MM_FROUND_CUR_DIRECTION SIMDE_MM_FROUND_CUR_DIRECTION
10450 #  define _MM_FROUND_RAISE_EXC SIMDE_MM_FROUND_RAISE_EXC
10451 #  define _MM_FROUND_NINT SIMDE_MM_FROUND_NINT
10452 #  define _MM_FROUND_FLOOR SIMDE_MM_FROUND_FLOOR
10453 #  define _MM_FROUND_CEIL SIMDE_MM_FROUND_CEIL
10454 #  define _MM_FROUND_TRUNC SIMDE_MM_FROUND_TRUNC
10455 #  define _MM_FROUND_RINT SIMDE_MM_FROUND_RINT
10456 #  define _MM_FROUND_NEARBYINT SIMDE_MM_FROUND_NEARBYINT
10457 #endif
10458 
10459 #if defined(_MM_EXCEPT_INVALID)
10460 #  define SIMDE_MM_EXCEPT_INVALID _MM_EXCEPT_INVALID
10461 #else
10462 #  define SIMDE_MM_EXCEPT_INVALID (0x0001)
10463 #endif
10464 #if defined(_MM_EXCEPT_DENORM)
10465 #  define SIMDE_MM_EXCEPT_DENORM _MM_EXCEPT_DENORM
10466 #else
10467 #  define SIMDE_MM_EXCEPT_DENORM (0x0002)
10468 #endif
10469 #if defined(_MM_EXCEPT_DIV_ZERO)
10470 #  define SIMDE_MM_EXCEPT_DIV_ZERO _MM_EXCEPT_DIV_ZERO
10471 #else
10472 #  define SIMDE_MM_EXCEPT_DIV_ZERO (0x0004)
10473 #endif
10474 #if defined(_MM_EXCEPT_OVERFLOW)
10475 #  define SIMDE_MM_EXCEPT_OVERFLOW _MM_EXCEPT_OVERFLOW
10476 #else
10477 #  define SIMDE_MM_EXCEPT_OVERFLOW (0x0008)
10478 #endif
10479 #if defined(_MM_EXCEPT_UNDERFLOW)
10480 #  define SIMDE_MM_EXCEPT_UNDERFLOW _MM_EXCEPT_UNDERFLOW
10481 #else
10482 #  define SIMDE_MM_EXCEPT_UNDERFLOW (0x0010)
10483 #endif
10484 #if defined(_MM_EXCEPT_INEXACT)
10485 #  define SIMDE_MM_EXCEPT_INEXACT _MM_EXCEPT_INEXACT
10486 #else
10487 #  define SIMDE_MM_EXCEPT_INEXACT (0x0020)
10488 #endif
10489 #if defined(_MM_EXCEPT_MASK)
10490 #  define SIMDE_MM_EXCEPT_MASK _MM_EXCEPT_MASK
10491 #else
10492 #  define SIMDE_MM_EXCEPT_MASK \
10493      (SIMDE_MM_EXCEPT_INVALID | SIMDE_MM_EXCEPT_DENORM | \
10494       SIMDE_MM_EXCEPT_DIV_ZERO | SIMDE_MM_EXCEPT_OVERFLOW | \
10495       SIMDE_MM_EXCEPT_UNDERFLOW | SIMDE_MM_EXCEPT_INEXACT)
10496 #endif
10497 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10498   #define _MM_EXCEPT_INVALID SIMDE_MM_EXCEPT_INVALID
10499   #define _MM_EXCEPT_DENORM SIMDE_MM_EXCEPT_DENORM
10500   #define _MM_EXCEPT_DIV_ZERO SIMDE_MM_EXCEPT_DIV_ZERO
10501   #define _MM_EXCEPT_OVERFLOW SIMDE_MM_EXCEPT_OVERFLOW
10502   #define _MM_EXCEPT_UNDERFLOW SIMDE_MM_EXCEPT_UNDERFLOW
10503   #define _MM_EXCEPT_INEXACT SIMDE_MM_EXCEPT_INEXACT
10504   #define _MM_EXCEPT_MASK SIMDE_MM_EXCEPT_MASK
10505 #endif
10506 
10507 #if defined(_MM_MASK_INVALID)
10508 #  define SIMDE_MM_MASK_INVALID _MM_MASK_INVALID
10509 #else
10510 #  define SIMDE_MM_MASK_INVALID (0x0080)
10511 #endif
10512 #if defined(_MM_MASK_DENORM)
10513 #  define SIMDE_MM_MASK_DENORM _MM_MASK_DENORM
10514 #else
10515 #  define SIMDE_MM_MASK_DENORM (0x0100)
10516 #endif
10517 #if defined(_MM_MASK_DIV_ZERO)
10518 #  define SIMDE_MM_MASK_DIV_ZERO _MM_MASK_DIV_ZERO
10519 #else
10520 #  define SIMDE_MM_MASK_DIV_ZERO (0x0200)
10521 #endif
10522 #if defined(_MM_MASK_OVERFLOW)
10523 #  define SIMDE_MM_MASK_OVERFLOW _MM_MASK_OVERFLOW
10524 #else
10525 #  define SIMDE_MM_MASK_OVERFLOW (0x0400)
10526 #endif
10527 #if defined(_MM_MASK_UNDERFLOW)
10528 #  define SIMDE_MM_MASK_UNDERFLOW _MM_MASK_UNDERFLOW
10529 #else
10530 #  define SIMDE_MM_MASK_UNDERFLOW (0x0800)
10531 #endif
10532 #if defined(_MM_MASK_INEXACT)
10533 #  define SIMDE_MM_MASK_INEXACT _MM_MASK_INEXACT
10534 #else
10535 #  define SIMDE_MM_MASK_INEXACT (0x1000)
10536 #endif
10537 #if defined(_MM_MASK_MASK)
10538 #  define SIMDE_MM_MASK_MASK _MM_MASK_MASK
10539 #else
10540 #  define SIMDE_MM_MASK_MASK \
10541      (SIMDE_MM_MASK_INVALID | SIMDE_MM_MASK_DENORM | \
10542       SIMDE_MM_MASK_DIV_ZERO | SIMDE_MM_MASK_OVERFLOW | \
10543       SIMDE_MM_MASK_UNDERFLOW | SIMDE_MM_MASK_INEXACT)
10544 #endif
10545 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10546   #define _MM_MASK_INVALID SIMDE_MM_MASK_INVALID
10547   #define _MM_MASK_DENORM SIMDE_MM_MASK_DENORM
10548   #define _MM_MASK_DIV_ZERO SIMDE_MM_MASK_DIV_ZERO
10549   #define _MM_MASK_OVERFLOW SIMDE_MM_MASK_OVERFLOW
10550   #define _MM_MASK_UNDERFLOW SIMDE_MM_MASK_UNDERFLOW
10551   #define _MM_MASK_INEXACT SIMDE_MM_MASK_INEXACT
10552   #define _MM_MASK_MASK SIMDE_MM_MASK_MASK
10553 #endif
10554 
10555 #if defined(_MM_FLUSH_ZERO_MASK)
10556 #  define SIMDE_MM_FLUSH_ZERO_MASK _MM_FLUSH_ZERO_MASK
10557 #else
10558 #  define SIMDE_MM_FLUSH_ZERO_MASK (0x8000)
10559 #endif
10560 #if defined(_MM_FLUSH_ZERO_ON)
10561 #  define SIMDE_MM_FLUSH_ZERO_ON _MM_FLUSH_ZERO_ON
10562 #else
10563 #  define SIMDE_MM_FLUSH_ZERO_ON (0x8000)
10564 #endif
10565 #if defined(_MM_FLUSH_ZERO_OFF)
10566 #  define SIMDE_MM_FLUSH_ZERO_OFF _MM_FLUSH_ZERO_OFF
10567 #else
10568 #  define SIMDE_MM_FLUSH_ZERO_OFF (0x0000)
10569 #endif
10570 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10571   #define _MM_FLUSH_ZERO_MASK SIMDE_MM_FLUSH_ZERO_MASK
10572   #define _MM_FLUSH_ZERO_ON SIMDE_MM_FLUSH_ZERO_ON
10573   #define _MM_FLUSH_ZERO_OFF SIMDE_MM_FLUSH_ZERO_OFF
10574 #endif
10575 
10576 SIMDE_FUNCTION_ATTRIBUTES
10577 unsigned int
SIMDE_MM_GET_ROUNDING_MODE(void)10578 SIMDE_MM_GET_ROUNDING_MODE(void) {
10579   #if defined(SIMDE_X86_SSE_NATIVE)
10580     return _MM_GET_ROUNDING_MODE();
10581   #elif defined(SIMDE_HAVE_FENV_H)
10582     unsigned int vfe_mode;
10583 
10584     switch (fegetround()) {
10585       #if defined(FE_TONEAREST)
10586         case FE_TONEAREST:
10587           vfe_mode = SIMDE_MM_ROUND_NEAREST;
10588           break;
10589       #endif
10590 
10591       #if defined(FE_TOWARDZERO)
10592         case FE_TOWARDZERO:
10593           vfe_mode = SIMDE_MM_ROUND_DOWN;
10594           break;
10595       #endif
10596 
10597       #if defined(FE_UPWARD)
10598         case FE_UPWARD:
10599           vfe_mode = SIMDE_MM_ROUND_UP;
10600           break;
10601       #endif
10602 
10603       #if defined(FE_DOWNWARD)
10604         case FE_DOWNWARD:
10605           vfe_mode = SIMDE_MM_ROUND_TOWARD_ZERO;
10606           break;
10607       #endif
10608 
10609       default:
10610         vfe_mode = SIMDE_MM_ROUND_NEAREST;
10611         break;
10612     }
10613 
10614     return vfe_mode;
10615   #else
10616     return SIMDE_MM_ROUND_NEAREST;
10617   #endif
10618 }
10619 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10620   #define _MM_GET_ROUNDING_MODE() SIMDE_MM_GET_ROUNDING_MODE()
10621 #endif
10622 
10623 SIMDE_FUNCTION_ATTRIBUTES
10624 void
SIMDE_MM_SET_ROUNDING_MODE(unsigned int a)10625 SIMDE_MM_SET_ROUNDING_MODE(unsigned int a) {
10626   #if defined(SIMDE_X86_SSE_NATIVE)
10627     _MM_SET_ROUNDING_MODE(a);
10628   #elif defined(SIMDE_HAVE_FENV_H)
10629     int fe_mode = FE_TONEAREST;
10630 
10631     switch (a) {
10632       #if defined(FE_TONEAREST)
10633         case SIMDE_MM_ROUND_NEAREST:
10634           fe_mode = FE_TONEAREST;
10635           break;
10636       #endif
10637 
10638       #if defined(FE_TOWARDZERO)
10639         case SIMDE_MM_ROUND_TOWARD_ZERO:
10640           fe_mode = FE_TOWARDZERO;
10641           break;
10642       #endif
10643 
10644       #if defined(FE_DOWNWARD)
10645         case SIMDE_MM_ROUND_DOWN:
10646           fe_mode = FE_DOWNWARD;
10647           break;
10648       #endif
10649 
10650       #if defined(FE_UPWARD)
10651         case SIMDE_MM_ROUND_UP:
10652           fe_mode = FE_UPWARD;
10653           break;
10654       #endif
10655 
10656       default:
10657         return;
10658     }
10659 
10660     fesetround(fe_mode);
10661   #else
10662     (void) a;
10663   #endif
10664 }
10665 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10666   #define _MM_SET_ROUNDING_MODE(a) SIMDE_MM_SET_ROUNDING_MODE(a)
10667 #endif
10668 
10669 SIMDE_FUNCTION_ATTRIBUTES
10670 uint32_t
SIMDE_MM_GET_FLUSH_ZERO_MODE(void)10671 SIMDE_MM_GET_FLUSH_ZERO_MODE (void) {
10672   #if defined(SIMDE_X86_SSE_NATIVE)
10673     return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
10674   #else
10675     return SIMDE_MM_FLUSH_ZERO_OFF;
10676   #endif
10677 }
10678 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10679   #define _MM_SET_FLUSH_ZERO_MODE(a) SIMDE_MM_SET_FLUSH_ZERO_MODE(a)
10680 #endif
10681 
10682 SIMDE_FUNCTION_ATTRIBUTES
10683 void
SIMDE_MM_SET_FLUSH_ZERO_MODE(uint32_t a)10684 SIMDE_MM_SET_FLUSH_ZERO_MODE (uint32_t a) {
10685   #if defined(SIMDE_X86_SSE_NATIVE)
10686     _MM_SET_FLUSH_ZERO_MODE(a);
10687   #else
10688     (void) a;
10689   #endif
10690 }
10691 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10692   #define _MM_SET_FLUSH_ZERO_MODE(a) SIMDE_MM_SET_FLUSH_ZERO_MODE(a)
10693 #endif
10694 
10695 SIMDE_FUNCTION_ATTRIBUTES
10696 uint32_t
simde_mm_getcsr(void)10697 simde_mm_getcsr (void) {
10698   #if defined(SIMDE_X86_SSE_NATIVE)
10699     return _mm_getcsr();
10700   #else
10701     return SIMDE_MM_GET_ROUNDING_MODE();
10702   #endif
10703 }
10704 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10705   #define _mm_getcsr() simde_mm_getcsr()
10706 #endif
10707 
10708 SIMDE_FUNCTION_ATTRIBUTES
10709 void
simde_mm_setcsr(uint32_t a)10710 simde_mm_setcsr (uint32_t a) {
10711   #if defined(SIMDE_X86_SSE_NATIVE)
10712     _mm_setcsr(a);
10713   #else
10714     SIMDE_MM_SET_ROUNDING_MODE(HEDLEY_STATIC_CAST(unsigned int, a));
10715   #endif
10716 }
10717 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10718   #define _mm_setcsr(a) simde_mm_setcsr(a)
10719 #endif
10720 
10721 SIMDE_FUNCTION_ATTRIBUTES
10722 simde__m128
simde_x_mm_round_ps(simde__m128 a,int rounding,int lax_rounding)10723 simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding)
10724     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15)
10725     SIMDE_REQUIRE_CONSTANT_RANGE(lax_rounding, 0, 1) {
10726   simde__m128_private
10727     r_,
10728     a_ = simde__m128_to_private(a);
10729 
10730   (void) lax_rounding;
10731 
10732   /* For architectures which lack a current direction SIMD instruction.
10733    *
10734    * Note that NEON actually has a current rounding mode instruction,
10735    * but in ARMv8+ the rounding mode is ignored and nearest is always
10736    * used, so we treat ARMv7 as having a rounding mode but ARMv8 as
10737    * not. */
10738   #if \
10739       defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \
10740       defined(SIMDE_ARM_NEON_A32V8)
10741     if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
10742       rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13;
10743   #endif
10744 
10745   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
10746     case SIMDE_MM_FROUND_CUR_DIRECTION:
10747       #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
10748         r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_round(a_.altivec_f32));
10749       #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
10750         r_.neon_f32 = vrndiq_f32(a_.neon_f32);
10751       #elif defined(simde_math_nearbyintf)
10752         SIMDE_VECTORIZE
10753         for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
10754           r_.f32[i] = simde_math_nearbyintf(a_.f32[i]);
10755         }
10756       #else
10757         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
10758       #endif
10759       break;
10760 
10761     case SIMDE_MM_FROUND_TO_NEAREST_INT:
10762       #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
10763         r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_rint(a_.altivec_f32));
10764       #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
10765         r_.neon_f32 = vrndnq_f32(a_.neon_f32);
10766       #elif defined(simde_math_roundevenf)
10767         SIMDE_VECTORIZE
10768         for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
10769           r_.f32[i] = simde_math_roundevenf(a_.f32[i]);
10770         }
10771       #else
10772         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
10773       #endif
10774       break;
10775 
10776     case SIMDE_MM_FROUND_TO_NEG_INF:
10777       #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
10778         r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_floor(a_.altivec_f32));
10779       #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
10780         r_.neon_f32 = vrndmq_f32(a_.neon_f32);
10781       #elif defined(simde_math_floorf)
10782         SIMDE_VECTORIZE
10783         for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
10784           r_.f32[i] = simde_math_floorf(a_.f32[i]);
10785         }
10786       #else
10787         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
10788       #endif
10789       break;
10790 
10791     case SIMDE_MM_FROUND_TO_POS_INF:
10792       #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
10793         r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_ceil(a_.altivec_f32));
10794       #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
10795         r_.neon_f32 = vrndpq_f32(a_.neon_f32);
10796       #elif defined(simde_math_ceilf)
10797         SIMDE_VECTORIZE
10798         for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
10799           r_.f32[i] = simde_math_ceilf(a_.f32[i]);
10800         }
10801       #else
10802         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
10803       #endif
10804       break;
10805 
10806     case SIMDE_MM_FROUND_TO_ZERO:
10807       #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
10808         r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_trunc(a_.altivec_f32));
10809       #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
10810         r_.neon_f32 = vrndq_f32(a_.neon_f32);
10811       #elif defined(simde_math_truncf)
10812         SIMDE_VECTORIZE
10813         for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
10814           r_.f32[i] = simde_math_truncf(a_.f32[i]);
10815         }
10816       #else
10817         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
10818       #endif
10819       break;
10820 
10821     default:
10822       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
10823   }
10824 
10825   return simde__m128_from_private(r_);
10826 }
10827 #if defined(SIMDE_X86_SSE4_1_NATIVE)
10828   #define simde_mm_round_ps(a, rounding) _mm_round_ps((a), (rounding))
10829 #else
10830   #define simde_mm_round_ps(a, rounding) simde_x_mm_round_ps((a), (rounding), 0)
10831 #endif
10832 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
10833   #define _mm_round_ps(a, rounding) simde_mm_round_ps((a), (rounding))
10834 #endif
10835 
10836 SIMDE_FUNCTION_ATTRIBUTES
10837 simde__m128
simde_mm_set_ps(simde_float32 e3,simde_float32 e2,simde_float32 e1,simde_float32 e0)10838 simde_mm_set_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
10839   #if defined(SIMDE_X86_SSE_NATIVE)
10840     return _mm_set_ps(e3, e2, e1, e0);
10841   #else
10842     simde__m128_private r_;
10843 
10844     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
10845       SIMDE_ALIGN_TO_16 simde_float32 data[4] = { e0, e1, e2, e3 };
10846       r_.neon_f32 = vld1q_f32(data);
10847     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
10848       r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3);
10849     #else
10850       r_.f32[0] = e0;
10851       r_.f32[1] = e1;
10852       r_.f32[2] = e2;
10853       r_.f32[3] = e3;
10854     #endif
10855 
10856     return simde__m128_from_private(r_);
10857   #endif
10858 }
10859 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10860 #  define _mm_set_ps(e3, e2, e1, e0) simde_mm_set_ps(e3, e2, e1, e0)
10861 #endif
10862 
10863 SIMDE_FUNCTION_ATTRIBUTES
10864 simde__m128
simde_mm_set_ps1(simde_float32 a)10865 simde_mm_set_ps1 (simde_float32 a) {
10866   #if defined(SIMDE_X86_SSE_NATIVE)
10867     return _mm_set_ps1(a);
10868   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
10869     return vdupq_n_f32(a);
10870   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
10871     (void) a;
10872     return vec_splats(a);
10873   #else
10874     return simde_mm_set_ps(a, a, a, a);
10875   #endif
10876 }
10877 #define simde_mm_set1_ps(a) simde_mm_set_ps1(a)
10878 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10879 #  define _mm_set_ps1(a) simde_mm_set_ps1(a)
10880 #  define _mm_set1_ps(a) simde_mm_set1_ps(a)
10881 #endif
10882 
10883 SIMDE_FUNCTION_ATTRIBUTES
10884 simde__m128
simde_mm_move_ss(simde__m128 a,simde__m128 b)10885 simde_mm_move_ss (simde__m128 a, simde__m128 b) {
10886   #if defined(SIMDE_X86_SSE_NATIVE)
10887     return _mm_move_ss(a, b);
10888   #else
10889     simde__m128_private
10890       r_,
10891       a_ = simde__m128_to_private(a),
10892       b_ = simde__m128_to_private(b);
10893 
10894     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
10895       r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0);
10896     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
10897       SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) m = {
10898         16, 17, 18, 19,
10899         4,  5,  6,  7,
10900         8,  9, 10, 11,
10901         12, 13, 14, 15
10902       };
10903       r_.altivec_f32 = vec_perm(a_.altivec_f32, b_.altivec_f32, m);
10904     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
10905       r_.wasm_v128 = wasm_v8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
10906     #elif defined(SIMDE_SHUFFLE_VECTOR_)
10907       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3);
10908     #else
10909       r_.f32[0] = b_.f32[0];
10910       r_.f32[1] = a_.f32[1];
10911       r_.f32[2] = a_.f32[2];
10912       r_.f32[3] = a_.f32[3];
10913     #endif
10914 
10915     return simde__m128_from_private(r_);
10916   #endif
10917 }
10918 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10919 #  define _mm_move_ss(a, b) simde_mm_move_ss((a), (b))
10920 #endif
10921 
10922 SIMDE_FUNCTION_ATTRIBUTES
10923 simde__m128
simde_mm_add_ps(simde__m128 a,simde__m128 b)10924 simde_mm_add_ps (simde__m128 a, simde__m128 b) {
10925   #if defined(SIMDE_X86_SSE_NATIVE)
10926     return _mm_add_ps(a, b);
10927   #else
10928     simde__m128_private
10929       r_,
10930       a_ = simde__m128_to_private(a),
10931       b_ = simde__m128_to_private(b);
10932 
10933     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
10934       r_.neon_f32 = vaddq_f32(a_.neon_f32, b_.neon_f32);
10935     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
10936       r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128);
10937     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
10938       r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32);
10939     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
10940       r_.f32 = a_.f32 + b_.f32;
10941     #else
10942       SIMDE_VECTORIZE
10943       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
10944         r_.f32[i] = a_.f32[i] + b_.f32[i];
10945       }
10946     #endif
10947 
10948     return simde__m128_from_private(r_);
10949   #endif
10950 }
10951 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10952 #  define _mm_add_ps(a, b) simde_mm_add_ps((a), (b))
10953 #endif
10954 
10955 SIMDE_FUNCTION_ATTRIBUTES
10956 simde__m128
simde_mm_add_ss(simde__m128 a,simde__m128 b)10957 simde_mm_add_ss (simde__m128 a, simde__m128 b) {
10958   #if defined(SIMDE_X86_SSE_NATIVE)
10959     return _mm_add_ss(a, b);
10960   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
10961     return simde_mm_move_ss(a, simde_mm_add_ps(a, b));
10962   #else
10963     simde__m128_private
10964       r_,
10965       a_ = simde__m128_to_private(a),
10966       b_ = simde__m128_to_private(b);
10967 
10968     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
10969       float32_t b0 = vgetq_lane_f32(b_.neon_f32, 0);
10970       float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
10971       // the upper values in the result must be the remnants of <a>.
10972       r_.neon_f32 = vaddq_f32(a_.neon_f32, value);
10973     #else
10974       r_.f32[0] = a_.f32[0] + b_.f32[0];
10975       r_.f32[1] = a_.f32[1];
10976       r_.f32[2] = a_.f32[2];
10977       r_.f32[3] = a_.f32[3];
10978     #endif
10979 
10980     return simde__m128_from_private(r_);
10981   #endif
10982 }
10983 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
10984 #  define _mm_add_ss(a, b) simde_mm_add_ss((a), (b))
10985 #endif
10986 
10987 SIMDE_FUNCTION_ATTRIBUTES
10988 simde__m128
simde_mm_and_ps(simde__m128 a,simde__m128 b)10989 simde_mm_and_ps (simde__m128 a, simde__m128 b) {
10990   #if defined(SIMDE_X86_SSE_NATIVE)
10991     return _mm_and_ps(a, b);
10992   #else
10993     simde__m128_private
10994       r_,
10995       a_ = simde__m128_to_private(a),
10996       b_ = simde__m128_to_private(b);
10997 
10998     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
10999       r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
11000     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11001       r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
11002     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11003       r_.i32 = a_.i32 & b_.i32;
11004     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
11005       r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32);
11006     #else
11007       SIMDE_VECTORIZE
11008       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
11009         r_.i32[i] = a_.i32[i] & b_.i32[i];
11010       }
11011     #endif
11012 
11013     return simde__m128_from_private(r_);
11014   #endif
11015 }
11016 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11017 #  define _mm_and_ps(a, b) simde_mm_and_ps((a), (b))
11018 #endif
11019 
11020 SIMDE_FUNCTION_ATTRIBUTES
11021 simde__m128
simde_mm_andnot_ps(simde__m128 a,simde__m128 b)11022 simde_mm_andnot_ps (simde__m128 a, simde__m128 b) {
11023   #if defined(SIMDE_X86_SSE_NATIVE)
11024     return _mm_andnot_ps(a, b);
11025   #else
11026     simde__m128_private
11027       r_,
11028       a_ = simde__m128_to_private(a),
11029       b_ = simde__m128_to_private(b);
11030 
11031     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11032       r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
11033     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11034       r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
11035     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
11036       r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32);
11037     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11038       r_.i32 = ~a_.i32 & b_.i32;
11039     #else
11040       SIMDE_VECTORIZE
11041       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
11042         r_.i32[i] = ~(a_.i32[i]) & b_.i32[i];
11043       }
11044     #endif
11045 
11046     return simde__m128_from_private(r_);
11047   #endif
11048 }
11049 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11050 #  define _mm_andnot_ps(a, b) simde_mm_andnot_ps((a), (b))
11051 #endif
11052 
11053 SIMDE_FUNCTION_ATTRIBUTES
11054 simde__m128
simde_mm_xor_ps(simde__m128 a,simde__m128 b)11055 simde_mm_xor_ps (simde__m128 a, simde__m128 b) {
11056   #if defined(SIMDE_X86_SSE_NATIVE)
11057     return _mm_xor_ps(a, b);
11058   #else
11059     simde__m128_private
11060       r_,
11061       a_ = simde__m128_to_private(a),
11062       b_ = simde__m128_to_private(b);
11063 
11064     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11065       r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
11066     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11067       r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128);
11068     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
11069       r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
11070     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11071       r_.i32f = a_.i32f ^ b_.i32f;
11072     #else
11073       SIMDE_VECTORIZE
11074       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
11075         r_.u32[i] = a_.u32[i] ^ b_.u32[i];
11076       }
11077     #endif
11078 
11079     return simde__m128_from_private(r_);
11080   #endif
11081 }
11082 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11083 #  define _mm_xor_ps(a, b) simde_mm_xor_ps((a), (b))
11084 #endif
11085 
11086 SIMDE_FUNCTION_ATTRIBUTES
11087 simde__m128
simde_mm_or_ps(simde__m128 a,simde__m128 b)11088 simde_mm_or_ps (simde__m128 a, simde__m128 b) {
11089   #if defined(SIMDE_X86_SSE_NATIVE)
11090     return _mm_or_ps(a, b);
11091   #else
11092     simde__m128_private
11093       r_,
11094       a_ = simde__m128_to_private(a),
11095       b_ = simde__m128_to_private(b);
11096 
11097     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11098       r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
11099     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11100       r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128);
11101     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
11102       r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
11103     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11104       r_.i32f = a_.i32f | b_.i32f;
11105     #else
11106       SIMDE_VECTORIZE
11107       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
11108         r_.u32[i] = a_.u32[i] | b_.u32[i];
11109       }
11110     #endif
11111 
11112     return simde__m128_from_private(r_);
11113   #endif
11114 }
11115 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11116 #  define _mm_or_ps(a, b) simde_mm_or_ps((a), (b))
11117 #endif
11118 
11119 SIMDE_FUNCTION_ATTRIBUTES
11120 simde__m128
simde_x_mm_not_ps(simde__m128 a)11121 simde_x_mm_not_ps(simde__m128 a) {
11122   #if defined(SIMDE_X86_AVX512VL_NATIVE)
11123     __m128i ai = _mm_castps_si128(a);
11124     return _mm_castsi128_ps(_mm_ternarylogic_epi32(ai, ai, ai, 0x55));
11125   #elif defined(SIMDE_X86_SSE2_NATIVE)
11126     /* Note: we use ints instead of floats because we don't want cmpeq
11127      * to return false for (NaN, NaN) */
11128     __m128i ai = _mm_castps_si128(a);
11129     return _mm_castsi128_ps(_mm_andnot_si128(ai, _mm_cmpeq_epi32(ai, ai)));
11130   #else
11131     simde__m128_private
11132       r_,
11133       a_ = simde__m128_to_private(a);
11134 
11135     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11136       r_.neon_i32 = vmvnq_s32(a_.neon_i32);
11137     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
11138       r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
11139     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11140       r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
11141     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11142       r_.i32 = ~a_.i32;
11143     #else
11144       SIMDE_VECTORIZE
11145       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
11146         r_.i32[i] = ~(a_.i32[i]);
11147       }
11148     #endif
11149 
11150     return simde__m128_from_private(r_);
11151   #endif
11152 }
11153 
11154 SIMDE_FUNCTION_ATTRIBUTES
11155 simde__m128
simde_x_mm_select_ps(simde__m128 a,simde__m128 b,simde__m128 mask)11156 simde_x_mm_select_ps(simde__m128 a, simde__m128 b, simde__m128 mask) {
11157   /* This function is for when you want to blend two elements together
11158    * according to a mask.  It is similar to _mm_blendv_ps, except that
11159    * it is undefined whether the blend is based on the highest bit in
11160    * each lane (like blendv) or just bitwise operations.  This allows
11161    * us to implement the function efficiently everywhere.
11162    *
11163    * Basically, you promise that all the lanes in mask are either 0 or
11164    * ~0. */
11165   #if defined(SIMDE_X86_SSE4_1_NATIVE)
11166     return _mm_blendv_ps(a, b, mask);
11167   #else
11168     simde__m128_private
11169       r_,
11170       a_ = simde__m128_to_private(a),
11171       b_ = simde__m128_to_private(b),
11172       mask_ = simde__m128_to_private(mask);
11173 
11174     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11175       r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
11176     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11177       r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128, mask_.wasm_v128);
11178     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
11179       r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, mask_.altivec_u32);
11180     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11181       r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32);
11182     #else
11183       SIMDE_VECTORIZE
11184       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
11185         r_.i32[i] = a_.i32[i] ^ ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]);
11186       }
11187     #endif
11188 
11189     return simde__m128_from_private(r_);
11190   #endif
11191 }
11192 
11193 SIMDE_FUNCTION_ATTRIBUTES
11194 simde__m64
simde_mm_avg_pu16(simde__m64 a,simde__m64 b)11195 simde_mm_avg_pu16 (simde__m64 a, simde__m64 b) {
11196   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
11197     return _mm_avg_pu16(a, b);
11198   #else
11199     simde__m64_private
11200       r_,
11201       a_ = simde__m64_to_private(a),
11202       b_ = simde__m64_to_private(b);
11203 
11204     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11205       r_.neon_u16 = vrhadd_u16(b_.neon_u16, a_.neon_u16);
11206     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
11207       uint32_t wa SIMDE_VECTOR(16);
11208       uint32_t wb SIMDE_VECTOR(16);
11209       uint32_t wr SIMDE_VECTOR(16);
11210       SIMDE_CONVERT_VECTOR_(wa, a_.u16);
11211       SIMDE_CONVERT_VECTOR_(wb, b_.u16);
11212       wr = (wa + wb + 1) >> 1;
11213       SIMDE_CONVERT_VECTOR_(r_.u16, wr);
11214     #else
11215       SIMDE_VECTORIZE
11216       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
11217         r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
11218       }
11219     #endif
11220 
11221     return simde__m64_from_private(r_);
11222   #endif
11223 }
11224 #define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b)
11225 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11226 #  define _mm_avg_pu16(a, b) simde_mm_avg_pu16(a, b)
11227 #  define _m_pavgw(a, b) simde_mm_avg_pu16(a, b)
11228 #endif
11229 
11230 SIMDE_FUNCTION_ATTRIBUTES
11231 simde__m64
simde_mm_avg_pu8(simde__m64 a,simde__m64 b)11232 simde_mm_avg_pu8 (simde__m64 a, simde__m64 b) {
11233   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
11234     return _mm_avg_pu8(a, b);
11235   #else
11236     simde__m64_private
11237       r_,
11238       a_ = simde__m64_to_private(a),
11239       b_ = simde__m64_to_private(b);
11240 
11241     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11242       r_.neon_u8 = vrhadd_u8(b_.neon_u8, a_.neon_u8);
11243     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
11244       uint16_t wa SIMDE_VECTOR(16);
11245       uint16_t wb SIMDE_VECTOR(16);
11246       uint16_t wr SIMDE_VECTOR(16);
11247       SIMDE_CONVERT_VECTOR_(wa, a_.u8);
11248       SIMDE_CONVERT_VECTOR_(wb, b_.u8);
11249       wr = (wa + wb + 1) >> 1;
11250       SIMDE_CONVERT_VECTOR_(r_.u8, wr);
11251     #else
11252       SIMDE_VECTORIZE
11253       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
11254         r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
11255       }
11256     #endif
11257 
11258     return simde__m64_from_private(r_);
11259   #endif
11260 }
11261 #define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b)
11262 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11263 #  define _mm_avg_pu8(a, b) simde_mm_avg_pu8(a, b)
11264 #  define _m_pavgb(a, b) simde_mm_avg_pu8(a, b)
11265 #endif
11266 
11267 SIMDE_FUNCTION_ATTRIBUTES
11268 simde__m128
simde_x_mm_abs_ps(simde__m128 a)11269 simde_x_mm_abs_ps(simde__m128 a) {
11270   #if defined(SIMDE_X86_SSE_NATIVE)
11271     simde_float32 mask_;
11272     uint32_t u32_ = UINT32_C(0x7FFFFFFF);
11273     simde_memcpy(&mask_, &u32_, sizeof(u32_));
11274     return _mm_and_ps(_mm_set1_ps(mask_), a);
11275   #else
11276     simde__m128_private
11277       r_,
11278       a_ = simde__m128_to_private(a);
11279 
11280     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11281       r_.neon_f32 = vabsq_f32(a_.neon_f32);
11282     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
11283       r_.altivec_f32 = vec_abs(a_.altivec_f32);
11284     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11285       r_.wasm_v128 = wasm_f32x4_abs(a_.wasm_v128);
11286     #else
11287       SIMDE_VECTORIZE
11288       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11289         r_.f32[i] = simde_math_fabsf(a_.f32[i]);
11290       }
11291     #endif
11292 
11293     return simde__m128_from_private(r_);
11294   #endif
11295 }
11296 
11297 SIMDE_FUNCTION_ATTRIBUTES
11298 simde__m128
simde_mm_cmpeq_ps(simde__m128 a,simde__m128 b)11299 simde_mm_cmpeq_ps (simde__m128 a, simde__m128 b) {
11300   #if defined(SIMDE_X86_SSE_NATIVE)
11301     return _mm_cmpeq_ps(a, b);
11302   #else
11303     simde__m128_private
11304       r_,
11305       a_ = simde__m128_to_private(a),
11306       b_ = simde__m128_to_private(b);
11307 
11308     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11309       r_.neon_u32 = vceqq_f32(a_.neon_f32, b_.neon_f32);
11310     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11311       r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128);
11312     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
11313       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpeq(a_.altivec_f32, b_.altivec_f32));
11314     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11315       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.f32 == b_.f32);
11316     #else
11317       SIMDE_VECTORIZE
11318       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11319         r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
11320       }
11321     #endif
11322 
11323     return simde__m128_from_private(r_);
11324   #endif
11325 }
11326 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11327 #  define _mm_cmpeq_ps(a, b) simde_mm_cmpeq_ps((a), (b))
11328 #endif
11329 
11330 SIMDE_FUNCTION_ATTRIBUTES
11331 simde__m128
simde_mm_cmpeq_ss(simde__m128 a,simde__m128 b)11332 simde_mm_cmpeq_ss (simde__m128 a, simde__m128 b) {
11333   #if defined(SIMDE_X86_SSE_NATIVE)
11334     return _mm_cmpeq_ss(a, b);
11335   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
11336     return simde_mm_move_ss(a, simde_mm_cmpeq_ps(a, b));
11337   #else
11338     simde__m128_private
11339       r_,
11340       a_ = simde__m128_to_private(a),
11341       b_ = simde__m128_to_private(b);
11342 
11343     r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
11344     SIMDE_VECTORIZE
11345     for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11346       r_.u32[i] = a_.u32[i];
11347     }
11348 
11349     return simde__m128_from_private(r_);
11350   #endif
11351 }
11352 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11353 #  define _mm_cmpeq_ss(a, b) simde_mm_cmpeq_ss((a), (b))
11354 #endif
11355 
11356 SIMDE_FUNCTION_ATTRIBUTES
11357 simde__m128
simde_mm_cmpge_ps(simde__m128 a,simde__m128 b)11358 simde_mm_cmpge_ps (simde__m128 a, simde__m128 b) {
11359   #if defined(SIMDE_X86_SSE_NATIVE)
11360     return _mm_cmpge_ps(a, b);
11361   #else
11362     simde__m128_private
11363       r_,
11364       a_ = simde__m128_to_private(a),
11365       b_ = simde__m128_to_private(b);
11366 
11367     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11368       r_.neon_u32 = vcgeq_f32(a_.neon_f32, b_.neon_f32);
11369     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11370       r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128);
11371     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
11372       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpge(a_.altivec_f32, b_.altivec_f32));
11373     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11374       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32));
11375     #else
11376       SIMDE_VECTORIZE
11377       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11378         r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
11379       }
11380     #endif
11381 
11382     return simde__m128_from_private(r_);
11383   #endif
11384 }
11385 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11386 #  define _mm_cmpge_ps(a, b) simde_mm_cmpge_ps((a), (b))
11387 #endif
11388 
11389 SIMDE_FUNCTION_ATTRIBUTES
11390 simde__m128
simde_mm_cmpge_ss(simde__m128 a,simde__m128 b)11391 simde_mm_cmpge_ss (simde__m128 a, simde__m128 b) {
11392   #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
11393     return _mm_cmpge_ss(a, b);
11394   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
11395     return simde_mm_move_ss(a, simde_mm_cmpge_ps(a, b));
11396   #else
11397     simde__m128_private
11398       r_,
11399       a_ = simde__m128_to_private(a),
11400       b_ = simde__m128_to_private(b);
11401 
11402     r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
11403     SIMDE_VECTORIZE
11404     for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11405       r_.u32[i] = a_.u32[i];
11406     }
11407 
11408     return simde__m128_from_private(r_);
11409   #endif
11410 }
11411 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11412 #  define _mm_cmpge_ss(a, b) simde_mm_cmpge_ss((a), (b))
11413 #endif
11414 
11415 SIMDE_FUNCTION_ATTRIBUTES
11416 simde__m128
simde_mm_cmpgt_ps(simde__m128 a,simde__m128 b)11417 simde_mm_cmpgt_ps (simde__m128 a, simde__m128 b) {
11418   #if defined(SIMDE_X86_SSE_NATIVE)
11419     return _mm_cmpgt_ps(a, b);
11420   #else
11421     simde__m128_private
11422       r_,
11423       a_ = simde__m128_to_private(a),
11424       b_ = simde__m128_to_private(b);
11425 
11426     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11427       r_.neon_u32 = vcgtq_f32(a_.neon_f32, b_.neon_f32);
11428     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11429       r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128);
11430     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
11431       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpgt(a_.altivec_f32, b_.altivec_f32));
11432     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11433       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32));
11434     #else
11435       SIMDE_VECTORIZE
11436       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11437         r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
11438       }
11439     #endif
11440 
11441     return simde__m128_from_private(r_);
11442   #endif
11443 }
11444 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11445 #  define _mm_cmpgt_ps(a, b) simde_mm_cmpgt_ps((a), (b))
11446 #endif
11447 
11448 SIMDE_FUNCTION_ATTRIBUTES
11449 simde__m128
simde_mm_cmpgt_ss(simde__m128 a,simde__m128 b)11450 simde_mm_cmpgt_ss (simde__m128 a, simde__m128 b) {
11451   #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
11452     return _mm_cmpgt_ss(a, b);
11453   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
11454     return simde_mm_move_ss(a, simde_mm_cmpgt_ps(a, b));
11455   #else
11456     simde__m128_private
11457       r_,
11458       a_ = simde__m128_to_private(a),
11459       b_ = simde__m128_to_private(b);
11460 
11461     r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
11462     SIMDE_VECTORIZE
11463     for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11464       r_.u32[i] = a_.u32[i];
11465     }
11466 
11467     return simde__m128_from_private(r_);
11468   #endif
11469 }
11470 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11471 #  define _mm_cmpgt_ss(a, b) simde_mm_cmpgt_ss((a), (b))
11472 #endif
11473 
11474 SIMDE_FUNCTION_ATTRIBUTES
11475 simde__m128
simde_mm_cmple_ps(simde__m128 a,simde__m128 b)11476 simde_mm_cmple_ps (simde__m128 a, simde__m128 b) {
11477   #if defined(SIMDE_X86_SSE_NATIVE)
11478     return _mm_cmple_ps(a, b);
11479   #else
11480     simde__m128_private
11481       r_,
11482       a_ = simde__m128_to_private(a),
11483       b_ = simde__m128_to_private(b);
11484 
11485     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11486       r_.neon_u32 = vcleq_f32(a_.neon_f32, b_.neon_f32);
11487     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11488       r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128);
11489     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
11490       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmple(a_.altivec_f32, b_.altivec_f32));
11491     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11492       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32));
11493     #else
11494       SIMDE_VECTORIZE
11495       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11496         r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
11497       }
11498     #endif
11499 
11500     return simde__m128_from_private(r_);
11501   #endif
11502 }
11503 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11504 #  define _mm_cmple_ps(a, b) simde_mm_cmple_ps((a), (b))
11505 #endif
11506 
11507 SIMDE_FUNCTION_ATTRIBUTES
11508 simde__m128
simde_mm_cmple_ss(simde__m128 a,simde__m128 b)11509 simde_mm_cmple_ss (simde__m128 a, simde__m128 b) {
11510   #if defined(SIMDE_X86_SSE_NATIVE)
11511     return _mm_cmple_ss(a, b);
11512   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
11513     return simde_mm_move_ss(a, simde_mm_cmple_ps(a, b));
11514   #else
11515     simde__m128_private
11516       r_,
11517       a_ = simde__m128_to_private(a),
11518       b_ = simde__m128_to_private(b);
11519 
11520     r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
11521     SIMDE_VECTORIZE
11522     for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11523       r_.u32[i] = a_.u32[i];
11524     }
11525 
11526     return simde__m128_from_private(r_);
11527   #endif
11528 }
11529 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11530 #  define _mm_cmple_ss(a, b) simde_mm_cmple_ss((a), (b))
11531 #endif
11532 
11533 SIMDE_FUNCTION_ATTRIBUTES
11534 simde__m128
simde_mm_cmplt_ps(simde__m128 a,simde__m128 b)11535 simde_mm_cmplt_ps (simde__m128 a, simde__m128 b) {
11536   #if defined(SIMDE_X86_SSE_NATIVE)
11537     return _mm_cmplt_ps(a, b);
11538   #else
11539     simde__m128_private
11540       r_,
11541       a_ = simde__m128_to_private(a),
11542       b_ = simde__m128_to_private(b);
11543 
11544     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11545       r_.neon_u32 = vcltq_f32(a_.neon_f32, b_.neon_f32);
11546     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11547       r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128);
11548     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
11549       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmplt(a_.altivec_f32, b_.altivec_f32));
11550     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11551       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32));
11552     #else
11553       SIMDE_VECTORIZE
11554       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11555         r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
11556       }
11557     #endif
11558 
11559     return simde__m128_from_private(r_);
11560   #endif
11561 }
11562 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11563 #  define _mm_cmplt_ps(a, b) simde_mm_cmplt_ps((a), (b))
11564 #endif
11565 
11566 SIMDE_FUNCTION_ATTRIBUTES
11567 simde__m128
simde_mm_cmplt_ss(simde__m128 a,simde__m128 b)11568 simde_mm_cmplt_ss (simde__m128 a, simde__m128 b) {
11569   #if defined(SIMDE_X86_SSE_NATIVE)
11570     return _mm_cmplt_ss(a, b);
11571   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
11572     return simde_mm_move_ss(a, simde_mm_cmplt_ps(a, b));
11573   #else
11574     simde__m128_private
11575       r_,
11576       a_ = simde__m128_to_private(a),
11577       b_ = simde__m128_to_private(b);
11578 
11579     r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
11580     SIMDE_VECTORIZE
11581     for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11582       r_.u32[i] = a_.u32[i];
11583     }
11584 
11585     return simde__m128_from_private(r_);
11586   #endif
11587 }
11588 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11589 #  define _mm_cmplt_ss(a, b) simde_mm_cmplt_ss((a), (b))
11590 #endif
11591 
11592 SIMDE_FUNCTION_ATTRIBUTES
11593 simde__m128
simde_mm_cmpneq_ps(simde__m128 a,simde__m128 b)11594 simde_mm_cmpneq_ps (simde__m128 a, simde__m128 b) {
11595   #if defined(SIMDE_X86_SSE_NATIVE)
11596     return _mm_cmpneq_ps(a, b);
11597   #else
11598     simde__m128_private
11599       r_,
11600       a_ = simde__m128_to_private(a),
11601       b_ = simde__m128_to_private(b);
11602 
11603     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11604       r_.neon_u32 = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
11605     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11606       r_.wasm_v128 = wasm_f32x4_ne(a_.wasm_v128, b_.wasm_v128);
11607     #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && SIMDE_ARCH_POWER_CHECK(900) && !defined(HEDLEY_IBM_VERSION)
11608       /* vec_cmpne(SIMDE_POWER_ALTIVEC_VECTOR(float), SIMDE_POWER_ALTIVEC_VECTOR(float))
11609         is missing from XL C/C++ v16.1.1,
11610         though the documentation (table 89 on page 432 of the IBM XL C/C++ for
11611         Linux Compiler Reference, Version 16.1.1) shows that it should be
11612         present.  Both GCC and clang support it. */
11613       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpne(a_.altivec_f32, b_.altivec_f32));
11614     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
11615       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpeq(a_.altivec_f32, b_.altivec_f32));
11616       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_nor(r_.altivec_f32, r_.altivec_f32));
11617     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
11618       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32));
11619     #else
11620       SIMDE_VECTORIZE
11621       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11622         r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
11623       }
11624     #endif
11625 
11626     return simde__m128_from_private(r_);
11627   #endif
11628 }
11629 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11630 #  define _mm_cmpneq_ps(a, b) simde_mm_cmpneq_ps((a), (b))
11631 #endif
11632 
11633 SIMDE_FUNCTION_ATTRIBUTES
11634 simde__m128
simde_mm_cmpneq_ss(simde__m128 a,simde__m128 b)11635 simde_mm_cmpneq_ss (simde__m128 a, simde__m128 b) {
11636   #if defined(SIMDE_X86_SSE_NATIVE)
11637     return _mm_cmpneq_ss(a, b);
11638   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
11639     return simde_mm_move_ss(a, simde_mm_cmpneq_ps(a, b));
11640   #else
11641     simde__m128_private
11642       r_,
11643       a_ = simde__m128_to_private(a),
11644       b_ = simde__m128_to_private(b);
11645 
11646     r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
11647     SIMDE_VECTORIZE
11648     for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11649       r_.u32[i] = a_.u32[i];
11650     }
11651 
11652     return simde__m128_from_private(r_);
11653   #endif
11654 }
11655 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11656 #  define _mm_cmpneq_ss(a, b) simde_mm_cmpneq_ss((a), (b))
11657 #endif
11658 
11659 SIMDE_FUNCTION_ATTRIBUTES
11660 simde__m128
simde_mm_cmpnge_ps(simde__m128 a,simde__m128 b)11661 simde_mm_cmpnge_ps (simde__m128 a, simde__m128 b) {
11662   return simde_mm_cmplt_ps(a, b);
11663 }
11664 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11665 #  define _mm_cmpnge_ps(a, b) simde_mm_cmpnge_ps((a), (b))
11666 #endif
11667 
11668 SIMDE_FUNCTION_ATTRIBUTES
11669 simde__m128
simde_mm_cmpnge_ss(simde__m128 a,simde__m128 b)11670 simde_mm_cmpnge_ss (simde__m128 a, simde__m128 b) {
11671   return simde_mm_cmplt_ss(a, b);
11672 }
11673 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11674 #  define _mm_cmpnge_ss(a, b) simde_mm_cmpnge_ss((a), (b))
11675 #endif
11676 
11677 SIMDE_FUNCTION_ATTRIBUTES
11678 simde__m128
simde_mm_cmpngt_ps(simde__m128 a,simde__m128 b)11679 simde_mm_cmpngt_ps (simde__m128 a, simde__m128 b) {
11680   return simde_mm_cmple_ps(a, b);
11681 }
11682 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11683 #  define _mm_cmpngt_ps(a, b) simde_mm_cmpngt_ps((a), (b))
11684 #endif
11685 
11686 SIMDE_FUNCTION_ATTRIBUTES
11687 simde__m128
simde_mm_cmpngt_ss(simde__m128 a,simde__m128 b)11688 simde_mm_cmpngt_ss (simde__m128 a, simde__m128 b) {
11689   return simde_mm_cmple_ss(a, b);
11690 }
11691 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11692 #  define _mm_cmpngt_ss(a, b) simde_mm_cmpngt_ss((a), (b))
11693 #endif
11694 
11695 SIMDE_FUNCTION_ATTRIBUTES
11696 simde__m128
simde_mm_cmpnle_ps(simde__m128 a,simde__m128 b)11697 simde_mm_cmpnle_ps (simde__m128 a, simde__m128 b) {
11698   return simde_mm_cmpgt_ps(a, b);
11699 }
11700 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11701 #  define _mm_cmpnle_ps(a, b) simde_mm_cmpnle_ps((a), (b))
11702 #endif
11703 
11704 SIMDE_FUNCTION_ATTRIBUTES
11705 simde__m128
simde_mm_cmpnle_ss(simde__m128 a,simde__m128 b)11706 simde_mm_cmpnle_ss (simde__m128 a, simde__m128 b) {
11707   return simde_mm_cmpgt_ss(a, b);
11708 }
11709 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11710 #  define _mm_cmpnle_ss(a, b) simde_mm_cmpnle_ss((a), (b))
11711 #endif
11712 
11713 SIMDE_FUNCTION_ATTRIBUTES
11714 simde__m128
simde_mm_cmpnlt_ps(simde__m128 a,simde__m128 b)11715 simde_mm_cmpnlt_ps (simde__m128 a, simde__m128 b) {
11716   return simde_mm_cmpge_ps(a, b);
11717 }
11718 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11719 #  define _mm_cmpnlt_ps(a, b) simde_mm_cmpnlt_ps((a), (b))
11720 #endif
11721 
11722 SIMDE_FUNCTION_ATTRIBUTES
11723 simde__m128
simde_mm_cmpnlt_ss(simde__m128 a,simde__m128 b)11724 simde_mm_cmpnlt_ss (simde__m128 a, simde__m128 b) {
11725   return simde_mm_cmpge_ss(a, b);
11726 }
11727 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11728 #  define _mm_cmpnlt_ss(a, b) simde_mm_cmpnlt_ss((a), (b))
11729 #endif
11730 
11731 SIMDE_FUNCTION_ATTRIBUTES
11732 simde__m128
simde_mm_cmpord_ps(simde__m128 a,simde__m128 b)11733 simde_mm_cmpord_ps (simde__m128 a, simde__m128 b) {
11734   #if defined(SIMDE_X86_SSE_NATIVE)
11735     return _mm_cmpord_ps(a, b);
11736   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11737     return wasm_v128_and(wasm_f32x4_eq(a, a), wasm_f32x4_eq(b, b));
11738   #else
11739     simde__m128_private
11740       r_,
11741       a_ = simde__m128_to_private(a),
11742       b_ = simde__m128_to_private(b);
11743 
11744     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11745       /* Note: NEON does not have ordered compare builtin
11746         Need to compare a eq a and b eq b to check for NaN
11747         Do AND of results to get final */
11748       uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
11749       uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
11750       r_.neon_u32 = vandq_u32(ceqaa, ceqbb);
11751     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11752       r_.wasm_v128 = wasm_v128_and(wasm_f32x4_eq(a_.wasm_v128, a_.wasm_v128), wasm_f32x4_eq(b_.wasm_v128, b_.wasm_v128));
11753     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
11754       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float),
11755           vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
11756     #elif defined(simde_math_isnanf)
11757       SIMDE_VECTORIZE
11758       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11759         r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? UINT32_C(0) : ~UINT32_C(0);
11760       }
11761     #else
11762       HEDLEY_UNREACHABLE();
11763     #endif
11764 
11765     return simde__m128_from_private(r_);
11766   #endif
11767 }
11768 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11769 #  define _mm_cmpord_ps(a, b) simde_mm_cmpord_ps((a), (b))
11770 #endif
11771 
11772 SIMDE_FUNCTION_ATTRIBUTES
11773 simde__m128
simde_mm_cmpunord_ps(simde__m128 a,simde__m128 b)11774 simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) {
11775   #if defined(SIMDE_X86_SSE_NATIVE)
11776     return _mm_cmpunord_ps(a, b);
11777   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11778     return wasm_v128_or(wasm_f32x4_ne(a, a), wasm_f32x4_ne(b, b));
11779   #else
11780     simde__m128_private
11781       r_,
11782       a_ = simde__m128_to_private(a),
11783       b_ = simde__m128_to_private(b);
11784 
11785     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11786       uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
11787       uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
11788       r_.neon_u32 = vmvnq_u32(vandq_u32(ceqaa, ceqbb));
11789     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
11790       r_.wasm_v128 = wasm_v128_or(wasm_f32x4_ne(a_.wasm_v128, a_.wasm_v128), wasm_f32x4_ne(b_.wasm_v128, b_.wasm_v128));
11791     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
11792       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float),
11793           vec_nand(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
11794     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
11795       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float),
11796           vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
11797       r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32);
11798     #elif defined(simde_math_isnanf)
11799       SIMDE_VECTORIZE
11800       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
11801         r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0);
11802       }
11803     #else
11804       HEDLEY_UNREACHABLE();
11805     #endif
11806 
11807     return simde__m128_from_private(r_);
11808   #endif
11809 }
11810 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11811 #  define _mm_cmpunord_ps(a, b) simde_mm_cmpunord_ps((a), (b))
11812 #endif
11813 
11814 SIMDE_FUNCTION_ATTRIBUTES
11815 simde__m128
simde_mm_cmpunord_ss(simde__m128 a,simde__m128 b)11816 simde_mm_cmpunord_ss (simde__m128 a, simde__m128 b) {
11817   #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
11818     return _mm_cmpunord_ss(a, b);
11819   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
11820     return simde_mm_move_ss(a, simde_mm_cmpunord_ps(a, b));
11821   #else
11822     simde__m128_private
11823       r_,
11824       a_ = simde__m128_to_private(a),
11825       b_ = simde__m128_to_private(b);
11826 
11827     #if defined(simde_math_isnanf)
11828       r_.u32[0] = (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0);
11829       SIMDE_VECTORIZE
11830       for (size_t i = 1 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
11831         r_.u32[i] = a_.u32[i];
11832       }
11833     #else
11834       HEDLEY_UNREACHABLE();
11835     #endif
11836 
11837     return simde__m128_from_private(r_);
11838   #endif
11839 }
11840 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11841 #  define _mm_cmpunord_ss(a, b) simde_mm_cmpunord_ss((a), (b))
11842 #endif
11843 
11844 SIMDE_FUNCTION_ATTRIBUTES
11845 int
simde_mm_comieq_ss(simde__m128 a,simde__m128 b)11846 simde_mm_comieq_ss (simde__m128 a, simde__m128 b) {
11847   #if defined(SIMDE_X86_SSE_NATIVE)
11848     return _mm_comieq_ss(a, b);
11849   #else
11850     simde__m128_private
11851       a_ = simde__m128_to_private(a),
11852       b_ = simde__m128_to_private(b);
11853 
11854     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11855       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
11856       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
11857       uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
11858       uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
11859       return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
11860     #else
11861       return a_.f32[0] == b_.f32[0];
11862     #endif
11863   #endif
11864 }
11865 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11866 #  define _mm_comieq_ss(a, b) simde_mm_comieq_ss((a), (b))
11867 #endif
11868 
11869 SIMDE_FUNCTION_ATTRIBUTES
11870 int
simde_mm_comige_ss(simde__m128 a,simde__m128 b)11871 simde_mm_comige_ss (simde__m128 a, simde__m128 b) {
11872   #if defined(SIMDE_X86_SSE_NATIVE)
11873     return _mm_comige_ss(a, b);
11874   #else
11875     simde__m128_private
11876       a_ = simde__m128_to_private(a),
11877       b_ = simde__m128_to_private(b);
11878 
11879     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11880       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
11881       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
11882       uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
11883       uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
11884       return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
11885     #else
11886       return a_.f32[0] >= b_.f32[0];
11887     #endif
11888   #endif
11889 }
11890 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11891 #  define _mm_comige_ss(a, b) simde_mm_comige_ss((a), (b))
11892 #endif
11893 
11894 SIMDE_FUNCTION_ATTRIBUTES
11895 int
simde_mm_comigt_ss(simde__m128 a,simde__m128 b)11896 simde_mm_comigt_ss (simde__m128 a, simde__m128 b) {
11897   #if defined(SIMDE_X86_SSE_NATIVE)
11898     return _mm_comigt_ss(a, b);
11899   #else
11900     simde__m128_private
11901       a_ = simde__m128_to_private(a),
11902       b_ = simde__m128_to_private(b);
11903 
11904     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11905       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
11906       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
11907       uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
11908       uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
11909       return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
11910     #else
11911       return a_.f32[0] > b_.f32[0];
11912     #endif
11913   #endif
11914 }
11915 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11916 #  define _mm_comigt_ss(a, b) simde_mm_comigt_ss((a), (b))
11917 #endif
11918 
11919 SIMDE_FUNCTION_ATTRIBUTES
11920 int
simde_mm_comile_ss(simde__m128 a,simde__m128 b)11921 simde_mm_comile_ss (simde__m128 a, simde__m128 b) {
11922   #if defined(SIMDE_X86_SSE_NATIVE)
11923     return _mm_comile_ss(a, b);
11924   #else
11925     simde__m128_private
11926       a_ = simde__m128_to_private(a),
11927       b_ = simde__m128_to_private(b);
11928 
11929     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11930       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
11931       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
11932       uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
11933       uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
11934       return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
11935     #else
11936       return a_.f32[0] <= b_.f32[0];
11937     #endif
11938   #endif
11939 }
11940 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11941 #  define _mm_comile_ss(a, b) simde_mm_comile_ss((a), (b))
11942 #endif
11943 
11944 SIMDE_FUNCTION_ATTRIBUTES
11945 int
simde_mm_comilt_ss(simde__m128 a,simde__m128 b)11946 simde_mm_comilt_ss (simde__m128 a, simde__m128 b) {
11947   #if defined(SIMDE_X86_SSE_NATIVE)
11948     return _mm_comilt_ss(a, b);
11949   #else
11950     simde__m128_private
11951       a_ = simde__m128_to_private(a),
11952       b_ = simde__m128_to_private(b);
11953 
11954     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11955       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
11956       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
11957       uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
11958       uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
11959       return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
11960     #else
11961       return a_.f32[0] < b_.f32[0];
11962     #endif
11963   #endif
11964 }
11965 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11966 #  define _mm_comilt_ss(a, b) simde_mm_comilt_ss((a), (b))
11967 #endif
11968 
11969 SIMDE_FUNCTION_ATTRIBUTES
11970 int
simde_mm_comineq_ss(simde__m128 a,simde__m128 b)11971 simde_mm_comineq_ss (simde__m128 a, simde__m128 b) {
11972   #if defined(SIMDE_X86_SSE_NATIVE)
11973     return _mm_comineq_ss(a, b);
11974   #else
11975     simde__m128_private
11976       a_ = simde__m128_to_private(a),
11977       b_ = simde__m128_to_private(b);
11978 
11979     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
11980       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
11981       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
11982       uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
11983       uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
11984       return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
11985     #else
11986       return a_.f32[0] != b_.f32[0];
11987     #endif
11988   #endif
11989 }
11990 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
11991 #  define _mm_comineq_ss(a, b) simde_mm_comineq_ss((a), (b))
11992 #endif
11993 
11994 SIMDE_FUNCTION_ATTRIBUTES
11995 simde__m128
simde_x_mm_copysign_ps(simde__m128 dest,simde__m128 src)11996 simde_x_mm_copysign_ps(simde__m128 dest, simde__m128 src) {
11997   simde__m128_private
11998     r_,
11999     dest_ = simde__m128_to_private(dest),
12000     src_ = simde__m128_to_private(src);
12001 
12002   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12003     const uint32x4_t sign_pos = vreinterpretq_u32_f32(vdupq_n_f32(-SIMDE_FLOAT32_C(0.0)));
12004     r_.neon_u32 = vbslq_u32(sign_pos, src_.neon_u32, dest_.neon_u32);
12005   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
12006     const v128_t sign_pos = wasm_f32x4_splat(-0.0f);
12007     r_.wasm_v128 = wasm_v128_bitselect(src_.wasm_v128, dest_.wasm_v128, sign_pos);
12008   #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
12009     #if !defined(HEDLEY_IBM_VERSION)
12010       r_.altivec_f32 = vec_cpsgn(dest_.altivec_f32, src_.altivec_f32);
12011     #else
12012       r_.altivec_f32 = vec_cpsgn(src_.altivec_f32, dest_.altivec_f32);
12013     #endif
12014   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
12015     const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sign_pos = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_splats(-0.0f));
12016     r_.altivec_f32 = vec_sel(dest_.altivec_f32, src_.altivec_f32, sign_pos);
12017   #elif defined(SIMDE_IEEE754_STORAGE)
12018     (void) src_;
12019     (void) dest_;
12020     simde__m128 sign_pos = simde_mm_set1_ps(-0.0f);
12021     r_ = simde__m128_to_private(simde_mm_xor_ps(dest, simde_mm_and_ps(simde_mm_xor_ps(dest, src), sign_pos)));
12022   #else
12023     SIMDE_VECTORIZE
12024     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
12025       r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]);
12026     }
12027   #endif
12028 
12029   return simde__m128_from_private(r_);
12030 }
12031 
12032 SIMDE_FUNCTION_ATTRIBUTES
12033 simde__m128
simde_x_mm_xorsign_ps(simde__m128 dest,simde__m128 src)12034 simde_x_mm_xorsign_ps(simde__m128 dest, simde__m128 src) {
12035   return simde_mm_xor_ps(simde_mm_and_ps(simde_mm_set1_ps(-0.0f), src), dest);
12036 }
12037 
12038 SIMDE_FUNCTION_ATTRIBUTES
12039 simde__m128
simde_mm_cvt_pi2ps(simde__m128 a,simde__m64 b)12040 simde_mm_cvt_pi2ps (simde__m128 a, simde__m64 b) {
12041   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12042     return _mm_cvt_pi2ps(a, b);
12043   #else
12044     simde__m128_private
12045       r_,
12046       a_ = simde__m128_to_private(a);
12047     simde__m64_private b_ = simde__m64_to_private(b);
12048 
12049     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12050       r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), vget_high_f32(a_.neon_f32));
12051     #elif defined(SIMDE_CONVERT_VECTOR_)
12052       SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32);
12053       r_.m64_private[1] = a_.m64_private[1];
12054     #else
12055       r_.f32[0] = (simde_float32) b_.i32[0];
12056       r_.f32[1] = (simde_float32) b_.i32[1];
12057       r_.i32[2] = a_.i32[2];
12058       r_.i32[3] = a_.i32[3];
12059     #endif
12060 
12061     return simde__m128_from_private(r_);
12062   #endif
12063 }
12064 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12065 #  define _mm_cvt_pi2ps(a, b) simde_mm_cvt_pi2ps((a), (b))
12066 #endif
12067 
12068 SIMDE_FUNCTION_ATTRIBUTES
12069 simde__m64
simde_mm_cvt_ps2pi(simde__m128 a)12070 simde_mm_cvt_ps2pi (simde__m128 a) {
12071   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12072     return _mm_cvt_ps2pi(a);
12073   #else
12074     simde__m64_private r_;
12075     simde__m128_private a_;
12076 
12077   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12078     a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
12079     r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
12080   #elif defined(SIMDE_CONVERT_VECTOR_) && SIMDE_NATURAL_VECTOR_SIZE_GE(128)
12081     a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
12082     SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32);
12083   #else
12084     a_ = simde__m128_to_private(a);
12085 
12086     SIMDE_VECTORIZE
12087     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
12088       r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_nearbyintf(a_.f32[i]));
12089     }
12090   #endif
12091 
12092     return simde__m64_from_private(r_);
12093   #endif
12094 }
12095 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12096 #  define _mm_cvt_ps2pi(a) simde_mm_cvt_ps2pi((a))
12097 #endif
12098 
12099 SIMDE_FUNCTION_ATTRIBUTES
12100 simde__m128
simde_mm_cvt_si2ss(simde__m128 a,int32_t b)12101 simde_mm_cvt_si2ss (simde__m128 a, int32_t b) {
12102   #if defined(SIMDE_X86_SSE_NATIVE)
12103     return _mm_cvt_si2ss(a, b);
12104   #else
12105     simde__m128_private
12106       r_,
12107       a_ = simde__m128_to_private(a);
12108 
12109     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12110       r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float, b), a_.neon_f32, 0);
12111     #else
12112       r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
12113       r_.i32[1] = a_.i32[1];
12114       r_.i32[2] = a_.i32[2];
12115       r_.i32[3] = a_.i32[3];
12116     #endif
12117 
12118     return simde__m128_from_private(r_);
12119   #endif
12120 }
12121 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12122 #  define _mm_cvt_si2ss(a, b) simde_mm_cvt_si2ss((a), b)
12123 #endif
12124 
12125 SIMDE_FUNCTION_ATTRIBUTES
12126 int32_t
simde_mm_cvt_ss2si(simde__m128 a)12127 simde_mm_cvt_ss2si (simde__m128 a) {
12128   #if defined(SIMDE_X86_SSE_NATIVE)
12129     return _mm_cvt_ss2si(a);
12130   #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399)
12131     return vgetq_lane_s32(vcvtnq_s32_f32(simde__m128_to_neon_f32(a)), 0);
12132   #else
12133     simde__m128_private a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
12134     #if !defined(SIMDE_FAST_CONVERSION_RANGE)
12135       return ((a_.f32[0] > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
12136           (a_.f32[0] < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
12137         SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]) : INT32_MIN;
12138     #else
12139       return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]);
12140     #endif
12141   #endif
12142 }
12143 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12144 #  define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si((a))
12145 #endif
12146 
12147 SIMDE_FUNCTION_ATTRIBUTES
12148 simde__m128
simde_mm_cvtpi16_ps(simde__m64 a)12149 simde_mm_cvtpi16_ps (simde__m64 a) {
12150   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12151     return _mm_cvtpi16_ps(a);
12152   #else
12153     simde__m128_private r_;
12154     simde__m64_private a_ = simde__m64_to_private(a);
12155 
12156     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12157       r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(a_.neon_i16));
12158     #elif defined(SIMDE_CONVERT_VECTOR_)
12159       SIMDE_CONVERT_VECTOR_(r_.f32, a_.i16);
12160     #else
12161       SIMDE_VECTORIZE
12162       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
12163         simde_float32 v = a_.i16[i];
12164         r_.f32[i] = v;
12165       }
12166     #endif
12167 
12168     return simde__m128_from_private(r_);
12169   #endif
12170 }
12171 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12172 #  define _mm_cvtpi16_ps(a) simde_mm_cvtpi16_ps(a)
12173 #endif
12174 
12175 SIMDE_FUNCTION_ATTRIBUTES
12176 simde__m128
simde_mm_cvtpi32_ps(simde__m128 a,simde__m64 b)12177 simde_mm_cvtpi32_ps (simde__m128 a, simde__m64 b) {
12178   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12179     return _mm_cvtpi32_ps(a, b);
12180   #else
12181     simde__m128_private
12182       r_,
12183       a_ = simde__m128_to_private(a);
12184     simde__m64_private b_ = simde__m64_to_private(b);
12185 
12186     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12187       r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), vget_high_f32(a_.neon_f32));
12188     #elif defined(SIMDE_CONVERT_VECTOR_)
12189       SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32);
12190       r_.m64_private[1] = a_.m64_private[1];
12191     #else
12192       r_.f32[0] = (simde_float32) b_.i32[0];
12193       r_.f32[1] = (simde_float32) b_.i32[1];
12194       r_.i32[2] = a_.i32[2];
12195       r_.i32[3] = a_.i32[3];
12196     #endif
12197 
12198     return simde__m128_from_private(r_);
12199   #endif
12200 }
12201 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12202 #  define _mm_cvtpi32_ps(a, b) simde_mm_cvtpi32_ps((a), b)
12203 #endif
12204 
12205 SIMDE_FUNCTION_ATTRIBUTES
12206 simde__m128
simde_mm_cvtpi32x2_ps(simde__m64 a,simde__m64 b)12207 simde_mm_cvtpi32x2_ps (simde__m64 a, simde__m64 b) {
12208   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12209     return _mm_cvtpi32x2_ps(a, b);
12210   #else
12211     simde__m128_private r_;
12212     simde__m64_private
12213       a_ = simde__m64_to_private(a),
12214       b_ = simde__m64_to_private(b);
12215 
12216     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12217       r_.neon_f32 = vcvtq_f32_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
12218     #elif defined(SIMDE_CONVERT_VECTOR_)
12219       SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.i32);
12220       SIMDE_CONVERT_VECTOR_(r_.m64_private[1].f32, b_.i32);
12221     #else
12222       r_.f32[0] = (simde_float32) a_.i32[0];
12223       r_.f32[1] = (simde_float32) a_.i32[1];
12224       r_.f32[2] = (simde_float32) b_.i32[0];
12225       r_.f32[3] = (simde_float32) b_.i32[1];
12226     #endif
12227 
12228     return simde__m128_from_private(r_);
12229   #endif
12230 }
12231 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12232 #  define _mm_cvtpi32x2_ps(a, b) simde_mm_cvtpi32x2_ps(a, b)
12233 #endif
12234 
12235 SIMDE_FUNCTION_ATTRIBUTES
12236 simde__m128
simde_mm_cvtpi8_ps(simde__m64 a)12237 simde_mm_cvtpi8_ps (simde__m64 a) {
12238   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12239     return _mm_cvtpi8_ps(a);
12240   #else
12241     simde__m128_private r_;
12242     simde__m64_private a_ = simde__m64_to_private(a);
12243 
12244     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12245       r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(a_.neon_i8))));
12246     #else
12247       r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[0]);
12248       r_.f32[1] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[1]);
12249       r_.f32[2] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[2]);
12250       r_.f32[3] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[3]);
12251     #endif
12252 
12253     return simde__m128_from_private(r_);
12254   #endif
12255 }
12256 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12257 #  define _mm_cvtpi8_ps(a) simde_mm_cvtpi8_ps(a)
12258 #endif
12259 
12260 SIMDE_FUNCTION_ATTRIBUTES
12261 simde__m64
simde_mm_cvtps_pi16(simde__m128 a)12262 simde_mm_cvtps_pi16 (simde__m128 a) {
12263   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12264     return _mm_cvtps_pi16(a);
12265   #else
12266     simde__m64_private r_;
12267     simde__m128_private a_ = simde__m128_to_private(a);
12268 
12269     #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
12270       r_.neon_i16 = vmovn_s32(vcvtq_s32_f32(vrndiq_f32(a_.neon_f32)));
12271     #else
12272       SIMDE_VECTORIZE
12273       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
12274         r_.i16[i] = SIMDE_CONVERT_FTOI(int16_t, simde_math_roundf(a_.f32[i]));
12275       }
12276     #endif
12277 
12278     return simde__m64_from_private(r_);
12279   #endif
12280 }
12281 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12282 #  define _mm_cvtps_pi16(a) simde_mm_cvtps_pi16((a))
12283 #endif
12284 
12285 SIMDE_FUNCTION_ATTRIBUTES
12286 simde__m64
simde_mm_cvtps_pi32(simde__m128 a)12287 simde_mm_cvtps_pi32 (simde__m128 a) {
12288   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12289     return _mm_cvtps_pi32(a);
12290   #else
12291     simde__m64_private r_;
12292     simde__m128_private a_ = simde__m128_to_private(a);
12293 
12294     #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399)
12295       r_.neon_i32 = vcvt_s32_f32(vget_low_f32(vrndiq_f32(a_.neon_f32)));
12296     #else
12297       SIMDE_VECTORIZE
12298       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
12299         simde_float32 v = simde_math_roundf(a_.f32[i]);
12300         #if !defined(SIMDE_FAST_CONVERSION_RANGE)
12301           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
12302             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
12303         #else
12304           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
12305         #endif
12306       }
12307     #endif
12308 
12309     return simde__m64_from_private(r_);
12310   #endif
12311 }
12312 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12313 #  define _mm_cvtps_pi32(a) simde_mm_cvtps_pi32((a))
12314 #endif
12315 
12316 SIMDE_FUNCTION_ATTRIBUTES
12317 simde__m64
simde_mm_cvtps_pi8(simde__m128 a)12318 simde_mm_cvtps_pi8 (simde__m128 a) {
12319   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12320     return _mm_cvtps_pi8(a);
12321   #else
12322     simde__m64_private r_;
12323     simde__m128_private a_ = simde__m128_to_private(a);
12324 
12325     #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95471)
12326       /* Clamp the input to [INT8_MIN, INT8_MAX], round, convert to i32, narrow to
12327       * i16, combine with an all-zero vector of i16 (which will become the upper
12328       * half), narrow to i8. */
12329       float32x4_t max = vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MAX));
12330       float32x4_t min = vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MIN));
12331       float32x4_t values = vrndnq_f32(vmaxq_f32(vminq_f32(max, a_.neon_f32), min));
12332       r_.neon_i8 = vmovn_s16(vcombine_s16(vmovn_s32(vcvtq_s32_f32(values)), vdup_n_s16(0)));
12333     #else
12334       SIMDE_VECTORIZE
12335       for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
12336         if (a_.f32[i] > HEDLEY_STATIC_CAST(simde_float32, INT8_MAX))
12337           r_.i8[i] = INT8_MAX;
12338         else if (a_.f32[i] <  HEDLEY_STATIC_CAST(simde_float32, INT8_MIN))
12339           r_.i8[i] = INT8_MIN;
12340         else
12341           r_.i8[i] = SIMDE_CONVERT_FTOI(int8_t, simde_math_roundf(a_.f32[i]));
12342       }
12343       /* Note: the upper half is undefined */
12344     #endif
12345 
12346     return simde__m64_from_private(r_);
12347   #endif
12348 }
12349 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12350 #  define _mm_cvtps_pi8(a) simde_mm_cvtps_pi8((a))
12351 #endif
12352 
12353 SIMDE_FUNCTION_ATTRIBUTES
12354 simde__m128
simde_mm_cvtpu16_ps(simde__m64 a)12355 simde_mm_cvtpu16_ps (simde__m64 a) {
12356   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12357     return _mm_cvtpu16_ps(a);
12358   #else
12359     simde__m128_private r_;
12360     simde__m64_private a_ = simde__m64_to_private(a);
12361 
12362     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12363       r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(a_.neon_u16));
12364     #elif defined(SIMDE_CONVERT_VECTOR_)
12365       SIMDE_CONVERT_VECTOR_(r_.f32, a_.u16);
12366     #else
12367       SIMDE_VECTORIZE
12368       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
12369         r_.f32[i] = (simde_float32) a_.u16[i];
12370       }
12371     #endif
12372 
12373     return simde__m128_from_private(r_);
12374   #endif
12375 }
12376 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12377 #  define _mm_cvtpu16_ps(a) simde_mm_cvtpu16_ps(a)
12378 #endif
12379 
12380 SIMDE_FUNCTION_ATTRIBUTES
12381 simde__m128
simde_mm_cvtpu8_ps(simde__m64 a)12382 simde_mm_cvtpu8_ps (simde__m64 a) {
12383   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12384     return _mm_cvtpu8_ps(a);
12385   #else
12386     simde__m128_private r_;
12387     simde__m64_private a_ = simde__m64_to_private(a);
12388 
12389     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12390       r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(a_.neon_u8))));
12391     #else
12392       SIMDE_VECTORIZE
12393       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
12394         r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.u8[i]);
12395       }
12396     #endif
12397 
12398     return simde__m128_from_private(r_);
12399   #endif
12400 }
12401 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12402 #  define _mm_cvtpu8_ps(a) simde_mm_cvtpu8_ps(a)
12403 #endif
12404 
12405 SIMDE_FUNCTION_ATTRIBUTES
12406 simde__m128
simde_mm_cvtsi32_ss(simde__m128 a,int32_t b)12407 simde_mm_cvtsi32_ss (simde__m128 a, int32_t b) {
12408   #if defined(SIMDE_X86_SSE_NATIVE)
12409     return _mm_cvtsi32_ss(a, b);
12410   #else
12411     simde__m128_private r_;
12412     simde__m128_private a_ = simde__m128_to_private(a);
12413 
12414     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12415       r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b), a_.neon_f32, 0);
12416     #else
12417       r_ = a_;
12418       r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
12419     #endif
12420 
12421     return simde__m128_from_private(r_);
12422   #endif
12423 }
12424 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12425 #  define _mm_cvtsi32_ss(a, b) simde_mm_cvtsi32_ss((a), b)
12426 #endif
12427 
12428 SIMDE_FUNCTION_ATTRIBUTES
12429 simde__m128
simde_mm_cvtsi64_ss(simde__m128 a,int64_t b)12430 simde_mm_cvtsi64_ss (simde__m128 a, int64_t b) {
12431   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
12432     #if !defined(__PGI)
12433       return _mm_cvtsi64_ss(a, b);
12434     #else
12435       return _mm_cvtsi64x_ss(a, b);
12436     #endif
12437   #else
12438     simde__m128_private r_;
12439     simde__m128_private a_ = simde__m128_to_private(a);
12440 
12441     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12442       r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b), a_.neon_f32, 0);
12443     #else
12444       r_ = a_;
12445       r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
12446     #endif
12447 
12448     return simde__m128_from_private(r_);
12449   #endif
12450 }
12451 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
12452 #  define _mm_cvtsi64_ss(a, b) simde_mm_cvtsi64_ss((a), b)
12453 #endif
12454 
12455 SIMDE_FUNCTION_ATTRIBUTES
12456 simde_float32
simde_mm_cvtss_f32(simde__m128 a)12457 simde_mm_cvtss_f32 (simde__m128 a) {
12458   #if defined(SIMDE_X86_SSE_NATIVE)
12459     return _mm_cvtss_f32(a);
12460   #else
12461     simde__m128_private a_ = simde__m128_to_private(a);
12462     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12463       return vgetq_lane_f32(a_.neon_f32, 0);
12464     #else
12465       return a_.f32[0];
12466     #endif
12467   #endif
12468 }
12469 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12470 #  define _mm_cvtss_f32(a) simde_mm_cvtss_f32((a))
12471 #endif
12472 
12473 SIMDE_FUNCTION_ATTRIBUTES
12474 int32_t
simde_mm_cvtss_si32(simde__m128 a)12475 simde_mm_cvtss_si32 (simde__m128 a) {
12476   return simde_mm_cvt_ss2si(a);
12477 }
12478 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12479 #  define _mm_cvtss_si32(a) simde_mm_cvtss_si32((a))
12480 #endif
12481 
12482 SIMDE_FUNCTION_ATTRIBUTES
12483 int64_t
simde_mm_cvtss_si64(simde__m128 a)12484 simde_mm_cvtss_si64 (simde__m128 a) {
12485   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
12486     #if !defined(__PGI)
12487       return _mm_cvtss_si64(a);
12488     #else
12489       return _mm_cvtss_si64x(a);
12490     #endif
12491   #else
12492     simde__m128_private a_ = simde__m128_to_private(a);
12493     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12494       return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(vgetq_lane_f32(a_.neon_f32, 0)));
12495     #else
12496       return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(a_.f32[0]));
12497     #endif
12498   #endif
12499 }
12500 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
12501 #  define _mm_cvtss_si64(a) simde_mm_cvtss_si64((a))
12502 #endif
12503 
12504 SIMDE_FUNCTION_ATTRIBUTES
12505 simde__m64
simde_mm_cvtt_ps2pi(simde__m128 a)12506 simde_mm_cvtt_ps2pi (simde__m128 a) {
12507   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12508     return _mm_cvtt_ps2pi(a);
12509   #else
12510     simde__m64_private r_;
12511     simde__m128_private a_ = simde__m128_to_private(a);
12512 
12513     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
12514       r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
12515     #else
12516       SIMDE_VECTORIZE
12517       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
12518         simde_float32 v = a_.f32[i];
12519         #if !defined(SIMDE_FAST_CONVERSION_RANGE)
12520           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
12521             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
12522         #else
12523           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
12524         #endif
12525       }
12526     #endif
12527 
12528     return simde__m64_from_private(r_);
12529   #endif
12530 }
12531 #define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a)
12532 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12533 #  define _mm_cvtt_ps2pi(a) simde_mm_cvtt_ps2pi((a))
12534 #  define _mm_cvttps_pi32(a) simde_mm_cvttps_pi32((a))
12535 #endif
12536 
12537 SIMDE_FUNCTION_ATTRIBUTES
12538 int32_t
simde_mm_cvtt_ss2si(simde__m128 a)12539 simde_mm_cvtt_ss2si (simde__m128 a) {
12540   #if defined(SIMDE_X86_SSE_NATIVE)
12541     return _mm_cvtt_ss2si(a);
12542   #else
12543     simde__m128_private a_ = simde__m128_to_private(a);
12544 
12545     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
12546       return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0));
12547     #else
12548       simde_float32 v = a_.f32[0];
12549       #if !defined(SIMDE_FAST_CONVERSION_RANGE)
12550         return ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
12551           SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
12552       #else
12553         return SIMDE_CONVERT_FTOI(int32_t, v);
12554       #endif
12555     #endif
12556   #endif
12557 }
12558 #define simde_mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a))
12559 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12560 #  define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si((a))
12561 #  define _mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a))
12562 #endif
12563 
12564 SIMDE_FUNCTION_ATTRIBUTES
12565 int64_t
simde_mm_cvttss_si64(simde__m128 a)12566 simde_mm_cvttss_si64 (simde__m128 a) {
12567   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(_MSC_VER)
12568     #if defined(__PGI)
12569       return _mm_cvttss_si64x(a);
12570     #else
12571       return _mm_cvttss_si64(a);
12572     #endif
12573   #else
12574     simde__m128_private a_ = simde__m128_to_private(a);
12575 
12576     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12577       return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0));
12578     #else
12579       return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]);
12580     #endif
12581   #endif
12582 }
12583 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
12584 #  define _mm_cvttss_si64(a) simde_mm_cvttss_si64((a))
12585 #endif
12586 
12587 SIMDE_FUNCTION_ATTRIBUTES
12588 simde__m128
simde_mm_cmpord_ss(simde__m128 a,simde__m128 b)12589 simde_mm_cmpord_ss (simde__m128 a, simde__m128 b) {
12590   #if defined(SIMDE_X86_SSE_NATIVE)
12591     return _mm_cmpord_ss(a, b);
12592   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
12593     return simde_mm_move_ss(a, simde_mm_cmpord_ps(a, b));
12594   #else
12595     simde__m128_private
12596       r_,
12597       a_ = simde__m128_to_private(a);
12598 
12599     #if defined(simde_math_isnanf)
12600       r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) || simde_math_isnanf(simde_mm_cvtss_f32(b))) ? UINT32_C(0) : ~UINT32_C(0);
12601       SIMDE_VECTORIZE
12602       for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
12603         r_.u32[i] = a_.u32[i];
12604       }
12605     #else
12606       HEDLEY_UNREACHABLE();
12607     #endif
12608 
12609     return simde__m128_from_private(r_);
12610   #endif
12611 }
12612 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12613 #  define _mm_cmpord_ss(a, b) simde_mm_cmpord_ss((a), (b))
12614 #endif
12615 
12616 SIMDE_FUNCTION_ATTRIBUTES
12617 simde__m128
simde_mm_div_ps(simde__m128 a,simde__m128 b)12618 simde_mm_div_ps (simde__m128 a, simde__m128 b) {
12619   #if defined(SIMDE_X86_SSE_NATIVE)
12620     return _mm_div_ps(a, b);
12621   #else
12622     simde__m128_private
12623       r_,
12624       a_ = simde__m128_to_private(a),
12625       b_ = simde__m128_to_private(b);
12626 
12627     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
12628       r_.neon_f32 = vdivq_f32(a_.neon_f32, b_.neon_f32);
12629     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12630       float32x4_t recip0 = vrecpeq_f32(b_.neon_f32);
12631       float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b_.neon_f32));
12632       r_.neon_f32 = vmulq_f32(a_.neon_f32, recip1);
12633     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
12634       r_.wasm_v128 =  wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128);
12635     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
12636       r_.altivec_f32 = vec_div(a_.altivec_f32, b_.altivec_f32);
12637     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
12638       r_.f32 = a_.f32 / b_.f32;
12639     #else
12640       SIMDE_VECTORIZE
12641       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
12642         r_.f32[i] = a_.f32[i] / b_.f32[i];
12643       }
12644     #endif
12645 
12646     return simde__m128_from_private(r_);
12647   #endif
12648 }
12649 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12650 #  define _mm_div_ps(a, b) simde_mm_div_ps((a), (b))
12651 #endif
12652 
12653 SIMDE_FUNCTION_ATTRIBUTES
12654 simde__m128
simde_mm_div_ss(simde__m128 a,simde__m128 b)12655 simde_mm_div_ss (simde__m128 a, simde__m128 b) {
12656   #if defined(SIMDE_X86_SSE_NATIVE)
12657     return _mm_div_ss(a, b);
12658   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
12659     return simde_mm_move_ss(a, simde_mm_div_ps(a, b));
12660   #else
12661     simde__m128_private
12662       r_,
12663       a_ = simde__m128_to_private(a),
12664       b_ = simde__m128_to_private(b);
12665 
12666     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12667       float32_t value =
12668               vgetq_lane_f32(simde__m128_to_private(simde_mm_div_ps(a, b)).neon_f32, 0);
12669       r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
12670     #else
12671       r_.f32[0] = a_.f32[0] / b_.f32[0];
12672       SIMDE_VECTORIZE
12673       for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
12674         r_.f32[i] = a_.f32[i];
12675       }
12676     #endif
12677 
12678     return simde__m128_from_private(r_);
12679   #endif
12680 }
12681 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12682 #  define _mm_div_ss(a, b) simde_mm_div_ss((a), (b))
12683 #endif
12684 
12685 SIMDE_FUNCTION_ATTRIBUTES
12686 int16_t
simde_mm_extract_pi16(simde__m64 a,const int imm8)12687 simde_mm_extract_pi16 (simde__m64 a, const int imm8)
12688     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
12689   simde__m64_private a_ = simde__m64_to_private(a);
12690   return a_.i16[imm8];
12691 }
12692 #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(HEDLEY_PGI_VERSION)
12693 #  if defined(SIMDE_BUG_CLANG_44589)
12694 #    define simde_mm_extract_pi16(a, imm8) ( \
12695          HEDLEY_DIAGNOSTIC_PUSH \
12696          _Pragma("clang diagnostic ignored \"-Wvector-conversion\"") \
12697          HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16((a), (imm8))) \
12698          HEDLEY_DIAGNOSTIC_POP \
12699        )
12700 #  else
12701 #    define simde_mm_extract_pi16(a, imm8) HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16(a, imm8))
12702 #  endif
12703 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12704 #  define simde_mm_extract_pi16(a, imm8) vget_lane_s16(simde__m64_to_private(a).neon_i16, imm8)
12705 #endif
12706 #define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a, imm8)
12707 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12708 #  define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16((a), (imm8))
12709 #  define _m_pextrw(a, imm8) simde_mm_extract_pi16((a), (imm8))
12710 #endif
12711 
12712 SIMDE_FUNCTION_ATTRIBUTES
12713 simde__m64
simde_mm_insert_pi16(simde__m64 a,int16_t i,const int imm8)12714 simde_mm_insert_pi16 (simde__m64 a, int16_t i, const int imm8)
12715     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
12716   simde__m64_private
12717     r_,
12718     a_ = simde__m64_to_private(a);
12719 
12720   r_.i64[0] = a_.i64[0];
12721   r_.i16[imm8] = i;
12722 
12723   return simde__m64_from_private(r_);
12724 }
12725 #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
12726 #  if defined(SIMDE_BUG_CLANG_44589)
12727 #    define ssimde_mm_insert_pi16(a, i, imm8) ( \
12728          HEDLEY_DIAGNOSTIC_PUSH \
12729          _Pragma("clang diagnostic ignored \"-Wvector-conversion\"") \
12730         (_mm_insert_pi16((a), (i), (imm8))) \
12731          HEDLEY_DIAGNOSTIC_POP \
12732        )
12733 #  else
12734 #    define simde_mm_insert_pi16(a, i, imm8) _mm_insert_pi16(a, i, imm8)
12735 #  endif
12736 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12737 #  define simde_mm_insert_pi16(a, i, imm8) simde__m64_from_neon_i16(vset_lane_s16((i), simde__m64_to_neon_i16(a), (imm8)))
12738 #endif
12739 #define simde_m_pinsrw(a, i, imm8) (simde_mm_insert_pi16(a, i, imm8))
12740 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12741 #  define _mm_insert_pi16(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
12742 #  define _m_pinsrw(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
12743 #endif
12744 
12745 SIMDE_FUNCTION_ATTRIBUTES
12746 simde__m128
simde_mm_load_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM (4)])12747 simde_mm_load_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
12748 #if defined(SIMDE_X86_SSE_NATIVE)
12749   return _mm_load_ps(mem_addr);
12750 #else
12751   simde__m128_private r_;
12752 
12753   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12754     r_.neon_f32 = vld1q_f32(mem_addr);
12755   #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
12756     r_.altivec_f32 = vec_vsx_ld(0, mem_addr);
12757   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
12758     r_.altivec_f32 = vec_ld(0, mem_addr);
12759   #else
12760     simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128), sizeof(r_));
12761   #endif
12762 
12763   return simde__m128_from_private(r_);
12764 #endif
12765 }
12766 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12767 #  define _mm_load_ps(mem_addr) simde_mm_load_ps(mem_addr)
12768 #endif
12769 
12770 SIMDE_FUNCTION_ATTRIBUTES
12771 simde__m128
simde_mm_load1_ps(simde_float32 const * mem_addr)12772 simde_mm_load1_ps (simde_float32 const* mem_addr) {
12773   #if defined(SIMDE_X86_SSE_NATIVE)
12774     return _mm_load_ps1(mem_addr);
12775   #else
12776     simde__m128_private r_;
12777 
12778     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12779       r_.neon_f32 = vld1q_dup_f32(mem_addr);
12780     #else
12781       r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr));
12782     #endif
12783 
12784     return simde__m128_from_private(r_);
12785   #endif
12786 }
12787 #define simde_mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr)
12788 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12789 #  define _mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr)
12790 #  define _mm_load1_ps(mem_addr) simde_mm_load1_ps(mem_addr)
12791 #endif
12792 
12793 SIMDE_FUNCTION_ATTRIBUTES
12794 simde__m128
simde_mm_load_ss(simde_float32 const * mem_addr)12795 simde_mm_load_ss (simde_float32 const* mem_addr) {
12796   #if defined(SIMDE_X86_SSE_NATIVE)
12797     return _mm_load_ss(mem_addr);
12798   #else
12799     simde__m128_private r_;
12800 
12801     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12802       r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
12803     #else
12804       r_.f32[0] = *mem_addr;
12805       r_.i32[1] = 0;
12806       r_.i32[2] = 0;
12807       r_.i32[3] = 0;
12808     #endif
12809 
12810     return simde__m128_from_private(r_);
12811   #endif
12812 }
12813 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12814 #  define _mm_load_ss(mem_addr) simde_mm_load_ss(mem_addr)
12815 #endif
12816 
12817 SIMDE_FUNCTION_ATTRIBUTES
12818 simde__m128
simde_mm_loadh_pi(simde__m128 a,simde__m64 const * mem_addr)12819 simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) {
12820   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12821     return _mm_loadh_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr));
12822   #else
12823     simde__m128_private
12824       r_,
12825       a_ = simde__m128_to_private(a);
12826 
12827   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12828     r_.neon_f32 = vcombine_f32(vget_low_f32(a_.neon_f32), vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)));
12829   #else
12830     simde__m64_private b_ = *HEDLEY_REINTERPRET_CAST(simde__m64_private const*, mem_addr);
12831     r_.f32[0] = a_.f32[0];
12832     r_.f32[1] = a_.f32[1];
12833     r_.f32[2] = b_.f32[0];
12834     r_.f32[3] = b_.f32[1];
12835   #endif
12836 
12837     return simde__m128_from_private(r_);
12838   #endif
12839 }
12840 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12841   #if HEDLEY_HAS_WARNING("-Wold-style-cast")
12842     #define _mm_loadh_pi(a, mem_addr) simde_mm_loadh_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const*, (mem_addr)))
12843   #else
12844     #define _mm_loadh_pi(a, mem_addr) simde_mm_loadh_pi((a), (simde__m64 const*) (mem_addr))
12845   #endif
12846 #endif
12847 
12848 /* The SSE documentation says that there are no alignment requirements
12849    for mem_addr.  Unfortunately they used the __m64 type for the argument
12850    which is supposed to be 8-byte aligned, so some compilers (like clang
12851    with -Wcast-align) will generate a warning if you try to cast, say,
12852    a simde_float32* to a simde__m64* for this function.
12853 
12854    I think the choice of argument type is unfortunate, but I do think we
12855    need to stick to it here.  If there is demand I can always add something
12856    like simde_x_mm_loadl_f32(simde__m128, simde_float32 mem_addr[2]) */
12857 SIMDE_FUNCTION_ATTRIBUTES
12858 simde__m128
simde_mm_loadl_pi(simde__m128 a,simde__m64 const * mem_addr)12859 simde_mm_loadl_pi (simde__m128 a, simde__m64 const* mem_addr) {
12860   #if defined(SIMDE_X86_SSE_NATIVE)
12861     return _mm_loadl_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr));
12862   #else
12863     simde__m128_private
12864       r_,
12865       a_ = simde__m128_to_private(a);
12866 
12867     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12868       r_.neon_f32 = vcombine_f32(vld1_f32(
12869         HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)), vget_high_f32(a_.neon_f32));
12870     #else
12871       simde__m64_private b_;
12872       simde_memcpy(&b_, mem_addr, sizeof(b_));
12873       r_.i32[0] = b_.i32[0];
12874       r_.i32[1] = b_.i32[1];
12875       r_.i32[2] = a_.i32[2];
12876       r_.i32[3] = a_.i32[3];
12877     #endif
12878 
12879     return simde__m128_from_private(r_);
12880   #endif
12881 }
12882 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12883   #if HEDLEY_HAS_WARNING("-Wold-style-cast")
12884     #define _mm_loadl_pi(a, mem_addr) simde_mm_loadl_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const*, (mem_addr)))
12885   #else
12886     #define _mm_loadl_pi(a, mem_addr) simde_mm_loadl_pi((a), (simde__m64 const*) (mem_addr))
12887   #endif
12888 #endif
12889 
12890 SIMDE_FUNCTION_ATTRIBUTES
12891 simde__m128
simde_mm_loadr_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM (4)])12892 simde_mm_loadr_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
12893   #if defined(SIMDE_X86_SSE_NATIVE)
12894     return _mm_loadr_ps(mem_addr);
12895   #else
12896     simde__m128_private
12897       r_,
12898       v_ = simde__m128_to_private(simde_mm_load_ps(mem_addr));
12899 
12900     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12901       r_.neon_f32 = vrev64q_f32(v_.neon_f32);
12902       r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2);
12903     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__)
12904       r_.altivec_f32 = vec_reve(v_.altivec_f32);
12905     #elif defined(SIMDE_SHUFFLE_VECTOR_)
12906       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, v_.f32, v_.f32, 3, 2, 1, 0);
12907     #else
12908       r_.f32[0] = v_.f32[3];
12909       r_.f32[1] = v_.f32[2];
12910       r_.f32[2] = v_.f32[1];
12911       r_.f32[3] = v_.f32[0];
12912     #endif
12913 
12914     return simde__m128_from_private(r_);
12915   #endif
12916 }
12917 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12918 #  define _mm_loadr_ps(mem_addr) simde_mm_loadr_ps(mem_addr)
12919 #endif
12920 
12921 SIMDE_FUNCTION_ATTRIBUTES
12922 simde__m128
simde_mm_loadu_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM (4)])12923 simde_mm_loadu_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
12924   #if defined(SIMDE_X86_SSE_NATIVE)
12925     return _mm_loadu_ps(mem_addr);
12926   #else
12927     simde__m128_private r_;
12928 
12929     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12930       r_.neon_f32 = vld1q_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr));
12931     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
12932       r_.wasm_v128 = wasm_v128_load(mem_addr);
12933     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__)
12934       r_.altivec_f32 = vec_vsx_ld(0, mem_addr);
12935     #else
12936       simde_memcpy(&r_, mem_addr, sizeof(r_));
12937     #endif
12938 
12939     return simde__m128_from_private(r_);
12940   #endif
12941 }
12942 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12943 #  define _mm_loadu_ps(mem_addr) simde_mm_loadu_ps(mem_addr)
12944 #endif
12945 
12946 SIMDE_FUNCTION_ATTRIBUTES
12947 void
simde_mm_maskmove_si64(simde__m64 a,simde__m64 mask,int8_t * mem_addr)12948 simde_mm_maskmove_si64 (simde__m64 a, simde__m64 mask, int8_t* mem_addr) {
12949   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12950     _mm_maskmove_si64(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
12951   #else
12952     simde__m64_private
12953       a_ = simde__m64_to_private(a),
12954       mask_ = simde__m64_to_private(mask);
12955 
12956     SIMDE_VECTORIZE
12957     for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++)
12958       if (mask_.i8[i] < 0)
12959         mem_addr[i] = a_.i8[i];
12960   #endif
12961 }
12962 #define simde_m_maskmovq(a, mask, mem_addr) simde_mm_maskmove_si64(a, mask, mem_addr)
12963 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12964 #  define _mm_maskmove_si64(a, mask, mem_addr) simde_mm_maskmove_si64((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
12965 #  define _m_maskmovq(a, mask, mem_addr) simde_mm_maskmove_si64((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
12966 #endif
12967 
12968 SIMDE_FUNCTION_ATTRIBUTES
12969 simde__m64
simde_mm_max_pi16(simde__m64 a,simde__m64 b)12970 simde_mm_max_pi16 (simde__m64 a, simde__m64 b) {
12971   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
12972     return _mm_max_pi16(a, b);
12973   #else
12974     simde__m64_private
12975       r_,
12976       a_ = simde__m64_to_private(a),
12977       b_ = simde__m64_to_private(b);
12978 
12979     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
12980       r_.neon_i16 = vmax_s16(a_.neon_i16, b_.neon_i16);
12981     #else
12982       SIMDE_VECTORIZE
12983       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
12984         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
12985       }
12986     #endif
12987 
12988     return simde__m64_from_private(r_);
12989   #endif
12990 }
12991 #define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
12992 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
12993 #  define _mm_max_pi16(a, b) simde_mm_max_pi16(a, b)
12994 #  define _m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
12995 #endif
12996 
12997 SIMDE_FUNCTION_ATTRIBUTES
12998 simde__m128
simde_mm_max_ps(simde__m128 a,simde__m128 b)12999 simde_mm_max_ps (simde__m128 a, simde__m128 b) {
13000   #if defined(SIMDE_X86_SSE_NATIVE)
13001     return _mm_max_ps(a, b);
13002   #else
13003     simde__m128_private
13004       r_,
13005       a_ = simde__m128_to_private(a),
13006       b_ = simde__m128_to_private(b);
13007 
13008     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_NANS)
13009       r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32);
13010     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13011       r_.neon_f32 = vbslq_f32(vcgtq_f32(a_.neon_f32, b_.neon_f32), a_.neon_f32, b_.neon_f32);
13012     #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS)
13013       r_.wasm_v128 = wasm_f32x4_max(a_.wasm_v128, b_.wasm_v128);
13014     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
13015       r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128));
13016     #elif (defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)) && defined(SIMDE_FAST_NANS)
13017       r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32);
13018     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
13019       r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(a_.altivec_f32, b_.altivec_f32));
13020     #else
13021       SIMDE_VECTORIZE
13022       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
13023         r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i];
13024       }
13025     #endif
13026 
13027     return simde__m128_from_private(r_);
13028   #endif
13029 }
13030 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13031 #  define _mm_max_ps(a, b) simde_mm_max_ps((a), (b))
13032 #endif
13033 
13034 SIMDE_FUNCTION_ATTRIBUTES
13035 simde__m64
simde_mm_max_pu8(simde__m64 a,simde__m64 b)13036 simde_mm_max_pu8 (simde__m64 a, simde__m64 b) {
13037   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
13038     return _mm_max_pu8(a, b);
13039   #else
13040     simde__m64_private
13041       r_,
13042       a_ = simde__m64_to_private(a),
13043       b_ = simde__m64_to_private(b);
13044 
13045     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13046       r_.neon_u8 = vmax_u8(a_.neon_u8, b_.neon_u8);
13047     #else
13048       SIMDE_VECTORIZE
13049       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
13050         r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
13051       }
13052     #endif
13053 
13054     return simde__m64_from_private(r_);
13055   #endif
13056 }
13057 #define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b)
13058 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13059 #  define _mm_max_pu8(a, b) simde_mm_max_pu8(a, b)
13060 #  define _m_pmaxub(a, b) simde_mm_max_pu8(a, b)
13061 #endif
13062 
13063 SIMDE_FUNCTION_ATTRIBUTES
13064 simde__m128
simde_mm_max_ss(simde__m128 a,simde__m128 b)13065 simde_mm_max_ss (simde__m128 a, simde__m128 b) {
13066   #if defined(SIMDE_X86_SSE_NATIVE)
13067     return _mm_max_ss(a, b);
13068   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
13069     return simde_mm_move_ss(a, simde_mm_max_ps(a, b));
13070   #else
13071     simde__m128_private
13072       r_,
13073       a_ = simde__m128_to_private(a),
13074       b_ = simde__m128_to_private(b);
13075 
13076     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13077       float32_t value = vgetq_lane_f32(maxq_f32(a_.neon_f32, b_.neon_f32), 0);
13078       r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
13079     #else
13080       r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0];
13081       r_.f32[1] = a_.f32[1];
13082       r_.f32[2] = a_.f32[2];
13083       r_.f32[3] = a_.f32[3];
13084     #endif
13085 
13086     return simde__m128_from_private(r_);
13087   #endif
13088 }
13089 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13090 #  define _mm_max_ss(a, b) simde_mm_max_ss((a), (b))
13091 #endif
13092 
13093 SIMDE_FUNCTION_ATTRIBUTES
13094 simde__m64
simde_mm_min_pi16(simde__m64 a,simde__m64 b)13095 simde_mm_min_pi16 (simde__m64 a, simde__m64 b) {
13096   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
13097     return _mm_min_pi16(a, b);
13098   #else
13099     simde__m64_private
13100       r_,
13101       a_ = simde__m64_to_private(a),
13102       b_ = simde__m64_to_private(b);
13103 
13104     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13105       r_.neon_i16 = vmin_s16(a_.neon_i16, b_.neon_i16);
13106     #else
13107       SIMDE_VECTORIZE
13108       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
13109         r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
13110       }
13111     #endif
13112 
13113     return simde__m64_from_private(r_);
13114   #endif
13115 }
13116 #define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b)
13117 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13118 #  define _mm_min_pi16(a, b) simde_mm_min_pi16(a, b)
13119 #  define _m_pminsw(a, b) simde_mm_min_pi16(a, b)
13120 #endif
13121 
13122 SIMDE_FUNCTION_ATTRIBUTES
13123 simde__m128
simde_mm_min_ps(simde__m128 a,simde__m128 b)13124 simde_mm_min_ps (simde__m128 a, simde__m128 b) {
13125   #if defined(SIMDE_X86_SSE_NATIVE)
13126     return _mm_min_ps(a, b);
13127   #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13128     return simde__m128_from_neon_f32(vminq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b)));
13129   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
13130     simde__m128_private
13131       r_,
13132       a_ = simde__m128_to_private(a),
13133       b_ = simde__m128_to_private(b);
13134     #if defined(SIMDE_FAST_NANS)
13135       r_.wasm_v128 = wasm_f32x4_min(a_.wasm_v128, b_.wasm_v128);
13136     #else
13137       r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128));
13138     #endif
13139     return simde__m128_from_private(r_);
13140   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
13141     simde__m128_private
13142       r_,
13143       a_ = simde__m128_to_private(a),
13144       b_ = simde__m128_to_private(b);
13145 
13146     #if defined(SIMDE_FAST_NANS)
13147       r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32);
13148     #else
13149       r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(b_.altivec_f32, a_.altivec_f32));
13150     #endif
13151 
13152     return simde__m128_from_private(r_);
13153   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
13154     simde__m128 mask = simde_mm_cmplt_ps(a, b);
13155     return simde_mm_or_ps(simde_mm_and_ps(mask, a), simde_mm_andnot_ps(mask, b));
13156   #else
13157     simde__m128_private
13158       r_,
13159       a_ = simde__m128_to_private(a),
13160       b_ = simde__m128_to_private(b);
13161 
13162     SIMDE_VECTORIZE
13163     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
13164       r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i];
13165     }
13166 
13167     return simde__m128_from_private(r_);
13168   #endif
13169 }
13170 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13171 #  define _mm_min_ps(a, b) simde_mm_min_ps((a), (b))
13172 #endif
13173 
13174 SIMDE_FUNCTION_ATTRIBUTES
13175 simde__m64
simde_mm_min_pu8(simde__m64 a,simde__m64 b)13176 simde_mm_min_pu8 (simde__m64 a, simde__m64 b) {
13177   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
13178     return _mm_min_pu8(a, b);
13179   #else
13180     simde__m64_private
13181       r_,
13182       a_ = simde__m64_to_private(a),
13183       b_ = simde__m64_to_private(b);
13184 
13185     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13186       r_.neon_u8 = vmin_u8(a_.neon_u8, b_.neon_u8);
13187     #else
13188       SIMDE_VECTORIZE
13189       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
13190         r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
13191       }
13192     #endif
13193 
13194     return simde__m64_from_private(r_);
13195   #endif
13196 }
13197 #define simde_m_pminub(a, b) simde_mm_min_pu8(a, b)
13198 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13199 #  define _mm_min_pu8(a, b) simde_mm_min_pu8(a, b)
13200 #  define _m_pminub(a, b) simde_mm_min_pu8(a, b)
13201 #endif
13202 
13203 SIMDE_FUNCTION_ATTRIBUTES
13204 simde__m128
simde_mm_min_ss(simde__m128 a,simde__m128 b)13205 simde_mm_min_ss (simde__m128 a, simde__m128 b) {
13206   #if defined(SIMDE_X86_SSE_NATIVE)
13207     return _mm_min_ss(a, b);
13208   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
13209     return simde_mm_move_ss(a, simde_mm_min_ps(a, b));
13210   #else
13211     simde__m128_private
13212       r_,
13213       a_ = simde__m128_to_private(a),
13214       b_ = simde__m128_to_private(b);
13215 
13216     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13217       float32_t value = vgetq_lane_f32(vminq_f32(a_.neon_f32, b_.neon_f32), 0);
13218       r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
13219     #else
13220       r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0];
13221       r_.f32[1] = a_.f32[1];
13222       r_.f32[2] = a_.f32[2];
13223       r_.f32[3] = a_.f32[3];
13224     #endif
13225 
13226     return simde__m128_from_private(r_);
13227   #endif
13228 }
13229 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13230 #  define _mm_min_ss(a, b) simde_mm_min_ss((a), (b))
13231 #endif
13232 
13233 SIMDE_FUNCTION_ATTRIBUTES
13234 simde__m128
simde_mm_movehl_ps(simde__m128 a,simde__m128 b)13235 simde_mm_movehl_ps (simde__m128 a, simde__m128 b) {
13236   #if defined(SIMDE_X86_SSE_NATIVE)
13237     return _mm_movehl_ps(a, b);
13238   #else
13239     simde__m128_private
13240       r_,
13241       a_ = simde__m128_to_private(a),
13242       b_ = simde__m128_to_private(b);
13243 
13244     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13245       float32x2_t a32 = vget_high_f32(a_.neon_f32);
13246       float32x2_t b32 = vget_high_f32(b_.neon_f32);
13247       r_.neon_f32 = vcombine_f32(b32, a32);
13248     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
13249       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float),
13250           vec_mergel(b_.altivec_i64, a_.altivec_i64));
13251     #elif defined(SIMDE_SHUFFLE_VECTOR_)
13252       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 6, 7, 2, 3);
13253     #else
13254       r_.f32[0] = b_.f32[2];
13255       r_.f32[1] = b_.f32[3];
13256       r_.f32[2] = a_.f32[2];
13257       r_.f32[3] = a_.f32[3];
13258     #endif
13259 
13260     return simde__m128_from_private(r_);
13261   #endif
13262 }
13263 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13264 #  define _mm_movehl_ps(a, b) simde_mm_movehl_ps((a), (b))
13265 #endif
13266 
13267 SIMDE_FUNCTION_ATTRIBUTES
13268 simde__m128
simde_mm_movelh_ps(simde__m128 a,simde__m128 b)13269 simde_mm_movelh_ps (simde__m128 a, simde__m128 b) {
13270   #if defined(SIMDE_X86_SSE_NATIVE)
13271     return _mm_movelh_ps(a, b);
13272   #else
13273     simde__m128_private
13274       r_,
13275       a_ = simde__m128_to_private(a),
13276       b_ = simde__m128_to_private(b);
13277 
13278     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13279       float32x2_t a10 = vget_low_f32(a_.neon_f32);
13280       float32x2_t b10 = vget_low_f32(b_.neon_f32);
13281       r_.neon_f32 = vcombine_f32(a10, b10);
13282     #elif defined(SIMDE_SHUFFLE_VECTOR_)
13283       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5);
13284     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
13285       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float),
13286           vec_mergeh(a_.altivec_i64, b_.altivec_i64));
13287     #else
13288       r_.f32[0] = a_.f32[0];
13289       r_.f32[1] = a_.f32[1];
13290       r_.f32[2] = b_.f32[0];
13291       r_.f32[3] = b_.f32[1];
13292     #endif
13293 
13294     return simde__m128_from_private(r_);
13295   #endif
13296 }
13297 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13298 #  define _mm_movelh_ps(a, b) simde_mm_movelh_ps((a), (b))
13299 #endif
13300 
13301 SIMDE_FUNCTION_ATTRIBUTES
13302 int
simde_mm_movemask_pi8(simde__m64 a)13303 simde_mm_movemask_pi8 (simde__m64 a) {
13304   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
13305     return _mm_movemask_pi8(a);
13306   #else
13307     simde__m64_private a_ = simde__m64_to_private(a);
13308     int r = 0;
13309 
13310     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
13311       uint8x8_t input = a_.neon_u8;
13312       const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
13313       const uint8x8_t mask_and = vdup_n_u8(0x80);
13314       const int8x8_t mask_shift = vld1_s8(xr);
13315       const uint8x8_t mask_result = vshl_u8(vand_u8(input, mask_and), mask_shift);
13316       uint8x8_t lo = mask_result;
13317       r = vaddv_u8(lo);
13318     #else
13319       const size_t nmemb = sizeof(a_.i8) / sizeof(a_.i8[0]);
13320       SIMDE_VECTORIZE_REDUCTION(|:r)
13321       for (size_t i = 0 ; i < nmemb ; i++) {
13322         r |= (a_.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
13323       }
13324     #endif
13325 
13326     return r;
13327   #endif
13328 }
13329 #define simde_m_pmovmskb(a) simde_mm_movemask_pi8(a)
13330 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13331 #  define _mm_movemask_pi8(a) simde_mm_movemask_pi8(a)
13332 #  define _m_pmovmskb(a) simde_mm_movemask_pi8(a)
13333 #endif
13334 
13335 SIMDE_FUNCTION_ATTRIBUTES
13336 int
simde_mm_movemask_ps(simde__m128 a)13337 simde_mm_movemask_ps (simde__m128 a) {
13338   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
13339     return _mm_movemask_ps(a);
13340   #else
13341     int r = 0;
13342     simde__m128_private a_ = simde__m128_to_private(a);
13343 
13344     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
13345       static const int32_t shift_amount[] = { 0, 1, 2, 3 };
13346       const int32x4_t shift = vld1q_s32(shift_amount);
13347       uint32x4_t tmp = vshrq_n_u32(a_.neon_u32, 31);
13348       return HEDLEY_STATIC_CAST(int, vaddvq_u32(vshlq_u32(tmp, shift)));
13349     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13350       // Shift out everything but the sign bits with a 32-bit unsigned shift right.
13351       uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(a_.neon_u32, 31));
13352       // Merge the two pairs together with a 64-bit unsigned shift right + add.
13353       uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
13354       // Extract the result.
13355       return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
13356     #else
13357       SIMDE_VECTORIZE_REDUCTION(|:r)
13358       for (size_t i = 0 ; i < sizeof(a_.u32) / sizeof(a_.u32[0]) ; i++) {
13359         r |= (a_.u32[i] >> ((sizeof(a_.u32[i]) * CHAR_BIT) - 1)) << i;
13360       }
13361     #endif
13362 
13363     return r;
13364   #endif
13365 }
13366 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13367 #  define _mm_movemask_ps(a) simde_mm_movemask_ps((a))
13368 #endif
13369 
13370 SIMDE_FUNCTION_ATTRIBUTES
13371 simde__m128
simde_mm_mul_ps(simde__m128 a,simde__m128 b)13372 simde_mm_mul_ps (simde__m128 a, simde__m128 b) {
13373   #if defined(SIMDE_X86_SSE_NATIVE)
13374     return _mm_mul_ps(a, b);
13375   #else
13376     simde__m128_private
13377       r_,
13378       a_ = simde__m128_to_private(a),
13379       b_ = simde__m128_to_private(b);
13380 
13381     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13382       r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
13383     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
13384       r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128);
13385     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
13386       r_.f32 = a_.f32 * b_.f32;
13387     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
13388       r_.altivec_f32 = vec_mul(a_.altivec_f32, b_.altivec_f32);
13389     #else
13390       SIMDE_VECTORIZE
13391       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
13392         r_.f32[i] = a_.f32[i] * b_.f32[i];
13393       }
13394     #endif
13395 
13396     return simde__m128_from_private(r_);
13397   #endif
13398 }
13399 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13400 #  define _mm_mul_ps(a, b) simde_mm_mul_ps((a), (b))
13401 #endif
13402 
13403 SIMDE_FUNCTION_ATTRIBUTES
13404 simde__m128
simde_mm_mul_ss(simde__m128 a,simde__m128 b)13405 simde_mm_mul_ss (simde__m128 a, simde__m128 b) {
13406   #if defined(SIMDE_X86_SSE_NATIVE)
13407     return _mm_mul_ss(a, b);
13408   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
13409     return simde_mm_move_ss(a, simde_mm_mul_ps(a, b));
13410   #else
13411     simde__m128_private
13412       r_,
13413       a_ = simde__m128_to_private(a),
13414       b_ = simde__m128_to_private(b);
13415 
13416     r_.f32[0] = a_.f32[0] * b_.f32[0];
13417     r_.f32[1] = a_.f32[1];
13418     r_.f32[2] = a_.f32[2];
13419     r_.f32[3] = a_.f32[3];
13420 
13421     return simde__m128_from_private(r_);
13422   #endif
13423 }
13424 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13425 #  define _mm_mul_ss(a, b) simde_mm_mul_ss((a), (b))
13426 #endif
13427 
13428 SIMDE_FUNCTION_ATTRIBUTES
13429 simde__m64
simde_mm_mulhi_pu16(simde__m64 a,simde__m64 b)13430 simde_mm_mulhi_pu16 (simde__m64 a, simde__m64 b) {
13431   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
13432     return _mm_mulhi_pu16(a, b);
13433   #else
13434     simde__m64_private
13435       r_,
13436       a_ = simde__m64_to_private(a),
13437       b_ = simde__m64_to_private(b);
13438 
13439     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13440       const uint32x4_t t1 = vmull_u16(a_.neon_u16, b_.neon_u16);
13441       const uint32x4_t t2 = vshrq_n_u32(t1, 16);
13442       const uint16x4_t t3 = vmovn_u32(t2);
13443       r_.neon_u16 = t3;
13444     #else
13445       SIMDE_VECTORIZE
13446       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
13447         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >> UINT32_C(16)));
13448       }
13449     #endif
13450 
13451     return simde__m64_from_private(r_);
13452   #endif
13453 }
13454 #define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
13455 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13456 #  define _mm_mulhi_pu16(a, b) simde_mm_mulhi_pu16(a, b)
13457 #  define _m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
13458 #endif
13459 
13460 #if defined(SIMDE_X86_SSE_NATIVE) && defined(HEDLEY_GCC_VERSION)
13461   #define SIMDE_MM_HINT_NTA  HEDLEY_STATIC_CAST(enum _mm_hint, 0)
13462   #define SIMDE_MM_HINT_T0   HEDLEY_STATIC_CAST(enum _mm_hint, 1)
13463   #define SIMDE_MM_HINT_T1   HEDLEY_STATIC_CAST(enum _mm_hint, 2)
13464   #define SIMDE_MM_HINT_T2   HEDLEY_STATIC_CAST(enum _mm_hint, 3)
13465   #define SIMDE_MM_HINT_ENTA HEDLEY_STATIC_CAST(enum _mm_hint, 4)
13466   #define SIMDE_MM_HINT_ET0  HEDLEY_STATIC_CAST(enum _mm_hint, 5)
13467   #define SIMDE_MM_HINT_ET1  HEDLEY_STATIC_CAST(enum _mm_hint, 6)
13468   #define SIMDE_MM_HINT_ET2  HEDLEY_STATIC_CAST(enum _mm_hint, 7)
13469 #else
13470   #define SIMDE_MM_HINT_NTA  0
13471   #define SIMDE_MM_HINT_T0   1
13472   #define SIMDE_MM_HINT_T1   2
13473   #define SIMDE_MM_HINT_T2   3
13474   #define SIMDE_MM_HINT_ENTA 4
13475   #define SIMDE_MM_HINT_ET0  5
13476   #define SIMDE_MM_HINT_ET1  6
13477   #define SIMDE_MM_HINT_ET2  7
13478 #endif
13479 
13480 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13481   HEDLEY_DIAGNOSTIC_PUSH
13482   #if HEDLEY_HAS_WARNING("-Wreserved-id-macro")
13483     _Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"")
13484   #endif
13485   #undef  _MM_HINT_NTA
13486   #define _MM_HINT_NTA  SIMDE_MM_HINT_NTA
13487   #undef  _MM_HINT_T0
13488   #define _MM_HINT_T0   SIMDE_MM_HINT_T0
13489   #undef  _MM_HINT_T1
13490   #define _MM_HINT_T1   SIMDE_MM_HINT_T1
13491   #undef  _MM_HINT_T2
13492   #define _MM_HINT_T2   SIMDE_MM_HINT_T2
13493   #undef  _MM_HINT_ETNA
13494   #define _MM_HINT_ETNA SIMDE_MM_HINT_ETNA
13495   #undef  _MM_HINT_ET0
13496   #define _MM_HINT_ET0  SIMDE_MM_HINT_ET0
13497   #undef  _MM_HINT_ET1
13498   #define _MM_HINT_ET1  SIMDE_MM_HINT_ET1
13499   #undef  _MM_HINT_ET1
13500   #define _MM_HINT_ET2  SIMDE_MM_HINT_ET2
13501   HEDLEY_DIAGNOSTIC_POP
13502 #endif
13503 
13504 SIMDE_FUNCTION_ATTRIBUTES
13505 void
simde_mm_prefetch(char const * p,int i)13506 simde_mm_prefetch (char const* p, int i) {
13507   #if defined(HEDLEY_GCC_VERSION)
13508     __builtin_prefetch(p);
13509   #else
13510     (void) p;
13511   #endif
13512 
13513   (void) i;
13514 }
13515 #if defined(SIMDE_X86_SSE_NATIVE)
13516   #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) /* https://reviews.llvm.org/D71718 */
13517     #define simde_mm_prefetch(p, i) \
13518       (__extension__({ \
13519         HEDLEY_DIAGNOSTIC_PUSH \
13520         HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
13521         _mm_prefetch((p), (i)); \
13522         HEDLEY_DIAGNOSTIC_POP \
13523       }))
13524   #else
13525     #define simde_mm_prefetch(p, i) _mm_prefetch(p, i)
13526   #endif
13527 #endif
13528 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13529   #define _mm_prefetch(p, i) simde_mm_prefetch(p, i)
13530 #endif
13531 
13532 SIMDE_FUNCTION_ATTRIBUTES
13533 simde__m128
simde_x_mm_negate_ps(simde__m128 a)13534 simde_x_mm_negate_ps(simde__m128 a) {
13535   #if defined(SIMDE_X86_SSE_NATIVE)
13536     return simde_mm_xor_ps(a, _mm_set1_ps(SIMDE_FLOAT32_C(-0.0)));
13537   #else
13538     simde__m128_private
13539       r_,
13540       a_ = simde__m128_to_private(a);
13541 
13542     #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
13543         (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,1,0))
13544       r_.altivec_f32 = vec_neg(a_.altivec_f32);
13545     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13546       r_.neon_f32 = vnegq_f32(a_.neon_f32);
13547     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
13548       r_.wasm_v128 = wasm_f32x4_neg(a_.wasm_v128);
13549     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
13550       r_.altivec_f32 = vec_neg(a_.altivec_f32);
13551     #elif defined(SIMDE_VECTOR_NEGATE)
13552       r_.f32 = -a_.f32;
13553     #else
13554       SIMDE_VECTORIZE
13555       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
13556         r_.f32[i] = -a_.f32[i];
13557       }
13558     #endif
13559 
13560     return simde__m128_from_private(r_);
13561   #endif
13562 }
13563 
13564 SIMDE_FUNCTION_ATTRIBUTES
13565 simde__m128
simde_mm_rcp_ps(simde__m128 a)13566 simde_mm_rcp_ps (simde__m128 a) {
13567   #if defined(SIMDE_X86_SSE_NATIVE)
13568     return _mm_rcp_ps(a);
13569   #else
13570     simde__m128_private
13571       r_,
13572       a_ = simde__m128_to_private(a);
13573 
13574     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13575       float32x4_t recip = vrecpeq_f32(a_.neon_f32);
13576 
13577       #if SIMDE_ACCURACY_PREFERENCE > 0
13578         for (int i = 0; i < SIMDE_ACCURACY_PREFERENCE ; ++i) {
13579           recip = vmulq_f32(recip, vrecpsq_f32(recip, a_.neon_f32));
13580         }
13581       #endif
13582 
13583       r_.neon_f32 = recip;
13584     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
13585       r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), a_.wasm_v128);
13586     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
13587       r_.altivec_f32 = vec_re(a_.altivec_f32);
13588     #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
13589       r_.f32 = 1.0f / a_.f32;
13590     #elif defined(SIMDE_IEEE754_STORAGE)
13591       /* https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 */
13592       SIMDE_VECTORIZE
13593       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
13594         int32_t ix;
13595         simde_float32 fx = a_.f32[i];
13596         simde_memcpy(&ix, &fx, sizeof(ix));
13597         int32_t x = INT32_C(0x7EF311C3) - ix;
13598         simde_float32 temp;
13599         simde_memcpy(&temp, &x, sizeof(temp));
13600         r_.f32[i] = temp * (SIMDE_FLOAT32_C(2.0) - temp * fx);
13601       }
13602     #else
13603       SIMDE_VECTORIZE
13604       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
13605         r_.f32[i] = 1.0f / a_.f32[i];
13606       }
13607     #endif
13608 
13609     return simde__m128_from_private(r_);
13610   #endif
13611 }
13612 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13613 #  define _mm_rcp_ps(a) simde_mm_rcp_ps((a))
13614 #endif
13615 
13616 SIMDE_FUNCTION_ATTRIBUTES
13617 simde__m128
simde_mm_rcp_ss(simde__m128 a)13618 simde_mm_rcp_ss (simde__m128 a) {
13619   #if defined(SIMDE_X86_SSE_NATIVE)
13620     return _mm_rcp_ss(a);
13621   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
13622     return simde_mm_move_ss(a, simde_mm_rcp_ps(a));
13623   #else
13624     simde__m128_private
13625       r_,
13626       a_ = simde__m128_to_private(a);
13627 
13628     r_.f32[0] = 1.0f / a_.f32[0];
13629     r_.f32[1] = a_.f32[1];
13630     r_.f32[2] = a_.f32[2];
13631     r_.f32[3] = a_.f32[3];
13632 
13633     return simde__m128_from_private(r_);
13634   #endif
13635 }
13636 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13637 #  define _mm_rcp_ss(a) simde_mm_rcp_ss((a))
13638 #endif
13639 
13640 SIMDE_FUNCTION_ATTRIBUTES
13641 simde__m128
simde_mm_rsqrt_ps(simde__m128 a)13642 simde_mm_rsqrt_ps (simde__m128 a) {
13643   #if defined(SIMDE_X86_SSE_NATIVE)
13644     return _mm_rsqrt_ps(a);
13645   #else
13646     simde__m128_private
13647       r_,
13648       a_ = simde__m128_to_private(a);
13649 
13650     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13651       r_.neon_f32 = vrsqrteq_f32(a_.neon_f32);
13652     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
13653       r_.altivec_f32 = vec_rsqrte(a_.altivec_f32);
13654     #elif defined(SIMDE_IEEE754_STORAGE)
13655       /* https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
13656         Pages 100 - 103 */
13657       SIMDE_VECTORIZE
13658       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
13659         #if SIMDE_ACCURACY_PREFERENCE <= 0
13660           r_.i32[i] = INT32_C(0x5F37624F) - (a_.i32[i] >> 1);
13661         #else
13662           simde_float32 x = a_.f32[i];
13663           simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x;
13664           int32_t ix;
13665 
13666           simde_memcpy(&ix, &x, sizeof(ix));
13667 
13668           #if SIMDE_ACCURACY_PREFERENCE == 1
13669             ix = INT32_C(0x5F375A82) - (ix >> 1);
13670           #else
13671             ix = INT32_C(0x5F37599E) - (ix >> 1);
13672           #endif
13673 
13674           simde_memcpy(&x, &ix, sizeof(x));
13675 
13676           #if SIMDE_ACCURACY_PREFERENCE >= 2
13677             x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
13678           #endif
13679           x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
13680 
13681           r_.f32[i] = x;
13682         #endif
13683       }
13684     #elif defined(simde_math_sqrtf)
13685       SIMDE_VECTORIZE
13686       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
13687         r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]);
13688       }
13689     #else
13690       HEDLEY_UNREACHABLE();
13691     #endif
13692 
13693     return simde__m128_from_private(r_);
13694   #endif
13695 }
13696 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13697 #  define _mm_rsqrt_ps(a) simde_mm_rsqrt_ps((a))
13698 #endif
13699 
13700 SIMDE_FUNCTION_ATTRIBUTES
13701 simde__m128
simde_mm_rsqrt_ss(simde__m128 a)13702 simde_mm_rsqrt_ss (simde__m128 a) {
13703   #if defined(SIMDE_X86_SSE_NATIVE)
13704     return _mm_rsqrt_ss(a);
13705   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
13706     return simde_mm_move_ss(a, simde_mm_rsqrt_ps(a));
13707   #else
13708     simde__m128_private
13709       r_,
13710       a_ = simde__m128_to_private(a);
13711 
13712   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13713       r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(simde_mm_rsqrt_ps(a).neon_f32, 0), a_.neon_f32, 0);
13714   #elif defined(SIMDE_IEEE754_STORAGE)
13715     {
13716       #if SIMDE_ACCURACY_PREFERENCE <= 0
13717         r_.i32[0] = INT32_C(0x5F37624F) - (a_.i32[0] >> 1);
13718       #else
13719         simde_float32 x = a_.f32[0];
13720         simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x;
13721         int32_t ix;
13722 
13723         simde_memcpy(&ix, &x, sizeof(ix));
13724 
13725         #if SIMDE_ACCURACY_PREFERENCE == 1
13726           ix = INT32_C(0x5F375A82) - (ix >> 1);
13727         #else
13728           ix = INT32_C(0x5F37599E) - (ix >> 1);
13729         #endif
13730 
13731         simde_memcpy(&x, &ix, sizeof(x));
13732 
13733         #if SIMDE_ACCURACY_PREFERENCE >= 2
13734           x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
13735         #endif
13736         x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
13737 
13738         r_.f32[0] = x;
13739       #endif
13740     }
13741     r_.f32[1] = a_.f32[1];
13742     r_.f32[2] = a_.f32[2];
13743     r_.f32[3] = a_.f32[3];
13744   #elif defined(simde_math_sqrtf)
13745     r_.f32[0] = 1.0f / simde_math_sqrtf(a_.f32[0]);
13746     r_.f32[1] = a_.f32[1];
13747     r_.f32[2] = a_.f32[2];
13748     r_.f32[3] = a_.f32[3];
13749   #else
13750     HEDLEY_UNREACHABLE();
13751   #endif
13752 
13753     return simde__m128_from_private(r_);
13754   #endif
13755 }
13756 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13757 #  define _mm_rsqrt_ss(a) simde_mm_rsqrt_ss((a))
13758 #endif
13759 
13760 SIMDE_FUNCTION_ATTRIBUTES
13761 simde__m64
simde_mm_sad_pu8(simde__m64 a,simde__m64 b)13762 simde_mm_sad_pu8 (simde__m64 a, simde__m64 b) {
13763   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
13764     return _mm_sad_pu8(a, b);
13765   #else
13766     simde__m64_private
13767       r_,
13768       a_ = simde__m64_to_private(a),
13769       b_ = simde__m64_to_private(b);
13770 
13771     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13772       uint16x4_t t = vpaddl_u8(vabd_u8(a_.neon_u8, b_.neon_u8));
13773       uint16_t r0 = t[0] + t[1] + t[2] + t[3];
13774       r_.neon_u16 = vset_lane_u16(r0, vdup_n_u16(0), 0);
13775     #else
13776       uint16_t sum = 0;
13777 
13778       #if defined(SIMDE_HAVE_STDLIB_H)
13779         SIMDE_VECTORIZE_REDUCTION(+:sum)
13780         for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
13781           sum += HEDLEY_STATIC_CAST(uint8_t, abs(a_.u8[i] - b_.u8[i]));
13782         }
13783 
13784         r_.i16[0] = HEDLEY_STATIC_CAST(int16_t, sum);
13785         r_.i16[1] = 0;
13786         r_.i16[2] = 0;
13787         r_.i16[3] = 0;
13788       #else
13789         HEDLEY_UNREACHABLE();
13790       #endif
13791     #endif
13792 
13793     return simde__m64_from_private(r_);
13794   #endif
13795 }
13796 #define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b)
13797 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13798 #  define _mm_sad_pu8(a, b) simde_mm_sad_pu8(a, b)
13799 #  define _m_psadbw(a, b) simde_mm_sad_pu8(a, b)
13800 #endif
13801 
13802 SIMDE_FUNCTION_ATTRIBUTES
13803 simde__m128
simde_mm_set_ss(simde_float32 a)13804 simde_mm_set_ss (simde_float32 a) {
13805   #if defined(SIMDE_X86_SSE_NATIVE)
13806     return _mm_set_ss(a);
13807   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13808     return vsetq_lane_f32(a, vdupq_n_f32(SIMDE_FLOAT32_C(0.0)), 0);
13809   #else
13810     return simde_mm_set_ps(SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), a);
13811   #endif
13812 }
13813 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13814 #  define _mm_set_ss(a) simde_mm_set_ss(a)
13815 #endif
13816 
13817 SIMDE_FUNCTION_ATTRIBUTES
13818 simde__m128
simde_mm_setr_ps(simde_float32 e3,simde_float32 e2,simde_float32 e1,simde_float32 e0)13819 simde_mm_setr_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
13820   #if defined(SIMDE_X86_SSE_NATIVE)
13821     return _mm_setr_ps(e3, e2, e1, e0);
13822   #else
13823     return simde_mm_set_ps(e0, e1, e2, e3);
13824   #endif
13825 }
13826 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13827 #  define _mm_setr_ps(e3, e2, e1, e0) simde_mm_setr_ps(e3, e2, e1, e0)
13828 #endif
13829 
13830 SIMDE_FUNCTION_ATTRIBUTES
13831 simde__m128
simde_mm_setzero_ps(void)13832 simde_mm_setzero_ps (void) {
13833   #if defined(SIMDE_X86_SSE_NATIVE)
13834     return _mm_setzero_ps();
13835   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13836     return vdupq_n_f32(SIMDE_FLOAT32_C(0.0));
13837   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
13838     return vec_splats(SIMDE_FLOAT32_C(0.0));
13839   #else
13840     simde__m128 r;
13841     simde_memset(&r, 0, sizeof(r));
13842     return r;
13843   #endif
13844 }
13845 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13846 #  define _mm_setzero_ps() simde_mm_setzero_ps()
13847 #endif
13848 
13849 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
13850 HEDLEY_DIAGNOSTIC_PUSH
13851 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
13852 #endif
13853 
13854 SIMDE_FUNCTION_ATTRIBUTES
13855 simde__m128
simde_mm_undefined_ps(void)13856 simde_mm_undefined_ps (void) {
13857   simde__m128_private r_;
13858 
13859   #if defined(SIMDE_HAVE_UNDEFINED128)
13860     r_.n = _mm_undefined_ps();
13861   #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
13862     r_ = simde__m128_to_private(simde_mm_setzero_ps());
13863   #endif
13864 
13865   return simde__m128_from_private(r_);
13866 }
13867 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13868 #  define _mm_undefined_ps() simde_mm_undefined_ps()
13869 #endif
13870 
13871 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
13872 HEDLEY_DIAGNOSTIC_POP
13873 #endif
13874 
13875 SIMDE_FUNCTION_ATTRIBUTES
13876 simde__m128
simde_x_mm_setone_ps(void)13877 simde_x_mm_setone_ps (void) {
13878   simde__m128 t = simde_mm_setzero_ps();
13879   return simde_mm_cmpeq_ps(t, t);
13880 }
13881 
13882 SIMDE_FUNCTION_ATTRIBUTES
13883 void
simde_mm_sfence(void)13884 simde_mm_sfence (void) {
13885     /* TODO: Use Hedley. */
13886   #if defined(SIMDE_X86_SSE_NATIVE)
13887     _mm_sfence();
13888   #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
13889     __atomic_thread_fence(__ATOMIC_SEQ_CST);
13890   #elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
13891     #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9)
13892       __atomic_thread_fence(__ATOMIC_SEQ_CST);
13893     #else
13894       atomic_thread_fence(memory_order_seq_cst);
13895     #endif
13896   #elif defined(_MSC_VER)
13897     MemoryBarrier();
13898   #elif HEDLEY_HAS_EXTENSION(c_atomic)
13899     __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
13900   #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
13901     __sync_synchronize();
13902   #elif defined(_OPENMP)
13903     #pragma omp critical(simde_mm_sfence_)
13904     { }
13905   #endif
13906 }
13907 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13908 #  define _mm_sfence() simde_mm_sfence()
13909 #endif
13910 
13911 #define SIMDE_MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
13912 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13913 #  define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w)
13914 #endif
13915 
13916 #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
13917 #  define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8)
13918 #elif defined(SIMDE_SHUFFLE_VECTOR_)
13919 #  define simde_mm_shuffle_pi16(a, imm8) (__extension__ ({ \
13920       const simde__m64_private simde__tmp_a_ = simde__m64_to_private(a); \
13921       simde__m64_from_private((simde__m64_private) { .i16 = \
13922         SIMDE_SHUFFLE_VECTOR_(16, 8, \
13923           (simde__tmp_a_).i16, \
13924           (simde__tmp_a_).i16, \
13925           (((imm8)     ) & 3), \
13926           (((imm8) >> 2) & 3), \
13927           (((imm8) >> 4) & 3), \
13928           (((imm8) >> 6) & 3)) }); }))
13929 #else
13930 SIMDE_FUNCTION_ATTRIBUTES
13931 simde__m64
simde_mm_shuffle_pi16(simde__m64 a,const int imm8)13932 simde_mm_shuffle_pi16 (simde__m64 a, const int imm8)
13933     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
13934   simde__m64_private r_;
13935   simde__m64_private a_ = simde__m64_to_private(a);
13936 
13937   for (size_t i = 0 ; i < sizeof(r_.i16) / sizeof(r_.i16[0]) ; i++) {
13938     r_.i16[i] = a_.i16[(imm8 >> (i * 2)) & 3];
13939   }
13940 
13941 HEDLEY_DIAGNOSTIC_PUSH
13942 #if HEDLEY_HAS_WARNING("-Wconditional-uninitialized")
13943 #  pragma clang diagnostic ignored "-Wconditional-uninitialized"
13944 #endif
13945   return simde__m64_from_private(r_);
13946 HEDLEY_DIAGNOSTIC_POP
13947 }
13948 #endif
13949 #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
13950 #  define simde_m_pshufw(a, imm8) _m_pshufw(a, imm8)
13951 #else
13952 #  define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
13953 #endif
13954 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
13955 #  define _mm_shuffle_pi16(a, imm8) simde_mm_shuffle_pi16(a, imm8)
13956 #  define _m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
13957 #endif
13958 
13959 #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
13960 #  define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8)
13961 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
13962   #define simde_mm_shuffle_ps(a, b, imm8)                                   \
13963      __extension__({                                                        \
13964         float32x4_t ret;                                                   \
13965         ret = vmovq_n_f32(                                                 \
13966             vgetq_lane_f32(a, (imm8) & (0x3)));     \
13967         ret = vsetq_lane_f32(                                              \
13968             vgetq_lane_f32(a, ((imm8) >> 2) & 0x3), \
13969             ret, 1);                                                       \
13970         ret = vsetq_lane_f32(                                              \
13971             vgetq_lane_f32(b, ((imm8) >> 4) & 0x3), \
13972             ret, 2);                                                       \
13973         ret = vsetq_lane_f32(                                              \
13974             vgetq_lane_f32(b, ((imm8) >> 6) & 0x3), \
13975             ret, 3);                                                                    \
13976     })
13977 #elif defined(SIMDE_SHUFFLE_VECTOR_)
13978 #  define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \
13979       simde__m128_from_private((simde__m128_private) { .f32 = \
13980         SIMDE_SHUFFLE_VECTOR_(32, 16, \
13981           simde__m128_to_private(a).f32, \
13982           simde__m128_to_private(b).f32, \
13983           (((imm8)     ) & 3), \
13984           (((imm8) >> 2) & 3), \
13985           (((imm8) >> 4) & 3) + 4, \
13986           (((imm8) >> 6) & 3) + 4) }); }))
13987 #else
13988 SIMDE_FUNCTION_ATTRIBUTES
13989 simde__m128
simde_mm_shuffle_ps(simde__m128 a,simde__m128 b,const int imm8)13990 simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8)
13991     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
13992   simde__m128_private
13993     r_,
13994     a_ = simde__m128_to_private(a),
13995     b_ = simde__m128_to_private(b);
13996 
13997   r_.f32[0] = a_.f32[(imm8 >> 0) & 3];
13998   r_.f32[1] = a_.f32[(imm8 >> 2) & 3];
13999   r_.f32[2] = b_.f32[(imm8 >> 4) & 3];
14000   r_.f32[3] = b_.f32[(imm8 >> 6) & 3];
14001 
14002   return simde__m128_from_private(r_);
14003 }
14004 #endif
14005 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14006 #  define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8)
14007 #endif
14008 
14009 SIMDE_FUNCTION_ATTRIBUTES
14010 simde__m128
simde_mm_sqrt_ps(simde__m128 a)14011 simde_mm_sqrt_ps (simde__m128 a) {
14012   #if defined(SIMDE_X86_SSE_NATIVE)
14013     return _mm_sqrt_ps(a);
14014   #else
14015     simde__m128_private
14016       r_,
14017       a_ = simde__m128_to_private(a);
14018 
14019     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
14020       r_.neon_f32 = vsqrtq_f32(a_.neon_f32);
14021     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14022       float32x4_t est = vrsqrteq_f32(a_.neon_f32);
14023       for (int i = 0 ; i <= SIMDE_ACCURACY_PREFERENCE ; i++) {
14024         est = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a_.neon_f32, est), est), est);
14025       }
14026       r_.neon_f32 = vmulq_f32(a_.neon_f32, est);
14027     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
14028       r_.wasm_v128 = wasm_f32x4_sqrt(a_.wasm_v128);
14029     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
14030       r_.altivec_f32 = vec_sqrt(a_.altivec_f32);
14031     #elif defined(simde_math_sqrt)
14032       SIMDE_VECTORIZE
14033       for (size_t i = 0 ; i < sizeof(r_.f32) / sizeof(r_.f32[0]) ; i++) {
14034         r_.f32[i] = simde_math_sqrtf(a_.f32[i]);
14035       }
14036     #else
14037       HEDLEY_UNREACHABLE();
14038     #endif
14039 
14040     return simde__m128_from_private(r_);
14041   #endif
14042 }
14043 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14044 #  define _mm_sqrt_ps(a) simde_mm_sqrt_ps((a))
14045 #endif
14046 
14047 SIMDE_FUNCTION_ATTRIBUTES
14048 simde__m128
simde_mm_sqrt_ss(simde__m128 a)14049 simde_mm_sqrt_ss (simde__m128 a) {
14050   #if defined(SIMDE_X86_SSE_NATIVE)
14051     return _mm_sqrt_ss(a);
14052   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
14053     return simde_mm_move_ss(a, simde_mm_sqrt_ps(a));
14054   #else
14055     simde__m128_private
14056       r_,
14057       a_ = simde__m128_to_private(a);
14058 
14059     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14060       float32_t value =
14061             vgetq_lane_f32(simde__m128_to_private(simde_mm_sqrt_ps(a)).neon_f32, 0);
14062       r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
14063     #elif defined(simde_math_sqrtf)
14064       r_.f32[0] = simde_math_sqrtf(a_.f32[0]);
14065       r_.f32[1] = a_.f32[1];
14066       r_.f32[2] = a_.f32[2];
14067       r_.f32[3] = a_.f32[3];
14068     #else
14069       HEDLEY_UNREACHABLE();
14070     #endif
14071 
14072     return simde__m128_from_private(r_);
14073   #endif
14074 }
14075 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14076 #  define _mm_sqrt_ss(a) simde_mm_sqrt_ss((a))
14077 #endif
14078 
14079 SIMDE_FUNCTION_ATTRIBUTES
14080 void
simde_mm_store_ps(simde_float32 mem_addr[4],simde__m128 a)14081 simde_mm_store_ps (simde_float32 mem_addr[4], simde__m128 a) {
14082   #if defined(SIMDE_X86_SSE_NATIVE)
14083     _mm_store_ps(mem_addr, a);
14084   #else
14085     simde__m128_private a_ = simde__m128_to_private(a);
14086 
14087     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14088       vst1q_f32(mem_addr, a_.neon_f32);
14089     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
14090       vec_st(a_.altivec_f32, 0, mem_addr);
14091     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
14092       wasm_v128_store(mem_addr, a_.wasm_v128);
14093     #else
14094       simde_memcpy(mem_addr, &a_, sizeof(a));
14095     #endif
14096   #endif
14097 }
14098 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14099 #  define _mm_store_ps(mem_addr, a) simde_mm_store_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a))
14100 #endif
14101 
14102 SIMDE_FUNCTION_ATTRIBUTES
14103 void
simde_mm_store1_ps(simde_float32 mem_addr[4],simde__m128 a)14104 simde_mm_store1_ps (simde_float32 mem_addr[4], simde__m128 a) {
14105   simde_float32* mem_addr_ = SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128);
14106 
14107   #if defined(SIMDE_X86_SSE_NATIVE)
14108     _mm_store_ps1(mem_addr_, a);
14109   #else
14110     simde__m128_private a_ = simde__m128_to_private(a);
14111 
14112     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14113       vst1q_f32(mem_addr_, vdupq_lane_f32(vget_low_f32(a_.neon_f32), 0));
14114     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
14115       wasm_v128_store(mem_addr_, wasm_v32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 0, 0));
14116     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
14117       vec_st(vec_splat(a_.altivec_f32, 0), 0, mem_addr_);
14118     #elif defined(SIMDE_SHUFFLE_VECTOR_)
14119       simde__m128_private tmp_;
14120       tmp_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0);
14121       simde_mm_store_ps(mem_addr_, tmp_.f32);
14122     #else
14123       SIMDE_VECTORIZE_ALIGNED(mem_addr_:16)
14124       for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) {
14125         mem_addr_[i] = a_.f32[0];
14126       }
14127     #endif
14128   #endif
14129 }
14130 #define simde_mm_store_ps1(mem_addr, a) simde_mm_store1_ps(mem_addr, a)
14131 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14132 #  define _mm_store_ps1(mem_addr, a) simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a))
14133 #  define _mm_store1_ps(mem_addr, a) simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a))
14134 #endif
14135 
14136 SIMDE_FUNCTION_ATTRIBUTES
14137 void
simde_mm_store_ss(simde_float32 * mem_addr,simde__m128 a)14138 simde_mm_store_ss (simde_float32* mem_addr, simde__m128 a) {
14139   #if defined(SIMDE_X86_SSE_NATIVE)
14140     _mm_store_ss(mem_addr, a);
14141   #else
14142     simde__m128_private a_ = simde__m128_to_private(a);
14143 
14144     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14145       vst1q_lane_f32(mem_addr, a_.neon_f32, 0);
14146     #else
14147       *mem_addr = a_.f32[0];
14148     #endif
14149   #endif
14150 }
14151 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14152 #  define _mm_store_ss(mem_addr, a) simde_mm_store_ss(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a))
14153 #endif
14154 
14155 SIMDE_FUNCTION_ATTRIBUTES
14156 void
simde_mm_storeh_pi(simde__m64 * mem_addr,simde__m128 a)14157 simde_mm_storeh_pi (simde__m64* mem_addr, simde__m128 a) {
14158   #if defined(SIMDE_X86_SSE_NATIVE)
14159     _mm_storeh_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a);
14160   #else
14161     simde__m128_private a_ = simde__m128_to_private(a);
14162 
14163     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14164       vst1_f32(HEDLEY_REINTERPRET_CAST(float32_t*, mem_addr), vget_high_f32(a_.neon_f32));
14165     #else
14166       simde_memcpy(mem_addr, &(a_.m64[1]), sizeof(a_.m64[1]));
14167     #endif
14168   #endif
14169 }
14170 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14171 #  define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, (a))
14172 #endif
14173 
14174 SIMDE_FUNCTION_ATTRIBUTES
14175 void
simde_mm_storel_pi(simde__m64 * mem_addr,simde__m128 a)14176 simde_mm_storel_pi (simde__m64* mem_addr, simde__m128 a) {
14177   #if defined(SIMDE_X86_SSE_NATIVE)
14178     _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a);
14179   #else
14180     simde__m64_private* dest_ = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr);
14181     simde__m128_private a_ = simde__m128_to_private(a);
14182 
14183     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14184       dest_->neon_f32 = vget_low_f32(a_.neon_f32);
14185     #else
14186       dest_->f32[0] = a_.f32[0];
14187       dest_->f32[1] = a_.f32[1];
14188     #endif
14189   #endif
14190 }
14191 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14192 #  define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, (a))
14193 #endif
14194 
14195 SIMDE_FUNCTION_ATTRIBUTES
14196 void
simde_mm_storer_ps(simde_float32 mem_addr[4],simde__m128 a)14197 simde_mm_storer_ps (simde_float32 mem_addr[4], simde__m128 a) {
14198   #if defined(SIMDE_X86_SSE_NATIVE)
14199     _mm_storer_ps(mem_addr, a);
14200   #else
14201     simde__m128_private a_ = simde__m128_to_private(a);
14202 
14203     #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
14204       vec_st(vec_reve(a_.altivec_f32), 0, mem_addr);
14205     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14206       float32x4_t tmp = vrev64q_f32(a_.neon_f32);
14207       vst1q_f32(mem_addr, vextq_f32(tmp, tmp, 2));
14208     #elif defined(SIMDE_SHUFFLE_VECTOR_)
14209       a_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 3, 2, 1, 0);
14210       simde_mm_store_ps(mem_addr, simde__m128_from_private(a_));
14211     #else
14212       SIMDE_VECTORIZE_ALIGNED(mem_addr:16)
14213       for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) {
14214         mem_addr[i] = a_.f32[((sizeof(a_.f32) / sizeof(a_.f32[0])) - 1) - i];
14215       }
14216     #endif
14217   #endif
14218 }
14219 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14220 #  define _mm_storer_ps(mem_addr, a) simde_mm_storer_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a))
14221 #endif
14222 
14223 SIMDE_FUNCTION_ATTRIBUTES
14224 void
simde_mm_storeu_ps(simde_float32 mem_addr[4],simde__m128 a)14225 simde_mm_storeu_ps (simde_float32 mem_addr[4], simde__m128 a) {
14226   #if defined(SIMDE_X86_SSE_NATIVE)
14227     _mm_storeu_ps(mem_addr, a);
14228   #else
14229     simde__m128_private a_ = simde__m128_to_private(a);
14230 
14231     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14232       vst1q_f32(mem_addr, a_.neon_f32);
14233     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
14234       vec_vsx_st(a_.altivec_f32, 0, mem_addr);
14235     #else
14236       simde_memcpy(mem_addr, &a_, sizeof(a_));
14237     #endif
14238   #endif
14239 }
14240 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14241 #  define _mm_storeu_ps(mem_addr, a) simde_mm_storeu_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a))
14242 #endif
14243 
14244 SIMDE_FUNCTION_ATTRIBUTES
14245 simde__m128
simde_mm_sub_ps(simde__m128 a,simde__m128 b)14246 simde_mm_sub_ps (simde__m128 a, simde__m128 b) {
14247   #if defined(SIMDE_X86_SSE_NATIVE)
14248     return _mm_sub_ps(a, b);
14249   #else
14250     simde__m128_private
14251       r_,
14252       a_ = simde__m128_to_private(a),
14253       b_ = simde__m128_to_private(b);
14254 
14255     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14256       r_.neon_f32 = vsubq_f32(a_.neon_f32, b_.neon_f32);
14257     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
14258       r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128);
14259     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
14260       r_.altivec_f32 = vec_sub(a_.altivec_f32, b_.altivec_f32);
14261     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
14262       r_.f32 = a_.f32 - b_.f32;
14263     #else
14264       SIMDE_VECTORIZE
14265       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
14266         r_.f32[i] = a_.f32[i] - b_.f32[i];
14267       }
14268     #endif
14269 
14270     return simde__m128_from_private(r_);
14271   #endif
14272 }
14273 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14274 #  define _mm_sub_ps(a, b) simde_mm_sub_ps((a), (b))
14275 #endif
14276 
14277 SIMDE_FUNCTION_ATTRIBUTES
14278 simde__m128
simde_mm_sub_ss(simde__m128 a,simde__m128 b)14279 simde_mm_sub_ss (simde__m128 a, simde__m128 b) {
14280   #if defined(SIMDE_X86_SSE_NATIVE)
14281     return _mm_sub_ss(a, b);
14282   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
14283     return simde_mm_move_ss(a, simde_mm_sub_ps(a, b));
14284   #else
14285     simde__m128_private
14286       r_,
14287       a_ = simde__m128_to_private(a),
14288       b_ = simde__m128_to_private(b);
14289 
14290     r_.f32[0] = a_.f32[0] - b_.f32[0];
14291     r_.f32[1] = a_.f32[1];
14292     r_.f32[2] = a_.f32[2];
14293     r_.f32[3] = a_.f32[3];
14294 
14295     return simde__m128_from_private(r_);
14296   #endif
14297 }
14298 
14299 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14300 #  define _mm_sub_ss(a, b) simde_mm_sub_ss((a), (b))
14301 #endif
14302 
14303 SIMDE_FUNCTION_ATTRIBUTES
14304 int
simde_mm_ucomieq_ss(simde__m128 a,simde__m128 b)14305 simde_mm_ucomieq_ss (simde__m128 a, simde__m128 b) {
14306   #if defined(SIMDE_X86_SSE_NATIVE)
14307     return _mm_ucomieq_ss(a, b);
14308   #else
14309     simde__m128_private
14310       a_ = simde__m128_to_private(a),
14311       b_ = simde__m128_to_private(b);
14312     int r;
14313 
14314     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14315       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
14316       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
14317       uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
14318       uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
14319       r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
14320     #elif defined(SIMDE_HAVE_FENV_H)
14321       fenv_t envp;
14322       int x = feholdexcept(&envp);
14323       r = a_.f32[0] == b_.f32[0];
14324       if (HEDLEY_LIKELY(x == 0))
14325         fesetenv(&envp);
14326     #else
14327       r = a_.f32[0] == b_.f32[0];
14328     #endif
14329 
14330     return r;
14331   #endif
14332 }
14333 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14334 #  define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss((a), (b))
14335 #endif
14336 
14337 SIMDE_FUNCTION_ATTRIBUTES
14338 int
simde_mm_ucomige_ss(simde__m128 a,simde__m128 b)14339 simde_mm_ucomige_ss (simde__m128 a, simde__m128 b) {
14340   #if defined(SIMDE_X86_SSE_NATIVE)
14341     return _mm_ucomige_ss(a, b);
14342   #else
14343     simde__m128_private
14344       a_ = simde__m128_to_private(a),
14345       b_ = simde__m128_to_private(b);
14346     int r;
14347 
14348     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14349       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
14350       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
14351       uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
14352       uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
14353       r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
14354     #elif defined(SIMDE_HAVE_FENV_H)
14355       fenv_t envp;
14356       int x = feholdexcept(&envp);
14357       r = a_.f32[0] >= b_.f32[0];
14358       if (HEDLEY_LIKELY(x == 0))
14359         fesetenv(&envp);
14360     #else
14361       r = a_.f32[0] >= b_.f32[0];
14362     #endif
14363 
14364     return r;
14365   #endif
14366 }
14367 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14368 #  define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss((a), (b))
14369 #endif
14370 
14371 SIMDE_FUNCTION_ATTRIBUTES
14372 int
simde_mm_ucomigt_ss(simde__m128 a,simde__m128 b)14373 simde_mm_ucomigt_ss (simde__m128 a, simde__m128 b) {
14374   #if defined(SIMDE_X86_SSE_NATIVE)
14375     return _mm_ucomigt_ss(a, b);
14376   #else
14377     simde__m128_private
14378       a_ = simde__m128_to_private(a),
14379       b_ = simde__m128_to_private(b);
14380     int r;
14381 
14382     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14383       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
14384       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
14385       uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
14386       uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
14387       r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
14388     #elif defined(SIMDE_HAVE_FENV_H)
14389       fenv_t envp;
14390       int x = feholdexcept(&envp);
14391       r = a_.f32[0] > b_.f32[0];
14392       if (HEDLEY_LIKELY(x == 0))
14393         fesetenv(&envp);
14394     #else
14395       r = a_.f32[0] > b_.f32[0];
14396     #endif
14397 
14398     return r;
14399   #endif
14400 }
14401 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14402 #  define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss((a), (b))
14403 #endif
14404 
14405 SIMDE_FUNCTION_ATTRIBUTES
14406 int
simde_mm_ucomile_ss(simde__m128 a,simde__m128 b)14407 simde_mm_ucomile_ss (simde__m128 a, simde__m128 b) {
14408   #if defined(SIMDE_X86_SSE_NATIVE)
14409     return _mm_ucomile_ss(a, b);
14410   #else
14411     simde__m128_private
14412       a_ = simde__m128_to_private(a),
14413       b_ = simde__m128_to_private(b);
14414     int r;
14415 
14416     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14417       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
14418       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
14419       uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
14420       uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
14421       r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
14422     #elif defined(SIMDE_HAVE_FENV_H)
14423       fenv_t envp;
14424       int x = feholdexcept(&envp);
14425       r = a_.f32[0] <= b_.f32[0];
14426       if (HEDLEY_LIKELY(x == 0))
14427         fesetenv(&envp);
14428     #else
14429       r = a_.f32[0] <= b_.f32[0];
14430     #endif
14431 
14432     return r;
14433   #endif
14434 }
14435 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14436 #  define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss((a), (b))
14437 #endif
14438 
14439 SIMDE_FUNCTION_ATTRIBUTES
14440 int
simde_mm_ucomilt_ss(simde__m128 a,simde__m128 b)14441 simde_mm_ucomilt_ss (simde__m128 a, simde__m128 b) {
14442   #if defined(SIMDE_X86_SSE_NATIVE)
14443     return _mm_ucomilt_ss(a, b);
14444   #else
14445     simde__m128_private
14446       a_ = simde__m128_to_private(a),
14447       b_ = simde__m128_to_private(b);
14448     int r;
14449 
14450     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14451       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
14452       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
14453       uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
14454       uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
14455       r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
14456     #elif defined(SIMDE_HAVE_FENV_H)
14457       fenv_t envp;
14458       int x = feholdexcept(&envp);
14459       r = a_.f32[0] < b_.f32[0];
14460       if (HEDLEY_LIKELY(x == 0))
14461         fesetenv(&envp);
14462     #else
14463       r = a_.f32[0] < b_.f32[0];
14464     #endif
14465 
14466     return r;
14467   #endif
14468 }
14469 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14470 #  define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss((a), (b))
14471 #endif
14472 
14473 SIMDE_FUNCTION_ATTRIBUTES
14474 int
simde_mm_ucomineq_ss(simde__m128 a,simde__m128 b)14475 simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) {
14476   #if defined(SIMDE_X86_SSE_NATIVE)
14477     return _mm_ucomineq_ss(a, b);
14478   #else
14479     simde__m128_private
14480       a_ = simde__m128_to_private(a),
14481       b_ = simde__m128_to_private(b);
14482     int r;
14483 
14484     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14485       uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
14486       uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
14487       uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
14488       uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
14489       r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
14490     #elif defined(SIMDE_HAVE_FENV_H)
14491       fenv_t envp;
14492       int x = feholdexcept(&envp);
14493       r = a_.f32[0] != b_.f32[0];
14494       if (HEDLEY_LIKELY(x == 0))
14495         fesetenv(&envp);
14496     #else
14497       r = a_.f32[0] != b_.f32[0];
14498     #endif
14499 
14500     return r;
14501   #endif
14502 }
14503 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14504 #  define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b))
14505 #endif
14506 
14507 #if defined(SIMDE_X86_SSE_NATIVE)
14508 #  if defined(__has_builtin)
14509 #    if __has_builtin(__builtin_ia32_undef128)
14510 #      define SIMDE_HAVE_UNDEFINED128
14511 #    endif
14512 #  elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && !defined(_MSC_VER)
14513 #    define SIMDE_HAVE_UNDEFINED128
14514 #  endif
14515 #endif
14516 
14517 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
14518   HEDLEY_DIAGNOSTIC_PUSH
14519   SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
14520 #endif
14521 
14522 SIMDE_FUNCTION_ATTRIBUTES
14523 simde__m128
simde_mm_unpackhi_ps(simde__m128 a,simde__m128 b)14524 simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) {
14525   #if defined(SIMDE_X86_SSE_NATIVE)
14526     return _mm_unpackhi_ps(a, b);
14527   #else
14528     simde__m128_private
14529       r_,
14530       a_ = simde__m128_to_private(a),
14531       b_ = simde__m128_to_private(b);
14532 
14533     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
14534       r_.neon_f32 = vzip2q_f32(a_.neon_f32, b_.neon_f32);
14535     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14536       float32x2_t a1 = vget_high_f32(a_.neon_f32);
14537       float32x2_t b1 = vget_high_f32(b_.neon_f32);
14538       float32x2x2_t result = vzip_f32(a1, b1);
14539       r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
14540     #elif defined(SIMDE_SHUFFLE_VECTOR_)
14541       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7);
14542     #else
14543       r_.f32[0] = a_.f32[2];
14544       r_.f32[1] = b_.f32[2];
14545       r_.f32[2] = a_.f32[3];
14546       r_.f32[3] = b_.f32[3];
14547     #endif
14548 
14549     return simde__m128_from_private(r_);
14550   #endif
14551 }
14552 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14553 #  define _mm_unpackhi_ps(a, b) simde_mm_unpackhi_ps((a), (b))
14554 #endif
14555 
14556 SIMDE_FUNCTION_ATTRIBUTES
14557 simde__m128
simde_mm_unpacklo_ps(simde__m128 a,simde__m128 b)14558 simde_mm_unpacklo_ps (simde__m128 a, simde__m128 b) {
14559   #if defined(SIMDE_X86_SSE_NATIVE)
14560     return _mm_unpacklo_ps(a, b);
14561   #else
14562     simde__m128_private
14563       r_,
14564       a_ = simde__m128_to_private(a),
14565       b_ = simde__m128_to_private(b);
14566 
14567     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
14568       r_.neon_f32 = vzip1q_f32(a_.neon_f32, b_.neon_f32);
14569     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
14570       r_.altivec_f32 = vec_mergeh(a_.altivec_f32, b_.altivec_f32);
14571     #elif defined(SIMDE_SHUFFLE_VECTOR_)
14572       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5);
14573     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14574       float32x2_t a1 = vget_low_f32(a_.neon_f32);
14575       float32x2_t b1 = vget_low_f32(b_.neon_f32);
14576       float32x2x2_t result = vzip_f32(a1, b1);
14577       r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
14578     #else
14579       r_.f32[0] = a_.f32[0];
14580       r_.f32[1] = b_.f32[0];
14581       r_.f32[2] = a_.f32[1];
14582       r_.f32[3] = b_.f32[1];
14583     #endif
14584 
14585     return simde__m128_from_private(r_);
14586   #endif
14587 }
14588 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14589 #  define _mm_unpacklo_ps(a, b) simde_mm_unpacklo_ps((a), (b))
14590 #endif
14591 
14592 SIMDE_FUNCTION_ATTRIBUTES
14593 void
simde_mm_stream_pi(simde__m64 * mem_addr,simde__m64 a)14594 simde_mm_stream_pi (simde__m64* mem_addr, simde__m64 a) {
14595   #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
14596     _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a);
14597   #else
14598     simde__m64_private*
14599       dest = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr),
14600       a_ = simde__m64_to_private(a);
14601 
14602     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14603       dest->i64[0] = vget_lane_s64(a_.neon_i64, 0);
14604     #else
14605       dest->i64[0] = a_.i64[0];
14606     #endif
14607   #endif
14608 }
14609 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14610 #  define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, (a))
14611 #endif
14612 
14613 SIMDE_FUNCTION_ATTRIBUTES
14614 void
simde_mm_stream_ps(simde_float32 mem_addr[4],simde__m128 a)14615 simde_mm_stream_ps (simde_float32 mem_addr[4], simde__m128 a) {
14616   #if defined(SIMDE_X86_SSE_NATIVE)
14617     _mm_stream_ps(mem_addr, a);
14618   #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
14619     simde__m128_private a_ = simde__m128_to_private(a);
14620     __builtin_nontemporal_store(a_.f32, SIMDE_ALIGN_CAST(__typeof__(a_.f32)*, mem_addr));
14621   #else
14622     simde_mm_store_ps(mem_addr, a);
14623   #endif
14624 }
14625 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14626 #  define _mm_stream_ps(mem_addr, a) simde_mm_stream_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a))
14627 #endif
14628 
14629 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14630 #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
14631   do {                                                  \
14632         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
14633         float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
14634         row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
14635                             vget_low_f32(ROW23.val[0]));  \
14636         row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
14637                             vget_low_f32(ROW23.val[1]));  \
14638         row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
14639                             vget_high_f32(ROW23.val[0])); \
14640         row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
14641                             vget_high_f32(ROW23.val[1])); \
14642     } while (0)
14643 #else
14644 #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
14645   do { \
14646     simde__m128 tmp3, tmp2, tmp1, tmp0; \
14647     tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \
14648     tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \
14649     tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \
14650     tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \
14651     row0 = simde_mm_movelh_ps(tmp0, tmp2); \
14652     row1 = simde_mm_movehl_ps(tmp2, tmp0); \
14653     row2 = simde_mm_movelh_ps(tmp1, tmp3); \
14654     row3 = simde_mm_movehl_ps(tmp3, tmp1); \
14655   } while (0)
14656 #endif
14657 #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
14658 #  define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)
14659 #endif
14660 
14661 SIMDE_END_DECLS_
14662 
14663 HEDLEY_DIAGNOSTIC_POP
14664 
14665 #endif /* !defined(SIMDE_X86_SSE_H) */
14666 /* :: End ../simde/simde/x86/sse.h :: */
14667 #if !defined(SIMDE_X86_AVX_H)
14668 #define SIMDE_X86_AVX_H
14669 
14670 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
14671 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
14672 /* :: Begin ../simde/simde/x86/sse4.2.h :: */
14673 /* SPDX-License-Identifier: MIT
14674  *
14675  * Permission is hereby granted, free of charge, to any person
14676  * obtaining a copy of this software and associated documentation
14677  * files (the "Software"), to deal in the Software without
14678  * restriction, including without limitation the rights to use, copy,
14679  * modify, merge, publish, distribute, sublicense, and/or sell copies
14680  * of the Software, and to permit persons to whom the Software is
14681  * furnished to do so, subject to the following conditions:
14682  *
14683  * The above copyright notice and this permission notice shall be
14684  * included in all copies or substantial portions of the Software.
14685  *
14686  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14687  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14688  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14689  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
14690  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
14691  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14692  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
14693  * SOFTWARE.
14694  *
14695  * Copyright:
14696  *   2017      Evan Nemerson <evan@nemerson.com>
14697  *   2020      Hidayat Khan <huk2209@gmail.com>
14698  */
14699 
14700 #if !defined(SIMDE_X86_SSE4_2_H)
14701 #define SIMDE_X86_SSE4_2_H
14702 
14703 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
14704 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
14705 /* :: Begin ../simde/simde/x86/sse4.1.h :: */
14706 /* SPDX-License-Identifier: MIT
14707  *
14708  * Permission is hereby granted, free of charge, to any person
14709  * obtaining a copy of this software and associated documentation
14710  * files (the "Software"), to deal in the Software without
14711  * restriction, including without limitation the rights to use, copy,
14712  * modify, merge, publish, distribute, sublicense, and/or sell copies
14713  * of the Software, and to permit persons to whom the Software is
14714  * furnished to do so, subject to the following conditions:
14715  *
14716  * The above copyright notice and this permission notice shall be
14717  * included in all copies or substantial portions of the Software.
14718  *
14719  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14720  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14721  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14722  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
14723  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
14724  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14725  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
14726  * SOFTWARE.
14727  *
14728  * Copyright:
14729  *   2017-2020 Evan Nemerson <evan@nemerson.com>
14730  */
14731 
14732 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
14733 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
14734 #if !defined(SIMDE_X86_SSE4_1_H)
14735 #define SIMDE_X86_SSE4_1_H
14736 
14737 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
14738 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
14739 /* :: Begin ../simde/simde/x86/ssse3.h :: */
14740 /* SPDX-License-Identifier: MIT
14741  *
14742  * Permission is hereby granted, free of charge, to any person
14743  * obtaining a copy of this software and associated documentation
14744  * files (the "Software"), to deal in the Software without
14745  * restriction, including without limitation the rights to use, copy,
14746  * modify, merge, publish, distribute, sublicense, and/or sell copies
14747  * of the Software, and to permit persons to whom the Software is
14748  * furnished to do so, subject to the following conditions:
14749  *
14750  * The above copyright notice and this permission notice shall be
14751  * included in all copies or substantial portions of the Software.
14752  *
14753  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14754  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14755  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14756  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
14757  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
14758  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14759  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
14760  * SOFTWARE.
14761  *
14762  * Copyright:
14763  *   2017-2020 Evan Nemerson <evan@nemerson.com>
14764  */
14765 
14766 #if !defined(SIMDE_X86_SSSE3_H)
14767 #define SIMDE_X86_SSSE3_H
14768 
14769 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
14770 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
14771 /* :: Begin ../simde/simde/x86/sse3.h :: */
14772 /* SPDX-License-Identifier: MIT
14773  *
14774  * Permission is hereby granted, free of charge, to any person
14775  * obtaining a copy of this software and associated documentation
14776  * files (the "Software"), to deal in the Software without
14777  * restriction, including without limitation the rights to use, copy,
14778  * modify, merge, publish, distribute, sublicense, and/or sell copies
14779  * of the Software, and to permit persons to whom the Software is
14780  * furnished to do so, subject to the following conditions:
14781  *
14782  * The above copyright notice and this permission notice shall be
14783  * included in all copies or substantial portions of the Software.
14784  *
14785  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14786  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14787  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14788  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
14789  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
14790  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14791  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
14792  * SOFTWARE.
14793  *
14794  * Copyright:
14795  *   2017-2020 Evan Nemerson <evan@nemerson.com>
14796  */
14797 
14798 #if !defined(SIMDE_X86_SSE3_H)
14799 #define SIMDE_X86_SSE3_H
14800 
14801 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
14802 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
14803 /* :: Begin ../simde/simde/x86/sse2.h :: */
14804 /* SPDX-License-Identifier: MIT
14805  *
14806  * Permission is hereby granted, free of charge, to any person
14807  * obtaining a copy of this software and associated documentation
14808  * files (the "Software"), to deal in the Software without
14809  * restriction, including without limitation the rights to use, copy,
14810  * modify, merge, publish, distribute, sublicense, and/or sell copies
14811  * of the Software, and to permit persons to whom the Software is
14812  * furnished to do so, subject to the following conditions:
14813  *
14814  * The above copyright notice and this permission notice shall be
14815  * included in all copies or substantial portions of the Software.
14816  *
14817  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14818  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14819  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14820  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
14821  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
14822  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14823  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
14824  * SOFTWARE.
14825  *
14826  * Copyright:
14827  *   2017-2020 Evan Nemerson <evan@nemerson.com>
14828  *   2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
14829  *   2015      Brandon Rowlett <browlett@nvidia.com>
14830  *   2015      Ken Fast <kfast@gdeb.com>
14831  *   2017      Hasindu Gamaarachchi <hasindu@unsw.edu.au>
14832  *   2018      Jeff Daily <jeff.daily@amd.com>
14833  */
14834 
14835 #if !defined(SIMDE_X86_SSE2_H)
14836 #define SIMDE_X86_SSE2_H
14837 
14838 /* AUTOMATICALLY GENERATED FILE, DO NOT MODIFY */
14839 /* e8b7a2ec175ceb3725ce0827ef9a6725b6309cc9 */
14840 
14841 HEDLEY_DIAGNOSTIC_PUSH
14842 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
14843 SIMDE_BEGIN_DECLS_
14844 
14845 typedef union {
14846   #if defined(SIMDE_VECTOR_SUBSCRIPT)
14847     SIMDE_ALIGN_TO_16 int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14848     SIMDE_ALIGN_TO_16 int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14849     SIMDE_ALIGN_TO_16 int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14850     SIMDE_ALIGN_TO_16 int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14851     SIMDE_ALIGN_TO_16 uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14852     SIMDE_ALIGN_TO_16 uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14853     SIMDE_ALIGN_TO_16 uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14854     SIMDE_ALIGN_TO_16 uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14855     #if defined(SIMDE_HAVE_INT128_)
14856     SIMDE_ALIGN_TO_16 simde_int128  i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14857     SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14858     #endif
14859     SIMDE_ALIGN_TO_16 simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14860     SIMDE_ALIGN_TO_16 simde_float64  f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14861 
14862     SIMDE_ALIGN_TO_16 int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14863     SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14864   #else
14865     SIMDE_ALIGN_TO_16 int8_t         i8[16];
14866     SIMDE_ALIGN_TO_16 int16_t        i16[8];
14867     SIMDE_ALIGN_TO_16 int32_t        i32[4];
14868     SIMDE_ALIGN_TO_16 int64_t        i64[2];
14869     SIMDE_ALIGN_TO_16 uint8_t        u8[16];
14870     SIMDE_ALIGN_TO_16 uint16_t       u16[8];
14871     SIMDE_ALIGN_TO_16 uint32_t       u32[4];
14872     SIMDE_ALIGN_TO_16 uint64_t       u64[2];
14873     #if defined(SIMDE_HAVE_INT128_)
14874     SIMDE_ALIGN_TO_16 simde_int128  i128[1];
14875     SIMDE_ALIGN_TO_16 simde_uint128 u128[1];
14876     #endif
14877     SIMDE_ALIGN_TO_16 simde_float32  f32[4];
14878     SIMDE_ALIGN_TO_16 simde_float64  f64[2];
14879 
14880     SIMDE_ALIGN_TO_16 int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
14881     SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
14882   #endif
14883 
14884     SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
14885     SIMDE_ALIGN_TO_16 simde__m64         m64[2];
14886 
14887   #if defined(SIMDE_X86_SSE2_NATIVE)
14888     SIMDE_ALIGN_TO_16 __m128i        n;
14889   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14890     SIMDE_ALIGN_TO_16 int8x16_t      neon_i8;
14891     SIMDE_ALIGN_TO_16 int16x8_t      neon_i16;
14892     SIMDE_ALIGN_TO_16 int32x4_t      neon_i32;
14893     SIMDE_ALIGN_TO_16 int64x2_t      neon_i64;
14894     SIMDE_ALIGN_TO_16 uint8x16_t     neon_u8;
14895     SIMDE_ALIGN_TO_16 uint16x8_t     neon_u16;
14896     SIMDE_ALIGN_TO_16 uint32x4_t     neon_u32;
14897     SIMDE_ALIGN_TO_16 uint64x2_t     neon_u64;
14898     #if defined(__ARM_FP16_FORMAT_IEEE)
14899     SIMDE_ALIGN_TO_16 float16x8_t    neon_f16;
14900     #endif
14901     SIMDE_ALIGN_TO_16 float32x4_t    neon_f32;
14902     #if defined(SIMDE_ARCH_AARCH64)
14903     SIMDE_ALIGN_TO_16 float64x2_t    neon_f64;
14904     #endif
14905   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
14906     SIMDE_ALIGN_TO_16 v128_t         wasm_v128;
14907   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
14908     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char)          altivec_i8;
14909     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short)         altivec_i16;
14910     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32;
14911     #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
14912       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__)  altivec_i32f;
14913     #else
14914       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32f;
14915     #endif
14916     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)        altivec_u8;
14917     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)       altivec_u16;
14918     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32;
14919     #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
14920       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
14921     #else
14922       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32f;
14923     #endif
14924       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float)                altivec_f32;
14925     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
14926       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64;
14927       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
14928       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
14929     #endif
14930   #endif
14931 } simde__m128i_private;
14932 
14933 typedef union {
14934   #if defined(SIMDE_VECTOR_SUBSCRIPT)
14935     SIMDE_ALIGN_TO_16 int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14936     SIMDE_ALIGN_TO_16 int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14937     SIMDE_ALIGN_TO_16 int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14938     SIMDE_ALIGN_TO_16 int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14939     SIMDE_ALIGN_TO_16 uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14940     SIMDE_ALIGN_TO_16 uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14941     SIMDE_ALIGN_TO_16 uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14942     SIMDE_ALIGN_TO_16 uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14943     SIMDE_ALIGN_TO_16 simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14944     SIMDE_ALIGN_TO_16 simde_float64  f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14945     SIMDE_ALIGN_TO_16 int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14946     SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
14947   #else
14948     SIMDE_ALIGN_TO_16 int8_t         i8[16];
14949     SIMDE_ALIGN_TO_16 int16_t        i16[8];
14950     SIMDE_ALIGN_TO_16 int32_t        i32[4];
14951     SIMDE_ALIGN_TO_16 int64_t        i64[2];
14952     SIMDE_ALIGN_TO_16 uint8_t        u8[16];
14953     SIMDE_ALIGN_TO_16 uint16_t       u16[8];
14954     SIMDE_ALIGN_TO_16 uint32_t       u32[4];
14955     SIMDE_ALIGN_TO_16 uint64_t       u64[2];
14956     SIMDE_ALIGN_TO_16 simde_float32  f32[4];
14957     SIMDE_ALIGN_TO_16 simde_float64  f64[2];
14958     SIMDE_ALIGN_TO_16 int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
14959     SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
14960   #endif
14961 
14962     SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
14963     SIMDE_ALIGN_TO_16 simde__m64         m64[2];
14964 
14965   #if defined(SIMDE_X86_SSE2_NATIVE)
14966     SIMDE_ALIGN_TO_16 __m128d        n;
14967   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
14968     SIMDE_ALIGN_TO_16 int8x16_t      neon_i8;
14969     SIMDE_ALIGN_TO_16 int16x8_t      neon_i16;
14970     SIMDE_ALIGN_TO_16 int32x4_t      neon_i32;
14971     SIMDE_ALIGN_TO_16 int64x2_t      neon_i64;
14972     SIMDE_ALIGN_TO_16 uint8x16_t     neon_u8;
14973     SIMDE_ALIGN_TO_16 uint16x8_t     neon_u16;
14974     SIMDE_ALIGN_TO_16 uint32x4_t     neon_u32;
14975     SIMDE_ALIGN_TO_16 uint64x2_t     neon_u64;
14976     SIMDE_ALIGN_TO_16 float32x4_t    neon_f32;
14977     #if defined(SIMDE_ARCH_AARCH64)
14978     SIMDE_ALIGN_TO_16 float64x2_t    neon_f64;
14979     #endif
14980   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
14981     SIMDE_ALIGN_TO_16 v128_t         wasm_v128;
14982   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
14983     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char)          altivec_i8;
14984     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short)         altivec_i16;
14985     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32;
14986     #if defined(__INT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
14987       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__)  altivec_i32f;
14988     #else
14989       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32f;
14990     #endif
14991     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)        altivec_u8;
14992     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)       altivec_u16;
14993     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32;
14994     #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
14995       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
14996     #else
14997       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32f;
14998     #endif
14999     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float)                altivec_f32;
15000     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15001       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64;
15002       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
15003       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
15004     #endif
15005   #endif
15006 } simde__m128d_private;
15007 
15008 #if defined(SIMDE_X86_SSE2_NATIVE)
15009   typedef __m128i simde__m128i;
15010   typedef __m128d simde__m128d;
15011 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15012    typedef int64x2_t simde__m128i;
15013 #  if defined(SIMDE_ARCH_AARCH64)
15014      typedef float64x2_t simde__m128d;
15015 #  elif defined(SIMDE_VECTOR_SUBSCRIPT)
15016      typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
15017 #  else
15018      typedef simde__m128d_private simde__m128d;
15019 #  endif
15020 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15021    typedef v128_t simde__m128i;
15022    typedef v128_t simde__m128d;
15023 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15024   typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
15025   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15026      typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
15027   #else
15028      typedef simde__m128d_private simde__m128d;
15029   #endif
15030 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
15031   typedef int64_t simde__m128i SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
15032   typedef simde_float64 simde__m128d SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
15033 #else
15034   typedef simde__m128i_private simde__m128i;
15035   typedef simde__m128d_private simde__m128d;
15036 #endif
15037 
15038 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15039   typedef simde__m128i __m128i;
15040   typedef simde__m128d __m128d;
15041 #endif
15042 
15043 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
15044 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect");
15045 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
15046 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect");
15047 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
15048 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned");
15049 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned");
15050 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned");
15051 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned");
15052 #endif
15053 
15054 SIMDE_FUNCTION_ATTRIBUTES
15055 simde__m128i
simde__m128i_from_private(simde__m128i_private v)15056 simde__m128i_from_private(simde__m128i_private v) {
15057   simde__m128i r;
15058   simde_memcpy(&r, &v, sizeof(r));
15059   return r;
15060 }
15061 
15062 SIMDE_FUNCTION_ATTRIBUTES
15063 simde__m128i_private
simde__m128i_to_private(simde__m128i v)15064 simde__m128i_to_private(simde__m128i v) {
15065   simde__m128i_private r;
15066   simde_memcpy(&r, &v, sizeof(r));
15067   return r;
15068 }
15069 
15070 SIMDE_FUNCTION_ATTRIBUTES
15071 simde__m128d
simde__m128d_from_private(simde__m128d_private v)15072 simde__m128d_from_private(simde__m128d_private v) {
15073   simde__m128d r;
15074   simde_memcpy(&r, &v, sizeof(r));
15075   return r;
15076 }
15077 
15078 SIMDE_FUNCTION_ATTRIBUTES
15079 simde__m128d_private
simde__m128d_to_private(simde__m128d v)15080 simde__m128d_to_private(simde__m128d v) {
15081   simde__m128d_private r;
15082   simde_memcpy(&r, &v, sizeof(r));
15083   return r;
15084 }
15085 
15086 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,int8x16_t,neon,i8)15087   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
15088   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
15089   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
15090   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
15091   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
15092   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
15093   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
15094   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
15095   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
15096   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
15097     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
15098   #endif
15099 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15100   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
15101   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
15102   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
15103   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
15104   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
15105   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
15106   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15107     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
15108     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
15109   #endif
15110 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
15111 
15112 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15113   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
15114   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
15115   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
15116   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
15117   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
15118   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
15119   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
15120   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
15121   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
15122   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
15123     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
15124   #endif
15125 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15126   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
15127   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
15128   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
15129   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
15130   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
15131   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
15132   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15133     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
15134     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
15135     #if defined(SIMDE_BUG_GCC_95782)
15136       SIMDE_FUNCTION_ATTRIBUTES
15137       SIMDE_POWER_ALTIVEC_VECTOR(double)
15138       simde__m128d_to_altivec_f64(simde__m128d value) {
15139         simde__m128d_private r_ = simde__m128d_to_private(value);
15140         return r_.altivec_f64;
15141       }
15142 
15143       SIMDE_FUNCTION_ATTRIBUTES
15144       simde__m128d
15145       simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) {
15146         simde__m128d_private r_;
15147         r_.altivec_f64 = value;
15148         return simde__m128d_from_private(r_);
15149       }
15150     #else
15151       SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64)
15152     #endif
15153   #endif
15154 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15155   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v128_t, wasm, v128);
15156   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v128_t, wasm, v128);
15157 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
15158 
15159 SIMDE_FUNCTION_ATTRIBUTES
15160 simde__m128d
15161 simde_mm_set_pd (simde_float64 e1, simde_float64 e0) {
15162   #if defined(SIMDE_X86_SSE2_NATIVE)
15163     return _mm_set_pd(e1, e0);
15164   #else
15165     simde__m128d_private r_;
15166 
15167     #if defined(SIMDE_WASM_SIMD128_NATIVE)
15168       r_.wasm_v128 = wasm_f64x2_make(e0, e1);
15169     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
15170       SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 };
15171       r_.neon_f64 = vld1q_f64(data);
15172     #else
15173       r_.f64[0] = e0;
15174       r_.f64[1] = e1;
15175     #endif
15176 
15177     return simde__m128d_from_private(r_);
15178   #endif
15179 }
15180 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15181   #define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
15182 #endif
15183 
15184 SIMDE_FUNCTION_ATTRIBUTES
15185 simde__m128d
simde_mm_set1_pd(simde_float64 a)15186 simde_mm_set1_pd (simde_float64 a) {
15187   #if defined(SIMDE_X86_SSE2_NATIVE)
15188     return _mm_set1_pd(a);
15189   #else
15190     simde__m128d_private r_;
15191 
15192     #if defined(SIMDE_WASM_SIMD128_NATIVE)
15193       r_.wasm_v128 = wasm_f64x2_splat(a);
15194     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
15195       r_.neon_f64 = vdupq_n_f64(a);
15196     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15197       r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a));
15198     #else
15199       SIMDE_VECTORIZE
15200       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
15201         r_.f64[i] = a;
15202       }
15203     #endif
15204 
15205     return simde__m128d_from_private(r_);
15206   #endif
15207 }
15208 #define simde_mm_set_pd1(a) simde_mm_set1_pd(a)
15209 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15210   #define _mm_set1_pd(a) simde_mm_set1_pd(a)
15211   #define _mm_set_pd1(a) simde_mm_set1_pd(a)
15212 #endif
15213 
15214 SIMDE_FUNCTION_ATTRIBUTES
15215 simde__m128d
simde_x_mm_abs_pd(simde__m128d a)15216 simde_x_mm_abs_pd(simde__m128d a) {
15217   #if defined(SIMDE_X86_SSE2_NATIVE)
15218     simde_float64 mask_;
15219     uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF);
15220     simde_memcpy(&mask_, &u64_, sizeof(u64_));
15221     return _mm_and_pd(_mm_set1_pd(mask_), a);
15222   #else
15223     simde__m128d_private
15224       r_,
15225       a_ = simde__m128d_to_private(a);
15226 
15227     #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
15228       r_.neon_f64 = vabsq_f64(a_.neon_f64);
15229     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15230       r_.altivec_f64 = vec_abs(a_.altivec_f64);
15231     #else
15232       SIMDE_VECTORIZE
15233       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
15234         r_.f64[i] = simde_math_fabs(a_.f64[i]);
15235       }
15236     #endif
15237 
15238     return simde__m128d_from_private(r_);
15239   #endif
15240 }
15241 
15242 SIMDE_FUNCTION_ATTRIBUTES
15243 simde__m128d
simde_x_mm_not_pd(simde__m128d a)15244 simde_x_mm_not_pd(simde__m128d a) {
15245   #if defined(SIMDE_X86_AVX512VL_NATIVE)
15246     __m128i ai = _mm_castpd_si128(a);
15247     return _mm_castsi128_pd(_mm_ternarylogic_epi64(ai, ai, ai, 0x55));
15248   #else
15249     simde__m128d_private
15250       r_,
15251       a_ = simde__m128d_to_private(a);
15252 
15253     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15254       r_.neon_i32 = vmvnq_s32(a_.neon_i32);
15255     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
15256       r_.altivec_f64 = vec_nor(a_.altivec_f64, a_.altivec_f64);
15257     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
15258       r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
15259     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15260       r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
15261     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15262       r_.i32f = ~a_.i32f;
15263     #else
15264       SIMDE_VECTORIZE
15265       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
15266         r_.i32f[i] = ~(a_.i32f[i]);
15267       }
15268     #endif
15269 
15270     return simde__m128d_from_private(r_);
15271   #endif
15272 }
15273 
15274 SIMDE_FUNCTION_ATTRIBUTES
15275 simde__m128d
simde_x_mm_select_pd(simde__m128d a,simde__m128d b,simde__m128d mask)15276 simde_x_mm_select_pd(simde__m128d a, simde__m128d b, simde__m128d mask) {
15277   /* This function is for when you want to blend two elements together
15278    * according to a mask.  It is similar to _mm_blendv_pd, except that
15279    * it is undefined whether the blend is based on the highest bit in
15280    * each lane (like blendv) or just bitwise operations.  This allows
15281    * us to implement the function efficiently everywhere.
15282    *
15283    * Basically, you promise that all the lanes in mask are either 0 or
15284    * ~0. */
15285   #if defined(SIMDE_X86_SSE4_1_NATIVE)
15286     return _mm_blendv_pd(a, b, mask);
15287   #else
15288     simde__m128d_private
15289       r_,
15290       a_ = simde__m128d_to_private(a),
15291       b_ = simde__m128d_to_private(b),
15292       mask_ = simde__m128d_to_private(mask);
15293 
15294     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15295       r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64);
15296     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15297       r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
15298     #else
15299       SIMDE_VECTORIZE
15300       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
15301         r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]);
15302       }
15303     #endif
15304 
15305     return simde__m128d_from_private(r_);
15306   #endif
15307 }
15308 
15309 SIMDE_FUNCTION_ATTRIBUTES
15310 simde__m128i
simde_mm_add_epi8(simde__m128i a,simde__m128i b)15311 simde_mm_add_epi8 (simde__m128i a, simde__m128i b) {
15312   #if defined(SIMDE_X86_SSE2_NATIVE)
15313     return _mm_add_epi8(a, b);
15314   #else
15315     simde__m128i_private
15316       r_,
15317       a_ = simde__m128i_to_private(a),
15318       b_ = simde__m128i_to_private(b);
15319 
15320     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15321       r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
15322     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
15323       r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
15324     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15325       r_.wasm_v128 = wasm_i8x16_add(a_.wasm_v128, b_.wasm_v128);
15326     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15327       r_.i8 = a_.i8 + b_.i8;
15328     #else
15329       SIMDE_VECTORIZE
15330       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
15331         r_.i8[i] = a_.i8[i] + b_.i8[i];
15332       }
15333     #endif
15334 
15335     return simde__m128i_from_private(r_);
15336   #endif
15337 }
15338 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15339   #define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
15340 #endif
15341 
15342 SIMDE_FUNCTION_ATTRIBUTES
15343 simde__m128i
simde_mm_add_epi16(simde__m128i a,simde__m128i b)15344 simde_mm_add_epi16 (simde__m128i a, simde__m128i b) {
15345   #if defined(SIMDE_X86_SSE2_NATIVE)
15346     return _mm_add_epi16(a, b);
15347   #else
15348     simde__m128i_private
15349       r_,
15350       a_ = simde__m128i_to_private(a),
15351       b_ = simde__m128i_to_private(b);
15352 
15353     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15354       r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
15355     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
15356       r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
15357     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15358       r_.wasm_v128 = wasm_i16x8_add(a_.wasm_v128, b_.wasm_v128);
15359     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15360       r_.i16 = a_.i16 + b_.i16;
15361     #else
15362       SIMDE_VECTORIZE
15363       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
15364         r_.i16[i] = a_.i16[i] + b_.i16[i];
15365       }
15366     #endif
15367 
15368     return simde__m128i_from_private(r_);
15369   #endif
15370 }
15371 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15372   #define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
15373 #endif
15374 
15375 SIMDE_FUNCTION_ATTRIBUTES
15376 simde__m128i
simde_mm_add_epi32(simde__m128i a,simde__m128i b)15377 simde_mm_add_epi32 (simde__m128i a, simde__m128i b) {
15378   #if defined(SIMDE_X86_SSE2_NATIVE)
15379     return _mm_add_epi32(a, b);
15380   #else
15381     simde__m128i_private
15382       r_,
15383       a_ = simde__m128i_to_private(a),
15384       b_ = simde__m128i_to_private(b);
15385 
15386     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15387       r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
15388     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
15389       r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
15390     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15391       r_.wasm_v128 = wasm_i32x4_add(a_.wasm_v128, b_.wasm_v128);
15392     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15393       r_.i32 = a_.i32 + b_.i32;
15394     #else
15395       SIMDE_VECTORIZE
15396       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
15397         r_.i32[i] = a_.i32[i] + b_.i32[i];
15398       }
15399     #endif
15400 
15401     return simde__m128i_from_private(r_);
15402   #endif
15403 }
15404 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15405   #define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
15406 #endif
15407 
15408 SIMDE_FUNCTION_ATTRIBUTES
15409 simde__m128i
simde_mm_add_epi64(simde__m128i a,simde__m128i b)15410 simde_mm_add_epi64 (simde__m128i a, simde__m128i b) {
15411   #if defined(SIMDE_X86_SSE2_NATIVE)
15412     return _mm_add_epi64(a, b);
15413   #else
15414     simde__m128i_private
15415       r_,
15416       a_ = simde__m128i_to_private(a),
15417       b_ = simde__m128i_to_private(b);
15418 
15419     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15420       r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
15421     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
15422       r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
15423     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15424       r_.wasm_v128 = wasm_i64x2_add(a_.wasm_v128, b_.wasm_v128);
15425     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15426       r_.i64 = a_.i64 + b_.i64;
15427     #else
15428       SIMDE_VECTORIZE
15429       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
15430         r_.i64[i] = a_.i64[i] + b_.i64[i];
15431       }
15432     #endif
15433 
15434     return simde__m128i_from_private(r_);
15435   #endif
15436 }
15437 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15438   #define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
15439 #endif
15440 
15441 SIMDE_FUNCTION_ATTRIBUTES
15442 simde__m128d
simde_mm_add_pd(simde__m128d a,simde__m128d b)15443 simde_mm_add_pd (simde__m128d a, simde__m128d b) {
15444   #if defined(SIMDE_X86_SSE2_NATIVE)
15445     return _mm_add_pd(a, b);
15446   #else
15447     simde__m128d_private
15448       r_,
15449       a_ = simde__m128d_to_private(a),
15450       b_ = simde__m128d_to_private(b);
15451 
15452     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
15453       r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
15454     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15455       r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
15456     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
15457       r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
15458     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15459       r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
15460     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15461       r_.f64 = a_.f64 + b_.f64;
15462     #else
15463       SIMDE_VECTORIZE
15464       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
15465         r_.f64[i] = a_.f64[i] + b_.f64[i];
15466       }
15467     #endif
15468 
15469     return simde__m128d_from_private(r_);
15470   #endif
15471 }
15472 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15473   #define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
15474 #endif
15475 
15476 SIMDE_FUNCTION_ATTRIBUTES
15477 simde__m128d
simde_mm_move_sd(simde__m128d a,simde__m128d b)15478 simde_mm_move_sd (simde__m128d a, simde__m128d b) {
15479   #if defined(SIMDE_X86_SSE2_NATIVE)
15480     return _mm_move_sd(a, b);
15481   #else
15482     simde__m128d_private
15483       r_,
15484       a_ = simde__m128d_to_private(a),
15485       b_ = simde__m128d_to_private(b);
15486 
15487     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
15488       r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
15489     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
15490       #if defined(HEDLEY_IBM_VERSION)
15491         r_.altivec_f64 = vec_xxpermdi(a_.altivec_f64, b_.altivec_f64, 1);
15492       #else
15493         r_.altivec_f64 = vec_xxpermdi(b_.altivec_f64, a_.altivec_f64, 1);
15494       #endif
15495     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15496       r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 1);
15497     #elif defined(SIMDE_SHUFFLE_VECTOR_)
15498       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
15499     #else
15500       r_.f64[0] = b_.f64[0];
15501       r_.f64[1] = a_.f64[1];
15502     #endif
15503 
15504     return simde__m128d_from_private(r_);
15505   #endif
15506 }
15507 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15508   #define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
15509 #endif
15510 
15511 SIMDE_FUNCTION_ATTRIBUTES
15512 simde__m128d
simde_mm_add_sd(simde__m128d a,simde__m128d b)15513 simde_mm_add_sd (simde__m128d a, simde__m128d b) {
15514   #if defined(SIMDE_X86_SSE2_NATIVE)
15515     return _mm_add_sd(a, b);
15516   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
15517     return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
15518   #else
15519     simde__m128d_private
15520       r_,
15521       a_ = simde__m128d_to_private(a),
15522       b_ = simde__m128d_to_private(b);
15523 
15524     r_.f64[0] = a_.f64[0] + b_.f64[0];
15525     r_.f64[1] = a_.f64[1];
15526 
15527     return simde__m128d_from_private(r_);
15528   #endif
15529 }
15530 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15531   #define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
15532 #endif
15533 
15534 SIMDE_FUNCTION_ATTRIBUTES
15535 simde__m64
simde_mm_add_si64(simde__m64 a,simde__m64 b)15536 simde_mm_add_si64 (simde__m64 a, simde__m64 b) {
15537   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
15538     return _mm_add_si64(a, b);
15539   #else
15540     simde__m64_private
15541       r_,
15542       a_ = simde__m64_to_private(a),
15543       b_ = simde__m64_to_private(b);
15544 
15545     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15546       r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
15547     #else
15548       r_.i64[0] = a_.i64[0] + b_.i64[0];
15549     #endif
15550 
15551     return simde__m64_from_private(r_);
15552   #endif
15553 }
15554 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15555   #define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
15556 #endif
15557 
15558 SIMDE_FUNCTION_ATTRIBUTES
15559 simde__m128i
simde_mm_adds_epi8(simde__m128i a,simde__m128i b)15560 simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) {
15561   #if defined(SIMDE_X86_SSE2_NATIVE)
15562     return _mm_adds_epi8(a, b);
15563   #else
15564     simde__m128i_private
15565       r_,
15566       a_ = simde__m128i_to_private(a),
15567       b_ = simde__m128i_to_private(b);
15568 
15569     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15570       r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
15571     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15572       r_.wasm_v128 = wasm_i8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
15573     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
15574       r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
15575     #else
15576       SIMDE_VECTORIZE
15577       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
15578         const int_fast16_t tmp =
15579           HEDLEY_STATIC_CAST(int_fast16_t, a_.i8[i]) +
15580           HEDLEY_STATIC_CAST(int_fast16_t, b_.i8[i]);
15581         r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
15582       }
15583     #endif
15584 
15585     return simde__m128i_from_private(r_);
15586   #endif
15587 }
15588 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15589   #define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
15590 #endif
15591 
15592 SIMDE_FUNCTION_ATTRIBUTES
15593 simde__m128i
simde_mm_adds_epi16(simde__m128i a,simde__m128i b)15594 simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) {
15595   #if defined(SIMDE_X86_SSE2_NATIVE)
15596     return _mm_adds_epi16(a, b);
15597   #else
15598     simde__m128i_private
15599       r_,
15600       a_ = simde__m128i_to_private(a),
15601       b_ = simde__m128i_to_private(b);
15602 
15603     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15604       r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
15605     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15606       r_.wasm_v128 = wasm_i16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
15607     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
15608       r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
15609     #else
15610       SIMDE_VECTORIZE
15611       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
15612         const int_fast32_t tmp =
15613           HEDLEY_STATIC_CAST(int_fast32_t, a_.i16[i]) +
15614           HEDLEY_STATIC_CAST(int_fast32_t, b_.i16[i]);
15615         r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
15616       }
15617     #endif
15618 
15619     return simde__m128i_from_private(r_);
15620   #endif
15621 }
15622 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15623   #define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
15624 #endif
15625 
15626 SIMDE_FUNCTION_ATTRIBUTES
15627 simde__m128i
simde_mm_adds_epu8(simde__m128i a,simde__m128i b)15628 simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) {
15629   #if defined(SIMDE_X86_SSE2_NATIVE)
15630     return _mm_adds_epu8(a, b);
15631   #else
15632     simde__m128i_private
15633       r_,
15634       a_ = simde__m128i_to_private(a),
15635       b_ = simde__m128i_to_private(b);
15636 
15637     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15638       r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
15639     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15640       r_.wasm_v128 = wasm_u8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
15641     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
15642       r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
15643     #else
15644       SIMDE_VECTORIZE
15645       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
15646         r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
15647       }
15648     #endif
15649 
15650     return simde__m128i_from_private(r_);
15651   #endif
15652 }
15653 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15654   #define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
15655 #endif
15656 
15657 SIMDE_FUNCTION_ATTRIBUTES
15658 simde__m128i
simde_mm_adds_epu16(simde__m128i a,simde__m128i b)15659 simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) {
15660   #if defined(SIMDE_X86_SSE2_NATIVE)
15661     return _mm_adds_epu16(a, b);
15662   #else
15663     simde__m128i_private
15664       r_,
15665       a_ = simde__m128i_to_private(a),
15666       b_ = simde__m128i_to_private(b);
15667 
15668     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15669       r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
15670     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15671       r_.wasm_v128 = wasm_u16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
15672     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
15673       r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
15674     #else
15675       SIMDE_VECTORIZE
15676       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
15677         r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
15678       }
15679     #endif
15680 
15681     return simde__m128i_from_private(r_);
15682   #endif
15683 }
15684 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15685   #define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
15686 #endif
15687 
15688 SIMDE_FUNCTION_ATTRIBUTES
15689 simde__m128d
simde_mm_and_pd(simde__m128d a,simde__m128d b)15690 simde_mm_and_pd (simde__m128d a, simde__m128d b) {
15691   #if defined(SIMDE_X86_SSE2_NATIVE)
15692     return _mm_and_pd(a, b);
15693   #else
15694     simde__m128d_private
15695       r_,
15696       a_ = simde__m128d_to_private(a),
15697       b_ = simde__m128d_to_private(b);
15698 
15699     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15700       r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
15701     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15702       r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
15703     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
15704       r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
15705     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15706       r_.i32f = a_.i32f & b_.i32f;
15707     #else
15708       SIMDE_VECTORIZE
15709       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
15710         r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
15711       }
15712     #endif
15713 
15714     return simde__m128d_from_private(r_);
15715   #endif
15716 }
15717 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15718   #define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
15719 #endif
15720 
15721 SIMDE_FUNCTION_ATTRIBUTES
15722 simde__m128i
simde_mm_and_si128(simde__m128i a,simde__m128i b)15723 simde_mm_and_si128 (simde__m128i a, simde__m128i b) {
15724   #if defined(SIMDE_X86_SSE2_NATIVE)
15725     return _mm_and_si128(a, b);
15726   #else
15727     simde__m128i_private
15728       r_,
15729       a_ = simde__m128i_to_private(a),
15730       b_ = simde__m128i_to_private(b);
15731 
15732     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15733       r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
15734     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
15735       r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
15736     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15737       r_.i32f = a_.i32f & b_.i32f;
15738     #else
15739       SIMDE_VECTORIZE
15740       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
15741         r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
15742       }
15743     #endif
15744 
15745     return simde__m128i_from_private(r_);
15746   #endif
15747 }
15748 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15749   #define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
15750 #endif
15751 
15752 SIMDE_FUNCTION_ATTRIBUTES
15753 simde__m128d
simde_mm_andnot_pd(simde__m128d a,simde__m128d b)15754 simde_mm_andnot_pd (simde__m128d a, simde__m128d b) {
15755   #if defined(SIMDE_X86_SSE2_NATIVE)
15756     return _mm_andnot_pd(a, b);
15757   #else
15758     simde__m128d_private
15759       r_,
15760       a_ = simde__m128d_to_private(a),
15761       b_ = simde__m128d_to_private(b);
15762 
15763     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15764       r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
15765     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15766       r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
15767     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15768       r_.altivec_f64 = vec_andc(b_.altivec_f64, a_.altivec_f64);
15769     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
15770       r_.altivec_i32f = vec_andc(b_.altivec_i32f, a_.altivec_i32f);
15771     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15772       r_.i32f = ~a_.i32f & b_.i32f;
15773     #else
15774       SIMDE_VECTORIZE
15775       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
15776         r_.u64[i] = ~a_.u64[i] & b_.u64[i];
15777       }
15778     #endif
15779 
15780     return simde__m128d_from_private(r_);
15781   #endif
15782 }
15783 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15784   #define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
15785 #endif
15786 
15787 SIMDE_FUNCTION_ATTRIBUTES
15788 simde__m128i
simde_mm_andnot_si128(simde__m128i a,simde__m128i b)15789 simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) {
15790   #if defined(SIMDE_X86_SSE2_NATIVE)
15791     return _mm_andnot_si128(a, b);
15792   #else
15793     simde__m128i_private
15794       r_,
15795       a_ = simde__m128i_to_private(a),
15796       b_ = simde__m128i_to_private(b);
15797 
15798     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15799       r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
15800     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15801       r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
15802     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15803       r_.i32f = ~a_.i32f & b_.i32f;
15804     #else
15805       SIMDE_VECTORIZE
15806       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
15807         r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
15808       }
15809     #endif
15810 
15811     return simde__m128i_from_private(r_);
15812   #endif
15813 }
15814 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15815   #define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
15816 #endif
15817 
15818 SIMDE_FUNCTION_ATTRIBUTES
15819 simde__m128d
simde_mm_xor_pd(simde__m128d a,simde__m128d b)15820 simde_mm_xor_pd (simde__m128d a, simde__m128d b) {
15821   #if defined(SIMDE_X86_SSE2_NATIVE)
15822     return _mm_xor_pd(a, b);
15823   #else
15824     simde__m128d_private
15825       r_,
15826       a_ = simde__m128d_to_private(a),
15827       b_ = simde__m128d_to_private(b);
15828 
15829     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
15830       r_.i32f = a_.i32f ^ b_.i32f;
15831     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15832       r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128);
15833     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15834       r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64);
15835     #else
15836       SIMDE_VECTORIZE
15837       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
15838         r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
15839       }
15840     #endif
15841 
15842     return simde__m128d_from_private(r_);
15843   #endif
15844 }
15845 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15846   #define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
15847 #endif
15848 
15849 SIMDE_FUNCTION_ATTRIBUTES
15850 simde__m128i
simde_mm_avg_epu8(simde__m128i a,simde__m128i b)15851 simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) {
15852   #if defined(SIMDE_X86_SSE2_NATIVE)
15853     return _mm_avg_epu8(a, b);
15854   #else
15855     simde__m128i_private
15856       r_,
15857       a_ = simde__m128i_to_private(a),
15858       b_ = simde__m128i_to_private(b);
15859 
15860     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15861       r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
15862     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15863       r_.wasm_v128 = wasm_u8x16_avgr(a_.wasm_v128, b_.wasm_v128);
15864     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15865       r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
15866     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
15867       uint16_t wa SIMDE_VECTOR(32);
15868       uint16_t wb SIMDE_VECTOR(32);
15869       uint16_t wr SIMDE_VECTOR(32);
15870       SIMDE_CONVERT_VECTOR_(wa, a_.u8);
15871       SIMDE_CONVERT_VECTOR_(wb, b_.u8);
15872       wr = (wa + wb + 1) >> 1;
15873       SIMDE_CONVERT_VECTOR_(r_.u8, wr);
15874     #else
15875       SIMDE_VECTORIZE
15876       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
15877         r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
15878       }
15879     #endif
15880 
15881     return simde__m128i_from_private(r_);
15882   #endif
15883 }
15884 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15885   #define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
15886 #endif
15887 
15888 SIMDE_FUNCTION_ATTRIBUTES
15889 simde__m128i
simde_mm_avg_epu16(simde__m128i a,simde__m128i b)15890 simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) {
15891   #if defined(SIMDE_X86_SSE2_NATIVE)
15892     return _mm_avg_epu16(a, b);
15893   #else
15894     simde__m128i_private
15895       r_,
15896       a_ = simde__m128i_to_private(a),
15897       b_ = simde__m128i_to_private(b);
15898 
15899     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15900       r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
15901     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15902       r_.wasm_v128 = wasm_u16x8_avgr(a_.wasm_v128, b_.wasm_v128);
15903     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15904       r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
15905     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
15906       uint32_t wa SIMDE_VECTOR(32);
15907       uint32_t wb SIMDE_VECTOR(32);
15908       uint32_t wr SIMDE_VECTOR(32);
15909       SIMDE_CONVERT_VECTOR_(wa, a_.u16);
15910       SIMDE_CONVERT_VECTOR_(wb, b_.u16);
15911       wr = (wa + wb + 1) >> 1;
15912       SIMDE_CONVERT_VECTOR_(r_.u16, wr);
15913     #else
15914       SIMDE_VECTORIZE
15915       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
15916         r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
15917       }
15918     #endif
15919 
15920     return simde__m128i_from_private(r_);
15921   #endif
15922 }
15923 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15924   #define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
15925 #endif
15926 
15927 SIMDE_FUNCTION_ATTRIBUTES
15928 simde__m128i
simde_mm_setzero_si128(void)15929 simde_mm_setzero_si128 (void) {
15930   #if defined(SIMDE_X86_SSE2_NATIVE)
15931     return _mm_setzero_si128();
15932   #else
15933     simde__m128i_private r_;
15934 
15935     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
15936       r_.neon_i32 = vdupq_n_s32(0);
15937     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15938       r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0));
15939     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
15940       r_.wasm_v128 = wasm_i32x4_splat(INT32_C(0));
15941     #elif defined(SIMDE_VECTOR_SUBSCRIPT)
15942       r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 };
15943     #else
15944       SIMDE_VECTORIZE
15945       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
15946         r_.i32f[i] = 0;
15947       }
15948     #endif
15949 
15950     return simde__m128i_from_private(r_);
15951   #endif
15952 }
15953 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
15954   #define _mm_setzero_si128() (simde_mm_setzero_si128())
15955 #endif
15956 
15957 SIMDE_FUNCTION_ATTRIBUTES
15958 simde__m128i
simde_mm_bslli_si128(simde__m128i a,const int imm8)15959 simde_mm_bslli_si128 (simde__m128i a, const int imm8)
15960     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
15961   simde__m128i_private
15962     r_,
15963     a_ = simde__m128i_to_private(a);
15964 
15965   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
15966     return simde_mm_setzero_si128();
15967   }
15968 
15969   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
15970     r_.altivec_i8 =
15971       #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
15972         vec_slo
15973       #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
15974         vec_sro
15975       #endif
15976         (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
15977   #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15978     r_.altivec_i8 = vec_srb(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, (imm8 & 15) << 3)));
15979   #elif defined(SIMDE_HAVE_INT128_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
15980     r_.u128[0] = a_.u128[0] << (imm8 * 8);
15981   #else
15982     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
15983     for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
15984       r_.i8[i] = a_.i8[i - imm8];
15985     }
15986   #endif
15987 
15988   return simde__m128i_from_private(r_);
15989 }
15990 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
15991   #define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
15992 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
15993   #define simde_mm_bslli_si128(a, imm8) \
15994   simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8)))))
15995 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
15996   #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \
15997     const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
15998     const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
15999     simde__m128i_private simde__tmp_r_; \
16000     if (HEDLEY_UNLIKELY(imm8 > 15)) { \
16001       simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
16002     } else { \
16003       simde__tmp_r_.i8 = \
16004         SIMDE_SHUFFLE_VECTOR_(8, 16, \
16005           simde__tmp_z_.i8, \
16006           (simde__tmp_a_).i8, \
16007           HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \
16008           HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \
16009           HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \
16010           HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \
16011           HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \
16012           HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \
16013           HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \
16014           HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \
16015           HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \
16016           HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \
16017           HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \
16018           HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \
16019           HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \
16020           HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \
16021           HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \
16022           HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
16023     } \
16024     simde__m128i_from_private(simde__tmp_r_); }))
16025 #endif
16026 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
16027 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16028   #define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
16029   #define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
16030 #endif
16031 
16032 SIMDE_FUNCTION_ATTRIBUTES
16033 simde__m128i
simde_mm_bsrli_si128(simde__m128i a,const int imm8)16034 simde_mm_bsrli_si128 (simde__m128i a, const int imm8)
16035     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
16036   simde__m128i_private
16037     r_,
16038     a_ = simde__m128i_to_private(a);
16039 
16040   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
16041     return simde_mm_setzero_si128();
16042   }
16043 
16044   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
16045     r_.altivec_i8 =
16046     #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
16047       vec_sro
16048     #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
16049       vec_slo
16050     #endif
16051         (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
16052   #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16053     r_.altivec_i8 = vec_slb(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, (imm8 & 15) << 3)));
16054   #else
16055     SIMDE_VECTORIZE
16056     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
16057       const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
16058       r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
16059     }
16060   #endif
16061 
16062   return simde__m128i_from_private(r_);
16063 }
16064 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
16065   #define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
16066 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
16067   #define simde_mm_bsrli_si128(a, imm8) \
16068   simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
16069 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16070   #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \
16071     const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
16072     const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
16073     simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \
16074     if (HEDLEY_UNLIKELY(imm8 > 15)) { \
16075       simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
16076     } else { \
16077       simde__tmp_r_.i8 = \
16078       SIMDE_SHUFFLE_VECTOR_(8, 16, \
16079         simde__tmp_z_.i8, \
16080         (simde__tmp_a_).i8, \
16081         HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \
16082         HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \
16083         HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \
16084         HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \
16085         HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \
16086         HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \
16087         HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \
16088         HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \
16089         HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \
16090         HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \
16091         HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \
16092         HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \
16093         HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \
16094         HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \
16095         HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \
16096         HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
16097     } \
16098     simde__m128i_from_private(simde__tmp_r_); }))
16099 #endif
16100 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
16101 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16102   #define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
16103   #define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
16104 #endif
16105 
16106 SIMDE_FUNCTION_ATTRIBUTES
16107 void
simde_mm_clflush(void const * p)16108 simde_mm_clflush (void const* p) {
16109   #if defined(SIMDE_X86_SSE2_NATIVE)
16110     _mm_clflush(p);
16111   #else
16112     (void) p;
16113   #endif
16114 }
16115 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16116   #define _mm_clflush(a, b) simde_mm_clflush()
16117 #endif
16118 
16119 SIMDE_FUNCTION_ATTRIBUTES
16120 int
simde_mm_comieq_sd(simde__m128d a,simde__m128d b)16121 simde_mm_comieq_sd (simde__m128d a, simde__m128d b) {
16122   #if defined(SIMDE_X86_SSE2_NATIVE)
16123     return _mm_comieq_sd(a, b);
16124   #else
16125     simde__m128d_private
16126       a_ = simde__m128d_to_private(a),
16127       b_ = simde__m128d_to_private(b);
16128     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16129       return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
16130     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16131       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0);
16132     #else
16133       return a_.f64[0] == b_.f64[0];
16134     #endif
16135   #endif
16136 }
16137 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16138   #define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
16139 #endif
16140 
16141 SIMDE_FUNCTION_ATTRIBUTES
16142 int
simde_mm_comige_sd(simde__m128d a,simde__m128d b)16143 simde_mm_comige_sd (simde__m128d a, simde__m128d b) {
16144   #if defined(SIMDE_X86_SSE2_NATIVE)
16145     return _mm_comige_sd(a, b);
16146   #else
16147     simde__m128d_private
16148       a_ = simde__m128d_to_private(a),
16149       b_ = simde__m128d_to_private(b);
16150     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16151       return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
16152     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16153       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
16154     #else
16155       return a_.f64[0] >= b_.f64[0];
16156     #endif
16157   #endif
16158 }
16159 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16160   #define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
16161 #endif
16162 
16163 SIMDE_FUNCTION_ATTRIBUTES
16164 int
simde_mm_comigt_sd(simde__m128d a,simde__m128d b)16165 simde_mm_comigt_sd (simde__m128d a, simde__m128d b) {
16166   #if defined(SIMDE_X86_SSE2_NATIVE)
16167     return _mm_comigt_sd(a, b);
16168   #else
16169     simde__m128d_private
16170       a_ = simde__m128d_to_private(a),
16171       b_ = simde__m128d_to_private(b);
16172     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16173       return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
16174     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16175       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0);
16176     #else
16177       return a_.f64[0] > b_.f64[0];
16178     #endif
16179   #endif
16180 }
16181 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16182   #define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
16183 #endif
16184 
16185 SIMDE_FUNCTION_ATTRIBUTES
16186 int
simde_mm_comile_sd(simde__m128d a,simde__m128d b)16187 simde_mm_comile_sd (simde__m128d a, simde__m128d b) {
16188   #if defined(SIMDE_X86_SSE2_NATIVE)
16189     return _mm_comile_sd(a, b);
16190   #else
16191     simde__m128d_private
16192       a_ = simde__m128d_to_private(a),
16193       b_ = simde__m128d_to_private(b);
16194     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16195       return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
16196     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16197       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
16198     #else
16199       return a_.f64[0] <= b_.f64[0];
16200     #endif
16201   #endif
16202 }
16203 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16204   #define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
16205 #endif
16206 
16207 SIMDE_FUNCTION_ATTRIBUTES
16208 int
simde_mm_comilt_sd(simde__m128d a,simde__m128d b)16209 simde_mm_comilt_sd (simde__m128d a, simde__m128d b) {
16210   #if defined(SIMDE_X86_SSE2_NATIVE)
16211     return _mm_comilt_sd(a, b);
16212   #else
16213     simde__m128d_private
16214       a_ = simde__m128d_to_private(a),
16215       b_ = simde__m128d_to_private(b);
16216     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16217       return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
16218     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16219       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0);
16220     #else
16221       return a_.f64[0] < b_.f64[0];
16222     #endif
16223   #endif
16224 }
16225 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16226   #define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
16227 #endif
16228 
16229 SIMDE_FUNCTION_ATTRIBUTES
16230 int
simde_mm_comineq_sd(simde__m128d a,simde__m128d b)16231 simde_mm_comineq_sd (simde__m128d a, simde__m128d b) {
16232   #if defined(SIMDE_X86_SSE2_NATIVE)
16233     return _mm_comineq_sd(a, b);
16234   #else
16235     simde__m128d_private
16236       a_ = simde__m128d_to_private(a),
16237       b_ = simde__m128d_to_private(b);
16238     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16239       return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
16240     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16241       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0);
16242     #else
16243       return a_.f64[0] != b_.f64[0];
16244     #endif
16245   #endif
16246 }
16247 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16248   #define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
16249 #endif
16250 
16251 SIMDE_FUNCTION_ATTRIBUTES
16252 simde__m128d
simde_x_mm_copysign_pd(simde__m128d dest,simde__m128d src)16253 simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) {
16254   simde__m128d_private
16255     r_,
16256     dest_ = simde__m128d_to_private(dest),
16257     src_ = simde__m128d_to_private(src);
16258 
16259   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16260     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16261       uint64x2_t sign_pos = vreinterpretq_u64_f64(vdupq_n_f64(-SIMDE_FLOAT64_C(0.0)));
16262     #else
16263       simde_float64 dbl_nz = -SIMDE_FLOAT64_C(0.0);
16264       uint64_t u64_nz;
16265       simde_memcpy(&u64_nz, &dbl_nz, sizeof(u64_nz));
16266       uint64x2_t sign_pos = vdupq_n_u64(u64_nz);
16267     #endif
16268     r_.neon_u64 = vbslq_u64(sign_pos, src_.neon_u64, dest_.neon_u64);
16269   #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
16270     #if !defined(HEDLEY_IBM_VERSION)
16271       r_.altivec_f64 = vec_cpsgn(dest_.altivec_f64, src_.altivec_f64);
16272     #else
16273       r_.altivec_f64 = vec_cpsgn(src_.altivec_f64, dest_.altivec_f64);
16274     #endif
16275   #elif defined(simde_math_copysign)
16276     SIMDE_VECTORIZE
16277     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
16278       r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]);
16279     }
16280   #else
16281     simde__m128d sgnbit = simde_mm_set1_pd(-SIMDE_FLOAT64_C(0.0));
16282     return simde_mm_xor_pd(simde_mm_and_pd(sgnbit, src), simde_mm_andnot_pd(sgnbit, dest));
16283   #endif
16284 
16285   return simde__m128d_from_private(r_);
16286 }
16287 
16288 SIMDE_FUNCTION_ATTRIBUTES
16289 simde__m128d
simde_x_mm_xorsign_pd(simde__m128d dest,simde__m128d src)16290 simde_x_mm_xorsign_pd(simde__m128d dest, simde__m128d src) {
16291   return simde_mm_xor_pd(simde_mm_and_pd(simde_mm_set1_pd(-0.0), src), dest);
16292 }
16293 
16294 SIMDE_FUNCTION_ATTRIBUTES
16295 simde__m128
simde_mm_castpd_ps(simde__m128d a)16296 simde_mm_castpd_ps (simde__m128d a) {
16297   #if defined(SIMDE_X86_SSE2_NATIVE)
16298     return _mm_castpd_ps(a);
16299   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16300     return vreinterpretq_f32_f64(a);
16301   #else
16302     simde__m128 r;
16303     simde_memcpy(&r, &a, sizeof(a));
16304     return r;
16305   #endif
16306 }
16307 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16308   #define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
16309 #endif
16310 
16311 SIMDE_FUNCTION_ATTRIBUTES
16312 simde__m128i
simde_mm_castpd_si128(simde__m128d a)16313 simde_mm_castpd_si128 (simde__m128d a) {
16314   #if defined(SIMDE_X86_SSE2_NATIVE)
16315     return _mm_castpd_si128(a);
16316   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16317     return vreinterpretq_s64_f64(a);
16318   #else
16319     simde__m128i r;
16320     simde_memcpy(&r, &a, sizeof(a));
16321     return r;
16322   #endif
16323 }
16324 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16325   #define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
16326 #endif
16327 
16328 SIMDE_FUNCTION_ATTRIBUTES
16329 simde__m128d
simde_mm_castps_pd(simde__m128 a)16330 simde_mm_castps_pd (simde__m128 a) {
16331   #if defined(SIMDE_X86_SSE2_NATIVE)
16332     return _mm_castps_pd(a);
16333   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16334     return vreinterpretq_f64_f32(a);
16335   #else
16336     simde__m128d r;
16337     simde_memcpy(&r, &a, sizeof(a));
16338     return r;
16339   #endif
16340 }
16341 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16342   #define _mm_castps_pd(a) simde_mm_castps_pd(a)
16343 #endif
16344 
16345 SIMDE_FUNCTION_ATTRIBUTES
16346 simde__m128i
simde_mm_castps_si128(simde__m128 a)16347 simde_mm_castps_si128 (simde__m128 a) {
16348   #if defined(SIMDE_X86_SSE2_NATIVE)
16349     return _mm_castps_si128(a);
16350   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16351     return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32);
16352   #else
16353     simde__m128i r;
16354     simde_memcpy(&r, &a, sizeof(a));
16355     return r;
16356   #endif
16357 }
16358 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16359   #define _mm_castps_si128(a) simde_mm_castps_si128(a)
16360 #endif
16361 
16362 SIMDE_FUNCTION_ATTRIBUTES
16363 simde__m128d
simde_mm_castsi128_pd(simde__m128i a)16364 simde_mm_castsi128_pd (simde__m128i a) {
16365   #if defined(SIMDE_X86_SSE2_NATIVE)
16366     return _mm_castsi128_pd(a);
16367   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16368     return vreinterpretq_f64_s64(a);
16369   #else
16370     simde__m128d r;
16371     simde_memcpy(&r, &a, sizeof(a));
16372     return r;
16373   #endif
16374 }
16375 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16376   #define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
16377 #endif
16378 
16379 SIMDE_FUNCTION_ATTRIBUTES
16380 simde__m128
simde_mm_castsi128_ps(simde__m128i a)16381 simde_mm_castsi128_ps (simde__m128i a) {
16382   #if defined(SIMDE_X86_SSE2_NATIVE)
16383     return _mm_castsi128_ps(a);
16384   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
16385     return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a);
16386   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16387     return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32);
16388   #else
16389     simde__m128 r;
16390     simde_memcpy(&r, &a, sizeof(a));
16391     return r;
16392   #endif
16393 }
16394 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16395   #define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
16396 #endif
16397 
16398 SIMDE_FUNCTION_ATTRIBUTES
16399 simde__m128i
simde_mm_cmpeq_epi8(simde__m128i a,simde__m128i b)16400 simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) {
16401   #if defined(SIMDE_X86_SSE2_NATIVE)
16402     return _mm_cmpeq_epi8(a, b);
16403   #else
16404     simde__m128i_private
16405       r_,
16406       a_ = simde__m128i_to_private(a),
16407       b_ = simde__m128i_to_private(b);
16408 
16409     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16410       r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8);
16411     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16412       r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
16413     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16414       r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8));
16415     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16416       r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
16417     #else
16418       SIMDE_VECTORIZE
16419       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
16420         r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
16421       }
16422     #endif
16423 
16424     return simde__m128i_from_private(r_);
16425   #endif
16426 }
16427 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16428   #define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
16429 #endif
16430 
16431 SIMDE_FUNCTION_ATTRIBUTES
16432 simde__m128i
simde_mm_cmpeq_epi16(simde__m128i a,simde__m128i b)16433 simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) {
16434   #if defined(SIMDE_X86_SSE2_NATIVE)
16435     return _mm_cmpeq_epi16(a, b);
16436   #else
16437     simde__m128i_private
16438       r_,
16439       a_ = simde__m128i_to_private(a),
16440       b_ = simde__m128i_to_private(b);
16441 
16442     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16443       r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16);
16444     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16445       r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
16446     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16447       r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16));
16448     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16449       r_.i16 = (a_.i16 == b_.i16);
16450     #else
16451       SIMDE_VECTORIZE
16452       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
16453         r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
16454       }
16455     #endif
16456 
16457     return simde__m128i_from_private(r_);
16458   #endif
16459 }
16460 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16461   #define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
16462 #endif
16463 
16464 SIMDE_FUNCTION_ATTRIBUTES
16465 simde__m128i
simde_mm_cmpeq_epi32(simde__m128i a,simde__m128i b)16466 simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) {
16467   #if defined(SIMDE_X86_SSE2_NATIVE)
16468     return _mm_cmpeq_epi32(a, b);
16469   #else
16470     simde__m128i_private
16471       r_,
16472       a_ = simde__m128i_to_private(a),
16473       b_ = simde__m128i_to_private(b);
16474 
16475     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16476       r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32);
16477     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16478       r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
16479     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16480       r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32));
16481     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16482       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
16483     #else
16484       SIMDE_VECTORIZE
16485       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
16486         r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
16487       }
16488     #endif
16489 
16490     return simde__m128i_from_private(r_);
16491   #endif
16492 }
16493 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16494   #define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
16495 #endif
16496 
16497 SIMDE_FUNCTION_ATTRIBUTES
16498 simde__m128d
simde_mm_cmpeq_pd(simde__m128d a,simde__m128d b)16499 simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) {
16500   #if defined(SIMDE_X86_SSE2_NATIVE)
16501     return _mm_cmpeq_pd(a, b);
16502   #else
16503     simde__m128d_private
16504       r_,
16505       a_ = simde__m128d_to_private(a),
16506       b_ = simde__m128d_to_private(b);
16507 
16508     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16509       r_.neon_u64 = vceqq_s64(b_.neon_i64, a_.neon_i64);
16510     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16511       r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
16512     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16513       r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64));
16514     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16515       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
16516     #else
16517       SIMDE_VECTORIZE
16518       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
16519         r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
16520       }
16521     #endif
16522 
16523     return simde__m128d_from_private(r_);
16524   #endif
16525 }
16526 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16527   #define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
16528 #endif
16529 
16530 SIMDE_FUNCTION_ATTRIBUTES
16531 simde__m128d
simde_mm_cmpeq_sd(simde__m128d a,simde__m128d b)16532 simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) {
16533   #if defined(SIMDE_X86_SSE2_NATIVE)
16534     return _mm_cmpeq_sd(a, b);
16535   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
16536     return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
16537   #else
16538     simde__m128d_private
16539       r_,
16540       a_ = simde__m128d_to_private(a),
16541       b_ = simde__m128d_to_private(b);
16542 
16543     r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
16544     r_.u64[1] = a_.u64[1];
16545 
16546     return simde__m128d_from_private(r_);
16547   #endif
16548 }
16549 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16550   #define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
16551 #endif
16552 
16553 SIMDE_FUNCTION_ATTRIBUTES
16554 simde__m128d
simde_mm_cmpneq_pd(simde__m128d a,simde__m128d b)16555 simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) {
16556   #if defined(SIMDE_X86_SSE2_NATIVE)
16557     return _mm_cmpneq_pd(a, b);
16558   #else
16559     simde__m128d_private
16560       r_,
16561       a_ = simde__m128d_to_private(a),
16562       b_ = simde__m128d_to_private(b);
16563 
16564     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16565       r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(b_.neon_f64, a_.neon_f64)));
16566     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16567       r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
16568     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16569       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
16570     #else
16571       SIMDE_VECTORIZE
16572       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
16573         r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
16574       }
16575     #endif
16576 
16577     return simde__m128d_from_private(r_);
16578   #endif
16579 }
16580 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16581   #define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
16582 #endif
16583 
16584 SIMDE_FUNCTION_ATTRIBUTES
16585 simde__m128d
simde_mm_cmpneq_sd(simde__m128d a,simde__m128d b)16586 simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) {
16587   #if defined(SIMDE_X86_SSE2_NATIVE)
16588     return _mm_cmpneq_sd(a, b);
16589   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
16590     return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
16591   #else
16592     simde__m128d_private
16593       r_,
16594       a_ = simde__m128d_to_private(a),
16595       b_ = simde__m128d_to_private(b);
16596 
16597     r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
16598     r_.u64[1] = a_.u64[1];
16599 
16600 
16601     return simde__m128d_from_private(r_);
16602   #endif
16603 }
16604 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16605   #define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
16606 #endif
16607 
16608 SIMDE_FUNCTION_ATTRIBUTES
16609 simde__m128i
simde_mm_cmplt_epi8(simde__m128i a,simde__m128i b)16610 simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) {
16611   #if defined(SIMDE_X86_SSE2_NATIVE)
16612     return _mm_cmplt_epi8(a, b);
16613   #else
16614     simde__m128i_private
16615       r_,
16616       a_ = simde__m128i_to_private(a),
16617       b_ = simde__m128i_to_private(b);
16618 
16619     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16620       r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8);
16621     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16622       r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8));
16623     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16624       r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
16625     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16626       r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
16627     #else
16628       SIMDE_VECTORIZE
16629       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
16630         r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
16631       }
16632     #endif
16633 
16634     return simde__m128i_from_private(r_);
16635   #endif
16636 }
16637 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16638   #define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
16639 #endif
16640 
16641 SIMDE_FUNCTION_ATTRIBUTES
16642 simde__m128i
simde_mm_cmplt_epi16(simde__m128i a,simde__m128i b)16643 simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) {
16644   #if defined(SIMDE_X86_SSE2_NATIVE)
16645     return _mm_cmplt_epi16(a, b);
16646   #else
16647     simde__m128i_private
16648       r_,
16649       a_ = simde__m128i_to_private(a),
16650       b_ = simde__m128i_to_private(b);
16651 
16652     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16653       r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16);
16654     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16655       r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16));
16656     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16657       r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
16658     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16659       r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
16660     #else
16661       SIMDE_VECTORIZE
16662       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
16663         r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
16664       }
16665     #endif
16666 
16667     return simde__m128i_from_private(r_);
16668   #endif
16669 }
16670 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16671   #define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
16672 #endif
16673 
16674 SIMDE_FUNCTION_ATTRIBUTES
16675 simde__m128i
simde_mm_cmplt_epi32(simde__m128i a,simde__m128i b)16676 simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) {
16677   #if defined(SIMDE_X86_SSE2_NATIVE)
16678     return _mm_cmplt_epi32(a, b);
16679   #else
16680     simde__m128i_private
16681       r_,
16682       a_ = simde__m128i_to_private(a),
16683       b_ = simde__m128i_to_private(b);
16684 
16685     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16686       r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32);
16687     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16688       r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32));
16689     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16690       r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
16691     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16692       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
16693     #else
16694       SIMDE_VECTORIZE
16695       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
16696         r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
16697       }
16698     #endif
16699 
16700     return simde__m128i_from_private(r_);
16701   #endif
16702 }
16703 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16704   #define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
16705 #endif
16706 
16707 SIMDE_FUNCTION_ATTRIBUTES
16708 simde__m128d
simde_mm_cmplt_pd(simde__m128d a,simde__m128d b)16709 simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) {
16710   #if defined(SIMDE_X86_SSE2_NATIVE)
16711     return _mm_cmplt_pd(a, b);
16712   #else
16713     simde__m128d_private
16714       r_,
16715       a_ = simde__m128d_to_private(a),
16716       b_ = simde__m128d_to_private(b);
16717 
16718     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16719       r_.neon_u64 = vcltq_f64(a_.neon_f64, b_.neon_f64);
16720     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16721       r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmplt(a_.altivec_f64, b_.altivec_f64));
16722     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16723       r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
16724     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16725       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
16726     #else
16727       SIMDE_VECTORIZE
16728       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
16729         r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
16730       }
16731     #endif
16732 
16733     return simde__m128d_from_private(r_);
16734   #endif
16735 }
16736 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16737   #define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
16738 #endif
16739 
16740 SIMDE_FUNCTION_ATTRIBUTES
16741 simde__m128d
simde_mm_cmplt_sd(simde__m128d a,simde__m128d b)16742 simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) {
16743   #if defined(SIMDE_X86_SSE2_NATIVE)
16744     return _mm_cmplt_sd(a, b);
16745   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
16746     return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
16747   #else
16748     simde__m128d_private
16749       r_,
16750       a_ = simde__m128d_to_private(a),
16751       b_ = simde__m128d_to_private(b);
16752 
16753     r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
16754     r_.u64[1] = a_.u64[1];
16755 
16756     return simde__m128d_from_private(r_);
16757   #endif
16758 }
16759 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16760   #define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
16761 #endif
16762 
16763 SIMDE_FUNCTION_ATTRIBUTES
16764 simde__m128d
simde_mm_cmple_pd(simde__m128d a,simde__m128d b)16765 simde_mm_cmple_pd (simde__m128d a, simde__m128d b) {
16766   #if defined(SIMDE_X86_SSE2_NATIVE)
16767     return _mm_cmple_pd(a, b);
16768   #else
16769     simde__m128d_private
16770       r_,
16771       a_ = simde__m128d_to_private(a),
16772       b_ = simde__m128d_to_private(b);
16773 
16774     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16775       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
16776     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16777       r_.neon_u64 = vcleq_f64(a_.neon_f64, b_.neon_f64);
16778     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16779       r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
16780     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16781       r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64));
16782     #else
16783       SIMDE_VECTORIZE
16784       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
16785         r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
16786       }
16787     #endif
16788 
16789     return simde__m128d_from_private(r_);
16790   #endif
16791 }
16792 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16793   #define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
16794 #endif
16795 
16796 SIMDE_FUNCTION_ATTRIBUTES
16797 simde__m128d
simde_mm_cmple_sd(simde__m128d a,simde__m128d b)16798 simde_mm_cmple_sd (simde__m128d a, simde__m128d b) {
16799   #if defined(SIMDE_X86_SSE2_NATIVE)
16800     return _mm_cmple_sd(a, b);
16801   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
16802     return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
16803   #else
16804     simde__m128d_private
16805       r_,
16806       a_ = simde__m128d_to_private(a),
16807       b_ = simde__m128d_to_private(b);
16808 
16809     r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
16810     r_.u64[1] = a_.u64[1];
16811 
16812     return simde__m128d_from_private(r_);
16813   #endif
16814 }
16815 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16816   #define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
16817 #endif
16818 
16819 SIMDE_FUNCTION_ATTRIBUTES
16820 simde__m128i
simde_mm_cmpgt_epi8(simde__m128i a,simde__m128i b)16821 simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) {
16822   #if defined(SIMDE_X86_SSE2_NATIVE)
16823     return _mm_cmpgt_epi8(a, b);
16824   #else
16825     simde__m128i_private
16826       r_,
16827       a_ = simde__m128i_to_private(a),
16828       b_ = simde__m128i_to_private(b);
16829 
16830     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16831       r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8);
16832     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16833       r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
16834     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16835       r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8));
16836     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16837       r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
16838     #else
16839       SIMDE_VECTORIZE
16840       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
16841         r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
16842       }
16843     #endif
16844 
16845     return simde__m128i_from_private(r_);
16846   #endif
16847 }
16848 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16849   #define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
16850 #endif
16851 
16852 SIMDE_FUNCTION_ATTRIBUTES
16853 simde__m128i
simde_mm_cmpgt_epi16(simde__m128i a,simde__m128i b)16854 simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) {
16855   #if defined(SIMDE_X86_SSE2_NATIVE)
16856     return _mm_cmpgt_epi16(a, b);
16857   #else
16858     simde__m128i_private
16859       r_,
16860       a_ = simde__m128i_to_private(a),
16861       b_ = simde__m128i_to_private(b);
16862 
16863     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16864       r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16);
16865     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16866       r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
16867     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16868       r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
16869     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16870       r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
16871     #else
16872       SIMDE_VECTORIZE
16873       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
16874         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
16875       }
16876     #endif
16877 
16878     return simde__m128i_from_private(r_);
16879   #endif
16880 }
16881 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16882   #define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
16883 #endif
16884 
16885 SIMDE_FUNCTION_ATTRIBUTES
16886 simde__m128i
simde_mm_cmpgt_epi32(simde__m128i a,simde__m128i b)16887 simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) {
16888   #if defined(SIMDE_X86_SSE2_NATIVE)
16889     return _mm_cmpgt_epi32(a, b);
16890   #else
16891     simde__m128i_private
16892       r_,
16893       a_ = simde__m128i_to_private(a),
16894       b_ = simde__m128i_to_private(b);
16895 
16896     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
16897       r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32);
16898     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16899       r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
16900     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16901       r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32));
16902     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16903       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
16904     #else
16905       SIMDE_VECTORIZE
16906       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
16907         r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
16908       }
16909     #endif
16910 
16911     return simde__m128i_from_private(r_);
16912   #endif
16913 }
16914 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16915   #define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
16916 #endif
16917 
16918 SIMDE_FUNCTION_ATTRIBUTES
16919 simde__m128d
simde_mm_cmpgt_pd(simde__m128d a,simde__m128d b)16920 simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) {
16921   #if defined(SIMDE_X86_SSE2_NATIVE)
16922     return _mm_cmpgt_pd(a, b);
16923   #else
16924     simde__m128d_private
16925       r_,
16926       a_ = simde__m128d_to_private(a),
16927       b_ = simde__m128d_to_private(b);
16928 
16929     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16930       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
16931     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16932       r_.neon_u64 = vcgtq_f64(a_.neon_f64, b_.neon_f64);
16933     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16934       r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
16935     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16936       r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
16937     #else
16938       SIMDE_VECTORIZE
16939       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
16940         r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
16941       }
16942     #endif
16943 
16944     return simde__m128d_from_private(r_);
16945   #endif
16946 }
16947 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16948   #define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
16949 #endif
16950 
16951 SIMDE_FUNCTION_ATTRIBUTES
16952 simde__m128d
simde_mm_cmpgt_sd(simde__m128d a,simde__m128d b)16953 simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) {
16954   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
16955     return _mm_cmpgt_sd(a, b);
16956   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
16957     return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
16958   #else
16959     simde__m128d_private
16960       r_,
16961       a_ = simde__m128d_to_private(a),
16962       b_ = simde__m128d_to_private(b);
16963 
16964     r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
16965     r_.u64[1] = a_.u64[1];
16966 
16967     return simde__m128d_from_private(r_);
16968   #endif
16969 }
16970 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
16971   #define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
16972 #endif
16973 
16974 SIMDE_FUNCTION_ATTRIBUTES
16975 simde__m128d
simde_mm_cmpge_pd(simde__m128d a,simde__m128d b)16976 simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) {
16977   #if defined(SIMDE_X86_SSE2_NATIVE)
16978     return _mm_cmpge_pd(a, b);
16979   #else
16980     simde__m128d_private
16981       r_,
16982       a_ = simde__m128d_to_private(a),
16983       b_ = simde__m128d_to_private(b);
16984 
16985     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
16986       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
16987     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
16988       r_.neon_u64 = vcgeq_f64(a_.neon_f64, b_.neon_f64);
16989     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
16990       r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
16991     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
16992       r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64));
16993     #else
16994       SIMDE_VECTORIZE
16995       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
16996         r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
16997       }
16998     #endif
16999 
17000     return simde__m128d_from_private(r_);
17001   #endif
17002 }
17003 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17004   #define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
17005 #endif
17006 
17007 SIMDE_FUNCTION_ATTRIBUTES
17008 simde__m128d
simde_mm_cmpge_sd(simde__m128d a,simde__m128d b)17009 simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) {
17010   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
17011     return _mm_cmpge_sd(a, b);
17012   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
17013     return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
17014   #else
17015     simde__m128d_private
17016       r_,
17017       a_ = simde__m128d_to_private(a),
17018       b_ = simde__m128d_to_private(b);
17019 
17020     r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
17021     r_.u64[1] = a_.u64[1];
17022 
17023     return simde__m128d_from_private(r_);
17024   #endif
17025 }
17026 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17027   #define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
17028 #endif
17029 
17030 SIMDE_FUNCTION_ATTRIBUTES
17031 simde__m128d
simde_mm_cmpngt_pd(simde__m128d a,simde__m128d b)17032 simde_mm_cmpngt_pd (simde__m128d a, simde__m128d b) {
17033   #if defined(SIMDE_X86_SSE2_NATIVE)
17034     return _mm_cmpngt_pd(a, b);
17035   #else
17036     return simde_mm_cmple_pd(a, b);
17037   #endif
17038 }
17039 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17040   #define _mm_cmpngt_pd(a, b) simde_mm_cmpngt_pd(a, b)
17041 #endif
17042 
17043 SIMDE_FUNCTION_ATTRIBUTES
17044 simde__m128d
simde_mm_cmpngt_sd(simde__m128d a,simde__m128d b)17045 simde_mm_cmpngt_sd (simde__m128d a, simde__m128d b) {
17046   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
17047     return _mm_cmpngt_sd(a, b);
17048   #else
17049     return simde_mm_cmple_sd(a, b);
17050   #endif
17051 }
17052 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17053   #define _mm_cmpngt_sd(a, b) simde_mm_cmpngt_sd(a, b)
17054 #endif
17055 
17056 SIMDE_FUNCTION_ATTRIBUTES
17057 simde__m128d
simde_mm_cmpnge_pd(simde__m128d a,simde__m128d b)17058 simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) {
17059   #if defined(SIMDE_X86_SSE2_NATIVE)
17060     return _mm_cmpnge_pd(a, b);
17061   #else
17062     return simde_mm_cmplt_pd(a, b);
17063   #endif
17064 }
17065 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17066   #define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
17067 #endif
17068 
17069 SIMDE_FUNCTION_ATTRIBUTES
17070 simde__m128d
simde_mm_cmpnge_sd(simde__m128d a,simde__m128d b)17071 simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) {
17072   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
17073     return _mm_cmpnge_sd(a, b);
17074   #else
17075     return simde_mm_cmplt_sd(a, b);
17076   #endif
17077 }
17078 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17079   #define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
17080 #endif
17081 
17082 SIMDE_FUNCTION_ATTRIBUTES
17083 simde__m128d
simde_mm_cmpnlt_pd(simde__m128d a,simde__m128d b)17084 simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) {
17085   #if defined(SIMDE_X86_SSE2_NATIVE)
17086     return _mm_cmpnlt_pd(a, b);
17087   #else
17088     return simde_mm_cmpge_pd(a, b);
17089   #endif
17090 }
17091 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17092   #define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
17093 #endif
17094 
17095 SIMDE_FUNCTION_ATTRIBUTES
17096 simde__m128d
simde_mm_cmpnlt_sd(simde__m128d a,simde__m128d b)17097 simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) {
17098   #if defined(SIMDE_X86_SSE2_NATIVE)
17099     return _mm_cmpnlt_sd(a, b);
17100   #else
17101     return simde_mm_cmpge_sd(a, b);
17102   #endif
17103 }
17104 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17105   #define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
17106 #endif
17107 
17108 SIMDE_FUNCTION_ATTRIBUTES
17109 simde__m128d
simde_mm_cmpnle_pd(simde__m128d a,simde__m128d b)17110 simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) {
17111   #if defined(SIMDE_X86_SSE2_NATIVE)
17112     return _mm_cmpnle_pd(a, b);
17113   #else
17114     return simde_mm_cmpgt_pd(a, b);
17115   #endif
17116 }
17117 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17118   #define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
17119 #endif
17120 
17121 SIMDE_FUNCTION_ATTRIBUTES
17122 simde__m128d
simde_mm_cmpnle_sd(simde__m128d a,simde__m128d b)17123 simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) {
17124   #if defined(SIMDE_X86_SSE2_NATIVE)
17125     return _mm_cmpnle_sd(a, b);
17126   #else
17127     return simde_mm_cmpgt_sd(a, b);
17128   #endif
17129 }
17130 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17131   #define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
17132 #endif
17133 
17134 SIMDE_FUNCTION_ATTRIBUTES
17135 simde__m128d
simde_mm_cmpord_pd(simde__m128d a,simde__m128d b)17136 simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) {
17137   #if defined(SIMDE_X86_SSE2_NATIVE)
17138     return _mm_cmpord_pd(a, b);
17139   #else
17140     simde__m128d_private
17141       r_,
17142       a_ = simde__m128d_to_private(a),
17143       b_ = simde__m128d_to_private(b);
17144 
17145     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17146       /* Note: NEON does not have ordered compare builtin
17147         Need to compare a eq a and b eq b to check for NaN
17148         Do AND of results to get final */
17149       uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
17150       uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
17151       r_.neon_u64 = vandq_u64(ceqaa, ceqbb);
17152     #elif defined(simde_math_isnan)
17153       SIMDE_VECTORIZE
17154       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
17155         r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
17156       }
17157     #else
17158       HEDLEY_UNREACHABLE();
17159     #endif
17160 
17161     return simde__m128d_from_private(r_);
17162   #endif
17163 }
17164 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17165   #define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
17166 #endif
17167 
17168 SIMDE_FUNCTION_ATTRIBUTES
17169 simde_float64
simde_mm_cvtsd_f64(simde__m128d a)17170 simde_mm_cvtsd_f64 (simde__m128d a) {
17171   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
17172     return _mm_cvtsd_f64(a);
17173   #else
17174     simde__m128d_private a_ = simde__m128d_to_private(a);
17175     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17176       return HEDLEY_STATIC_CAST(simde_float64, vgetq_lane_f64(a_.neon_f64, 0));
17177     #else
17178       return a_.f64[0];
17179     #endif
17180   #endif
17181 }
17182 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17183   #define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
17184 #endif
17185 
17186 SIMDE_FUNCTION_ATTRIBUTES
17187 simde__m128d
simde_mm_cmpord_sd(simde__m128d a,simde__m128d b)17188 simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) {
17189   #if defined(SIMDE_X86_SSE2_NATIVE)
17190     return _mm_cmpord_sd(a, b);
17191   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
17192     return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
17193   #else
17194     simde__m128d_private
17195       r_,
17196       a_ = simde__m128d_to_private(a),
17197       b_ = simde__m128d_to_private(b);
17198 
17199     #if defined(simde_math_isnan)
17200       r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
17201       r_.u64[1] = a_.u64[1];
17202     #else
17203       HEDLEY_UNREACHABLE();
17204     #endif
17205 
17206     return simde__m128d_from_private(r_);
17207   #endif
17208 }
17209 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17210   #define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
17211 #endif
17212 
17213 SIMDE_FUNCTION_ATTRIBUTES
17214 simde__m128d
simde_mm_cmpunord_pd(simde__m128d a,simde__m128d b)17215 simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) {
17216   #if defined(SIMDE_X86_SSE2_NATIVE)
17217     return _mm_cmpunord_pd(a, b);
17218   #else
17219     simde__m128d_private
17220       r_,
17221       a_ = simde__m128d_to_private(a),
17222       b_ = simde__m128d_to_private(b);
17223 
17224     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17225       uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
17226       uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
17227       r_.neon_u64 = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(ceqaa, ceqbb))));
17228     #elif defined(simde_math_isnan)
17229       SIMDE_VECTORIZE
17230       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
17231         r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
17232       }
17233     #else
17234       HEDLEY_UNREACHABLE();
17235     #endif
17236 
17237     return simde__m128d_from_private(r_);
17238   #endif
17239 }
17240 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17241   #define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
17242 #endif
17243 
17244 SIMDE_FUNCTION_ATTRIBUTES
17245 simde__m128d
simde_mm_cmpunord_sd(simde__m128d a,simde__m128d b)17246 simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) {
17247   #if defined(SIMDE_X86_SSE2_NATIVE)
17248     return _mm_cmpunord_sd(a, b);
17249   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
17250     return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
17251   #else
17252     simde__m128d_private
17253       r_,
17254       a_ = simde__m128d_to_private(a),
17255       b_ = simde__m128d_to_private(b);
17256 
17257     #if defined(simde_math_isnan)
17258       r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
17259       r_.u64[1] = a_.u64[1];
17260     #else
17261       HEDLEY_UNREACHABLE();
17262     #endif
17263 
17264     return simde__m128d_from_private(r_);
17265   #endif
17266 }
17267 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17268   #define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
17269 #endif
17270 
17271 SIMDE_FUNCTION_ATTRIBUTES
17272 simde__m128d
simde_mm_cvtepi32_pd(simde__m128i a)17273 simde_mm_cvtepi32_pd (simde__m128i a) {
17274   #if defined(SIMDE_X86_SSE2_NATIVE)
17275     return _mm_cvtepi32_pd(a);
17276   #else
17277     simde__m128d_private r_;
17278     simde__m128i_private a_ = simde__m128i_to_private(a);
17279 
17280     #if defined(SIMDE_CONVERT_VECTOR_)
17281       SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
17282     #else
17283       SIMDE_VECTORIZE
17284       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
17285         r_.f64[i] = (simde_float64) a_.i32[i];
17286       }
17287     #endif
17288 
17289     return simde__m128d_from_private(r_);
17290   #endif
17291 }
17292 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17293   #define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
17294 #endif
17295 
17296 SIMDE_FUNCTION_ATTRIBUTES
17297 simde__m128
simde_mm_cvtepi32_ps(simde__m128i a)17298 simde_mm_cvtepi32_ps (simde__m128i a) {
17299   #if defined(SIMDE_X86_SSE2_NATIVE)
17300     return _mm_cvtepi32_ps(a);
17301   #else
17302     simde__m128_private r_;
17303     simde__m128i_private a_ = simde__m128i_to_private(a);
17304 
17305     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
17306       r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
17307     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
17308       r_.wasm_v128 = wasm_f32x4_convert_i32x4(a_.wasm_v128);
17309     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
17310       HEDLEY_DIAGNOSTIC_PUSH
17311       #if HEDLEY_HAS_WARNING("-Wc11-extensions")
17312         #pragma clang diagnostic ignored "-Wc11-extensions"
17313       #endif
17314       r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
17315       HEDLEY_DIAGNOSTIC_POP
17316     #elif defined(SIMDE_CONVERT_VECTOR_)
17317       SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
17318     #else
17319       SIMDE_VECTORIZE
17320       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
17321         r_.f32[i] = (simde_float32) a_.i32[i];
17322       }
17323     #endif
17324 
17325     return simde__m128_from_private(r_);
17326   #endif
17327 }
17328 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17329   #define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
17330 #endif
17331 
17332 SIMDE_FUNCTION_ATTRIBUTES
17333 simde__m64
simde_mm_cvtpd_pi32(simde__m128d a)17334 simde_mm_cvtpd_pi32 (simde__m128d a) {
17335   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
17336     return _mm_cvtpd_pi32(a);
17337   #else
17338     simde__m64_private r_;
17339     simde__m128d_private a_ = simde__m128d_to_private(a);
17340 
17341     SIMDE_VECTORIZE
17342     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
17343       simde_float64 v = simde_math_round(a_.f64[i]);
17344       #if defined(SIMDE_FAST_CONVERSION_RANGE)
17345         r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
17346       #else
17347         r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
17348           SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
17349       #endif
17350     }
17351 
17352     return simde__m64_from_private(r_);
17353   #endif
17354 }
17355 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17356   #define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
17357 #endif
17358 
17359 SIMDE_FUNCTION_ATTRIBUTES
17360 simde__m128i
simde_mm_cvtpd_epi32(simde__m128d a)17361 simde_mm_cvtpd_epi32 (simde__m128d a) {
17362   #if defined(SIMDE_X86_SSE2_NATIVE)
17363     return _mm_cvtpd_epi32(a);
17364   #else
17365     simde__m128i_private r_;
17366 
17367     r_.m64[0] = simde_mm_cvtpd_pi32(a);
17368     r_.m64[1] = simde_mm_setzero_si64();
17369 
17370     return simde__m128i_from_private(r_);
17371   #endif
17372 }
17373 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17374   #define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
17375 #endif
17376 
17377 SIMDE_FUNCTION_ATTRIBUTES
17378 simde__m128
simde_mm_cvtpd_ps(simde__m128d a)17379 simde_mm_cvtpd_ps (simde__m128d a) {
17380   #if defined(SIMDE_X86_SSE2_NATIVE)
17381     return _mm_cvtpd_ps(a);
17382   #else
17383     simde__m128_private r_;
17384     simde__m128d_private a_ = simde__m128d_to_private(a);
17385 
17386     #if defined(SIMDE_CONVERT_VECTOR_)
17387       SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
17388       r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
17389     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17390       r_.neon_f32 = vreinterpretq_f32_f64(vcombine_f64(vreinterpret_f64_f32(vcvtx_f32_f64(a_.neon_f64)), vdup_n_f64(0)));
17391     #else
17392       SIMDE_VECTORIZE
17393       for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
17394         r_.f32[i] = (simde_float32) a_.f64[i];
17395       }
17396       simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
17397     #endif
17398 
17399     return simde__m128_from_private(r_);
17400   #endif
17401 }
17402 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17403   #define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
17404 #endif
17405 
17406 SIMDE_FUNCTION_ATTRIBUTES
17407 simde__m128d
simde_mm_cvtpi32_pd(simde__m64 a)17408 simde_mm_cvtpi32_pd (simde__m64 a) {
17409   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
17410     return _mm_cvtpi32_pd(a);
17411   #else
17412     simde__m128d_private r_;
17413     simde__m64_private a_ = simde__m64_to_private(a);
17414 
17415     #if defined(SIMDE_CONVERT_VECTOR_)
17416       SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
17417     #else
17418       SIMDE_VECTORIZE
17419       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
17420         r_.f64[i] = (simde_float64) a_.i32[i];
17421       }
17422     #endif
17423 
17424     return simde__m128d_from_private(r_);
17425   #endif
17426 }
17427 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17428   #define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
17429 #endif
17430 
17431 SIMDE_FUNCTION_ATTRIBUTES
17432 simde__m128i
simde_mm_cvtps_epi32(simde__m128 a)17433 simde_mm_cvtps_epi32 (simde__m128 a) {
17434   #if defined(SIMDE_X86_SSE2_NATIVE)
17435     return _mm_cvtps_epi32(a);
17436   #else
17437     simde__m128i_private r_;
17438     simde__m128_private a_ = simde__m128_to_private(a);
17439 
17440     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
17441       r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
17442     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_BUG_GCC_95399)
17443       r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
17444     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
17445       HEDLEY_DIAGNOSTIC_PUSH
17446       SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_
17447       SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
17448       r_.altivec_i32 = vec_cts(a_.altivec_f32, 1);
17449       HEDLEY_DIAGNOSTIC_POP
17450     #else
17451       a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1));
17452       SIMDE_VECTORIZE
17453       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
17454         simde_float32 v = simde_math_roundf(a_.f32[i]);
17455         #if defined(SIMDE_FAST_CONVERSION_RANGE)
17456           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
17457         #else
17458           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
17459             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
17460         #endif
17461       }
17462     #endif
17463 
17464     return simde__m128i_from_private(r_);
17465   #endif
17466 }
17467 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17468   #define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
17469 #endif
17470 
17471 SIMDE_FUNCTION_ATTRIBUTES
17472 simde__m128d
simde_mm_cvtps_pd(simde__m128 a)17473 simde_mm_cvtps_pd (simde__m128 a) {
17474   #if defined(SIMDE_X86_SSE2_NATIVE)
17475     return _mm_cvtps_pd(a);
17476   #else
17477     simde__m128d_private r_;
17478     simde__m128_private a_ = simde__m128_to_private(a);
17479 
17480     #if defined(SIMDE_CONVERT_VECTOR_)
17481       SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
17482     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17483       r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32));
17484     #else
17485       SIMDE_VECTORIZE
17486       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
17487         r_.f64[i] = a_.f32[i];
17488       }
17489     #endif
17490 
17491     return simde__m128d_from_private(r_);
17492   #endif
17493 }
17494 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17495   #define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
17496 #endif
17497 
17498 SIMDE_FUNCTION_ATTRIBUTES
17499 int32_t
simde_mm_cvtsd_si32(simde__m128d a)17500 simde_mm_cvtsd_si32 (simde__m128d a) {
17501   #if defined(SIMDE_X86_SSE2_NATIVE)
17502     return _mm_cvtsd_si32(a);
17503   #else
17504     simde__m128d_private a_ = simde__m128d_to_private(a);
17505 
17506     simde_float64 v = simde_math_round(a_.f64[0]);
17507     #if defined(SIMDE_FAST_CONVERSION_RANGE)
17508       return SIMDE_CONVERT_FTOI(int32_t, v);
17509     #else
17510       return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
17511         SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
17512     #endif
17513   #endif
17514 }
17515 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17516   #define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
17517 #endif
17518 
17519 SIMDE_FUNCTION_ATTRIBUTES
17520 int64_t
simde_mm_cvtsd_si64(simde__m128d a)17521 simde_mm_cvtsd_si64 (simde__m128d a) {
17522   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
17523     #if defined(__PGI)
17524       return _mm_cvtsd_si64x(a);
17525     #else
17526       return _mm_cvtsd_si64(a);
17527     #endif
17528   #else
17529     simde__m128d_private a_ = simde__m128d_to_private(a);
17530     return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0]));
17531   #endif
17532 }
17533 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
17534 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
17535   #define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
17536   #define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
17537 #endif
17538 
17539 SIMDE_FUNCTION_ATTRIBUTES
17540 simde__m128
simde_mm_cvtsd_ss(simde__m128 a,simde__m128d b)17541 simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) {
17542   #if defined(SIMDE_X86_SSE2_NATIVE)
17543     return _mm_cvtsd_ss(a, b);
17544   #else
17545     simde__m128_private
17546       r_,
17547       a_ = simde__m128_to_private(a);
17548     simde__m128d_private b_ = simde__m128d_to_private(b);
17549 
17550     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17551       r_.neon_f32 = vsetq_lane_f32(vcvtxd_f32_f64(vgetq_lane_f64(b_.neon_f64, 0)), a_.neon_f32, 0);
17552     #else
17553       r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
17554 
17555       SIMDE_VECTORIZE
17556       for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
17557         r_.i32[i] = a_.i32[i];
17558       }
17559     #endif
17560     return simde__m128_from_private(r_);
17561   #endif
17562 }
17563 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17564   #define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
17565 #endif
17566 
17567 SIMDE_FUNCTION_ATTRIBUTES
17568 int16_t
simde_x_mm_cvtsi128_si16(simde__m128i a)17569 simde_x_mm_cvtsi128_si16 (simde__m128i a) {
17570   simde__m128i_private
17571     a_ = simde__m128i_to_private(a);
17572 
17573   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
17574     return vgetq_lane_s16(a_.neon_i16, 0);
17575   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
17576     return HEDLEY_STATIC_CAST(int16_t, wasm_i16x8_extract_lane(a_.wasm_v128, 0));
17577   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
17578     #if defined(SIMDE_BUG_GCC_95227)
17579       (void) a_;
17580     #endif
17581     return vec_extract(a_.altivec_i16, 0);
17582   #else
17583     return a_.i16[0];
17584   #endif
17585 }
17586 
17587 SIMDE_FUNCTION_ATTRIBUTES
17588 int32_t
simde_mm_cvtsi128_si32(simde__m128i a)17589 simde_mm_cvtsi128_si32 (simde__m128i a) {
17590   #if defined(SIMDE_X86_SSE2_NATIVE)
17591     return _mm_cvtsi128_si32(a);
17592   #else
17593     simde__m128i_private
17594       a_ = simde__m128i_to_private(a);
17595 
17596     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
17597       return vgetq_lane_s32(a_.neon_i32, 0);
17598     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
17599       return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_extract_lane(a_.wasm_v128, 0));
17600     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
17601       #if defined(SIMDE_BUG_GCC_95227)
17602         (void) a_;
17603       #endif
17604       return vec_extract(a_.altivec_i32, 0);
17605     #else
17606       return a_.i32[0];
17607     #endif
17608   #endif
17609 }
17610 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17611   #define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
17612 #endif
17613 
17614 SIMDE_FUNCTION_ATTRIBUTES
17615 int64_t
simde_mm_cvtsi128_si64(simde__m128i a)17616 simde_mm_cvtsi128_si64 (simde__m128i a) {
17617   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
17618     #if defined(__PGI)
17619       return _mm_cvtsi128_si64x(a);
17620     #else
17621       return _mm_cvtsi128_si64(a);
17622     #endif
17623   #else
17624     simde__m128i_private a_ = simde__m128i_to_private(a);
17625   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION)
17626     return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0);
17627   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
17628     return vgetq_lane_s64(a_.neon_i64, 0);
17629   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
17630     return HEDLEY_STATIC_CAST(int64_t, wasm_i64x2_extract_lane(a_.wasm_v128, 0));
17631   #endif
17632     return a_.i64[0];
17633   #endif
17634 }
17635 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
17636 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
17637   #define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
17638   #define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
17639 #endif
17640 
17641 SIMDE_FUNCTION_ATTRIBUTES
17642 simde__m128d
simde_mm_cvtsi32_sd(simde__m128d a,int32_t b)17643 simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) {
17644   #if defined(SIMDE_X86_SSE2_NATIVE)
17645     return _mm_cvtsi32_sd(a, b);
17646   #else
17647     simde__m128d_private r_;
17648     simde__m128d_private a_ = simde__m128d_to_private(a);
17649 
17650     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17651       r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
17652     #else
17653       r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
17654       r_.i64[1] = a_.i64[1];
17655     #endif
17656 
17657     return simde__m128d_from_private(r_);
17658   #endif
17659 }
17660 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17661   #define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
17662 #endif
17663 
17664 SIMDE_FUNCTION_ATTRIBUTES
17665 simde__m128i
simde_x_mm_cvtsi16_si128(int16_t a)17666 simde_x_mm_cvtsi16_si128 (int16_t a) {
17667   simde__m128i_private r_;
17668 
17669   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
17670     r_.neon_i16 = vsetq_lane_s16(a, vdupq_n_s16(0), 0);
17671   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
17672     r_.wasm_v128 = wasm_i16x8_make(a, 0, 0, 0, 0, 0, 0, 0);
17673   #else
17674     r_.i16[0] = a;
17675     r_.i16[1] = 0;
17676     r_.i16[2] = 0;
17677     r_.i16[3] = 0;
17678     r_.i16[4] = 0;
17679     r_.i16[5] = 0;
17680     r_.i16[6] = 0;
17681     r_.i16[7] = 0;
17682   #endif
17683 
17684   return simde__m128i_from_private(r_);
17685 }
17686 
17687 SIMDE_FUNCTION_ATTRIBUTES
17688 simde__m128i
simde_mm_cvtsi32_si128(int32_t a)17689 simde_mm_cvtsi32_si128 (int32_t a) {
17690   #if defined(SIMDE_X86_SSE2_NATIVE)
17691     return _mm_cvtsi32_si128(a);
17692   #else
17693     simde__m128i_private r_;
17694 
17695     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
17696       r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
17697     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
17698       r_.wasm_v128 = wasm_i32x4_make(a, 0, 0, 0);
17699     #else
17700       r_.i32[0] = a;
17701       r_.i32[1] = 0;
17702       r_.i32[2] = 0;
17703       r_.i32[3] = 0;
17704     #endif
17705 
17706     return simde__m128i_from_private(r_);
17707   #endif
17708 }
17709 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17710   #define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
17711 #endif
17712 
17713 SIMDE_FUNCTION_ATTRIBUTES
17714 simde__m128d
simde_mm_cvtsi64_sd(simde__m128d a,int64_t b)17715 simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) {
17716   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
17717     #if !defined(__PGI)
17718       return _mm_cvtsi64_sd(a, b);
17719     #else
17720       return _mm_cvtsi64x_sd(a, b);
17721     #endif
17722   #else
17723     simde__m128d_private
17724       r_,
17725       a_ = simde__m128d_to_private(a);
17726 
17727     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17728       r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
17729     #else
17730       r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
17731       r_.f64[1] = a_.f64[1];
17732     #endif
17733 
17734     return simde__m128d_from_private(r_);
17735   #endif
17736 }
17737 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
17738 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
17739   #define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
17740   #define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
17741 #endif
17742 
17743 SIMDE_FUNCTION_ATTRIBUTES
17744 simde__m128i
simde_mm_cvtsi64_si128(int64_t a)17745 simde_mm_cvtsi64_si128 (int64_t a) {
17746   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
17747     #if !defined(__PGI)
17748       return _mm_cvtsi64_si128(a);
17749     #else
17750       return _mm_cvtsi64x_si128(a);
17751     #endif
17752   #else
17753     simde__m128i_private r_;
17754 
17755     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
17756       r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0);
17757     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
17758       r_.wasm_v128 = wasm_i64x2_make(a, 0);
17759     #else
17760       r_.i64[0] = a;
17761       r_.i64[1] = 0;
17762     #endif
17763 
17764     return simde__m128i_from_private(r_);
17765   #endif
17766 }
17767 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
17768 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
17769   #define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
17770   #define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
17771 #endif
17772 
17773 SIMDE_FUNCTION_ATTRIBUTES
17774 simde__m128d
simde_mm_cvtss_sd(simde__m128d a,simde__m128 b)17775 simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) {
17776   #if defined(SIMDE_X86_SSE2_NATIVE)
17777     return _mm_cvtss_sd(a, b);
17778   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17779     float64x2_t temp = vcvt_f64_f32(vset_lane_f32(vgetq_lane_f32(simde__m128_to_private(b).neon_f32, 0), vdup_n_f32(0), 0));
17780     return vsetq_lane_f64(vgetq_lane_f64(simde__m128d_to_private(a).neon_f64, 1), temp, 1);
17781   #else
17782     simde__m128d_private
17783       a_ = simde__m128d_to_private(a);
17784     simde__m128_private b_ = simde__m128_to_private(b);
17785 
17786     a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
17787 
17788     return simde__m128d_from_private(a_);
17789   #endif
17790 }
17791 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17792   #define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
17793 #endif
17794 
17795 SIMDE_FUNCTION_ATTRIBUTES
17796 simde__m64
simde_mm_cvttpd_pi32(simde__m128d a)17797 simde_mm_cvttpd_pi32 (simde__m128d a) {
17798   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
17799     return _mm_cvttpd_pi32(a);
17800   #else
17801     simde__m64_private r_;
17802     simde__m128d_private a_ = simde__m128d_to_private(a);
17803 
17804     #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
17805       SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
17806     #else
17807       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
17808         simde_float64 v = a_.f64[i];
17809         #if defined(SIMDE_FAST_CONVERSION_RANGE)
17810           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
17811         #else
17812           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
17813             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
17814         #endif
17815       }
17816     #endif
17817 
17818     return simde__m64_from_private(r_);
17819   #endif
17820 }
17821 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17822   #define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
17823 #endif
17824 
17825 SIMDE_FUNCTION_ATTRIBUTES
17826 simde__m128i
simde_mm_cvttpd_epi32(simde__m128d a)17827 simde_mm_cvttpd_epi32 (simde__m128d a) {
17828   #if defined(SIMDE_X86_SSE2_NATIVE)
17829     return _mm_cvttpd_epi32(a);
17830   #else
17831     simde__m128i_private r_;
17832 
17833     r_.m64[0] = simde_mm_cvttpd_pi32(a);
17834     r_.m64[1] = simde_mm_setzero_si64();
17835 
17836     return simde__m128i_from_private(r_);
17837   #endif
17838 }
17839 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17840   #define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
17841 #endif
17842 
17843 SIMDE_FUNCTION_ATTRIBUTES
17844 simde__m128i
simde_mm_cvttps_epi32(simde__m128 a)17845 simde_mm_cvttps_epi32 (simde__m128 a) {
17846   #if defined(SIMDE_X86_SSE2_NATIVE)
17847     return _mm_cvttps_epi32(a);
17848   #else
17849     simde__m128i_private r_;
17850     simde__m128_private a_ = simde__m128_to_private(a);
17851 
17852     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
17853       r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
17854     #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
17855       SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
17856     #else
17857       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
17858         simde_float32 v = a_.f32[i];
17859         #if defined(SIMDE_FAST_CONVERSION_RANGE)
17860           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
17861         #else
17862           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
17863             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
17864         #endif
17865       }
17866     #endif
17867 
17868     return simde__m128i_from_private(r_);
17869   #endif
17870 }
17871 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17872   #define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
17873 #endif
17874 
17875 SIMDE_FUNCTION_ATTRIBUTES
17876 int32_t
simde_mm_cvttsd_si32(simde__m128d a)17877 simde_mm_cvttsd_si32 (simde__m128d a) {
17878   #if defined(SIMDE_X86_SSE2_NATIVE)
17879     return _mm_cvttsd_si32(a);
17880   #else
17881     simde__m128d_private a_ = simde__m128d_to_private(a);
17882     simde_float64 v = a_.f64[0];
17883     #if defined(SIMDE_FAST_CONVERSION_RANGE)
17884       return SIMDE_CONVERT_FTOI(int32_t, v);
17885     #else
17886       return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
17887         SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
17888     #endif
17889   #endif
17890 }
17891 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17892   #define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
17893 #endif
17894 
17895 SIMDE_FUNCTION_ATTRIBUTES
17896 int64_t
simde_mm_cvttsd_si64(simde__m128d a)17897 simde_mm_cvttsd_si64 (simde__m128d a) {
17898   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
17899     #if !defined(__PGI)
17900       return _mm_cvttsd_si64(a);
17901     #else
17902       return _mm_cvttsd_si64x(a);
17903     #endif
17904   #else
17905     simde__m128d_private a_ = simde__m128d_to_private(a);
17906     return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
17907   #endif
17908 }
17909 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
17910 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
17911   #define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
17912   #define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
17913 #endif
17914 
17915 SIMDE_FUNCTION_ATTRIBUTES
17916 simde__m128d
simde_mm_div_pd(simde__m128d a,simde__m128d b)17917 simde_mm_div_pd (simde__m128d a, simde__m128d b) {
17918   #if defined(SIMDE_X86_SSE2_NATIVE)
17919     return _mm_div_pd(a, b);
17920   #else
17921     simde__m128d_private
17922       r_,
17923       a_ = simde__m128d_to_private(a),
17924       b_ = simde__m128d_to_private(b);
17925 
17926     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
17927       r_.f64 = a_.f64 / b_.f64;
17928     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17929       r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64);
17930     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
17931       r_.wasm_v128 =  wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
17932     #else
17933       SIMDE_VECTORIZE
17934       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
17935         r_.f64[i] = a_.f64[i] / b_.f64[i];
17936       }
17937     #endif
17938 
17939     return simde__m128d_from_private(r_);
17940   #endif
17941 }
17942 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17943   #define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
17944 #endif
17945 
17946 SIMDE_FUNCTION_ATTRIBUTES
17947 simde__m128d
simde_mm_div_sd(simde__m128d a,simde__m128d b)17948 simde_mm_div_sd (simde__m128d a, simde__m128d b) {
17949   #if defined(SIMDE_X86_SSE2_NATIVE)
17950     return _mm_div_sd(a, b);
17951   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
17952     return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
17953   #else
17954     simde__m128d_private
17955       r_,
17956       a_ = simde__m128d_to_private(a),
17957       b_ = simde__m128d_to_private(b);
17958 
17959     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
17960       float64x2_t temp = vdivq_f64(a_.neon_f64, b_.neon_f64);
17961       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
17962     #else
17963       r_.f64[0] = a_.f64[0] / b_.f64[0];
17964       r_.f64[1] = a_.f64[1];
17965     #endif
17966 
17967     return simde__m128d_from_private(r_);
17968   #endif
17969 }
17970 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17971   #define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
17972 #endif
17973 
17974 SIMDE_FUNCTION_ATTRIBUTES
17975 int32_t
simde_mm_extract_epi16(simde__m128i a,const int imm8)17976 simde_mm_extract_epi16 (simde__m128i a, const int imm8)
17977     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7)  {
17978   uint16_t r;
17979   simde__m128i_private a_ = simde__m128i_to_private(a);
17980 
17981   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
17982     #if defined(SIMDE_BUG_GCC_95227)
17983       (void) a_;
17984       (void) imm8;
17985     #endif
17986     r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8));
17987   #else
17988     r = a_.u16[imm8 & 7];
17989   #endif
17990 
17991   return  HEDLEY_STATIC_CAST(int32_t, r);
17992 }
17993 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0))
17994   #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
17995 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
17996   #define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff)))
17997 #endif
17998 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
17999   #define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
18000 #endif
18001 
18002 SIMDE_FUNCTION_ATTRIBUTES
18003 simde__m128i
simde_mm_insert_epi16(simde__m128i a,int16_t i,const int imm8)18004 simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8)
18005     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7)  {
18006   simde__m128i_private a_ = simde__m128i_to_private(a);
18007   a_.i16[imm8 & 7] = i;
18008   return simde__m128i_from_private(a_);
18009 }
18010 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
18011   #define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
18012 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18013   #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
18014 #endif
18015 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18016   #define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
18017 #endif
18018 
18019 SIMDE_FUNCTION_ATTRIBUTES
18020 simde__m128d
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])18021 simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
18022   #if defined(SIMDE_X86_SSE2_NATIVE)
18023     return _mm_load_pd(mem_addr);
18024   #else
18025     simde__m128d_private r_;
18026 
18027     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18028       r_.neon_f64 = vld1q_f64(mem_addr);
18029     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18030       r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr));
18031     #else
18032       simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), sizeof(r_));
18033     #endif
18034 
18035     return simde__m128d_from_private(r_);
18036   #endif
18037 }
18038 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18039   #define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
18040 #endif
18041 
18042 SIMDE_FUNCTION_ATTRIBUTES
18043 simde__m128d
simde_mm_load1_pd(simde_float64 const * mem_addr)18044 simde_mm_load1_pd (simde_float64 const* mem_addr) {
18045   #if defined(SIMDE_X86_SSE2_NATIVE)
18046     return _mm_load1_pd(mem_addr);
18047   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18048     return simde__m128d_from_neon_f64(vld1q_dup_f64(mem_addr));
18049   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
18050     return simde__m128d_from_wasm_v128(wasm_v64x2_load_splat(mem_addr));
18051   #else
18052     return simde_mm_set1_pd(*mem_addr);
18053   #endif
18054 }
18055 #define simde_mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
18056 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18057   #define _mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
18058   #define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
18059 #endif
18060 
18061 SIMDE_FUNCTION_ATTRIBUTES
18062 simde__m128d
simde_mm_load_sd(simde_float64 const * mem_addr)18063 simde_mm_load_sd (simde_float64 const* mem_addr) {
18064   #if defined(SIMDE_X86_SSE2_NATIVE)
18065     return _mm_load_sd(mem_addr);
18066   #else
18067     simde__m128d_private r_;
18068 
18069     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18070       r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
18071     #else
18072       r_.f64[0] = *mem_addr;
18073       r_.u64[1] = UINT64_C(0);
18074     #endif
18075 
18076     return simde__m128d_from_private(r_);
18077   #endif
18078 }
18079 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18080   #define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
18081 #endif
18082 
18083 SIMDE_FUNCTION_ATTRIBUTES
18084 simde__m128i
simde_mm_load_si128(simde__m128i const * mem_addr)18085 simde_mm_load_si128 (simde__m128i const* mem_addr) {
18086   #if defined(SIMDE_X86_SSE2_NATIVE)
18087     return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr));
18088   #else
18089     simde__m128i_private r_;
18090 
18091     #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
18092       r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr));
18093     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18094       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
18095     #else
18096       simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i));
18097     #endif
18098 
18099     return simde__m128i_from_private(r_);
18100   #endif
18101 }
18102 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18103   #define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
18104 #endif
18105 
18106 SIMDE_FUNCTION_ATTRIBUTES
18107 simde__m128d
simde_mm_loadh_pd(simde__m128d a,simde_float64 const * mem_addr)18108 simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) {
18109   #if defined(SIMDE_X86_SSE2_NATIVE)
18110     return _mm_loadh_pd(a, mem_addr);
18111   #else
18112     simde__m128d_private
18113       r_,
18114       a_ = simde__m128d_to_private(a);
18115 
18116     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18117       r_.neon_f64 = vcombine_f64(vget_low_f64(a_.neon_f64), vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)));
18118     #else
18119       simde_float64 t;
18120 
18121       simde_memcpy(&t, mem_addr, sizeof(t));
18122       r_.f64[0] = a_.f64[0];
18123       r_.f64[1] = t;
18124     #endif
18125 
18126     return simde__m128d_from_private(r_);
18127   #endif
18128 }
18129 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18130   #define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
18131 #endif
18132 
18133 SIMDE_FUNCTION_ATTRIBUTES
18134 simde__m128i
simde_mm_loadl_epi64(simde__m128i const * mem_addr)18135 simde_mm_loadl_epi64 (simde__m128i const* mem_addr) {
18136   #if defined(SIMDE_X86_SSE2_NATIVE)
18137     return _mm_loadl_epi64(mem_addr);
18138   #else
18139     simde__m128i_private r_;
18140 
18141     int64_t value;
18142     simde_memcpy(&value, mem_addr, sizeof(value));
18143 
18144     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18145       r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0));
18146     #else
18147       r_.i64[0] = value;
18148       r_.i64[1] = 0;
18149     #endif
18150 
18151     return simde__m128i_from_private(r_);
18152   #endif
18153 }
18154 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18155   #define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
18156 #endif
18157 
18158 SIMDE_FUNCTION_ATTRIBUTES
18159 simde__m128d
simde_mm_loadl_pd(simde__m128d a,simde_float64 const * mem_addr)18160 simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) {
18161   #if defined(SIMDE_X86_SSE2_NATIVE)
18162     return _mm_loadl_pd(a, mem_addr);
18163   #else
18164     simde__m128d_private
18165       r_,
18166       a_ = simde__m128d_to_private(a);
18167 
18168     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18169       r_.neon_f64 = vcombine_f64(vld1_f64(
18170         HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)), vget_high_f64(a_.neon_f64));
18171     #else
18172       r_.f64[0] = *mem_addr;
18173       r_.u64[1] = a_.u64[1];
18174     #endif
18175 
18176     return simde__m128d_from_private(r_);
18177   #endif
18178 }
18179 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18180   #define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
18181 #endif
18182 
18183 SIMDE_FUNCTION_ATTRIBUTES
18184 simde__m128d
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])18185 simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
18186   #if defined(SIMDE_X86_SSE2_NATIVE)
18187     return _mm_loadr_pd(mem_addr);
18188   #else
18189     simde__m128d_private
18190       r_;
18191 
18192     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18193       r_.neon_f64 = vld1q_f64(mem_addr);
18194       r_.neon_f64 = vextq_f64(r_.neon_f64, r_.neon_f64, 1);
18195     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18196       r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr));
18197       r_.neon_i64 = vextq_s64(r_.neon_i64, r_.neon_i64, 1);
18198     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
18199       v128_t tmp = wasm_v128_load(mem_addr);
18200       r_.wasm_v128 = wasm_v64x2_shuffle(tmp, tmp, 1, 0);
18201     #else
18202       r_.f64[0] = mem_addr[1];
18203       r_.f64[1] = mem_addr[0];
18204     #endif
18205 
18206     return simde__m128d_from_private(r_);
18207   #endif
18208 }
18209 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18210   #define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
18211 #endif
18212 
18213 SIMDE_FUNCTION_ATTRIBUTES
18214 simde__m128d
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])18215 simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
18216   #if defined(SIMDE_X86_SSE2_NATIVE)
18217     return _mm_loadu_pd(mem_addr);
18218   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18219     return vld1q_f64(mem_addr);
18220   #else
18221     simde__m128d_private r_;
18222 
18223     simde_memcpy(&r_, mem_addr, sizeof(r_));
18224 
18225     return simde__m128d_from_private(r_);
18226   #endif
18227 }
18228 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18229   #define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
18230 #endif
18231 
18232 SIMDE_FUNCTION_ATTRIBUTES
18233 simde__m128i
simde_mm_loadu_epi8(void const * mem_addr)18234 simde_mm_loadu_epi8(void const * mem_addr) {
18235   #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
18236     return _mm_loadu_epi8(mem_addr);
18237   #elif defined(SIMDE_X86_SSE2_NATIVE)
18238     return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
18239   #else
18240     simde__m128i_private r_;
18241 
18242     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18243       r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
18244     #else
18245       simde_memcpy(&r_, mem_addr, sizeof(r_));
18246     #endif
18247 
18248     return simde__m128i_from_private(r_);
18249   #endif
18250 }
18251 #define simde_x_mm_loadu_epi8(mem_addr) simde_mm_loadu_epi8(mem_addr)
18252 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
18253   #undef _mm_loadu_epi8
18254   #define _mm_loadu_epi8(a) simde_mm_loadu_epi8(a)
18255 #endif
18256 
18257 SIMDE_FUNCTION_ATTRIBUTES
18258 simde__m128i
simde_mm_loadu_epi16(void const * mem_addr)18259 simde_mm_loadu_epi16(void const * mem_addr) {
18260   #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
18261     return _mm_loadu_epi16(mem_addr);
18262   #elif defined(SIMDE_X86_SSE2_NATIVE)
18263     return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
18264   #else
18265     simde__m128i_private r_;
18266 
18267     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18268       r_.neon_i16 = vreinterpretq_s16_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)));
18269     #else
18270       simde_memcpy(&r_, mem_addr, sizeof(r_));
18271     #endif
18272 
18273     return simde__m128i_from_private(r_);
18274   #endif
18275 }
18276 #define simde_x_mm_loadu_epi16(mem_addr) simde_mm_loadu_epi16(mem_addr)
18277 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
18278   #undef _mm_loadu_epi16
18279   #define _mm_loadu_epi16(a) simde_mm_loadu_epi16(a)
18280 #endif
18281 
18282 SIMDE_FUNCTION_ATTRIBUTES
18283 simde__m128i
simde_mm_loadu_epi32(void const * mem_addr)18284 simde_mm_loadu_epi32(void const * mem_addr) {
18285   #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
18286     return _mm_loadu_epi32(mem_addr);
18287   #elif defined(SIMDE_X86_SSE2_NATIVE)
18288     return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
18289   #else
18290     simde__m128i_private r_;
18291 
18292     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18293       r_.neon_i32 = vreinterpretq_s32_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)));
18294     #else
18295       simde_memcpy(&r_, mem_addr, sizeof(r_));
18296     #endif
18297 
18298     return simde__m128i_from_private(r_);
18299   #endif
18300 }
18301 #define simde_x_mm_loadu_epi32(mem_addr) simde_mm_loadu_epi32(mem_addr)
18302 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
18303   #undef _mm_loadu_epi32
18304   #define _mm_loadu_epi32(a) simde_mm_loadu_epi32(a)
18305 #endif
18306 
18307 SIMDE_FUNCTION_ATTRIBUTES
18308 simde__m128i
simde_mm_loadu_epi64(void const * mem_addr)18309 simde_mm_loadu_epi64(void const * mem_addr) {
18310   #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
18311     return _mm_loadu_epi64(mem_addr);
18312   #elif defined(SIMDE_X86_SSE2_NATIVE)
18313     return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
18314   #else
18315     simde__m128i_private r_;
18316 
18317     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18318       r_.neon_i64 = vreinterpretq_s64_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)));
18319     #else
18320       simde_memcpy(&r_, mem_addr, sizeof(r_));
18321     #endif
18322 
18323     return simde__m128i_from_private(r_);
18324   #endif
18325 }
18326 #define simde_x_mm_loadu_epi64(mem_addr) simde_mm_loadu_epi64(mem_addr)
18327 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
18328   #undef _mm_loadu_epi64
18329   #define _mm_loadu_epi64(a) simde_mm_loadu_epi64(a)
18330 #endif
18331 
18332 SIMDE_FUNCTION_ATTRIBUTES
18333 simde__m128i
simde_mm_loadu_si128(void const * mem_addr)18334 simde_mm_loadu_si128 (void const* mem_addr) {
18335   #if defined(SIMDE_X86_SSE2_NATIVE)
18336     return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr));
18337   #else
18338     simde__m128i_private r_;
18339 
18340     #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0)
18341       HEDLEY_DIAGNOSTIC_PUSH
18342       SIMDE_DIAGNOSTIC_DISABLE_PACKED_
18343       struct simde_mm_loadu_si128_s {
18344         __typeof__(r_) v;
18345       } __attribute__((__packed__, __may_alias__));
18346       r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v;
18347       HEDLEY_DIAGNOSTIC_POP
18348     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18349       r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
18350     #else
18351       simde_memcpy(&r_, mem_addr, sizeof(r_));
18352     #endif
18353 
18354     return simde__m128i_from_private(r_);
18355   #endif
18356 }
18357 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18358   #define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
18359 #endif
18360 
18361 SIMDE_FUNCTION_ATTRIBUTES
18362 simde__m128i
simde_mm_madd_epi16(simde__m128i a,simde__m128i b)18363 simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) {
18364   #if defined(SIMDE_X86_SSE2_NATIVE)
18365     return _mm_madd_epi16(a, b);
18366   #else
18367     simde__m128i_private
18368       r_,
18369       a_ = simde__m128i_to_private(a),
18370       b_ = simde__m128i_to_private(b);
18371 
18372     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18373       int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16),  vget_low_s16(b_.neon_i16));
18374       int32x4_t ph = vmull_high_s16(a_.neon_i16, b_.neon_i16);
18375       r_.neon_i32 = vpaddq_s32(pl, ph);
18376     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18377       int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16),  vget_low_s16(b_.neon_i16));
18378       int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16));
18379       int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
18380       int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
18381       r_.neon_i32 = vcombine_s32(rl, rh);
18382     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
18383       static const SIMDE_POWER_ALTIVEC_VECTOR(int) tz = { 0, 0, 0, 0 };
18384       r_.altivec_i32 = vec_msum(a_.altivec_i16, b_.altivec_i16, tz);
18385     #else
18386       SIMDE_VECTORIZE
18387       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
18388         r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
18389       }
18390     #endif
18391 
18392     return simde__m128i_from_private(r_);
18393   #endif
18394 }
18395 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18396   #define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
18397 #endif
18398 
18399 SIMDE_FUNCTION_ATTRIBUTES
18400 void
simde_mm_maskmoveu_si128(simde__m128i a,simde__m128i mask,int8_t mem_addr[HEDLEY_ARRAY_PARAM (16)])18401 simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) {
18402   #if defined(SIMDE_X86_SSE2_NATIVE)
18403     _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
18404   #else
18405     simde__m128i_private
18406       a_ = simde__m128i_to_private(a),
18407       mask_ = simde__m128i_to_private(mask);
18408 
18409     for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) {
18410       if (mask_.u8[i] & 0x80) {
18411         mem_addr[i] = a_.i8[i];
18412       }
18413     }
18414   #endif
18415 }
18416 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18417   #define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
18418 #endif
18419 
18420 SIMDE_FUNCTION_ATTRIBUTES
18421 int32_t
simde_mm_movemask_epi8(simde__m128i a)18422 simde_mm_movemask_epi8 (simde__m128i a) {
18423   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
18424     /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
18425     return _mm_movemask_epi8(a);
18426   #else
18427     int32_t r = 0;
18428     simde__m128i_private a_ = simde__m128i_to_private(a);
18429 
18430     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18431       // Use increasingly wide shifts+adds to collect the sign bits
18432       // together.
18433       // Since the widening shifts would be rather confusing to follow in little endian, everything
18434       // will be illustrated in big endian order instead. This has a different result - the bits
18435       // would actually be reversed on a big endian machine.
18436 
18437       // Starting input (only half the elements are shown):
18438       // 89 ff 1d c0 00 10 99 33
18439       uint8x16_t input = a_.neon_u8;
18440 
18441       // Shift out everything but the sign bits with an unsigned shift right.
18442       //
18443       // Bytes of the vector::
18444       // 89 ff 1d c0 00 10 99 33
18445       // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
18446       //  |  |  |  |  |  |  |  |
18447       // 01 01 00 01 00 00 01 00
18448       //
18449       // Bits of first important lane(s):
18450       // 10001001 (89)
18451       // \______
18452       //        |
18453       // 00000001 (01)
18454       uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
18455 
18456       // Merge the even lanes together with a 16-bit unsigned shift right + add.
18457       // 'xx' represents garbage data which will be ignored in the final result.
18458       // In the important bytes, the add functions like a binary OR.
18459       //
18460       // 01 01 00 01 00 00 01 00
18461       //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
18462       //    \|    \|    \|    \|
18463       // xx 03 xx 01 xx 00 xx 02
18464       //
18465       // 00000001 00000001 (01 01)
18466       //        \_______ |
18467       //                \|
18468       // xxxxxxxx xxxxxx11 (xx 03)
18469       uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
18470 
18471       // Repeat with a wider 32-bit shift + add.
18472       // xx 03 xx 01 xx 00 xx 02
18473       //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
18474       //          \|          \|
18475       // xx xx xx 0d xx xx xx 02
18476       //
18477       // 00000011 00000001 (03 01)
18478       //        \\_____ ||
18479       //         '----.\||
18480       // xxxxxxxx xxxx1101 (xx 0d)
18481       uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
18482 
18483       // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
18484       // xx xx xx 0d xx xx xx 02
18485       //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
18486       //                      \|
18487       // xx xx xx xx xx xx xx d2
18488       //
18489       // 00001101 00000010 (0d 02)
18490       //     \   \___ |  |
18491       //      '---.  \|  |
18492       // xxxxxxxx 11010010 (xx d2)
18493       uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
18494 
18495       // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
18496       // xx xx xx xx xx xx xx d2
18497       //                      ||  return paired64[0]
18498       //                      d2
18499       // Note: Little endian would return the correct value 4b (01001011) instead.
18500       r = vgetq_lane_u8(paired64, 0) | (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u8(paired64, 8)) << 8);
18501     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
18502       static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
18503       r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
18504     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG)
18505       static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
18506       r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14));
18507     #else
18508       SIMDE_VECTORIZE_REDUCTION(|:r)
18509       for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
18510         r |= (a_.u8[15 - i] >> 7) << (15 - i);
18511       }
18512     #endif
18513 
18514     return r;
18515   #endif
18516 }
18517 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18518   #define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
18519 #endif
18520 
18521 SIMDE_FUNCTION_ATTRIBUTES
18522 int32_t
simde_mm_movemask_pd(simde__m128d a)18523 simde_mm_movemask_pd (simde__m128d a) {
18524   #if defined(SIMDE_X86_SSE2_NATIVE)
18525     return _mm_movemask_pd(a);
18526   #else
18527     int32_t r = 0;
18528     simde__m128d_private a_ = simde__m128d_to_private(a);
18529 
18530     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18531       static const int64_t shift_amount[] = { 0, 1 };
18532       const int64x2_t shift = vld1q_s64(shift_amount);
18533       uint64x2_t tmp = vshrq_n_u64(a_.neon_u64, 63);
18534       return HEDLEY_STATIC_CAST(int32_t, vaddvq_u64(vshlq_u64(tmp, shift)));
18535     #else
18536       SIMDE_VECTORIZE_REDUCTION(|:r)
18537       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
18538         r |= (a_.u64[i] >> 63) << i;
18539       }
18540     #endif
18541 
18542     return r;
18543   #endif
18544 }
18545 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18546   #define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
18547 #endif
18548 
18549 SIMDE_FUNCTION_ATTRIBUTES
18550 simde__m64
simde_mm_movepi64_pi64(simde__m128i a)18551 simde_mm_movepi64_pi64 (simde__m128i a) {
18552   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
18553     return _mm_movepi64_pi64(a);
18554   #else
18555     simde__m64_private r_;
18556     simde__m128i_private a_ = simde__m128i_to_private(a);
18557 
18558     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18559       r_.neon_i64 = vget_low_s64(a_.neon_i64);
18560     #else
18561       r_.i64[0] = a_.i64[0];
18562     #endif
18563 
18564     return simde__m64_from_private(r_);
18565   #endif
18566 }
18567 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18568   #define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
18569 #endif
18570 
18571 SIMDE_FUNCTION_ATTRIBUTES
18572 simde__m128i
simde_mm_movpi64_epi64(simde__m64 a)18573 simde_mm_movpi64_epi64 (simde__m64 a) {
18574   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
18575     return _mm_movpi64_epi64(a);
18576   #else
18577     simde__m128i_private r_;
18578     simde__m64_private a_ = simde__m64_to_private(a);
18579 
18580     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18581       r_.neon_i64 = vcombine_s64(a_.neon_i64, vdup_n_s64(0));
18582     #else
18583       r_.i64[0] = a_.i64[0];
18584       r_.i64[1] = 0;
18585     #endif
18586 
18587     return simde__m128i_from_private(r_);
18588   #endif
18589 }
18590 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18591   #define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
18592 #endif
18593 
18594 SIMDE_FUNCTION_ATTRIBUTES
18595 simde__m128i
simde_mm_min_epi16(simde__m128i a,simde__m128i b)18596 simde_mm_min_epi16 (simde__m128i a, simde__m128i b) {
18597   #if defined(SIMDE_X86_SSE2_NATIVE)
18598     return _mm_min_epi16(a, b);
18599   #else
18600     simde__m128i_private
18601       r_,
18602       a_ = simde__m128i_to_private(a),
18603       b_ = simde__m128i_to_private(b);
18604 
18605     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18606       r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
18607     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
18608       r_.wasm_v128 = wasm_i16x8_min(a_.wasm_v128, b_.wasm_v128);
18609     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
18610       r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16);
18611     #else
18612       SIMDE_VECTORIZE
18613       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
18614         r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
18615       }
18616     #endif
18617 
18618     return simde__m128i_from_private(r_);
18619   #endif
18620 }
18621 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18622   #define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
18623 #endif
18624 
18625 SIMDE_FUNCTION_ATTRIBUTES
18626 simde__m128i
simde_mm_min_epu8(simde__m128i a,simde__m128i b)18627 simde_mm_min_epu8 (simde__m128i a, simde__m128i b) {
18628   #if defined(SIMDE_X86_SSE2_NATIVE)
18629     return _mm_min_epu8(a, b);
18630   #else
18631     simde__m128i_private
18632       r_,
18633       a_ = simde__m128i_to_private(a),
18634       b_ = simde__m128i_to_private(b);
18635 
18636     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18637       r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
18638     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
18639       r_.wasm_v128 = wasm_u8x16_min(a_.wasm_v128, b_.wasm_v128);
18640     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
18641       r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8);
18642     #else
18643       SIMDE_VECTORIZE
18644       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
18645         r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
18646       }
18647     #endif
18648 
18649     return simde__m128i_from_private(r_);
18650   #endif
18651 }
18652 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18653   #define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
18654 #endif
18655 
18656 SIMDE_FUNCTION_ATTRIBUTES
18657 simde__m128d
simde_mm_min_pd(simde__m128d a,simde__m128d b)18658 simde_mm_min_pd (simde__m128d a, simde__m128d b) {
18659   #if defined(SIMDE_X86_SSE2_NATIVE)
18660     return _mm_min_pd(a, b);
18661   #else
18662     simde__m128d_private
18663       r_,
18664       a_ = simde__m128d_to_private(a),
18665       b_ = simde__m128d_to_private(b);
18666 
18667     #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
18668       r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64);
18669     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18670       r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64);
18671     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
18672       r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128);
18673     #else
18674       SIMDE_VECTORIZE
18675       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
18676         r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
18677       }
18678     #endif
18679 
18680     return simde__m128d_from_private(r_);
18681   #endif
18682 }
18683 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18684   #define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
18685 #endif
18686 
18687 SIMDE_FUNCTION_ATTRIBUTES
18688 simde__m128d
simde_mm_min_sd(simde__m128d a,simde__m128d b)18689 simde_mm_min_sd (simde__m128d a, simde__m128d b) {
18690   #if defined(SIMDE_X86_SSE2_NATIVE)
18691     return _mm_min_sd(a, b);
18692   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
18693     return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
18694   #else
18695     simde__m128d_private
18696       r_,
18697       a_ = simde__m128d_to_private(a),
18698       b_ = simde__m128d_to_private(b);
18699 
18700     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18701       float64x2_t temp = vminq_f64(a_.neon_f64, b_.neon_f64);
18702       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
18703     #else
18704       r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
18705       r_.f64[1] = a_.f64[1];
18706     #endif
18707 
18708     return simde__m128d_from_private(r_);
18709   #endif
18710 }
18711 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18712   #define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
18713 #endif
18714 
18715 SIMDE_FUNCTION_ATTRIBUTES
18716 simde__m128i
simde_mm_max_epi16(simde__m128i a,simde__m128i b)18717 simde_mm_max_epi16 (simde__m128i a, simde__m128i b) {
18718   #if defined(SIMDE_X86_SSE2_NATIVE)
18719     return _mm_max_epi16(a, b);
18720   #else
18721     simde__m128i_private
18722       r_,
18723       a_ = simde__m128i_to_private(a),
18724       b_ = simde__m128i_to_private(b);
18725 
18726     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18727       r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
18728     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
18729       r_.wasm_v128 = wasm_i16x8_max(a_.wasm_v128, b_.wasm_v128);
18730     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
18731       r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
18732     #else
18733       SIMDE_VECTORIZE
18734       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
18735         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
18736       }
18737     #endif
18738 
18739     return simde__m128i_from_private(r_);
18740   #endif
18741 }
18742 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18743   #define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
18744 #endif
18745 
18746 SIMDE_FUNCTION_ATTRIBUTES
18747 simde__m128i
simde_mm_max_epu8(simde__m128i a,simde__m128i b)18748 simde_mm_max_epu8 (simde__m128i a, simde__m128i b) {
18749   #if defined(SIMDE_X86_SSE2_NATIVE)
18750     return _mm_max_epu8(a, b);
18751   #else
18752     simde__m128i_private
18753       r_,
18754       a_ = simde__m128i_to_private(a),
18755       b_ = simde__m128i_to_private(b);
18756 
18757     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18758       r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
18759     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
18760       r_.wasm_v128 = wasm_u8x16_max(a_.wasm_v128, b_.wasm_v128);
18761     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
18762       r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
18763     #else
18764       SIMDE_VECTORIZE
18765       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
18766         r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
18767       }
18768     #endif
18769 
18770     return simde__m128i_from_private(r_);
18771   #endif
18772 }
18773 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18774   #define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
18775 #endif
18776 
18777 SIMDE_FUNCTION_ATTRIBUTES
18778 simde__m128d
simde_mm_max_pd(simde__m128d a,simde__m128d b)18779 simde_mm_max_pd (simde__m128d a, simde__m128d b) {
18780   #if defined(SIMDE_X86_SSE2_NATIVE)
18781     return _mm_max_pd(a, b);
18782   #else
18783     simde__m128d_private
18784       r_,
18785       a_ = simde__m128d_to_private(a),
18786       b_ = simde__m128d_to_private(b);
18787 
18788     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
18789       r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
18790     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
18791       r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128);
18792     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18793       r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64);
18794     #else
18795       SIMDE_VECTORIZE
18796       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
18797         r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
18798       }
18799     #endif
18800 
18801     return simde__m128d_from_private(r_);
18802   #endif
18803 }
18804 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18805   #define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
18806 #endif
18807 
18808 SIMDE_FUNCTION_ATTRIBUTES
18809 simde__m128d
simde_mm_max_sd(simde__m128d a,simde__m128d b)18810 simde_mm_max_sd (simde__m128d a, simde__m128d b) {
18811   #if defined(SIMDE_X86_SSE2_NATIVE)
18812     return _mm_max_sd(a, b);
18813   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
18814     return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
18815   #else
18816     simde__m128d_private
18817       r_,
18818       a_ = simde__m128d_to_private(a),
18819       b_ = simde__m128d_to_private(b);
18820 
18821     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18822       float64x2_t temp = vmaxq_f64(a_.neon_f64, b_.neon_f64);
18823       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
18824     #else
18825       r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
18826       r_.f64[1] = a_.f64[1];
18827     #endif
18828 
18829     return simde__m128d_from_private(r_);
18830   #endif
18831 }
18832 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18833   #define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
18834 #endif
18835 
18836 SIMDE_FUNCTION_ATTRIBUTES
18837 simde__m128i
simde_mm_move_epi64(simde__m128i a)18838 simde_mm_move_epi64 (simde__m128i a) {
18839   #if defined(SIMDE_X86_SSE2_NATIVE)
18840     return _mm_move_epi64(a);
18841   #else
18842     simde__m128i_private
18843       r_,
18844       a_ = simde__m128i_to_private(a);
18845 
18846     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18847       r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
18848     #else
18849       r_.i64[0] = a_.i64[0];
18850       r_.i64[1] = 0;
18851     #endif
18852 
18853     return simde__m128i_from_private(r_);
18854   #endif
18855 }
18856 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18857   #define _mm_move_epi64(a) simde_mm_move_epi64(a)
18858 #endif
18859 
18860 SIMDE_FUNCTION_ATTRIBUTES
18861 simde__m128i
simde_mm_mul_epu32(simde__m128i a,simde__m128i b)18862 simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) {
18863   #if defined(SIMDE_X86_SSE2_NATIVE)
18864     return _mm_mul_epu32(a, b);
18865   #else
18866     simde__m128i_private
18867       r_,
18868       a_ = simde__m128i_to_private(a),
18869       b_ = simde__m128i_to_private(b);
18870 
18871     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
18872       uint32x2_t a_lo = vmovn_u64(a_.neon_u64);
18873       uint32x2_t b_lo = vmovn_u64(b_.neon_u64);
18874       r_.neon_u64 = vmull_u32(a_lo, b_lo);
18875     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
18876       __typeof__(a_.u32) z = { 0, };
18877       a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6);
18878       b_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, b_.u32, z, 0, 4, 2, 6);
18879       r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u32) *
18880                HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), b_.u32);
18881     #else
18882       SIMDE_VECTORIZE
18883       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
18884         r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
18885       }
18886     #endif
18887 
18888     return simde__m128i_from_private(r_);
18889   #endif
18890 }
18891 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18892   #define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
18893 #endif
18894 
18895 SIMDE_FUNCTION_ATTRIBUTES
18896 simde__m128i
simde_x_mm_mul_epi64(simde__m128i a,simde__m128i b)18897 simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) {
18898   simde__m128i_private
18899     r_,
18900     a_ = simde__m128i_to_private(a),
18901     b_ = simde__m128i_to_private(b);
18902 
18903   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
18904     r_.i64 = a_.i64 * b_.i64;
18905   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18906     r_.neon_f64 = vmulq_s64(a_.neon_f64, b_.neon_f64);
18907   #else
18908     SIMDE_VECTORIZE
18909     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
18910       r_.i64[i] = a_.i64[i] * b_.i64[i];
18911     }
18912   #endif
18913 
18914   return simde__m128i_from_private(r_);
18915 }
18916 
18917 SIMDE_FUNCTION_ATTRIBUTES
18918 simde__m128i
simde_x_mm_mod_epi64(simde__m128i a,simde__m128i b)18919 simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) {
18920   simde__m128i_private
18921     r_,
18922     a_ = simde__m128i_to_private(a),
18923     b_ = simde__m128i_to_private(b);
18924 
18925   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
18926     r_.i64 = a_.i64 % b_.i64;
18927   #else
18928     SIMDE_VECTORIZE
18929     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
18930       r_.i64[i] = a_.i64[i] % b_.i64[i];
18931     }
18932   #endif
18933 
18934   return simde__m128i_from_private(r_);
18935 }
18936 
18937 SIMDE_FUNCTION_ATTRIBUTES
18938 simde__m128d
simde_mm_mul_pd(simde__m128d a,simde__m128d b)18939 simde_mm_mul_pd (simde__m128d a, simde__m128d b) {
18940   #if defined(SIMDE_X86_SSE2_NATIVE)
18941     return _mm_mul_pd(a, b);
18942   #else
18943     simde__m128d_private
18944       r_,
18945       a_ = simde__m128d_to_private(a),
18946       b_ = simde__m128d_to_private(b);
18947 
18948     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
18949       r_.f64 = a_.f64 * b_.f64;
18950     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18951       r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
18952     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
18953       r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
18954     #else
18955       SIMDE_VECTORIZE
18956       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
18957         r_.f64[i] = a_.f64[i] * b_.f64[i];
18958       }
18959     #endif
18960 
18961     return simde__m128d_from_private(r_);
18962   #endif
18963 }
18964 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18965   #define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
18966 #endif
18967 
18968 SIMDE_FUNCTION_ATTRIBUTES
18969 simde__m128d
simde_mm_mul_sd(simde__m128d a,simde__m128d b)18970 simde_mm_mul_sd (simde__m128d a, simde__m128d b) {
18971   #if defined(SIMDE_X86_SSE2_NATIVE)
18972     return _mm_mul_sd(a, b);
18973   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
18974     return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
18975   #else
18976     simde__m128d_private
18977       r_,
18978       a_ = simde__m128d_to_private(a),
18979       b_ = simde__m128d_to_private(b);
18980 
18981     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
18982       float64x2_t temp = vmulq_f64(a_.neon_f64, b_.neon_f64);
18983       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
18984     #else
18985       r_.f64[0] = a_.f64[0] * b_.f64[0];
18986       r_.f64[1] = a_.f64[1];
18987     #endif
18988 
18989     return simde__m128d_from_private(r_);
18990   #endif
18991 }
18992 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
18993   #define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
18994 #endif
18995 
18996 SIMDE_FUNCTION_ATTRIBUTES
18997 simde__m64
simde_mm_mul_su32(simde__m64 a,simde__m64 b)18998 simde_mm_mul_su32 (simde__m64 a, simde__m64 b) {
18999   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
19000     return _mm_mul_su32(a, b);
19001   #else
19002     simde__m64_private
19003       r_,
19004       a_ = simde__m64_to_private(a),
19005       b_ = simde__m64_to_private(b);
19006 
19007     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19008       r_.u64[0] = vget_lane_u64(vget_low_u64(vmull_u32(vreinterpret_u32_s64(a_.neon_i64), vreinterpret_u32_s64(b_.neon_i64))), 0);
19009     #else
19010       r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
19011     #endif
19012 
19013     return simde__m64_from_private(r_);
19014   #endif
19015 }
19016 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19017   #define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
19018 #endif
19019 
19020 SIMDE_FUNCTION_ATTRIBUTES
19021 simde__m128i
simde_mm_mulhi_epi16(simde__m128i a,simde__m128i b)19022 simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) {
19023   #if defined(SIMDE_X86_SSE2_NATIVE)
19024     return _mm_mulhi_epi16(a, b);
19025   #else
19026     simde__m128i_private
19027       r_,
19028       a_ = simde__m128i_to_private(a),
19029       b_ = simde__m128i_to_private(b);
19030 
19031     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19032       int16x4_t a3210 = vget_low_s16(a_.neon_i16);
19033       int16x4_t b3210 = vget_low_s16(b_.neon_i16);
19034       int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
19035       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
19036         int32x4_t ab7654 = vmull_high_s16(a_.neon_i16, b_.neon_i16);
19037         r_.neon_i16 = vuzp2q_s16(vreinterpretq_s16_s32(ab3210), vreinterpretq_s16_s32(ab7654));
19038       #else
19039         int16x4_t a7654 = vget_high_s16(a_.neon_i16);
19040         int16x4_t b7654 = vget_high_s16(b_.neon_i16);
19041         int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
19042         uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
19043         r_.neon_u16 = rv.val[1];
19044       #endif
19045     #else
19046       SIMDE_VECTORIZE
19047       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
19048         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
19049       }
19050     #endif
19051 
19052     return simde__m128i_from_private(r_);
19053   #endif
19054 }
19055 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19056   #define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
19057 #endif
19058 
19059 SIMDE_FUNCTION_ATTRIBUTES
19060 simde__m128i
simde_mm_mulhi_epu16(simde__m128i a,simde__m128i b)19061 simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) {
19062   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
19063     return _mm_mulhi_epu16(a, b);
19064   #else
19065     simde__m128i_private
19066       r_,
19067       a_ = simde__m128i_to_private(a),
19068       b_ = simde__m128i_to_private(b);
19069 
19070     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19071       uint16x4_t a3210 = vget_low_u16(a_.neon_u16);
19072       uint16x4_t b3210 = vget_low_u16(b_.neon_u16);
19073       uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
19074       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
19075         uint32x4_t ab7654 = vmull_high_u16(a_.neon_u16, b_.neon_u16);
19076         r_.neon_u16 = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
19077       #else
19078         uint16x4_t a7654 = vget_high_u16(a_.neon_u16);
19079         uint16x4_t b7654 = vget_high_u16(b_.neon_u16);
19080         uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
19081         uint16x8x2_t neon_r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
19082         r_.neon_u16 = neon_r.val[1];
19083       #endif
19084     #else
19085       SIMDE_VECTORIZE
19086       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
19087         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
19088       }
19089     #endif
19090 
19091     return simde__m128i_from_private(r_);
19092   #endif
19093 }
19094 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19095   #define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
19096 #endif
19097 
19098 SIMDE_FUNCTION_ATTRIBUTES
19099 simde__m128i
simde_mm_mullo_epi16(simde__m128i a,simde__m128i b)19100 simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) {
19101   #if defined(SIMDE_X86_SSE2_NATIVE)
19102     return _mm_mullo_epi16(a, b);
19103   #else
19104     simde__m128i_private
19105       r_,
19106       a_ = simde__m128i_to_private(a),
19107       b_ = simde__m128i_to_private(b);
19108 
19109     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19110       r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
19111     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
19112       (void) a_;
19113       (void) b_;
19114       r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16);
19115     #else
19116       SIMDE_VECTORIZE
19117       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
19118         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
19119       }
19120     #endif
19121 
19122     return simde__m128i_from_private(r_);
19123   #endif
19124 }
19125 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19126   #define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
19127 #endif
19128 
19129 SIMDE_FUNCTION_ATTRIBUTES
19130 simde__m128d
simde_mm_or_pd(simde__m128d a,simde__m128d b)19131 simde_mm_or_pd (simde__m128d a, simde__m128d b) {
19132   #if defined(SIMDE_X86_SSE2_NATIVE)
19133     return _mm_or_pd(a, b);
19134   #else
19135     simde__m128d_private
19136       r_,
19137       a_ = simde__m128d_to_private(a),
19138       b_ = simde__m128d_to_private(b);
19139 
19140     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
19141       r_.i32f = a_.i32f | b_.i32f;
19142     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
19143       r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128);
19144     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19145       r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64);
19146     #else
19147       SIMDE_VECTORIZE
19148       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
19149         r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
19150       }
19151     #endif
19152 
19153     return simde__m128d_from_private(r_);
19154   #endif
19155 }
19156 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19157   #define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
19158 #endif
19159 
19160 SIMDE_FUNCTION_ATTRIBUTES
19161 simde__m128i
simde_mm_or_si128(simde__m128i a,simde__m128i b)19162 simde_mm_or_si128 (simde__m128i a, simde__m128i b) {
19163   #if defined(SIMDE_X86_SSE2_NATIVE)
19164     return _mm_or_si128(a, b);
19165   #else
19166     simde__m128i_private
19167       r_,
19168       a_ = simde__m128i_to_private(a),
19169       b_ = simde__m128i_to_private(b);
19170 
19171     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19172       r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
19173     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
19174       r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
19175     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
19176       r_.i32f = a_.i32f | b_.i32f;
19177     #else
19178       SIMDE_VECTORIZE
19179       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
19180         r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
19181       }
19182     #endif
19183 
19184     return simde__m128i_from_private(r_);
19185   #endif
19186 }
19187 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19188   #define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
19189 #endif
19190 
19191 SIMDE_FUNCTION_ATTRIBUTES
19192 simde__m128i
simde_mm_packs_epi16(simde__m128i a,simde__m128i b)19193 simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) {
19194   #if defined(SIMDE_X86_SSE2_NATIVE)
19195     return _mm_packs_epi16(a, b);
19196   #else
19197     simde__m128i_private
19198       r_,
19199       a_ = simde__m128i_to_private(a),
19200       b_ = simde__m128i_to_private(b);
19201 
19202     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19203       r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
19204     #else
19205       SIMDE_VECTORIZE
19206       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
19207         r_.i8[i]     = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
19208         r_.i8[i + 8] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
19209       }
19210     #endif
19211 
19212     return simde__m128i_from_private(r_);
19213   #endif
19214 }
19215 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19216   #define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
19217 #endif
19218 
19219 SIMDE_FUNCTION_ATTRIBUTES
19220 simde__m128i
simde_mm_packs_epi32(simde__m128i a,simde__m128i b)19221 simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) {
19222   #if defined(SIMDE_X86_SSE2_NATIVE)
19223     return _mm_packs_epi32(a, b);
19224   #else
19225     simde__m128i_private
19226       r_,
19227       a_ = simde__m128i_to_private(a),
19228       b_ = simde__m128i_to_private(b);
19229 
19230     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19231       r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
19232     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
19233       r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
19234     #else
19235       SIMDE_VECTORIZE
19236       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
19237         r_.i16[i]     = (a_.i32[i] > INT16_MAX) ? INT16_MAX : ((a_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]));
19238         r_.i16[i + 4] = (b_.i32[i] > INT16_MAX) ? INT16_MAX : ((b_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[i]));
19239       }
19240     #endif
19241 
19242     return simde__m128i_from_private(r_);
19243   #endif
19244 }
19245 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19246   #define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
19247 #endif
19248 
19249 SIMDE_FUNCTION_ATTRIBUTES
19250 simde__m128i
simde_mm_packus_epi16(simde__m128i a,simde__m128i b)19251 simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) {
19252   #if defined(SIMDE_X86_SSE2_NATIVE)
19253     return _mm_packus_epi16(a, b);
19254   #else
19255     simde__m128i_private
19256       r_,
19257       a_ = simde__m128i_to_private(a),
19258       b_ = simde__m128i_to_private(b);
19259 
19260     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19261       r_.neon_u8 = vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
19262     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
19263       r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
19264     #else
19265       SIMDE_VECTORIZE
19266       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
19267         r_.u8[i]     = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
19268         r_.u8[i + 8] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
19269       }
19270     #endif
19271 
19272     return simde__m128i_from_private(r_);
19273   #endif
19274 }
19275 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19276   #define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
19277 #endif
19278 
19279 SIMDE_FUNCTION_ATTRIBUTES
19280 void
simde_mm_pause(void)19281 simde_mm_pause (void) {
19282   #if defined(SIMDE_X86_SSE2_NATIVE)
19283     _mm_pause();
19284   #endif
19285 }
19286 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19287   #define _mm_pause() (simde_mm_pause())
19288 #endif
19289 
19290 SIMDE_FUNCTION_ATTRIBUTES
19291 simde__m128i
simde_mm_sad_epu8(simde__m128i a,simde__m128i b)19292 simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) {
19293   #if defined(SIMDE_X86_SSE2_NATIVE)
19294     return _mm_sad_epu8(a, b);
19295   #else
19296     simde__m128i_private
19297       r_,
19298       a_ = simde__m128i_to_private(a),
19299       b_ = simde__m128i_to_private(b);
19300 
19301     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19302       const uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8));
19303       r_.neon_u64 = vcombine_u64(
19304         vpaddl_u32(vpaddl_u16(vget_low_u16(t))),
19305         vpaddl_u32(vpaddl_u16(vget_high_u16(t))));
19306     #else
19307       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
19308         uint16_t tmp = 0;
19309         SIMDE_VECTORIZE_REDUCTION(+:tmp)
19310         for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) {
19311           const size_t e = j + (i * 8);
19312           tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
19313         }
19314         r_.i64[i] = tmp;
19315       }
19316     #endif
19317 
19318     return simde__m128i_from_private(r_);
19319   #endif
19320 }
19321 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19322   #define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
19323 #endif
19324 
19325 SIMDE_FUNCTION_ATTRIBUTES
19326 simde__m128i
simde_mm_set_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)19327 simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
19328        int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
19329        int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
19330        int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
19331 
19332   #if defined(SIMDE_X86_SSE2_NATIVE)
19333     return _mm_set_epi8(
19334       e15, e14, e13, e12, e11, e10,  e9,  e8,
19335        e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
19336   #else
19337     simde__m128i_private r_;
19338 
19339     #if defined(SIMDE_WASM_SIMD128_NATIVE)
19340       r_.wasm_v128 = wasm_i8x16_make(
19341          e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
19342          e8,  e9, e10, e11, e12, e13, e14, e15);
19343     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19344       SIMDE_ALIGN_LIKE_16(int8x16_t) int8_t data[16] = {
19345         e0,  e1,  e2,  e3,
19346         e4,  e5,  e6,  e7,
19347         e8,  e9,  e10, e11,
19348         e12, e13, e14, e15};
19349       r_.neon_i8 = vld1q_s8(data);
19350     #else
19351       r_.i8[ 0] =  e0;
19352       r_.i8[ 1] =  e1;
19353       r_.i8[ 2] =  e2;
19354       r_.i8[ 3] =  e3;
19355       r_.i8[ 4] =  e4;
19356       r_.i8[ 5] =  e5;
19357       r_.i8[ 6] =  e6;
19358       r_.i8[ 7] =  e7;
19359       r_.i8[ 8] =  e8;
19360       r_.i8[ 9] =  e9;
19361       r_.i8[10] = e10;
19362       r_.i8[11] = e11;
19363       r_.i8[12] = e12;
19364       r_.i8[13] = e13;
19365       r_.i8[14] = e14;
19366       r_.i8[15] = e15;
19367     #endif
19368 
19369     return simde__m128i_from_private(r_);
19370   #endif
19371 }
19372 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19373   #define _mm_set_epi8(e15, e14, e13, e12, e11, e10,  e9,  e8,  e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10,  e9,  e8,  e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0)
19374 #endif
19375 
19376 SIMDE_FUNCTION_ATTRIBUTES
19377 simde__m128i
simde_mm_set_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)19378 simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
19379         int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
19380   #if defined(SIMDE_X86_SSE2_NATIVE)
19381     return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
19382   #else
19383     simde__m128i_private r_;
19384 
19385     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19386       SIMDE_ALIGN_LIKE_16(int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
19387       r_.neon_i16 = vld1q_s16(data);
19388     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
19389       r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
19390     #else
19391       r_.i16[0] = e0;
19392       r_.i16[1] = e1;
19393       r_.i16[2] = e2;
19394       r_.i16[3] = e3;
19395       r_.i16[4] = e4;
19396       r_.i16[5] = e5;
19397       r_.i16[6] = e6;
19398       r_.i16[7] = e7;
19399     #endif
19400 
19401     return simde__m128i_from_private(r_);
19402   #endif
19403 }
19404 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19405   #define _mm_set_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0) simde_mm_set_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0)
19406 #endif
19407 
19408 SIMDE_FUNCTION_ATTRIBUTES
19409 simde__m128i
simde_mm_loadu_si16(void const * mem_addr)19410 simde_mm_loadu_si16 (void const* mem_addr) {
19411   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
19412       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
19413       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
19414       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
19415     return _mm_loadu_si16(mem_addr);
19416   #else
19417     int16_t val;
19418     simde_memcpy(&val, mem_addr, sizeof(val));
19419     return simde_x_mm_cvtsi16_si128(val);
19420   #endif
19421 }
19422 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19423   #define _mm_loadu_si16(mem_addr) simde_mm_loadu_si16(mem_addr)
19424 #endif
19425 
19426 SIMDE_FUNCTION_ATTRIBUTES
19427 simde__m128i
simde_mm_set_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)19428 simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
19429   #if defined(SIMDE_X86_SSE2_NATIVE)
19430     return _mm_set_epi32(e3, e2, e1, e0);
19431   #else
19432     simde__m128i_private r_;
19433 
19434     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19435       SIMDE_ALIGN_LIKE_16(int32x4_t) int32_t data[4] = { e0, e1, e2, e3 };
19436       r_.neon_i32 = vld1q_s32(data);
19437     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
19438       r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
19439     #else
19440       r_.i32[0] = e0;
19441       r_.i32[1] = e1;
19442       r_.i32[2] = e2;
19443       r_.i32[3] = e3;
19444     #endif
19445 
19446     return simde__m128i_from_private(r_);
19447   #endif
19448 }
19449 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19450   #define _mm_set_epi32(e3,  e2,  e1,  e0) simde_mm_set_epi32(e3,  e2,  e1,  e0)
19451 #endif
19452 
19453 SIMDE_FUNCTION_ATTRIBUTES
19454 simde__m128i
simde_mm_loadu_si32(void const * mem_addr)19455 simde_mm_loadu_si32 (void const* mem_addr) {
19456   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
19457       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
19458       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
19459       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
19460     return _mm_loadu_si32(mem_addr);
19461   #else
19462     int32_t val;
19463     simde_memcpy(&val, mem_addr, sizeof(val));
19464     return simde_mm_cvtsi32_si128(val);
19465   #endif
19466 }
19467 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19468   #define _mm_loadu_si32(mem_addr) simde_mm_loadu_si32(mem_addr)
19469 #endif
19470 
19471 SIMDE_FUNCTION_ATTRIBUTES
19472 simde__m128i
simde_mm_set_epi64(simde__m64 e1,simde__m64 e0)19473 simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) {
19474   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
19475     return _mm_set_epi64(e1, e0);
19476   #else
19477     simde__m128i_private r_;
19478 
19479     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19480       r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1));
19481     #else
19482       r_.m64[0] = e0;
19483       r_.m64[1] = e1;
19484     #endif
19485 
19486     return simde__m128i_from_private(r_);
19487   #endif
19488 }
19489 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19490   #define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
19491 #endif
19492 
19493 SIMDE_FUNCTION_ATTRIBUTES
19494 simde__m128i
simde_mm_set_epi64x(int64_t e1,int64_t e0)19495 simde_mm_set_epi64x (int64_t e1, int64_t e0) {
19496   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
19497     return _mm_set_epi64x(e1, e0);
19498   #else
19499     simde__m128i_private r_;
19500 
19501     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19502       SIMDE_ALIGN_LIKE_16(int64x2_t) int64_t data[2] = {e0, e1};
19503       r_.neon_i64 = vld1q_s64(data);
19504     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
19505       r_.wasm_v128 = wasm_i64x2_make(e0, e1);
19506     #else
19507       r_.i64[0] = e0;
19508       r_.i64[1] = e1;
19509     #endif
19510 
19511     return simde__m128i_from_private(r_);
19512   #endif
19513 }
19514 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19515   #define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
19516 #endif
19517 
19518 SIMDE_FUNCTION_ATTRIBUTES
19519 simde__m128i
simde_mm_loadu_si64(void const * mem_addr)19520 simde_mm_loadu_si64 (void const* mem_addr) {
19521   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
19522       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
19523       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
19524       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
19525     return _mm_loadu_si64(mem_addr);
19526   #else
19527   int64_t val;
19528     simde_memcpy(&val, mem_addr, sizeof(val));
19529     return simde_mm_cvtsi64_si128(val);
19530   #endif
19531 }
19532 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19533   #define _mm_loadu_si64(mem_addr) simde_mm_loadu_si64(mem_addr)
19534 #endif
19535 
19536 SIMDE_FUNCTION_ATTRIBUTES
19537 simde__m128i
simde_x_mm_set_epu8(uint8_t e15,uint8_t e14,uint8_t e13,uint8_t e12,uint8_t e11,uint8_t e10,uint8_t e9,uint8_t e8,uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)19538 simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
19539          uint8_t e11, uint8_t e10, uint8_t  e9, uint8_t  e8,
19540          uint8_t  e7, uint8_t  e6, uint8_t  e5, uint8_t  e4,
19541          uint8_t  e3, uint8_t  e2, uint8_t  e1, uint8_t  e0) {
19542   #if defined(SIMDE_X86_SSE2_NATIVE)
19543     return _mm_set_epi8(
19544       HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
19545       HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char,  e9), HEDLEY_STATIC_CAST(char,  e8),
19546       HEDLEY_STATIC_CAST(char,  e7), HEDLEY_STATIC_CAST(char,  e6), HEDLEY_STATIC_CAST(char,  e5), HEDLEY_STATIC_CAST(char,  e4),
19547       HEDLEY_STATIC_CAST(char,  e3), HEDLEY_STATIC_CAST(char,  e2), HEDLEY_STATIC_CAST(char,  e1), HEDLEY_STATIC_CAST(char,  e0));
19548   #else
19549     simde__m128i_private r_;
19550 
19551     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19552       SIMDE_ALIGN_LIKE_16(uint8x16_t) uint8_t data[16] = {
19553         e0,  e1,  e2,  e3,
19554         e4,  e5,  e6,  e7,
19555         e8,  e9,  e10, e11,
19556         e12, e13, e14, e15};
19557       r_.neon_u8 = vld1q_u8(data);
19558     #else
19559       r_.u8[ 0] =  e0; r_.u8[ 1] =  e1; r_.u8[ 2] =  e2; r_.u8[ 3] =  e3;
19560       r_.u8[ 4] =  e4; r_.u8[ 5] =  e5; r_.u8[ 6] =  e6; r_.u8[ 7] =  e7;
19561       r_.u8[ 8] =  e8; r_.u8[ 9] =  e9; r_.u8[10] = e10; r_.u8[11] = e11;
19562       r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15;
19563     #endif
19564 
19565     return simde__m128i_from_private(r_);
19566   #endif
19567 }
19568 
19569 SIMDE_FUNCTION_ATTRIBUTES
19570 simde__m128i
simde_x_mm_set_epu16(uint16_t e7,uint16_t e6,uint16_t e5,uint16_t e4,uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)19571 simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
19572           uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
19573   #if defined(SIMDE_X86_SSE2_NATIVE)
19574     return _mm_set_epi16(
19575       HEDLEY_STATIC_CAST(short,  e7), HEDLEY_STATIC_CAST(short,  e6), HEDLEY_STATIC_CAST(short,  e5), HEDLEY_STATIC_CAST(short,  e4),
19576       HEDLEY_STATIC_CAST(short,  e3), HEDLEY_STATIC_CAST(short,  e2), HEDLEY_STATIC_CAST(short,  e1), HEDLEY_STATIC_CAST(short,  e0));
19577   #else
19578     simde__m128i_private r_;
19579 
19580     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19581       SIMDE_ALIGN_LIKE_16(uint16x8_t) uint16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
19582       r_.neon_u16 = vld1q_u16(data);
19583     #else
19584       r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3;
19585       r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7;
19586     #endif
19587 
19588     return simde__m128i_from_private(r_);
19589   #endif
19590 }
19591 
19592 SIMDE_FUNCTION_ATTRIBUTES
19593 simde__m128i
simde_x_mm_set_epu32(uint32_t e3,uint32_t e2,uint32_t e1,uint32_t e0)19594 simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
19595   #if defined(SIMDE_X86_SSE2_NATIVE)
19596     return _mm_set_epi32(
19597       HEDLEY_STATIC_CAST(int,  e3), HEDLEY_STATIC_CAST(int,  e2), HEDLEY_STATIC_CAST(int,  e1), HEDLEY_STATIC_CAST(int,  e0));
19598   #else
19599     simde__m128i_private r_;
19600 
19601     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19602       SIMDE_ALIGN_LIKE_16(uint32x4_t) uint32_t data[4] = { e0, e1, e2, e3 };
19603       r_.neon_u32 = vld1q_u32(data);
19604     #else
19605       r_.u32[0] = e0;
19606       r_.u32[1] = e1;
19607       r_.u32[2] = e2;
19608       r_.u32[3] = e3;
19609     #endif
19610 
19611     return simde__m128i_from_private(r_);
19612   #endif
19613 }
19614 
19615 SIMDE_FUNCTION_ATTRIBUTES
19616 simde__m128i
simde_x_mm_set_epu64x(uint64_t e1,uint64_t e0)19617 simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) {
19618   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
19619     return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t,  e1), HEDLEY_STATIC_CAST(int64_t,  e0));
19620   #else
19621     simde__m128i_private r_;
19622 
19623     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19624       SIMDE_ALIGN_LIKE_16(uint64x2_t) uint64_t data[2] = {e0, e1};
19625       r_.neon_u64 = vld1q_u64(data);
19626     #else
19627       r_.u64[0] = e0;
19628       r_.u64[1] = e1;
19629     #endif
19630 
19631     return simde__m128i_from_private(r_);
19632   #endif
19633 }
19634 
19635 SIMDE_FUNCTION_ATTRIBUTES
19636 simde__m128d
simde_mm_set_sd(simde_float64 a)19637 simde_mm_set_sd (simde_float64 a) {
19638   #if defined(SIMDE_X86_SSE2_NATIVE)
19639     return _mm_set_sd(a);
19640   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
19641     return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0);
19642   #else
19643     return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
19644   #endif
19645 }
19646 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19647   #define _mm_set_sd(a) simde_mm_set_sd(a)
19648 #endif
19649 
19650 SIMDE_FUNCTION_ATTRIBUTES
19651 simde__m128i
simde_mm_set1_epi8(int8_t a)19652 simde_mm_set1_epi8 (int8_t a) {
19653   #if defined(SIMDE_X86_SSE2_NATIVE)
19654     return _mm_set1_epi8(a);
19655   #else
19656     simde__m128i_private r_;
19657 
19658     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19659       r_.neon_i8 = vdupq_n_s8(a);
19660     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
19661       r_.wasm_v128 = wasm_i8x16_splat(a);
19662     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
19663       r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a));
19664     #else
19665       SIMDE_VECTORIZE
19666       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
19667         r_.i8[i] = a;
19668       }
19669     #endif
19670 
19671     return simde__m128i_from_private(r_);
19672   #endif
19673 }
19674 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19675   #define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
19676 #endif
19677 
19678 SIMDE_FUNCTION_ATTRIBUTES
19679 simde__m128i
simde_mm_set1_epi16(int16_t a)19680 simde_mm_set1_epi16 (int16_t a) {
19681   #if defined(SIMDE_X86_SSE2_NATIVE)
19682     return _mm_set1_epi16(a);
19683   #else
19684     simde__m128i_private r_;
19685 
19686     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19687       r_.neon_i16 = vdupq_n_s16(a);
19688     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
19689       r_.wasm_v128 = wasm_i16x8_splat(a);
19690     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
19691       r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a));
19692     #else
19693       SIMDE_VECTORIZE
19694       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
19695         r_.i16[i] = a;
19696       }
19697     #endif
19698 
19699     return simde__m128i_from_private(r_);
19700   #endif
19701 }
19702 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19703   #define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
19704 #endif
19705 
19706 SIMDE_FUNCTION_ATTRIBUTES
19707 simde__m128i
simde_mm_set1_epi32(int32_t a)19708 simde_mm_set1_epi32 (int32_t a) {
19709   #if defined(SIMDE_X86_SSE2_NATIVE)
19710     return _mm_set1_epi32(a);
19711   #else
19712     simde__m128i_private r_;
19713 
19714     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19715       r_.neon_i32 = vdupq_n_s32(a);
19716     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
19717       r_.wasm_v128 = wasm_i32x4_splat(a);
19718     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
19719       r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a));
19720     #else
19721       SIMDE_VECTORIZE
19722       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
19723         r_.i32[i] = a;
19724       }
19725     #endif
19726 
19727     return simde__m128i_from_private(r_);
19728   #endif
19729 }
19730 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19731   #define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
19732 #endif
19733 
19734 SIMDE_FUNCTION_ATTRIBUTES
19735 simde__m128i
simde_mm_set1_epi64x(int64_t a)19736 simde_mm_set1_epi64x (int64_t a) {
19737   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
19738     return _mm_set1_epi64x(a);
19739   #else
19740     simde__m128i_private r_;
19741 
19742     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19743       r_.neon_i64 = vdupq_n_s64(a);
19744     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
19745       r_.wasm_v128 = wasm_i64x2_splat(a);
19746     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
19747       r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a));
19748     #else
19749       SIMDE_VECTORIZE
19750       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
19751         r_.i64[i] = a;
19752       }
19753     #endif
19754 
19755     return simde__m128i_from_private(r_);
19756   #endif
19757 }
19758 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19759   #define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
19760 #endif
19761 
19762 SIMDE_FUNCTION_ATTRIBUTES
19763 simde__m128i
simde_mm_set1_epi64(simde__m64 a)19764 simde_mm_set1_epi64 (simde__m64 a) {
19765   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
19766     return _mm_set1_epi64(a);
19767   #else
19768     simde__m64_private a_ = simde__m64_to_private(a);
19769     return simde_mm_set1_epi64x(a_.i64[0]);
19770   #endif
19771 }
19772 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19773   #define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
19774 #endif
19775 
19776 SIMDE_FUNCTION_ATTRIBUTES
19777 simde__m128i
simde_x_mm_set1_epu8(uint8_t value)19778 simde_x_mm_set1_epu8 (uint8_t value) {
19779   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
19780     return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value)));
19781   #else
19782     return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
19783   #endif
19784 }
19785 
19786 SIMDE_FUNCTION_ATTRIBUTES
19787 simde__m128i
simde_x_mm_set1_epu16(uint16_t value)19788 simde_x_mm_set1_epu16 (uint16_t value) {
19789   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
19790     return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value)));
19791   #else
19792     return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
19793   #endif
19794 }
19795 
19796 SIMDE_FUNCTION_ATTRIBUTES
19797 simde__m128i
simde_x_mm_set1_epu32(uint32_t value)19798 simde_x_mm_set1_epu32 (uint32_t value) {
19799   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
19800     return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value)));
19801   #else
19802     return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
19803   #endif
19804 }
19805 
19806 SIMDE_FUNCTION_ATTRIBUTES
19807 simde__m128i
simde_x_mm_set1_epu64(uint64_t value)19808 simde_x_mm_set1_epu64 (uint64_t value) {
19809   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
19810     return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value)));
19811   #else
19812     return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
19813   #endif
19814 }
19815 
19816 SIMDE_FUNCTION_ATTRIBUTES
19817 simde__m128i
simde_mm_setr_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)19818 simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
19819         int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
19820         int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
19821         int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
19822   #if defined(SIMDE_X86_SSE2_NATIVE)
19823     return _mm_setr_epi8(
19824       e15, e14, e13, e12, e11, e10,  e9,    e8,
19825       e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
19826   #else
19827     return simde_mm_set_epi8(
19828       e0, e1, e2, e3, e4, e5, e6, e7,
19829       e8, e9, e10, e11, e12, e13, e14, e15);
19830   #endif
19831 }
19832 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19833   #define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
19834 #endif
19835 
19836 SIMDE_FUNCTION_ATTRIBUTES
19837 simde__m128i
simde_mm_setr_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)19838 simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
19839          int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
19840   #if defined(SIMDE_X86_SSE2_NATIVE)
19841     return _mm_setr_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
19842   #else
19843     return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
19844   #endif
19845 }
19846 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19847   #define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
19848 #endif
19849 
19850 SIMDE_FUNCTION_ATTRIBUTES
19851 simde__m128i
simde_mm_setr_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)19852 simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
19853   #if defined(SIMDE_X86_SSE2_NATIVE)
19854     return _mm_setr_epi32(e3, e2, e1, e0);
19855   #else
19856     return simde_mm_set_epi32(e0, e1, e2, e3);
19857   #endif
19858 }
19859 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19860   #define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
19861 #endif
19862 
19863 SIMDE_FUNCTION_ATTRIBUTES
19864 simde__m128i
simde_mm_setr_epi64(simde__m64 e1,simde__m64 e0)19865 simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) {
19866   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
19867     return _mm_setr_epi64(e1, e0);
19868   #else
19869     return simde_mm_set_epi64(e0, e1);
19870   #endif
19871 }
19872 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19873   #define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
19874 #endif
19875 
19876 SIMDE_FUNCTION_ATTRIBUTES
19877 simde__m128d
simde_mm_setr_pd(simde_float64 e1,simde_float64 e0)19878 simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) {
19879   #if defined(SIMDE_X86_SSE2_NATIVE)
19880     return _mm_setr_pd(e1, e0);
19881   #else
19882     return simde_mm_set_pd(e0, e1);
19883   #endif
19884 }
19885 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19886   #define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
19887 #endif
19888 
19889 SIMDE_FUNCTION_ATTRIBUTES
19890 simde__m128d
simde_mm_setzero_pd(void)19891 simde_mm_setzero_pd (void) {
19892   #if defined(SIMDE_X86_SSE2_NATIVE)
19893     return _mm_setzero_pd();
19894   #else
19895     return simde_mm_castsi128_pd(simde_mm_setzero_si128());
19896   #endif
19897 }
19898 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19899   #define _mm_setzero_pd() simde_mm_setzero_pd()
19900 #endif
19901 
19902 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
19903 HEDLEY_DIAGNOSTIC_PUSH
19904 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
19905 #endif
19906 
19907 SIMDE_FUNCTION_ATTRIBUTES
19908 simde__m128d
simde_mm_undefined_pd(void)19909 simde_mm_undefined_pd (void) {
19910   simde__m128d_private r_;
19911 
19912   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
19913     r_.n = _mm_undefined_pd();
19914   #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
19915     r_ = simde__m128d_to_private(simde_mm_setzero_pd());
19916   #endif
19917 
19918   return simde__m128d_from_private(r_);
19919 }
19920 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19921   #define _mm_undefined_pd() simde_mm_undefined_pd()
19922 #endif
19923 
19924 SIMDE_FUNCTION_ATTRIBUTES
19925 simde__m128i
simde_mm_undefined_si128(void)19926 simde_mm_undefined_si128 (void) {
19927   simde__m128i_private r_;
19928 
19929   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
19930     r_.n = _mm_undefined_si128();
19931   #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
19932     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
19933   #endif
19934 
19935   return simde__m128i_from_private(r_);
19936 }
19937 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
19938   #define _mm_undefined_si128() (simde_mm_undefined_si128())
19939 #endif
19940 
19941 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
19942 HEDLEY_DIAGNOSTIC_POP
19943 #endif
19944 
19945 SIMDE_FUNCTION_ATTRIBUTES
19946 simde__m128d
simde_x_mm_setone_pd(void)19947 simde_x_mm_setone_pd (void) {
19948   return simde_mm_castps_pd(simde_x_mm_setone_ps());
19949 }
19950 
19951 SIMDE_FUNCTION_ATTRIBUTES
19952 simde__m128i
simde_x_mm_setone_si128(void)19953 simde_x_mm_setone_si128 (void) {
19954   return simde_mm_castps_si128(simde_x_mm_setone_ps());
19955 }
19956 
19957 SIMDE_FUNCTION_ATTRIBUTES
19958 simde__m128i
simde_mm_shuffle_epi32(simde__m128i a,const int imm8)19959 simde_mm_shuffle_epi32 (simde__m128i a, const int imm8)
19960     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
19961   simde__m128i_private
19962     r_,
19963     a_ = simde__m128i_to_private(a);
19964 
19965   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
19966     r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
19967   }
19968 
19969   return simde__m128i_from_private(r_);
19970 }
19971 #if defined(SIMDE_X86_SSE2_NATIVE)
19972   #define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
19973 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
19974   #define simde_mm_shuffle_epi32(a, imm8)                                   \
19975     __extension__({                                                         \
19976         int32x4_t ret;                                                      \
19977         ret = vmovq_n_s32(                                                  \
19978             vgetq_lane_s32(vreinterpretq_s32_s64(a), (imm8) & (0x3)));     \
19979         ret = vsetq_lane_s32(                                               \
19980             vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 2) & 0x3), \
19981             ret, 1);                                                        \
19982         ret = vsetq_lane_s32(                                               \
19983             vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 4) & 0x3), \
19984             ret, 2);                                                        \
19985         ret = vsetq_lane_s32(                                               \
19986             vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 6) & 0x3), \
19987             ret, 3);                                                        \
19988         vreinterpretq_s64_s32(ret);                                       \
19989     })
19990 #elif defined(SIMDE_SHUFFLE_VECTOR_)
19991   #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \
19992       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
19993       simde__m128i_from_private((simde__m128i_private) { .i32 = \
19994         SIMDE_SHUFFLE_VECTOR_(32, 16, \
19995           (simde__tmp_a_).i32, \
19996           (simde__tmp_a_).i32, \
19997           ((imm8)     ) & 3, \
19998           ((imm8) >> 2) & 3, \
19999           ((imm8) >> 4) & 3, \
20000           ((imm8) >> 6) & 3) }); }))
20001 #endif
20002 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20003   #define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
20004 #endif
20005 
20006 SIMDE_FUNCTION_ATTRIBUTES
20007 simde__m128d
simde_mm_shuffle_pd(simde__m128d a,simde__m128d b,const int imm8)20008 simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8)
20009     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
20010   simde__m128d_private
20011     r_,
20012     a_ = simde__m128d_to_private(a),
20013     b_ = simde__m128d_to_private(b);
20014 
20015   r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
20016   r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
20017 
20018   return simde__m128d_from_private(r_);
20019 }
20020 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
20021   #define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
20022 #elif defined(SIMDE_SHUFFLE_VECTOR_)
20023   #define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \
20024       simde__m128d_from_private((simde__m128d_private) { .f64 = \
20025         SIMDE_SHUFFLE_VECTOR_(64, 16, \
20026           simde__m128d_to_private(a).f64, \
20027           simde__m128d_to_private(b).f64, \
20028           (((imm8)     ) & 1), \
20029           (((imm8) >> 1) & 1) + 2) }); }))
20030 #endif
20031 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20032   #define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
20033 #endif
20034 
20035 SIMDE_FUNCTION_ATTRIBUTES
20036 simde__m128i
simde_mm_shufflehi_epi16(simde__m128i a,const int imm8)20037 simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8)
20038     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
20039   simde__m128i_private
20040     r_,
20041     a_ = simde__m128i_to_private(a);
20042 
20043   SIMDE_VECTORIZE
20044   for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) {
20045     r_.i16[i] = a_.i16[i];
20046   }
20047   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
20048     r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
20049   }
20050 
20051   return simde__m128i_from_private(r_);
20052 }
20053 #if defined(SIMDE_X86_SSE2_NATIVE)
20054   #define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
20055 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20056   #define simde_mm_shufflehi_epi16(a, imm8) \
20057     __extension__({                                                            \
20058         int16x8_t ret = vreinterpretq_s16_s64(a);                            \
20059         int16x4_t highBits = vget_high_s16(ret);                               \
20060         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm8) & (0x3)), ret, 4);  \
20061         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 2) & 0x3), ret, \
20062                              5);                                               \
20063         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 4) & 0x3), ret, \
20064                              6);                                               \
20065         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 6) & 0x3), ret, \
20066                              7);                                               \
20067         vreinterpretq_s64_s16(ret);                                          \
20068     })
20069 #elif defined(SIMDE_SHUFFLE_VECTOR_)
20070   #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \
20071       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
20072       simde__m128i_from_private((simde__m128i_private) { .i16 = \
20073         SIMDE_SHUFFLE_VECTOR_(16, 16, \
20074           (simde__tmp_a_).i16, \
20075           (simde__tmp_a_).i16, \
20076           0, 1, 2, 3, \
20077           (((imm8)     ) & 3) + 4, \
20078           (((imm8) >> 2) & 3) + 4, \
20079           (((imm8) >> 4) & 3) + 4, \
20080           (((imm8) >> 6) & 3) + 4) }); }))
20081 #endif
20082 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20083   #define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
20084 #endif
20085 
20086 SIMDE_FUNCTION_ATTRIBUTES
20087 simde__m128i
simde_mm_shufflelo_epi16(simde__m128i a,const int imm8)20088 simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8)
20089     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
20090   simde__m128i_private
20091     r_,
20092     a_ = simde__m128i_to_private(a);
20093 
20094   for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
20095     r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
20096   }
20097   SIMDE_VECTORIZE
20098   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
20099     r_.i16[i] = a_.i16[i];
20100   }
20101 
20102   return simde__m128i_from_private(r_);
20103 }
20104 #if defined(SIMDE_X86_SSE2_NATIVE)
20105   #define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
20106 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20107   #define simde_mm_shufflelo_epi16(a, imm8)                                  \
20108     __extension__({                                                           \
20109         int16x8_t ret = vreinterpretq_s16_s64(a);                           \
20110         int16x4_t lowBits = vget_low_s16(ret);                                \
20111         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm8) & (0x3)), ret, 0);  \
20112         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 2) & 0x3), ret, \
20113                              1);                                              \
20114         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 4) & 0x3), ret, \
20115                              2);                                              \
20116         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 6) & 0x3), ret, \
20117                              3);                                              \
20118         vreinterpretq_s64_s16(ret);                                         \
20119     })
20120 #elif defined(SIMDE_SHUFFLE_VECTOR_)
20121   #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \
20122       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
20123       simde__m128i_from_private((simde__m128i_private) { .i16 = \
20124         SIMDE_SHUFFLE_VECTOR_(16, 16, \
20125           (simde__tmp_a_).i16, \
20126           (simde__tmp_a_).i16, \
20127           (((imm8)     ) & 3), \
20128           (((imm8) >> 2) & 3), \
20129           (((imm8) >> 4) & 3), \
20130           (((imm8) >> 6) & 3), \
20131           4, 5, 6, 7) }); }))
20132 #endif
20133 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20134   #define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
20135 #endif
20136 
20137 SIMDE_FUNCTION_ATTRIBUTES
20138 simde__m128i
simde_mm_sll_epi16(simde__m128i a,simde__m128i count)20139 simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
20140   #if defined(SIMDE_X86_SSE2_NATIVE)
20141     return _mm_sll_epi16(a, count);
20142   #else
20143     simde__m128i_private
20144       r_,
20145       a_ = simde__m128i_to_private(a),
20146       count_ = simde__m128i_to_private(count);
20147 
20148     if (count_.u64[0] > 15)
20149       return simde_mm_setzero_si128();
20150 
20151     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
20152       r_.u16 = (a_.u16 << count_.u64[0]);
20153     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20154       r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0])));
20155     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20156       r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
20157     #else
20158       SIMDE_VECTORIZE
20159       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
20160         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0]));
20161       }
20162     #endif
20163 
20164     return simde__m128i_from_private(r_);
20165   #endif
20166 }
20167 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20168   #define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
20169 #endif
20170 
20171 SIMDE_FUNCTION_ATTRIBUTES
20172 simde__m128i
simde_mm_sll_epi32(simde__m128i a,simde__m128i count)20173 simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
20174   #if defined(SIMDE_X86_SSE2_NATIVE)
20175     return _mm_sll_epi32(a, count);
20176   #else
20177     simde__m128i_private
20178       r_,
20179       a_ = simde__m128i_to_private(a),
20180       count_ = simde__m128i_to_private(count);
20181 
20182     if (count_.u64[0] > 31)
20183       return simde_mm_setzero_si128();
20184 
20185     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
20186       r_.u32 = (a_.u32 << count_.u64[0]);
20187     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20188       r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0])));
20189     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20190       r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0));
20191     #else
20192       SIMDE_VECTORIZE
20193       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
20194         r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0]));
20195       }
20196     #endif
20197 
20198     return simde__m128i_from_private(r_);
20199   #endif
20200 }
20201 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20202   #define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
20203 #endif
20204 
20205 SIMDE_FUNCTION_ATTRIBUTES
20206 simde__m128i
simde_mm_sll_epi64(simde__m128i a,simde__m128i count)20207 simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
20208   #if defined(SIMDE_X86_SSE2_NATIVE)
20209     return _mm_sll_epi64(a, count);
20210   #else
20211     simde__m128i_private
20212       r_,
20213       a_ = simde__m128i_to_private(a),
20214       count_ = simde__m128i_to_private(count);
20215 
20216     if (count_.u64[0] > 63)
20217       return simde_mm_setzero_si128();
20218 
20219     const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
20220     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20221       r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s)));
20222     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20223       r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, s) : wasm_i64x2_const(0,0);
20224     #else
20225       #if !defined(SIMDE_BUG_GCC_94488)
20226         SIMDE_VECTORIZE
20227       #endif
20228       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
20229         r_.u64[i] = a_.u64[i] << s;
20230       }
20231     #endif
20232 
20233     return simde__m128i_from_private(r_);
20234   #endif
20235 }
20236 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20237   #define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
20238 #endif
20239 
20240 SIMDE_FUNCTION_ATTRIBUTES
20241 simde__m128d
simde_mm_sqrt_pd(simde__m128d a)20242 simde_mm_sqrt_pd (simde__m128d a) {
20243   #if defined(SIMDE_X86_SSE2_NATIVE)
20244     return _mm_sqrt_pd(a);
20245   #else
20246     simde__m128d_private
20247       r_,
20248       a_ = simde__m128d_to_private(a);
20249 
20250     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
20251       r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
20252     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20253       r_.wasm_v128 = wasm_f64x2_sqrt(a_.wasm_v128);
20254     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
20255       r_.altivec_f64 = vec_sqrt(a_.altivec_f64);
20256     #elif defined(simde_math_sqrt)
20257       SIMDE_VECTORIZE
20258       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
20259         r_.f64[i] = simde_math_sqrt(a_.f64[i]);
20260       }
20261     #else
20262       HEDLEY_UNREACHABLE();
20263     #endif
20264 
20265     return simde__m128d_from_private(r_);
20266   #endif
20267 }
20268 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20269   #define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
20270 #endif
20271 
20272 SIMDE_FUNCTION_ATTRIBUTES
20273 simde__m128d
simde_mm_sqrt_sd(simde__m128d a,simde__m128d b)20274 simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) {
20275   #if defined(SIMDE_X86_SSE2_NATIVE)
20276     return _mm_sqrt_sd(a, b);
20277   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
20278     return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
20279   #else
20280     simde__m128d_private
20281       r_,
20282       a_ = simde__m128d_to_private(a),
20283       b_ = simde__m128d_to_private(b);
20284 
20285     #if defined(simde_math_sqrt)
20286       r_.f64[0] = simde_math_sqrt(b_.f64[0]);
20287       r_.f64[1] = a_.f64[1];
20288     #else
20289       HEDLEY_UNREACHABLE();
20290     #endif
20291 
20292     return simde__m128d_from_private(r_);
20293   #endif
20294 }
20295 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20296   #define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
20297 #endif
20298 
20299 SIMDE_FUNCTION_ATTRIBUTES
20300 simde__m128i
simde_mm_srl_epi16(simde__m128i a,simde__m128i count)20301 simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) {
20302   #if defined(SIMDE_X86_SSE2_NATIVE)
20303     return _mm_srl_epi16(a, count);
20304   #else
20305     simde__m128i_private
20306       r_,
20307       a_ = simde__m128i_to_private(a),
20308       count_ = simde__m128i_to_private(count);
20309 
20310     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
20311 
20312     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20313       r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
20314     #else
20315       SIMDE_VECTORIZE
20316       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
20317         r_.u16[i] = a_.u16[i] >> cnt;
20318       }
20319     #endif
20320 
20321     return simde__m128i_from_private(r_);
20322   #endif
20323 }
20324 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20325   #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
20326 #endif
20327 
20328 SIMDE_FUNCTION_ATTRIBUTES
20329 simde__m128i
simde_mm_srl_epi32(simde__m128i a,simde__m128i count)20330 simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) {
20331   #if defined(SIMDE_X86_SSE2_NATIVE)
20332     return _mm_srl_epi32(a, count);
20333   #else
20334     simde__m128i_private
20335       r_,
20336       a_ = simde__m128i_to_private(a),
20337       count_ = simde__m128i_to_private(count);
20338 
20339     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
20340 
20341     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20342       r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
20343     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20344       r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, cnt);
20345     #else
20346       SIMDE_VECTORIZE
20347       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
20348         r_.u32[i] = a_.u32[i] >> cnt;
20349       }
20350     #endif
20351 
20352     return simde__m128i_from_private(r_);
20353   #endif
20354 }
20355 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20356   #define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
20357 #endif
20358 
20359 SIMDE_FUNCTION_ATTRIBUTES
20360 simde__m128i
simde_mm_srl_epi64(simde__m128i a,simde__m128i count)20361 simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
20362   #if defined(SIMDE_X86_SSE2_NATIVE)
20363     return _mm_srl_epi64(a, count);
20364   #else
20365     simde__m128i_private
20366       r_,
20367       a_ = simde__m128i_to_private(a),
20368       count_ = simde__m128i_to_private(count);
20369 
20370     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
20371 
20372     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20373       r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
20374     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20375       r_.wasm_v128 = wasm_u64x2_shr(a_.wasm_v128, cnt);
20376     #else
20377       #if !defined(SIMDE_BUG_GCC_94488)
20378         SIMDE_VECTORIZE
20379       #endif
20380       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
20381         r_.u64[i] = a_.u64[i] >> cnt;
20382       }
20383     #endif
20384 
20385     return simde__m128i_from_private(r_);
20386   #endif
20387 }
20388 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20389   #define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
20390 #endif
20391 
20392 SIMDE_FUNCTION_ATTRIBUTES
20393 simde__m128i
simde_mm_srai_epi16(simde__m128i a,const int imm8)20394 simde_mm_srai_epi16 (simde__m128i a, const int imm8)
20395     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
20396   /* MSVC requires a range of (0, 255). */
20397   simde__m128i_private
20398     r_,
20399     a_ = simde__m128i_to_private(a);
20400 
20401   const int cnt = (imm8 & ~15) ? 15 : imm8;
20402 
20403   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20404     r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
20405   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20406     r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
20407   #else
20408     SIMDE_VECTORIZE
20409     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
20410       r_.i16[i] = a_.i16[i] >> cnt;
20411     }
20412   #endif
20413 
20414   return simde__m128i_from_private(r_);
20415 }
20416 #if defined(SIMDE_X86_SSE2_NATIVE)
20417   #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
20418 #endif
20419 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20420   #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
20421 #endif
20422 
20423 SIMDE_FUNCTION_ATTRIBUTES
20424 simde__m128i
simde_mm_srai_epi32(simde__m128i a,const int imm8)20425 simde_mm_srai_epi32 (simde__m128i a, const int imm8)
20426     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
20427   /* MSVC requires a range of (0, 255). */
20428   simde__m128i_private
20429     r_,
20430     a_ = simde__m128i_to_private(a);
20431 
20432   const int cnt = (imm8 & ~31) ? 31 : imm8;
20433 
20434   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20435     r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
20436   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20437     r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
20438   #else
20439     SIMDE_VECTORIZE
20440     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
20441       r_.i32[i] = a_.i32[i] >> cnt;
20442     }
20443   #endif
20444 
20445   return simde__m128i_from_private(r_);
20446 }
20447 #if defined(SIMDE_X86_SSE2_NATIVE)
20448   #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
20449 #endif
20450 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20451   #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
20452 #endif
20453 
20454 SIMDE_FUNCTION_ATTRIBUTES
20455 simde__m128i
simde_mm_sra_epi16(simde__m128i a,simde__m128i count)20456 simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) {
20457   #if defined(SIMDE_X86_SSE2_NATIVE)
20458     return _mm_sra_epi16(a, count);
20459   #else
20460     simde__m128i_private
20461       r_,
20462       a_ = simde__m128i_to_private(a),
20463       count_ = simde__m128i_to_private(count);
20464 
20465     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
20466 
20467     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20468       r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
20469     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20470       r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
20471     #else
20472       SIMDE_VECTORIZE
20473       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
20474         r_.i16[i] = a_.i16[i] >> cnt;
20475       }
20476     #endif
20477 
20478     return simde__m128i_from_private(r_);
20479   #endif
20480 }
20481 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20482   #define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
20483 #endif
20484 
20485 SIMDE_FUNCTION_ATTRIBUTES
20486 simde__m128i
simde_mm_sra_epi32(simde__m128i a,simde__m128i count)20487 simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) {
20488   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
20489     return _mm_sra_epi32(a, count);
20490   #else
20491     simde__m128i_private
20492       r_,
20493       a_ = simde__m128i_to_private(a),
20494       count_ = simde__m128i_to_private(count);
20495 
20496     const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]);
20497 
20498     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20499       r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
20500     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20501       r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
20502     #else
20503       SIMDE_VECTORIZE
20504       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
20505         r_.i32[i] = a_.i32[i] >> cnt;
20506       }
20507     #endif
20508 
20509     return simde__m128i_from_private(r_);
20510   #endif
20511 }
20512 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20513   #define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
20514 #endif
20515 
20516 SIMDE_FUNCTION_ATTRIBUTES
20517 simde__m128i
simde_mm_slli_epi16(simde__m128i a,const int imm8)20518 simde_mm_slli_epi16 (simde__m128i a, const int imm8)
20519     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
20520   if (HEDLEY_UNLIKELY((imm8 > 15))) {
20521     return simde_mm_setzero_si128();
20522   }
20523 
20524   simde__m128i_private
20525     r_,
20526     a_ = simde__m128i_to_private(a);
20527 
20528   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
20529     r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff);
20530   #else
20531     const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
20532     SIMDE_VECTORIZE
20533     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
20534       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
20535     }
20536   #endif
20537 
20538   return simde__m128i_from_private(r_);
20539 }
20540 #if defined(SIMDE_X86_SSE2_NATIVE)
20541   #define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
20542 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20543   #define simde_mm_slli_epi16(a, imm8) \
20544      (__extension__ ({ \
20545         simde__m128i ret; \
20546         if ((imm8) <= 0) { \
20547             ret = a; \
20548         } else if ((imm8) > 15) { \
20549             ret = simde_mm_setzero_si128(); \
20550         } else { \
20551             ret = simde__m128i_from_neon_i16( \
20552                 vshlq_n_s16(simde__m128i_to_neon_i16(a), ((imm8) & 15))); \
20553         } \
20554         ret; \
20555     }))
20556 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20557   #define simde_mm_slli_epi16(a, imm8) \
20558     ((imm8 < 16) ? wasm_i16x8_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0))
20559 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
20560   #define simde_mm_slli_epi16(a, imm8) \
20561     ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
20562 #endif
20563 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20564   #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
20565 #endif
20566 
20567 SIMDE_FUNCTION_ATTRIBUTES
20568 simde__m128i
simde_mm_slli_epi32(simde__m128i a,const int imm8)20569 simde_mm_slli_epi32 (simde__m128i a, const int imm8)
20570     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
20571   if (HEDLEY_UNLIKELY((imm8 > 31))) {
20572     return simde_mm_setzero_si128();
20573   }
20574   simde__m128i_private
20575     r_,
20576     a_ = simde__m128i_to_private(a);
20577 
20578   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
20579     r_.i32 = a_.i32 << imm8;
20580   #else
20581     SIMDE_VECTORIZE
20582     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
20583       r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
20584     }
20585   #endif
20586 
20587   return simde__m128i_from_private(r_);
20588 }
20589 #if defined(SIMDE_X86_SSE2_NATIVE)
20590   #define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
20591 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20592   #define simde_mm_slli_epi32(a, imm8) \
20593      (__extension__ ({ \
20594        simde__m128i ret; \
20595        if ((imm8) <= 0) { \
20596          ret = a; \
20597        } else if ((imm8) > 31) { \
20598          ret = simde_mm_setzero_si128(); \
20599        } else { \
20600          ret = simde__m128i_from_neon_i32( \
20601            vshlq_n_s32(simde__m128i_to_neon_i32(a), ((imm8) & 31))); \
20602        } \
20603        ret; \
20604     }))
20605 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20606   #define simde_mm_slli_epi32(a, imm8) \
20607     ((imm8 < 32) ? wasm_i32x4_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0))
20608 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
20609   #define simde_mm_slli_epi32(a, imm8) \
20610      (__extension__ ({ \
20611        simde__m128i ret; \
20612        if ((imm8) <= 0) { \
20613          ret = a; \
20614        } else if ((imm8) > 31) { \
20615          ret = simde_mm_setzero_si128(); \
20616        } else { \
20617          ret = simde__m128i_from_altivec_i32( \
20618            vec_sl(simde__m128i_to_altivec_i32(a), \
20619              vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \
20620        } \
20621        ret; \
20622      }))
20623 #endif
20624 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20625   #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
20626 #endif
20627 
20628 SIMDE_FUNCTION_ATTRIBUTES
20629 simde__m128i
simde_mm_slli_epi64(simde__m128i a,const int imm8)20630 simde_mm_slli_epi64 (simde__m128i a, const int imm8)
20631     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
20632   if (HEDLEY_UNLIKELY((imm8 > 63))) {
20633     return simde_mm_setzero_si128();
20634   }
20635   simde__m128i_private
20636     r_,
20637     a_ = simde__m128i_to_private(a);
20638 
20639   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
20640     r_.i64 = a_.i64 << imm8;
20641   #else
20642     SIMDE_VECTORIZE
20643     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
20644       r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
20645     }
20646   #endif
20647 
20648   return simde__m128i_from_private(r_);
20649 }
20650 #if defined(SIMDE_X86_SSE2_NATIVE)
20651   #define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
20652 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20653   #define simde_mm_slli_epi64(a, imm8) \
20654      (__extension__ ({ \
20655         simde__m128i ret; \
20656         if ((imm8) <= 0) { \
20657             ret = a; \
20658         } else if ((imm8) > 63) { \
20659             ret = simde_mm_setzero_si128(); \
20660         } else { \
20661             ret = simde__m128i_from_neon_i64( \
20662                 vshlq_n_s64(simde__m128i_to_neon_i64(a), ((imm8) & 63))); \
20663         } \
20664         ret; \
20665     }))
20666 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20667   #define simde_mm_slli_epi64(a, imm8) \
20668     ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
20669 #endif
20670 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20671   #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
20672 #endif
20673 
20674 SIMDE_FUNCTION_ATTRIBUTES
20675 simde__m128i
simde_mm_srli_epi16(simde__m128i a,const int imm8)20676 simde_mm_srli_epi16 (simde__m128i a, const int imm8)
20677     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
20678   if (HEDLEY_UNLIKELY((imm8 > 15))) {
20679     return simde_mm_setzero_si128();
20680   }
20681   simde__m128i_private
20682     r_,
20683     a_ = simde__m128i_to_private(a);
20684 
20685   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
20686     r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8);
20687   #else
20688     SIMDE_VECTORIZE
20689     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
20690       r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
20691     }
20692   #endif
20693 
20694   return simde__m128i_from_private(r_);
20695 }
20696 #if defined(SIMDE_X86_SSE2_NATIVE)
20697   #define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
20698 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20699   #define simde_mm_srli_epi16(a, imm8) \
20700      (__extension__ ({ \
20701         simde__m128i ret; \
20702         if ((imm8) <= 0) { \
20703             ret = a; \
20704         } else if ((imm8) > 15) { \
20705             ret = simde_mm_setzero_si128(); \
20706         } else { \
20707             ret = simde__m128i_from_neon_u16( \
20708                 vshrq_n_u16(simde__m128i_to_neon_u16(a), (((imm8) & 15) | (((imm8) & 15) == 0)))); \
20709         } \
20710         ret; \
20711     }))
20712 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20713   #define simde_mm_srli_epi16(a, imm8) \
20714     ((imm8 < 16) ? wasm_u16x8_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0))
20715 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
20716   #define simde_mm_srli_epi16(a, imm8) \
20717     ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
20718 #endif
20719 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20720   #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
20721 #endif
20722 
20723 SIMDE_FUNCTION_ATTRIBUTES
20724 simde__m128i
simde_mm_srli_epi32(simde__m128i a,const int imm8)20725 simde_mm_srli_epi32 (simde__m128i a, const int imm8)
20726     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
20727   if (HEDLEY_UNLIKELY((imm8 > 31))) {
20728     return simde_mm_setzero_si128();
20729   }
20730   simde__m128i_private
20731     r_,
20732     a_ = simde__m128i_to_private(a);
20733 
20734   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
20735     r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff);
20736   #else
20737     SIMDE_VECTORIZE
20738     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
20739       r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
20740     }
20741   #endif
20742 
20743   return simde__m128i_from_private(r_);
20744 }
20745 #if defined(SIMDE_X86_SSE2_NATIVE)
20746   #define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
20747 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20748   #define simde_mm_srli_epi32(a, imm8) \
20749     (__extension__ ({ \
20750         simde__m128i ret; \
20751         if ((imm8) <= 0) { \
20752             ret = a; \
20753         } else if ((imm8) > 31) { \
20754             ret = simde_mm_setzero_si128(); \
20755         } else { \
20756             ret = simde__m128i_from_neon_u32( \
20757               vshrq_n_u32(simde__m128i_to_neon_u32(a), (((imm8) & 31) | (((imm8) & 31) == 0)))); \
20758         } \
20759         ret; \
20760     }))
20761 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20762   #define simde_mm_srli_epi32(a, imm8) \
20763     ((imm8 < 32) ? wasm_u32x4_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0))
20764 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
20765   #define simde_mm_srli_epi32(a, imm8) \
20766     (__extension__ ({ \
20767         simde__m128i ret; \
20768         if ((imm8) <= 0) { \
20769             ret = a; \
20770         } else if ((imm8) > 31) { \
20771             ret = simde_mm_setzero_si128(); \
20772         } else { \
20773             ret = simde__m128i_from_altivec_i32( \
20774               vec_sr(simde__m128i_to_altivec_i32(a), \
20775                 vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \
20776         } \
20777         ret; \
20778     }))
20779 #endif
20780 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20781   #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
20782 #endif
20783 
20784 SIMDE_FUNCTION_ATTRIBUTES
20785 simde__m128i
simde_mm_srli_epi64(simde__m128i a,const int imm8)20786 simde_mm_srli_epi64 (simde__m128i a, const int imm8)
20787     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
20788   simde__m128i_private
20789     r_,
20790     a_ = simde__m128i_to_private(a);
20791 
20792   if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
20793     return simde_mm_setzero_si128();
20794 
20795   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20796     r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
20797   #else
20798     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
20799       r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8);
20800     #else
20801       SIMDE_VECTORIZE
20802       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
20803         r_.u64[i] = a_.u64[i] >> imm8;
20804       }
20805     #endif
20806   #endif
20807 
20808   return simde__m128i_from_private(r_);
20809 }
20810 #if defined(SIMDE_X86_SSE2_NATIVE)
20811   #define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
20812 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20813   #define simde_mm_srli_epi64(a, imm8) \
20814     (__extension__ ({ \
20815         simde__m128i ret; \
20816         if ((imm8) <= 0) { \
20817             ret = a; \
20818         } else if ((imm8) > 63) { \
20819             ret = simde_mm_setzero_si128(); \
20820         } else { \
20821             ret = simde__m128i_from_neon_u64( \
20822               vshrq_n_u64(simde__m128i_to_neon_u64(a), (((imm8) & 63) | (((imm8) & 63) == 0)))); \
20823         } \
20824         ret; \
20825     }))
20826 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
20827   #define simde_mm_srli_epi64(a, imm8) \
20828     ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
20829 #endif
20830 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20831   #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
20832 #endif
20833 
20834 SIMDE_FUNCTION_ATTRIBUTES
20835 void
simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)20836 simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
20837   #if defined(SIMDE_X86_SSE2_NATIVE)
20838     _mm_store_pd(mem_addr, a);
20839   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
20840     vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
20841   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20842     vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64);
20843   #else
20844     simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a));
20845   #endif
20846 }
20847 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20848   #define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
20849 #endif
20850 
20851 SIMDE_FUNCTION_ATTRIBUTES
20852 void
simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)20853 simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
20854   #if defined(SIMDE_X86_SSE2_NATIVE)
20855     _mm_store1_pd(mem_addr, a);
20856   #else
20857     simde__m128d_private a_ = simde__m128d_to_private(a);
20858 
20859     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
20860       vst1q_f64(mem_addr, vdupq_laneq_f64(a_.neon_f64, 0));
20861     #else
20862       mem_addr[0] = a_.f64[0];
20863       mem_addr[1] = a_.f64[0];
20864     #endif
20865   #endif
20866 }
20867 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
20868 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20869   #define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
20870   #define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
20871 #endif
20872 
20873 SIMDE_FUNCTION_ATTRIBUTES
20874 void
simde_mm_store_sd(simde_float64 * mem_addr,simde__m128d a)20875 simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) {
20876   #if defined(SIMDE_X86_SSE2_NATIVE)
20877     _mm_store_sd(mem_addr, a);
20878   #else
20879     simde__m128d_private a_ = simde__m128d_to_private(a);
20880 
20881     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
20882       const simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
20883       simde_memcpy(mem_addr, &v, sizeof(v));
20884     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20885       const int64_t v = vgetq_lane_s64(a_.neon_i64, 0);
20886       simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), &v, sizeof(v));
20887     #else
20888       simde_float64 v = a_.f64[0];
20889       simde_memcpy(mem_addr, &v, sizeof(simde_float64));
20890     #endif
20891   #endif
20892 }
20893 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20894   #define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
20895 #endif
20896 
20897 SIMDE_FUNCTION_ATTRIBUTES
20898 void
simde_mm_store_si128(simde__m128i * mem_addr,simde__m128i a)20899 simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) {
20900   #if defined(SIMDE_X86_SSE2_NATIVE)
20901     _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
20902   #else
20903     simde__m128i_private a_ = simde__m128i_to_private(a);
20904 
20905     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20906       vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
20907     #else
20908       simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), &a_, sizeof(a_));
20909     #endif
20910   #endif
20911 }
20912 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20913   #define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
20914 #endif
20915 
20916 SIMDE_FUNCTION_ATTRIBUTES
20917 void
simde_mm_storeh_pd(simde_float64 * mem_addr,simde__m128d a)20918   simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) {
20919   #if defined(SIMDE_X86_SSE2_NATIVE)
20920     _mm_storeh_pd(mem_addr, a);
20921   #else
20922     simde__m128d_private a_ = simde__m128d_to_private(a);
20923 
20924     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
20925       *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
20926     #else
20927       *mem_addr = a_.f64[1];
20928     #endif
20929   #endif
20930 }
20931 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20932   #define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
20933 #endif
20934 
20935 SIMDE_FUNCTION_ATTRIBUTES
20936 void
simde_mm_storel_epi64(simde__m128i * mem_addr,simde__m128i a)20937 simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) {
20938   #if defined(SIMDE_X86_SSE2_NATIVE)
20939     _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
20940   #else
20941     simde__m128i_private a_ = simde__m128i_to_private(a);
20942     int64_t tmp;
20943 
20944     /* memcpy to prevent aliasing, tmp because we can't take the
20945      * address of a vector element. */
20946 
20947     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20948       tmp = vgetq_lane_s64(a_.neon_i64, 0);
20949     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
20950       #if defined(SIMDE_BUG_GCC_95227)
20951         (void) a_;
20952       #endif
20953       tmp = vec_extract(a_.altivec_i64, 0);
20954     #else
20955       tmp = a_.i64[0];
20956     #endif
20957 
20958     simde_memcpy(mem_addr, &tmp, sizeof(tmp));
20959   #endif
20960 }
20961 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20962   #define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
20963 #endif
20964 
20965 SIMDE_FUNCTION_ATTRIBUTES
20966 void
simde_mm_storel_pd(simde_float64 * mem_addr,simde__m128d a)20967 simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) {
20968   #if defined(SIMDE_X86_SSE2_NATIVE)
20969     _mm_storel_pd(mem_addr, a);
20970   #else
20971     simde__m128d_private a_ = simde__m128d_to_private(a);
20972 
20973     simde_float64 tmp;
20974     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
20975       tmp = vgetq_lane_f64(a_.neon_f64, 0);
20976     #else
20977       tmp = a_.f64[0];
20978     #endif
20979     simde_memcpy(mem_addr, &tmp, sizeof(tmp));
20980   #endif
20981 }
20982 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
20983   #define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
20984 #endif
20985 
20986 SIMDE_FUNCTION_ATTRIBUTES
20987 void
simde_mm_storer_pd(simde_float64 mem_addr[2],simde__m128d a)20988 simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) {
20989   #if defined(SIMDE_X86_SSE2_NATIVE)
20990     _mm_storer_pd(mem_addr, a);
20991   #else
20992     simde__m128d_private a_ = simde__m128d_to_private(a);
20993 
20994     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
20995       vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), vextq_s64(a_.neon_i64, a_.neon_i64, 1));
20996     #elif defined(SIMDE_SHUFFLE_VECTOR_)
20997       a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0);
20998       simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_));
20999     #else
21000       mem_addr[0] = a_.f64[1];
21001       mem_addr[1] = a_.f64[0];
21002     #endif
21003   #endif
21004 }
21005 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21006   #define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
21007 #endif
21008 
21009 SIMDE_FUNCTION_ATTRIBUTES
21010 void
simde_mm_storeu_pd(simde_float64 * mem_addr,simde__m128d a)21011 simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) {
21012   #if defined(SIMDE_X86_SSE2_NATIVE)
21013     _mm_storeu_pd(mem_addr, a);
21014   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21015     vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
21016   #else
21017     simde_memcpy(mem_addr, &a, sizeof(a));
21018   #endif
21019 }
21020 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21021   #define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
21022 #endif
21023 
21024 SIMDE_FUNCTION_ATTRIBUTES
21025 void
simde_mm_storeu_si128(void * mem_addr,simde__m128i a)21026 simde_mm_storeu_si128 (void* mem_addr, simde__m128i a) {
21027   #if defined(SIMDE_X86_SSE2_NATIVE)
21028     _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
21029   #else
21030     simde_memcpy(mem_addr, &a, sizeof(a));
21031   #endif
21032 }
21033 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21034   #define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
21035 #endif
21036 
21037 SIMDE_FUNCTION_ATTRIBUTES
21038 void
simde_mm_storeu_si16(void * mem_addr,simde__m128i a)21039 simde_mm_storeu_si16 (void* mem_addr, simde__m128i a) {
21040   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
21041       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
21042       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
21043       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
21044     _mm_storeu_si16(mem_addr, a);
21045   #else
21046     int16_t val = simde_x_mm_cvtsi128_si16(a);
21047     simde_memcpy(mem_addr, &val, sizeof(val));
21048   #endif
21049 }
21050 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21051   #define _mm_storeu_si16(mem_addr, a) simde_mm_storeu_si16(mem_addr, a)
21052 #endif
21053 
21054 SIMDE_FUNCTION_ATTRIBUTES
21055 void
simde_mm_storeu_si32(void * mem_addr,simde__m128i a)21056 simde_mm_storeu_si32 (void* mem_addr, simde__m128i a) {
21057   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
21058       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
21059       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
21060       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
21061     _mm_storeu_si32(mem_addr, a);
21062   #else
21063     int32_t val = simde_mm_cvtsi128_si32(a);
21064     simde_memcpy(mem_addr, &val, sizeof(val));
21065   #endif
21066 }
21067 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21068   #define _mm_storeu_si32(mem_addr, a) simde_mm_storeu_si32(mem_addr, a)
21069 #endif
21070 
21071 SIMDE_FUNCTION_ATTRIBUTES
21072 void
simde_mm_storeu_si64(void * mem_addr,simde__m128i a)21073 simde_mm_storeu_si64 (void* mem_addr, simde__m128i a) {
21074   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
21075       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
21076       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
21077       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
21078     _mm_storeu_si64(mem_addr, a);
21079   #else
21080     int64_t val = simde_mm_cvtsi128_si64(a);
21081     simde_memcpy(mem_addr, &val, sizeof(val));
21082   #endif
21083 }
21084 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21085   #define _mm_storeu_si64(mem_addr, a) simde_mm_storeu_si64(mem_addr, a)
21086 #endif
21087 
21088 SIMDE_FUNCTION_ATTRIBUTES
21089 void
simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)21090 simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
21091   #if defined(SIMDE_X86_SSE2_NATIVE)
21092     _mm_stream_pd(mem_addr, a);
21093   #else
21094     simde_memcpy(mem_addr, &a, sizeof(a));
21095   #endif
21096 }
21097 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21098   #define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
21099 #endif
21100 
21101 SIMDE_FUNCTION_ATTRIBUTES
21102 void
simde_mm_stream_si128(simde__m128i * mem_addr,simde__m128i a)21103 simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) {
21104   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
21105     _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
21106   #else
21107     simde_memcpy(mem_addr, &a, sizeof(a));
21108   #endif
21109 }
21110 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21111   #define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
21112 #endif
21113 
21114 SIMDE_FUNCTION_ATTRIBUTES
21115 void
simde_mm_stream_si32(int32_t * mem_addr,int32_t a)21116 simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) {
21117   #if defined(SIMDE_X86_SSE2_NATIVE)
21118     _mm_stream_si32(mem_addr, a);
21119   #else
21120     *mem_addr = a;
21121   #endif
21122 }
21123 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21124   #define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
21125 #endif
21126 
21127 SIMDE_FUNCTION_ATTRIBUTES
21128 void
simde_mm_stream_si64(int64_t * mem_addr,int64_t a)21129 simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) {
21130   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION)
21131     _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a);
21132   #else
21133     *mem_addr = a;
21134   #endif
21135 }
21136 #define simde_mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(mem_addr, a)
21137 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
21138   #define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
21139   #define _mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
21140 #endif
21141 
21142 SIMDE_FUNCTION_ATTRIBUTES
21143 simde__m128i
simde_mm_sub_epi8(simde__m128i a,simde__m128i b)21144 simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) {
21145   #if defined(SIMDE_X86_SSE2_NATIVE)
21146     return _mm_sub_epi8(a, b);
21147   #else
21148     simde__m128i_private
21149       r_,
21150       a_ = simde__m128i_to_private(a),
21151       b_ = simde__m128i_to_private(b);
21152 
21153     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21154       r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
21155     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
21156       r_.i8 = a_.i8 - b_.i8;
21157     #else
21158       SIMDE_VECTORIZE
21159       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
21160         r_.i8[i] = a_.i8[i] - b_.i8[i];
21161       }
21162     #endif
21163 
21164     return simde__m128i_from_private(r_);
21165   #endif
21166 }
21167 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21168   #define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
21169 #endif
21170 
21171 SIMDE_FUNCTION_ATTRIBUTES
21172 simde__m128i
simde_mm_sub_epi16(simde__m128i a,simde__m128i b)21173 simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) {
21174   #if defined(SIMDE_X86_SSE2_NATIVE)
21175     return _mm_sub_epi16(a, b);
21176   #else
21177     simde__m128i_private
21178       r_,
21179       a_ = simde__m128i_to_private(a),
21180       b_ = simde__m128i_to_private(b);
21181 
21182     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21183       r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
21184     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
21185       r_.i16 = a_.i16 - b_.i16;
21186     #else
21187       SIMDE_VECTORIZE
21188       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
21189         r_.i16[i] = a_.i16[i] - b_.i16[i];
21190       }
21191     #endif
21192 
21193     return simde__m128i_from_private(r_);
21194   #endif
21195 }
21196 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21197   #define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
21198 #endif
21199 
21200 SIMDE_FUNCTION_ATTRIBUTES
21201 simde__m128i
simde_mm_sub_epi32(simde__m128i a,simde__m128i b)21202 simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) {
21203   #if defined(SIMDE_X86_SSE2_NATIVE)
21204     return _mm_sub_epi32(a, b);
21205   #else
21206     simde__m128i_private
21207       r_,
21208       a_ = simde__m128i_to_private(a),
21209       b_ = simde__m128i_to_private(b);
21210 
21211     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21212       r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
21213     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
21214       r_.i32 = a_.i32 - b_.i32;
21215     #else
21216       SIMDE_VECTORIZE
21217       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
21218         r_.i32[i] = a_.i32[i] - b_.i32[i];
21219       }
21220     #endif
21221 
21222     return simde__m128i_from_private(r_);
21223   #endif
21224 }
21225 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21226   #define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
21227 #endif
21228 
21229 SIMDE_FUNCTION_ATTRIBUTES
21230 simde__m128i
simde_mm_sub_epi64(simde__m128i a,simde__m128i b)21231 simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) {
21232   #if defined(SIMDE_X86_SSE2_NATIVE)
21233     return _mm_sub_epi64(a, b);
21234   #else
21235     simde__m128i_private
21236       r_,
21237       a_ = simde__m128i_to_private(a),
21238       b_ = simde__m128i_to_private(b);
21239 
21240     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21241       r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
21242     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
21243       r_.i64 = a_.i64 - b_.i64;
21244     #else
21245       SIMDE_VECTORIZE
21246       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
21247         r_.i64[i] = a_.i64[i] - b_.i64[i];
21248       }
21249     #endif
21250 
21251     return simde__m128i_from_private(r_);
21252   #endif
21253 }
21254 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21255   #define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
21256 #endif
21257 
21258 SIMDE_FUNCTION_ATTRIBUTES
21259 simde__m128i
simde_x_mm_sub_epu32(simde__m128i a,simde__m128i b)21260 simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) {
21261   simde__m128i_private
21262     r_,
21263     a_ = simde__m128i_to_private(a),
21264     b_ = simde__m128i_to_private(b);
21265 
21266   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
21267     r_.u32 = a_.u32 - b_.u32;
21268   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21269     r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32);
21270   #else
21271     SIMDE_VECTORIZE
21272     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
21273       r_.u32[i] = a_.u32[i] - b_.u32[i];
21274     }
21275   #endif
21276 
21277   return simde__m128i_from_private(r_);
21278 }
21279 
21280 SIMDE_FUNCTION_ATTRIBUTES
21281 simde__m128d
simde_mm_sub_pd(simde__m128d a,simde__m128d b)21282 simde_mm_sub_pd (simde__m128d a, simde__m128d b) {
21283   #if defined(SIMDE_X86_SSE2_NATIVE)
21284     return _mm_sub_pd(a, b);
21285   #else
21286     simde__m128d_private
21287       r_,
21288       a_ = simde__m128d_to_private(a),
21289       b_ = simde__m128d_to_private(b);
21290 
21291     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
21292       r_.f64 = a_.f64 - b_.f64;
21293     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21294       r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64);
21295     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21296       r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
21297     #else
21298       SIMDE_VECTORIZE
21299       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
21300         r_.f64[i] = a_.f64[i] - b_.f64[i];
21301       }
21302     #endif
21303 
21304     return simde__m128d_from_private(r_);
21305   #endif
21306 }
21307 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21308   #define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
21309 #endif
21310 
21311 SIMDE_FUNCTION_ATTRIBUTES
21312 simde__m128d
simde_mm_sub_sd(simde__m128d a,simde__m128d b)21313 simde_mm_sub_sd (simde__m128d a, simde__m128d b) {
21314   #if defined(SIMDE_X86_SSE2_NATIVE)
21315     return _mm_sub_sd(a, b);
21316   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
21317     return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
21318   #else
21319     simde__m128d_private
21320       r_,
21321       a_ = simde__m128d_to_private(a),
21322       b_ = simde__m128d_to_private(b);
21323 
21324     r_.f64[0] = a_.f64[0] - b_.f64[0];
21325     r_.f64[1] = a_.f64[1];
21326 
21327     return simde__m128d_from_private(r_);
21328   #endif
21329 }
21330 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21331   #define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
21332 #endif
21333 
21334 SIMDE_FUNCTION_ATTRIBUTES
21335 simde__m64
simde_mm_sub_si64(simde__m64 a,simde__m64 b)21336 simde_mm_sub_si64 (simde__m64 a, simde__m64 b) {
21337   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
21338     return _mm_sub_si64(a, b);
21339   #else
21340     simde__m64_private
21341       r_,
21342       a_ = simde__m64_to_private(a),
21343       b_ = simde__m64_to_private(b);
21344 
21345     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
21346       r_.i64 = a_.i64 - b_.i64;
21347     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21348       r_.neon_i64 = vsub_s64(a_.neon_i64, b_.neon_i64);
21349     #else
21350       r_.i64[0] = a_.i64[0] - b_.i64[0];
21351     #endif
21352 
21353     return simde__m64_from_private(r_);
21354   #endif
21355 }
21356 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21357   #define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
21358 #endif
21359 
21360 SIMDE_FUNCTION_ATTRIBUTES
21361 simde__m128i
simde_mm_subs_epi8(simde__m128i a,simde__m128i b)21362 simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) {
21363   #if defined(SIMDE_X86_SSE2_NATIVE)
21364     return _mm_subs_epi8(a, b);
21365   #else
21366     simde__m128i_private
21367       r_,
21368       a_ = simde__m128i_to_private(a),
21369       b_ = simde__m128i_to_private(b);
21370 
21371     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21372       r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
21373     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21374       r_.wasm_v128 = wasm_i8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
21375     #else
21376       SIMDE_VECTORIZE
21377       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
21378         if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
21379           r_.i8[i] = INT8_MIN;
21380         } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
21381           r_.i8[i] = INT8_MAX;
21382         } else {
21383           r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
21384         }
21385       }
21386     #endif
21387 
21388     return simde__m128i_from_private(r_);
21389   #endif
21390 }
21391 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21392   #define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
21393 #endif
21394 
21395 SIMDE_FUNCTION_ATTRIBUTES
21396 simde__m128i
simde_mm_subs_epi16(simde__m128i a,simde__m128i b)21397 simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) {
21398   #if defined(SIMDE_X86_SSE2_NATIVE)
21399     return _mm_subs_epi16(a, b);
21400   #else
21401     simde__m128i_private
21402       r_,
21403       a_ = simde__m128i_to_private(a),
21404       b_ = simde__m128i_to_private(b);
21405 
21406     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21407       r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
21408     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21409       r_.wasm_v128 = wasm_i16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
21410     #else
21411       SIMDE_VECTORIZE
21412       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
21413         if (((b_.i16[i]) > 0 && (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
21414           r_.i16[i] = INT16_MIN;
21415         } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
21416           r_.i16[i] = INT16_MAX;
21417         } else {
21418           r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
21419         }
21420       }
21421     #endif
21422 
21423     return simde__m128i_from_private(r_);
21424   #endif
21425 }
21426 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21427   #define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
21428 #endif
21429 
21430 SIMDE_FUNCTION_ATTRIBUTES
21431 simde__m128i
simde_mm_subs_epu8(simde__m128i a,simde__m128i b)21432 simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) {
21433   #if defined(SIMDE_X86_SSE2_NATIVE)
21434     return _mm_subs_epu8(a, b);
21435   #else
21436     simde__m128i_private
21437       r_,
21438       a_ = simde__m128i_to_private(a),
21439       b_ = simde__m128i_to_private(b);
21440 
21441     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21442       r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
21443     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21444       r_.wasm_v128 = wasm_u8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
21445     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
21446       r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8);
21447     #else
21448       SIMDE_VECTORIZE
21449       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
21450         const int32_t x = a_.u8[i] - b_.u8[i];
21451         if (x < 0) {
21452           r_.u8[i] = 0;
21453         } else if (x > UINT8_MAX) {
21454           r_.u8[i] = UINT8_MAX;
21455         } else {
21456           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
21457         }
21458       }
21459     #endif
21460 
21461     return simde__m128i_from_private(r_);
21462   #endif
21463 }
21464 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21465   #define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
21466 #endif
21467 
21468 SIMDE_FUNCTION_ATTRIBUTES
21469 simde__m128i
simde_mm_subs_epu16(simde__m128i a,simde__m128i b)21470 simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) {
21471   #if defined(SIMDE_X86_SSE2_NATIVE)
21472     return _mm_subs_epu16(a, b);
21473   #else
21474     simde__m128i_private
21475       r_,
21476       a_ = simde__m128i_to_private(a),
21477       b_ = simde__m128i_to_private(b);
21478 
21479     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21480       r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
21481     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21482       r_.wasm_v128 = wasm_u16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
21483     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
21484       r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16);
21485     #else
21486       SIMDE_VECTORIZE
21487       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
21488         const int32_t x = a_.u16[i] - b_.u16[i];
21489         if (x < 0) {
21490           r_.u16[i] = 0;
21491         } else if (x > UINT16_MAX) {
21492           r_.u16[i] = UINT16_MAX;
21493         } else {
21494           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
21495         }
21496       }
21497     #endif
21498 
21499     return simde__m128i_from_private(r_);
21500   #endif
21501 }
21502 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21503   #define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
21504 #endif
21505 
21506 SIMDE_FUNCTION_ATTRIBUTES
21507 int
simde_mm_ucomieq_sd(simde__m128d a,simde__m128d b)21508 simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) {
21509   #if defined(SIMDE_X86_SSE2_NATIVE)
21510     return _mm_ucomieq_sd(a, b);
21511   #else
21512     simde__m128d_private
21513       a_ = simde__m128d_to_private(a),
21514       b_ = simde__m128d_to_private(b);
21515     int r;
21516 
21517     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21518       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
21519       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
21520       uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
21521       uint64x2_t a_eq_b = vceqq_f64(a_.neon_f64, b_.neon_f64);
21522       r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_eq_b), 0) != 0);
21523     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21524       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0);
21525     #elif defined(SIMDE_HAVE_FENV_H)
21526       fenv_t envp;
21527       int x = feholdexcept(&envp);
21528       r =  a_.f64[0] == b_.f64[0];
21529       if (HEDLEY_LIKELY(x == 0))
21530         fesetenv(&envp);
21531     #else
21532       r =  a_.f64[0] == b_.f64[0];
21533     #endif
21534 
21535     return r;
21536   #endif
21537 }
21538 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21539   #define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
21540 #endif
21541 
21542 SIMDE_FUNCTION_ATTRIBUTES
21543 int
simde_mm_ucomige_sd(simde__m128d a,simde__m128d b)21544 simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) {
21545   #if defined(SIMDE_X86_SSE2_NATIVE)
21546     return _mm_ucomige_sd(a, b);
21547   #else
21548     simde__m128d_private
21549       a_ = simde__m128d_to_private(a),
21550       b_ = simde__m128d_to_private(b);
21551     int r;
21552 
21553     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21554       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
21555       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
21556       uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
21557       uint64x2_t a_ge_b = vcgeq_f64(a_.neon_f64, b_.neon_f64);
21558       r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_ge_b), 0) != 0);
21559     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21560       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
21561     #elif defined(SIMDE_HAVE_FENV_H)
21562       fenv_t envp;
21563       int x = feholdexcept(&envp);
21564       r = a_.f64[0] >= b_.f64[0];
21565       if (HEDLEY_LIKELY(x == 0))
21566         fesetenv(&envp);
21567     #else
21568       r = a_.f64[0] >= b_.f64[0];
21569     #endif
21570 
21571     return r;
21572   #endif
21573 }
21574 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21575   #define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
21576 #endif
21577 
21578 SIMDE_FUNCTION_ATTRIBUTES
21579 int
simde_mm_ucomigt_sd(simde__m128d a,simde__m128d b)21580 simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) {
21581   #if defined(SIMDE_X86_SSE2_NATIVE)
21582     return _mm_ucomigt_sd(a, b);
21583   #else
21584     simde__m128d_private
21585       a_ = simde__m128d_to_private(a),
21586       b_ = simde__m128d_to_private(b);
21587     int r;
21588 
21589     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21590       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
21591       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
21592       uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
21593       uint64x2_t a_gt_b = vcgtq_f64(a_.neon_f64, b_.neon_f64);
21594       r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_gt_b), 0) != 0);
21595     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21596       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0);
21597     #elif defined(SIMDE_HAVE_FENV_H)
21598       fenv_t envp;
21599       int x = feholdexcept(&envp);
21600       r = a_.f64[0] > b_.f64[0];
21601       if (HEDLEY_LIKELY(x == 0))
21602         fesetenv(&envp);
21603     #else
21604       r = a_.f64[0] > b_.f64[0];
21605     #endif
21606 
21607     return r;
21608   #endif
21609 }
21610 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21611   #define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
21612 #endif
21613 
21614 SIMDE_FUNCTION_ATTRIBUTES
21615 int
simde_mm_ucomile_sd(simde__m128d a,simde__m128d b)21616 simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) {
21617   #if defined(SIMDE_X86_SSE2_NATIVE)
21618     return _mm_ucomile_sd(a, b);
21619   #else
21620     simde__m128d_private
21621       a_ = simde__m128d_to_private(a),
21622       b_ = simde__m128d_to_private(b);
21623     int r;
21624 
21625     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21626       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
21627       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
21628       uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
21629       uint64x2_t a_le_b = vcleq_f64(a_.neon_f64, b_.neon_f64);
21630       r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_le_b), 0) != 0);
21631     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21632       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
21633     #elif defined(SIMDE_HAVE_FENV_H)
21634       fenv_t envp;
21635       int x = feholdexcept(&envp);
21636       r = a_.f64[0] <= b_.f64[0];
21637       if (HEDLEY_LIKELY(x == 0))
21638         fesetenv(&envp);
21639     #else
21640       r = a_.f64[0] <= b_.f64[0];
21641     #endif
21642 
21643     return r;
21644   #endif
21645 }
21646 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21647   #define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
21648 #endif
21649 
21650 SIMDE_FUNCTION_ATTRIBUTES
21651 int
simde_mm_ucomilt_sd(simde__m128d a,simde__m128d b)21652 simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) {
21653   #if defined(SIMDE_X86_SSE2_NATIVE)
21654     return _mm_ucomilt_sd(a, b);
21655   #else
21656     simde__m128d_private
21657       a_ = simde__m128d_to_private(a),
21658       b_ = simde__m128d_to_private(b);
21659     int r;
21660 
21661     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21662       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
21663       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
21664       uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
21665       uint64x2_t a_lt_b = vcltq_f64(a_.neon_f64, b_.neon_f64);
21666       r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_lt_b), 0) != 0);
21667     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21668       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0);
21669     #elif defined(SIMDE_HAVE_FENV_H)
21670       fenv_t envp;
21671       int x = feholdexcept(&envp);
21672       r = a_.f64[0] < b_.f64[0];
21673       if (HEDLEY_LIKELY(x == 0))
21674         fesetenv(&envp);
21675     #else
21676       r = a_.f64[0] < b_.f64[0];
21677     #endif
21678 
21679     return r;
21680   #endif
21681 }
21682 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21683   #define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
21684 #endif
21685 
21686 SIMDE_FUNCTION_ATTRIBUTES
21687 int
simde_mm_ucomineq_sd(simde__m128d a,simde__m128d b)21688 simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) {
21689   #if defined(SIMDE_X86_SSE2_NATIVE)
21690     return _mm_ucomineq_sd(a, b);
21691   #else
21692     simde__m128d_private
21693       a_ = simde__m128d_to_private(a),
21694       b_ = simde__m128d_to_private(b);
21695     int r;
21696 
21697     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21698       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
21699       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
21700       uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
21701       uint64x2_t a_neq_b = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a_.neon_f64, b_.neon_f64))));
21702       r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_neq_b), 0) != 0);
21703     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21704       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0);
21705     #elif defined(SIMDE_HAVE_FENV_H)
21706       fenv_t envp;
21707       int x = feholdexcept(&envp);
21708       r = a_.f64[0] != b_.f64[0];
21709       if (HEDLEY_LIKELY(x == 0))
21710         fesetenv(&envp);
21711     #else
21712       r = a_.f64[0] != b_.f64[0];
21713     #endif
21714 
21715     return r;
21716   #endif
21717 }
21718 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21719   #define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
21720 #endif
21721 
21722 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
21723   HEDLEY_DIAGNOSTIC_PUSH
21724   SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
21725 #endif
21726 
21727 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
21728   HEDLEY_DIAGNOSTIC_POP
21729 #endif
21730 
21731 SIMDE_FUNCTION_ATTRIBUTES
21732 void
simde_mm_lfence(void)21733 simde_mm_lfence (void) {
21734   #if defined(SIMDE_X86_SSE2_NATIVE)
21735     _mm_lfence();
21736   #else
21737     simde_mm_sfence();
21738   #endif
21739 }
21740 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21741   #define _mm_lfence() simde_mm_lfence()
21742 #endif
21743 
21744 SIMDE_FUNCTION_ATTRIBUTES
21745 void
simde_mm_mfence(void)21746 simde_mm_mfence (void) {
21747   #if defined(SIMDE_X86_SSE2_NATIVE)
21748     _mm_mfence();
21749   #else
21750     simde_mm_sfence();
21751   #endif
21752 }
21753 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21754   #define _mm_mfence() simde_mm_mfence()
21755 #endif
21756 
21757 SIMDE_FUNCTION_ATTRIBUTES
21758 simde__m128i
simde_mm_unpackhi_epi8(simde__m128i a,simde__m128i b)21759 simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) {
21760   #if defined(SIMDE_X86_SSE2_NATIVE)
21761     return _mm_unpackhi_epi8(a, b);
21762   #else
21763     simde__m128i_private
21764       r_,
21765       a_ = simde__m128i_to_private(a),
21766       b_ = simde__m128i_to_private(b);
21767 
21768     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21769       r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8);
21770     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21771       int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
21772       int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
21773       int8x8x2_t result = vzip_s8(a1, b1);
21774       r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
21775     #elif defined(SIMDE_SHUFFLE_VECTOR_)
21776       r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
21777     #else
21778       SIMDE_VECTORIZE
21779       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
21780         r_.i8[(i * 2)]     = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
21781         r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
21782       }
21783     #endif
21784 
21785     return simde__m128i_from_private(r_);
21786   #endif
21787 }
21788 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21789   #define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
21790 #endif
21791 
21792 SIMDE_FUNCTION_ATTRIBUTES
21793 simde__m128i
simde_mm_unpackhi_epi16(simde__m128i a,simde__m128i b)21794 simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) {
21795   #if defined(SIMDE_X86_SSE2_NATIVE)
21796     return _mm_unpackhi_epi16(a, b);
21797   #else
21798     simde__m128i_private
21799       r_,
21800       a_ = simde__m128i_to_private(a),
21801       b_ = simde__m128i_to_private(b);
21802 
21803     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21804       r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16);
21805     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21806       int16x4_t a1 = vget_high_s16(a_.neon_i16);
21807       int16x4_t b1 = vget_high_s16(b_.neon_i16);
21808       int16x4x2_t result = vzip_s16(a1, b1);
21809       r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
21810     #elif defined(SIMDE_SHUFFLE_VECTOR_)
21811       r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15);
21812     #else
21813       SIMDE_VECTORIZE
21814       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
21815         r_.i16[(i * 2)]     = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
21816         r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
21817       }
21818     #endif
21819 
21820     return simde__m128i_from_private(r_);
21821   #endif
21822 }
21823 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21824   #define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
21825 #endif
21826 
21827 SIMDE_FUNCTION_ATTRIBUTES
21828 simde__m128i
simde_mm_unpackhi_epi32(simde__m128i a,simde__m128i b)21829 simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) {
21830   #if defined(SIMDE_X86_SSE2_NATIVE)
21831     return _mm_unpackhi_epi32(a, b);
21832   #else
21833     simde__m128i_private
21834       r_,
21835       a_ = simde__m128i_to_private(a),
21836       b_ = simde__m128i_to_private(b);
21837 
21838     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21839       r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32);
21840     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21841       int32x2_t a1 = vget_high_s32(a_.neon_i32);
21842       int32x2_t b1 = vget_high_s32(b_.neon_i32);
21843       int32x2x2_t result = vzip_s32(a1, b1);
21844       r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
21845     #elif defined(SIMDE_SHUFFLE_VECTOR_)
21846       r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
21847     #else
21848       SIMDE_VECTORIZE
21849       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
21850         r_.i32[(i * 2)]     = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
21851         r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
21852       }
21853     #endif
21854 
21855     return simde__m128i_from_private(r_);
21856   #endif
21857 }
21858 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21859   #define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
21860 #endif
21861 
21862 SIMDE_FUNCTION_ATTRIBUTES
21863 simde__m128i
simde_mm_unpackhi_epi64(simde__m128i a,simde__m128i b)21864 simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) {
21865   #if defined(SIMDE_X86_SSE2_NATIVE)
21866     return _mm_unpackhi_epi64(a, b);
21867   #else
21868     simde__m128i_private
21869       r_,
21870       a_ = simde__m128i_to_private(a),
21871       b_ = simde__m128i_to_private(b);
21872 
21873     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21874       int64x1_t a_h = vget_high_s64(a_.neon_i64);
21875       int64x1_t b_h = vget_high_s64(b_.neon_i64);
21876       r_.neon_i64 = vcombine_s64(a_h, b_h);
21877     #elif defined(SIMDE_SHUFFLE_VECTOR_)
21878       r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
21879     #else
21880       SIMDE_VECTORIZE
21881       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
21882         r_.i64[(i * 2)]     = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
21883         r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
21884       }
21885     #endif
21886 
21887     return simde__m128i_from_private(r_);
21888   #endif
21889 }
21890 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21891   #define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
21892 #endif
21893 
21894 SIMDE_FUNCTION_ATTRIBUTES
21895 simde__m128d
simde_mm_unpackhi_pd(simde__m128d a,simde__m128d b)21896 simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) {
21897   #if defined(SIMDE_X86_SSE2_NATIVE)
21898     return _mm_unpackhi_pd(a, b);
21899   #else
21900     simde__m128d_private
21901       r_,
21902       a_ = simde__m128d_to_private(a),
21903       b_ = simde__m128d_to_private(b);
21904 
21905     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21906       float64x1_t a_l = vget_high_f64(a_.f64);
21907       float64x1_t b_l = vget_high_f64(b_.f64);
21908       r_.neon_f64 = vcombine_f64(a_l, b_l);
21909     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
21910       r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3);
21911     #elif defined(SIMDE_SHUFFLE_VECTOR_)
21912       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
21913     #else
21914       SIMDE_VECTORIZE
21915       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
21916         r_.f64[(i * 2)]     = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
21917         r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
21918       }
21919     #endif
21920 
21921     return simde__m128d_from_private(r_);
21922   #endif
21923 }
21924 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21925   #define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
21926 #endif
21927 
21928 SIMDE_FUNCTION_ATTRIBUTES
21929 simde__m128i
simde_mm_unpacklo_epi8(simde__m128i a,simde__m128i b)21930 simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) {
21931   #if defined(SIMDE_X86_SSE2_NATIVE)
21932     return _mm_unpacklo_epi8(a, b);
21933   #else
21934     simde__m128i_private
21935       r_,
21936       a_ = simde__m128i_to_private(a),
21937       b_ = simde__m128i_to_private(b);
21938 
21939     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21940       r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8);
21941     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21942       int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
21943       int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
21944       int8x8x2_t result = vzip_s8(a1, b1);
21945       r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
21946     #elif defined(SIMDE_SHUFFLE_VECTOR_)
21947       r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
21948     #else
21949       SIMDE_VECTORIZE
21950       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
21951         r_.i8[(i * 2)]     = a_.i8[i];
21952         r_.i8[(i * 2) + 1] = b_.i8[i];
21953       }
21954     #endif
21955 
21956     return simde__m128i_from_private(r_);
21957   #endif
21958 }
21959 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21960   #define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
21961 #endif
21962 
21963 SIMDE_FUNCTION_ATTRIBUTES
21964 simde__m128i
simde_mm_unpacklo_epi16(simde__m128i a,simde__m128i b)21965 simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) {
21966   #if defined(SIMDE_X86_SSE2_NATIVE)
21967     return _mm_unpacklo_epi16(a, b);
21968   #else
21969     simde__m128i_private
21970       r_,
21971       a_ = simde__m128i_to_private(a),
21972       b_ = simde__m128i_to_private(b);
21973 
21974     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
21975       r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16);
21976     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
21977       int16x4_t a1 = vget_low_s16(a_.neon_i16);
21978       int16x4_t b1 = vget_low_s16(b_.neon_i16);
21979       int16x4x2_t result = vzip_s16(a1, b1);
21980       r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
21981     #elif defined(SIMDE_SHUFFLE_VECTOR_)
21982       r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11);
21983     #else
21984       SIMDE_VECTORIZE
21985       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
21986         r_.i16[(i * 2)]     = a_.i16[i];
21987         r_.i16[(i * 2) + 1] = b_.i16[i];
21988       }
21989     #endif
21990 
21991     return simde__m128i_from_private(r_);
21992   #endif
21993 }
21994 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
21995   #define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
21996 #endif
21997 
21998 SIMDE_FUNCTION_ATTRIBUTES
21999 simde__m128i
simde_mm_unpacklo_epi32(simde__m128i a,simde__m128i b)22000 simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) {
22001   #if defined(SIMDE_X86_SSE2_NATIVE)
22002     return _mm_unpacklo_epi32(a, b);
22003   #else
22004     simde__m128i_private
22005       r_,
22006       a_ = simde__m128i_to_private(a),
22007       b_ = simde__m128i_to_private(b);
22008 
22009     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22010       r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32);
22011     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22012       int32x2_t a1 = vget_low_s32(a_.neon_i32);
22013       int32x2_t b1 = vget_low_s32(b_.neon_i32);
22014       int32x2x2_t result = vzip_s32(a1, b1);
22015       r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
22016     #elif defined(SIMDE_SHUFFLE_VECTOR_)
22017       r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
22018     #else
22019       SIMDE_VECTORIZE
22020       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
22021         r_.i32[(i * 2)]     = a_.i32[i];
22022         r_.i32[(i * 2) + 1] = b_.i32[i];
22023       }
22024     #endif
22025 
22026     return simde__m128i_from_private(r_);
22027   #endif
22028 }
22029 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
22030   #define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
22031 #endif
22032 
22033 SIMDE_FUNCTION_ATTRIBUTES
22034 simde__m128i
simde_mm_unpacklo_epi64(simde__m128i a,simde__m128i b)22035 simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) {
22036   #if defined(SIMDE_X86_SSE2_NATIVE)
22037     return _mm_unpacklo_epi64(a, b);
22038   #else
22039     simde__m128i_private
22040       r_,
22041       a_ = simde__m128i_to_private(a),
22042       b_ = simde__m128i_to_private(b);
22043 
22044     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22045       int64x1_t a_l = vget_low_s64(a_.neon_i64);
22046       int64x1_t b_l = vget_low_s64(b_.neon_i64);
22047       r_.neon_i64 = vcombine_s64(a_l, b_l);
22048     #elif defined(SIMDE_SHUFFLE_VECTOR_)
22049       r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
22050     #else
22051       SIMDE_VECTORIZE
22052       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
22053         r_.i64[(i * 2)]     = a_.i64[i];
22054         r_.i64[(i * 2) + 1] = b_.i64[i];
22055       }
22056     #endif
22057 
22058     return simde__m128i_from_private(r_);
22059   #endif
22060 }
22061 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
22062   #define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
22063 #endif
22064 
22065 SIMDE_FUNCTION_ATTRIBUTES
22066 simde__m128d
simde_mm_unpacklo_pd(simde__m128d a,simde__m128d b)22067 simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) {
22068   #if defined(SIMDE_X86_SSE2_NATIVE)
22069     return _mm_unpacklo_pd(a, b);
22070   #else
22071     simde__m128d_private
22072       r_,
22073       a_ = simde__m128d_to_private(a),
22074       b_ = simde__m128d_to_private(b);
22075 
22076     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22077       float64x1_t a_l = vget_low_f64(a_.f64);
22078       float64x1_t b_l = vget_low_f64(b_.f64);
22079       r_.neon_f64 = vcombine_f64(a_l, b_l);
22080     #elif defined(SIMDE_SHUFFLE_VECTOR_)
22081       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
22082     #else
22083       SIMDE_VECTORIZE
22084       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
22085         r_.f64[(i * 2)]     = a_.f64[i];
22086         r_.f64[(i * 2) + 1] = b_.f64[i];
22087       }
22088     #endif
22089 
22090     return simde__m128d_from_private(r_);
22091   #endif
22092 }
22093 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
22094   #define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
22095 #endif
22096 
22097 SIMDE_FUNCTION_ATTRIBUTES
22098 simde__m128d
simde_x_mm_negate_pd(simde__m128d a)22099 simde_x_mm_negate_pd(simde__m128d a) {
22100   #if defined(SIMDE_X86_SSE2_NATIVE)
22101     return simde_mm_xor_pd(a, _mm_set1_pd(SIMDE_FLOAT64_C(-0.0)));
22102   #else
22103     simde__m128d_private
22104       r_,
22105       a_ = simde__m128d_to_private(a);
22106 
22107     #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
22108         (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,1,0))
22109       r_.altivec_f64 = vec_neg(a_.altivec_f64);
22110     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22111       r_.neon_f64 = vnegq_f64(a_.neon_f64);
22112     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
22113       r_.wasm_v128 = wasm_f64x2_neg(a_.wasm_v128);
22114     #elif defined(SIMDE_VECTOR_NEGATE)
22115       r_.f64 = -a_.f64;
22116     #else
22117       SIMDE_VECTORIZE
22118       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
22119         r_.f64[i] = -a_.f64[i];
22120       }
22121     #endif
22122 
22123     return simde__m128d_from_private(r_);
22124   #endif
22125 }
22126 
22127 SIMDE_FUNCTION_ATTRIBUTES
22128 simde__m128i
simde_mm_xor_si128(simde__m128i a,simde__m128i b)22129 simde_mm_xor_si128 (simde__m128i a, simde__m128i b) {
22130   #if defined(SIMDE_X86_SSE2_NATIVE)
22131     return _mm_xor_si128(a, b);
22132   #else
22133     simde__m128i_private
22134       r_,
22135       a_ = simde__m128i_to_private(a),
22136       b_ = simde__m128i_to_private(b);
22137 
22138     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22139       r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
22140     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
22141       r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
22142     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
22143       r_.i32f = a_.i32f ^ b_.i32f;
22144     #else
22145       SIMDE_VECTORIZE
22146       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
22147         r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
22148       }
22149     #endif
22150 
22151     return simde__m128i_from_private(r_);
22152   #endif
22153 }
22154 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
22155   #define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
22156 #endif
22157 
22158 SIMDE_FUNCTION_ATTRIBUTES
22159 simde__m128i
simde_x_mm_not_si128(simde__m128i a)22160 simde_x_mm_not_si128 (simde__m128i a) {
22161   #if defined(SIMDE_X86_AVX512VL_NATIVE)
22162     return _mm_ternarylogic_epi32(a, a, a, 0x55);
22163   #else
22164     simde__m128i_private
22165       r_,
22166       a_ = simde__m128i_to_private(a);
22167 
22168     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22169       r_.neon_i32 = vmvnq_s32(a_.neon_i32);
22170     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
22171       r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
22172     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
22173       r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
22174     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
22175       r_.i32f = ~a_.i32f;
22176     #else
22177       SIMDE_VECTORIZE
22178       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
22179         r_.i32f[i] = ~(a_.i32f[i]);
22180       }
22181     #endif
22182 
22183     return simde__m128i_from_private(r_);
22184   #endif
22185 }
22186 
22187 #define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
22188 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
22189   #define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
22190 #endif
22191 
22192 SIMDE_END_DECLS_
22193 
22194 HEDLEY_DIAGNOSTIC_POP
22195 
22196 #endif /* !defined(SIMDE_X86_SSE2_H) */
22197 /* :: End ../simde/simde/x86/sse2.h :: */
22198 
22199 HEDLEY_DIAGNOSTIC_PUSH
22200 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
22201 SIMDE_BEGIN_DECLS_
22202 
22203 SIMDE_FUNCTION_ATTRIBUTES
22204 simde__m128i
simde_x_mm_deinterleaveeven_epi16(simde__m128i a,simde__m128i b)22205 simde_x_mm_deinterleaveeven_epi16 (simde__m128i a, simde__m128i b) {
22206   simde__m128i_private
22207     r_,
22208     a_ = simde__m128i_to_private(a),
22209     b_ = simde__m128i_to_private(b);
22210 
22211   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22212     r_.neon_i16 = vuzp1q_s16(a_.neon_i16, b_.neon_i16);
22213   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22214     int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16);
22215     r_.neon_i16 = t.val[0];
22216   #elif defined(SIMDE_SHUFFLE_VECTOR_)
22217     r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 2, 4, 6, 8, 10, 12, 14);
22218   #else
22219     const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
22220     for(size_t i = 0 ; i < halfway_point ; i++) {
22221       r_.i16[i] = a_.i16[2 * i];
22222       r_.i16[i + halfway_point] = b_.i16[2 * i];
22223     }
22224   #endif
22225 
22226   return simde__m128i_from_private(r_);
22227 }
22228 
22229 SIMDE_FUNCTION_ATTRIBUTES
22230 simde__m128i
simde_x_mm_deinterleaveodd_epi16(simde__m128i a,simde__m128i b)22231 simde_x_mm_deinterleaveodd_epi16 (simde__m128i a, simde__m128i b) {
22232   simde__m128i_private
22233     r_,
22234     a_ = simde__m128i_to_private(a),
22235     b_ = simde__m128i_to_private(b);
22236 
22237   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22238     r_.neon_i16 = vuzp2q_s16(a_.neon_i16, b_.neon_i16);
22239   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22240     int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16);
22241     r_.neon_i16 = t.val[1];
22242   #elif defined(SIMDE_SHUFFLE_VECTOR_)
22243     r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 1, 3, 5, 7, 9, 11, 13, 15);
22244   #else
22245     const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
22246     for(size_t i = 0 ; i < halfway_point ; i++) {
22247       r_.i16[i] = a_.i16[2 * i + 1];
22248       r_.i16[i + halfway_point] = b_.i16[2 * i + 1];
22249     }
22250   #endif
22251 
22252   return simde__m128i_from_private(r_);
22253 }
22254 
22255 SIMDE_FUNCTION_ATTRIBUTES
22256 simde__m128i
simde_x_mm_deinterleaveeven_epi32(simde__m128i a,simde__m128i b)22257 simde_x_mm_deinterleaveeven_epi32 (simde__m128i a, simde__m128i b) {
22258   simde__m128i_private
22259     r_,
22260     a_ = simde__m128i_to_private(a),
22261     b_ = simde__m128i_to_private(b);
22262 
22263   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22264     r_.neon_i32 = vuzp1q_s32(a_.neon_i32, b_.neon_i32);
22265   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22266     int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32);
22267     r_.neon_i32 = t.val[0];
22268   #elif defined(SIMDE_SHUFFLE_VECTOR_)
22269     r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 2, 4, 6);
22270   #else
22271     const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2;
22272     for(size_t i = 0 ; i < halfway_point ; i++) {
22273       r_.i32[i] = a_.i32[2 * i];
22274       r_.i32[i + halfway_point] = b_.i32[2 * i];
22275     }
22276   #endif
22277 
22278   return simde__m128i_from_private(r_);
22279 }
22280 
22281 SIMDE_FUNCTION_ATTRIBUTES
22282 simde__m128i
simde_x_mm_deinterleaveodd_epi32(simde__m128i a,simde__m128i b)22283 simde_x_mm_deinterleaveodd_epi32 (simde__m128i a, simde__m128i b) {
22284   simde__m128i_private
22285     r_,
22286     a_ = simde__m128i_to_private(a),
22287     b_ = simde__m128i_to_private(b);
22288 
22289   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22290     r_.neon_i32 = vuzp2q_s32(a_.neon_i32, b_.neon_i32);
22291   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22292     int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32);
22293     r_.neon_i32 = t.val[1];
22294   #elif defined(SIMDE_SHUFFLE_VECTOR_)
22295     r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 1, 3, 5, 7);
22296   #else
22297     const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2;
22298     for(size_t i = 0 ; i < halfway_point ; i++) {
22299       r_.i32[i] = a_.i32[2 * i + 1];
22300       r_.i32[i + halfway_point] = b_.i32[2 * i + 1];
22301     }
22302   #endif
22303 
22304   return simde__m128i_from_private(r_);
22305 }
22306 
22307 SIMDE_FUNCTION_ATTRIBUTES
22308 simde__m128
simde_x_mm_deinterleaveeven_ps(simde__m128 a,simde__m128 b)22309 simde_x_mm_deinterleaveeven_ps (simde__m128 a, simde__m128 b) {
22310   simde__m128_private
22311     r_,
22312     a_ = simde__m128_to_private(a),
22313     b_ = simde__m128_to_private(b);
22314 
22315   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22316     r_.neon_f32 = vuzp1q_f32(a_.neon_f32, b_.neon_f32);
22317   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22318     float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32);
22319     r_.neon_f32 = t.val[0];
22320   #elif defined(SIMDE_SHUFFLE_VECTOR_)
22321     r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 2, 4, 6);
22322   #else
22323     const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2;
22324     for(size_t i = 0 ; i < halfway_point ; i++) {
22325       r_.f32[i] = a_.f32[2 * i];
22326       r_.f32[i + halfway_point] = b_.f32[2 * i];
22327     }
22328   #endif
22329 
22330   return simde__m128_from_private(r_);
22331 }
22332 
22333 SIMDE_FUNCTION_ATTRIBUTES
22334 simde__m128
simde_x_mm_deinterleaveodd_ps(simde__m128 a,simde__m128 b)22335 simde_x_mm_deinterleaveodd_ps (simde__m128 a, simde__m128 b) {
22336   simde__m128_private
22337     r_,
22338     a_ = simde__m128_to_private(a),
22339     b_ = simde__m128_to_private(b);
22340 
22341   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22342     r_.neon_f32 = vuzp2q_f32(a_.neon_f32, b_.neon_f32);
22343   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22344     float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32);
22345     r_.neon_f32 = t.val[1];
22346   #elif defined(SIMDE_SHUFFLE_VECTOR_)
22347     r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 1, 3, 5, 7);
22348   #else
22349     const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2;
22350     for(size_t i = 0 ; i < halfway_point ; i++) {
22351       r_.f32[i] = a_.f32[2 * i + 1];
22352       r_.f32[i + halfway_point] = b_.f32[2 * i + 1];
22353     }
22354   #endif
22355 
22356   return simde__m128_from_private(r_);
22357 }
22358 
22359 SIMDE_FUNCTION_ATTRIBUTES
22360 simde__m128d
simde_x_mm_deinterleaveeven_pd(simde__m128d a,simde__m128d b)22361 simde_x_mm_deinterleaveeven_pd (simde__m128d a, simde__m128d b) {
22362   simde__m128d_private
22363     r_,
22364     a_ = simde__m128d_to_private(a),
22365     b_ = simde__m128d_to_private(b);
22366 
22367   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22368     r_.neon_f64 = vuzp1q_f64(a_.neon_f64, b_.neon_f64);
22369   #elif defined(SIMDE_SHUFFLE_VECTOR_)
22370     r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
22371   #else
22372     const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2;
22373     for(size_t i = 0 ; i < halfway_point ; i++) {
22374       r_.f64[i] = a_.f64[2 * i];
22375       r_.f64[i + halfway_point] = b_.f64[2 * i];
22376     }
22377   #endif
22378 
22379   return simde__m128d_from_private(r_);
22380 }
22381 
22382 SIMDE_FUNCTION_ATTRIBUTES
22383 simde__m128d
simde_x_mm_deinterleaveodd_pd(simde__m128d a,simde__m128d b)22384 simde_x_mm_deinterleaveodd_pd (simde__m128d a, simde__m128d b) {
22385   simde__m128d_private
22386     r_,
22387     a_ = simde__m128d_to_private(a),
22388     b_ = simde__m128d_to_private(b);
22389 
22390   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22391     r_.neon_f64 = vuzp2q_f64(a_.neon_f64, b_.neon_f64);
22392   #elif defined(SIMDE_SHUFFLE_VECTOR_)
22393     r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
22394   #else
22395     const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2;
22396     for(size_t i = 0 ; i < halfway_point ; i++) {
22397       r_.f64[i] = a_.f64[2 * i + 1];
22398       r_.f64[i + halfway_point] = b_.f64[2 * i + 1];
22399     }
22400   #endif
22401 
22402   return simde__m128d_from_private(r_);
22403 }
22404 
22405 SIMDE_FUNCTION_ATTRIBUTES
22406 simde__m128d
simde_mm_addsub_pd(simde__m128d a,simde__m128d b)22407 simde_mm_addsub_pd (simde__m128d a, simde__m128d b) {
22408   #if defined(SIMDE_X86_SSE3_NATIVE)
22409     return _mm_addsub_pd(a, b);
22410   #else
22411     simde__m128d_private
22412       r_,
22413       a_ = simde__m128d_to_private(a),
22414       b_ = simde__m128d_to_private(b);
22415 
22416     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22417       float64x2_t rs = vsubq_f64(a_.neon_f64, b_.neon_f64);
22418       float64x2_t ra = vaddq_f64(a_.neon_f64, b_.neon_f64);
22419       return vcombine_f64(vget_low_f64(rs), vget_high_f64(ra));
22420     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
22421       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64 - b_.f64, a_.f64 + b_.f64, 0, 3);
22422     #else
22423       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) {
22424         r_.f64[  i  ] = a_.f64[  i  ] - b_.f64[  i  ];
22425         r_.f64[1 + i] = a_.f64[1 + i] + b_.f64[1 + i];
22426       }
22427     #endif
22428 
22429     return simde__m128d_from_private(r_);
22430   #endif
22431 }
22432 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22433 #  define _mm_addsub_pd(a, b) simde_mm_addsub_pd(a, b)
22434 #endif
22435 
22436 SIMDE_FUNCTION_ATTRIBUTES
22437 simde__m128
simde_mm_addsub_ps(simde__m128 a,simde__m128 b)22438 simde_mm_addsub_ps (simde__m128 a, simde__m128 b) {
22439   #if defined(SIMDE_X86_SSE3_NATIVE)
22440     return _mm_addsub_ps(a, b);
22441   #else
22442     simde__m128_private
22443       r_,
22444       a_ = simde__m128_to_private(a),
22445       b_ = simde__m128_to_private(b);
22446 
22447     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22448       float32x4_t rs = vsubq_f32(a_.neon_f32, b_.neon_f32);
22449       float32x4_t ra = vaddq_f32(a_.neon_f32, b_.neon_f32);
22450       return vtrn2q_f32(vreinterpretq_f32_s32(vrev64q_s32(vreinterpretq_s32_f32(rs))), ra);
22451     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
22452       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32 - b_.f32, a_.f32 + b_.f32, 0, 5, 2, 7);
22453     #else
22454       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) {
22455         r_.f32[  i  ] = a_.f32[  i  ] - b_.f32[  i  ];
22456         r_.f32[1 + i] = a_.f32[1 + i] + b_.f32[1 + i];
22457       }
22458     #endif
22459 
22460     return simde__m128_from_private(r_);
22461   #endif
22462 }
22463 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22464 #  define _mm_addsub_ps(a, b) simde_mm_addsub_ps(a, b)
22465 #endif
22466 
22467 SIMDE_FUNCTION_ATTRIBUTES
22468 simde__m128d
simde_mm_hadd_pd(simde__m128d a,simde__m128d b)22469 simde_mm_hadd_pd (simde__m128d a, simde__m128d b) {
22470   #if defined(SIMDE_X86_SSE3_NATIVE)
22471     return _mm_hadd_pd(a, b);
22472   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22473     return simde__m128d_from_neon_f64(vpaddq_f64(simde__m128d_to_neon_f64(a), simde__m128d_to_neon_f64(b)));
22474   #else
22475     return simde_mm_add_pd(simde_x_mm_deinterleaveeven_pd(a, b), simde_x_mm_deinterleaveodd_pd(a, b));
22476   #endif
22477 }
22478 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22479 #  define _mm_hadd_pd(a, b) simde_mm_hadd_pd(a, b)
22480 #endif
22481 
22482 SIMDE_FUNCTION_ATTRIBUTES
22483 simde__m128
simde_mm_hadd_ps(simde__m128 a,simde__m128 b)22484 simde_mm_hadd_ps (simde__m128 a, simde__m128 b) {
22485   #if defined(SIMDE_X86_SSE3_NATIVE)
22486     return _mm_hadd_ps(a, b);
22487   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22488     return simde__m128_from_neon_f32(vpaddq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b)));
22489   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22490     float32x4x2_t t = vuzpq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b));
22491     return simde__m128_from_neon_f32(vaddq_f32(t.val[0], t.val[1]));
22492   #else
22493     return simde_mm_add_ps(simde_x_mm_deinterleaveeven_ps(a, b), simde_x_mm_deinterleaveodd_ps(a, b));
22494   #endif
22495 }
22496 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22497 #  define _mm_hadd_ps(a, b) simde_mm_hadd_ps(a, b)
22498 #endif
22499 
22500 SIMDE_FUNCTION_ATTRIBUTES
22501 simde__m128d
simde_mm_hsub_pd(simde__m128d a,simde__m128d b)22502 simde_mm_hsub_pd (simde__m128d a, simde__m128d b) {
22503   #if defined(SIMDE_X86_SSE3_NATIVE)
22504     return _mm_hsub_pd(a, b);
22505   #else
22506     return simde_mm_sub_pd(simde_x_mm_deinterleaveeven_pd(a, b), simde_x_mm_deinterleaveodd_pd(a, b));
22507   #endif
22508 }
22509 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22510 #  define _mm_hsub_pd(a, b) simde_mm_hsub_pd(a, b)
22511 #endif
22512 
22513 SIMDE_FUNCTION_ATTRIBUTES
22514 simde__m128
simde_mm_hsub_ps(simde__m128 a,simde__m128 b)22515 simde_mm_hsub_ps (simde__m128 a, simde__m128 b) {
22516   #if defined(SIMDE_X86_SSE3_NATIVE)
22517     return _mm_hsub_ps(a, b);
22518   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22519     float32x4x2_t t = vuzpq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b));
22520     return simde__m128_from_neon_f32(vaddq_f32(t.val[0], vnegq_f32(t.val[1])));
22521   #else
22522     return simde_mm_sub_ps(simde_x_mm_deinterleaveeven_ps(a, b), simde_x_mm_deinterleaveodd_ps(a, b));
22523   #endif
22524 }
22525 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22526 #  define _mm_hsub_ps(a, b) simde_mm_hsub_ps(a, b)
22527 #endif
22528 
22529 SIMDE_FUNCTION_ATTRIBUTES
22530 simde__m128i
simde_mm_lddqu_si128(simde__m128i const * mem_addr)22531 simde_mm_lddqu_si128 (simde__m128i const* mem_addr) {
22532   #if defined(SIMDE_X86_SSE3_NATIVE)
22533     return _mm_lddqu_si128(mem_addr);
22534   #else
22535     simde__m128i_private r_;
22536 
22537     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22538       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
22539     #else
22540       simde_memcpy(&r_, mem_addr, sizeof(r_));
22541     #endif
22542 
22543     return simde__m128i_from_private(r_);
22544   #endif
22545 }
22546 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22547 #  define _mm_lddqu_si128(mem_addr) simde_mm_lddqu_si128(mem_addr)
22548 #endif
22549 
22550 SIMDE_FUNCTION_ATTRIBUTES
22551 simde__m128d
simde_mm_loaddup_pd(simde_float64 const * mem_addr)22552 simde_mm_loaddup_pd (simde_float64 const* mem_addr) {
22553   #if defined(SIMDE_X86_SSE3_NATIVE)
22554     return _mm_loaddup_pd(mem_addr);
22555   #else
22556     simde__m128d_private r_;
22557 
22558     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22559       r_.neon_f64 = vdupq_n_f64(*mem_addr);
22560     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22561       r_.neon_i64 = vdupq_n_s64(*HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr));
22562     #else
22563       r_.f64[0] = *mem_addr;
22564       r_.f64[1] = *mem_addr;
22565     #endif
22566 
22567     return simde__m128d_from_private(r_);
22568   #endif
22569 }
22570 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22571 #  define _mm_loaddup_pd(mem_addr) simde_mm_loaddup_pd(mem_addr)
22572 #endif
22573 
22574 SIMDE_FUNCTION_ATTRIBUTES
22575 simde__m128d
simde_mm_movedup_pd(simde__m128d a)22576 simde_mm_movedup_pd (simde__m128d a) {
22577   #if defined(SIMDE_X86_SSE3_NATIVE)
22578     return _mm_movedup_pd(a);
22579   #else
22580     simde__m128d_private
22581       r_,
22582       a_ = simde__m128d_to_private(a);
22583 
22584     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22585       r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0);
22586     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
22587       r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0);
22588     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_SHUFFLE_VECTOR_)
22589       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0);
22590     #else
22591       r_.f64[0] = a_.f64[0];
22592       r_.f64[1] = a_.f64[0];
22593     #endif
22594 
22595     return simde__m128d_from_private(r_);
22596   #endif
22597 }
22598 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22599 #  define _mm_movedup_pd(a) simde_mm_movedup_pd(a)
22600 #endif
22601 
22602 SIMDE_FUNCTION_ATTRIBUTES
22603 simde__m128
simde_mm_movehdup_ps(simde__m128 a)22604 simde_mm_movehdup_ps (simde__m128 a) {
22605   #if defined(SIMDE_X86_SSE3_NATIVE)
22606     return _mm_movehdup_ps(a);
22607   #else
22608     simde__m128_private
22609       r_,
22610       a_ = simde__m128_to_private(a);
22611 
22612     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22613       r_.neon_f32 = vtrn2q_f32(a_.neon_f32, a_.neon_f32);
22614     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
22615       r_.wasm_v128 = wasm_v32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 1, 3, 3);
22616     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
22617       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 1, 1, 3, 3);
22618     #else
22619       r_.f32[0] = a_.f32[1];
22620       r_.f32[1] = a_.f32[1];
22621       r_.f32[2] = a_.f32[3];
22622       r_.f32[3] = a_.f32[3];
22623     #endif
22624 
22625     return simde__m128_from_private(r_);
22626   #endif
22627 }
22628 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22629 #  define _mm_movehdup_ps(a) simde_mm_movehdup_ps(a)
22630 #endif
22631 
22632 SIMDE_FUNCTION_ATTRIBUTES
22633 simde__m128
simde_mm_moveldup_ps(simde__m128 a)22634 simde_mm_moveldup_ps (simde__m128 a) {
22635   #if defined(SIMDE__SSE3_NATIVE)
22636     return _mm_moveldup_ps(a);
22637   #else
22638     simde__m128_private
22639       r_,
22640       a_ = simde__m128_to_private(a);
22641 
22642     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22643       r_.neon_f32 = vtrn1q_f32(a_.neon_f32, a_.neon_f32);
22644     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
22645       r_.wasm_v128 = wasm_v32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 2, 2);
22646     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
22647       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 2, 2);
22648     #else
22649       r_.f32[0] = a_.f32[0];
22650       r_.f32[1] = a_.f32[0];
22651       r_.f32[2] = a_.f32[2];
22652       r_.f32[3] = a_.f32[2];
22653     #endif
22654 
22655     return simde__m128_from_private(r_);
22656   #endif
22657 }
22658 #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES)
22659 #  define _mm_moveldup_ps(a) simde_mm_moveldup_ps(a)
22660 #endif
22661 
22662 SIMDE_END_DECLS_
22663 
22664 HEDLEY_DIAGNOSTIC_POP
22665 
22666 #endif /* !defined(SIMDE_X86_SSE3_H) */
22667 /* :: End ../simde/simde/x86/sse3.h :: */
22668 
22669 HEDLEY_DIAGNOSTIC_PUSH
22670 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
22671 SIMDE_BEGIN_DECLS_
22672 
22673 SIMDE_FUNCTION_ATTRIBUTES
22674 simde__m128i
simde_mm_abs_epi8(simde__m128i a)22675 simde_mm_abs_epi8 (simde__m128i a) {
22676   #if defined(SIMDE_X86_SSSE3_NATIVE)
22677     return _mm_abs_epi8(a);
22678   #elif defined(SIMDE_X86_SSE2_NATIVE)
22679     return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a));
22680   #else
22681     simde__m128i_private
22682       r_,
22683       a_ = simde__m128i_to_private(a);
22684 
22685     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22686       r_.neon_i8 = vabsq_s8(a_.neon_i8);
22687     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
22688       r_.altivec_i8 = vec_abs(a_.altivec_i8);
22689     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
22690       r_.wasm_v128 = wasm_i8x16_abs(a_.wasm_v128);
22691     #else
22692       SIMDE_VECTORIZE
22693       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
22694         r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a_.i8[i] < 0) ? (- a_.i8[i]) : a_.i8[i]);
22695       }
22696     #endif
22697 
22698     return simde__m128i_from_private(r_);
22699   #endif
22700 }
22701 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
22702 #  define _mm_abs_epi8(a) simde_mm_abs_epi8(a)
22703 #endif
22704 
22705 SIMDE_FUNCTION_ATTRIBUTES
22706 simde__m128i
simde_mm_abs_epi16(simde__m128i a)22707 simde_mm_abs_epi16 (simde__m128i a) {
22708   #if defined(SIMDE_X86_SSSE3_NATIVE)
22709     return _mm_abs_epi16(a);
22710   #elif defined(SIMDE_X86_SSE2_NATIVE)
22711     return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
22712   #else
22713     simde__m128i_private
22714       r_,
22715       a_ = simde__m128i_to_private(a);
22716 
22717     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22718       r_.neon_i16 = vabsq_s16(a_.neon_i16);
22719     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
22720       r_.altivec_i16 = vec_abs(a_.altivec_i16);
22721     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
22722       r_.wasm_v128 = wasm_i16x8_abs(a_.wasm_v128);
22723     #else
22724       SIMDE_VECTORIZE
22725       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
22726         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.i16[i] < 0) ? (- a_.i16[i]) : a_.i16[i]);
22727       }
22728     #endif
22729 
22730     return simde__m128i_from_private(r_);
22731   #endif
22732 }
22733 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
22734 #  define _mm_abs_epi16(a) simde_mm_abs_epi16(a)
22735 #endif
22736 
22737 SIMDE_FUNCTION_ATTRIBUTES
22738 simde__m128i
simde_mm_abs_epi32(simde__m128i a)22739 simde_mm_abs_epi32 (simde__m128i a) {
22740   #if defined(SIMDE_X86_SSSE3_NATIVE)
22741     return _mm_abs_epi32(a);
22742   #elif defined(SIMDE_X86_SSE2_NATIVE)
22743     const __m128i m = _mm_cmpgt_epi32(_mm_setzero_si128(), a);
22744     return _mm_sub_epi32(_mm_xor_si128(a, m), m);
22745   #else
22746     simde__m128i_private
22747       r_,
22748       a_ = simde__m128i_to_private(a);
22749 
22750     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22751       r_.neon_i32 = vabsq_s32(a_.neon_i32);
22752     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
22753       r_.altivec_i32 = vec_abs(a_.altivec_i32);
22754     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
22755       r_.wasm_v128 = wasm_i32x4_abs(a_.wasm_v128);
22756     #else
22757       SIMDE_VECTORIZE
22758       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
22759         #if defined(_MSC_VER)
22760           HEDLEY_DIAGNOSTIC_PUSH
22761           #pragma warning(disable:4146)
22762         #endif
22763         r_.u32[i] = (a_.i32[i] < 0) ? (- HEDLEY_STATIC_CAST(uint32_t, a_.i32[i])) : HEDLEY_STATIC_CAST(uint32_t, a_.i32[i]);
22764         #if defined(_MSC_VER)
22765           HEDLEY_DIAGNOSTIC_POP
22766         #endif
22767       }
22768     #endif
22769 
22770     return simde__m128i_from_private(r_);
22771   #endif
22772 }
22773 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
22774 #  define _mm_abs_epi32(a) simde_mm_abs_epi32(a)
22775 #endif
22776 
22777 SIMDE_FUNCTION_ATTRIBUTES
22778 simde__m64
simde_mm_abs_pi8(simde__m64 a)22779 simde_mm_abs_pi8 (simde__m64 a) {
22780   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
22781     return _mm_abs_pi8(a);
22782   #else
22783     simde__m64_private
22784       r_,
22785       a_ = simde__m64_to_private(a);
22786 
22787     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22788       r_.neon_i8 = vabs_s8(a_.neon_i8);
22789     #else
22790       SIMDE_VECTORIZE
22791       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
22792         r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a_.i8[i] < 0) ? (- a_.i8[i]) : a_.i8[i]);
22793       }
22794     #endif
22795 
22796     return simde__m64_from_private(r_);
22797   #endif
22798 }
22799 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
22800 #  define _mm_abs_pi8(a) simde_mm_abs_pi8(a)
22801 #endif
22802 
22803 SIMDE_FUNCTION_ATTRIBUTES
22804 simde__m64
simde_mm_abs_pi16(simde__m64 a)22805 simde_mm_abs_pi16 (simde__m64 a) {
22806   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
22807     return _mm_abs_pi16(a);
22808   #else
22809     simde__m64_private
22810       r_,
22811       a_ = simde__m64_to_private(a);
22812 
22813     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22814       r_.neon_i16 = vabs_s16(a_.neon_i16);
22815     #else
22816       SIMDE_VECTORIZE
22817       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
22818         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.i16[i] < 0) ? (- a_.i16[i]) : a_.i16[i]);
22819       }
22820     #endif
22821 
22822     return simde__m64_from_private(r_);
22823   #endif
22824 }
22825 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
22826 #  define _mm_abs_pi16(a) simde_mm_abs_pi16(a)
22827 #endif
22828 
22829 SIMDE_FUNCTION_ATTRIBUTES
22830 simde__m64
simde_mm_abs_pi32(simde__m64 a)22831 simde_mm_abs_pi32 (simde__m64 a) {
22832   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
22833     return _mm_abs_pi32(a);
22834   #else
22835     simde__m64_private
22836       r_,
22837       a_ = simde__m64_to_private(a);
22838 
22839     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22840       r_.neon_i32 = vabs_s32(a_.neon_i32);
22841     #else
22842       SIMDE_VECTORIZE
22843       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
22844         r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.i32[i] < 0) ? (- a_.i32[i]) : a_.i32[i]);
22845       }
22846     #endif
22847 
22848     return simde__m64_from_private(r_);
22849   #endif
22850 }
22851 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
22852 #  define _mm_abs_pi32(a) simde_mm_abs_pi32(a)
22853 #endif
22854 
22855 SIMDE_FUNCTION_ATTRIBUTES
22856 simde__m128i
simde_mm_alignr_epi8(simde__m128i a,simde__m128i b,int count)22857 simde_mm_alignr_epi8 (simde__m128i a, simde__m128i b, int count)
22858     SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) {
22859   simde__m128i_private
22860     r_,
22861     a_ = simde__m128i_to_private(a),
22862     b_ = simde__m128i_to_private(b);
22863 
22864   if (HEDLEY_UNLIKELY(count > 31))
22865     return simde_mm_setzero_si128();
22866 
22867   for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
22868     const int srcpos = count + HEDLEY_STATIC_CAST(int, i);
22869     if (srcpos > 31) {
22870       r_.i8[i] = 0;
22871     } else if (srcpos > 15) {
22872       r_.i8[i] = a_.i8[(srcpos) & 15];
22873     } else {
22874       r_.i8[i] = b_.i8[srcpos];
22875     }
22876   }
22877 
22878   return simde__m128i_from_private(r_);
22879 }
22880 #if defined(SIMDE_X86_SSSE3_NATIVE)
22881   #define simde_mm_alignr_epi8(a, b, count) _mm_alignr_epi8(a, b, count)
22882 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22883   #define simde_mm_alignr_epi8(a, b, count) \
22884     ( \
22885       ((count) > 31) \
22886         ? simde__m128i_from_neon_i8(vdupq_n_s8(0)) \
22887         : ( \
22888           ((count) > 15) \
22889             ? (simde__m128i_from_neon_i8(vextq_s8(simde__m128i_to_neon_i8(a), vdupq_n_s8(0), (count) & 15))) \
22890             : (simde__m128i_from_neon_i8(vextq_s8(simde__m128i_to_neon_i8(b), simde__m128i_to_neon_i8(a), ((count) & 15))))))
22891 #endif
22892 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
22893   #define _mm_alignr_epi8(a, b, count) simde_mm_alignr_epi8(a, b, count)
22894 #endif
22895 
22896 SIMDE_FUNCTION_ATTRIBUTES
22897 simde__m64
simde_mm_alignr_pi8(simde__m64 a,simde__m64 b,const int count)22898 simde_mm_alignr_pi8 (simde__m64 a, simde__m64 b, const int count)
22899     SIMDE_REQUIRE_CONSTANT(count) {
22900   simde__m64_private
22901     r_,
22902     a_ = simde__m64_to_private(a),
22903     b_ = simde__m64_to_private(b);
22904 
22905   if (HEDLEY_UNLIKELY(count > 15))
22906     return simde_mm_setzero_si64();
22907 
22908   for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
22909     const int srcpos = count + HEDLEY_STATIC_CAST(int, i);
22910     if (srcpos > 15) {
22911       r_.i8[i] = 0;
22912     } else if (srcpos > 7) {
22913       r_.i8[i] = a_.i8[(srcpos) & 7];
22914     } else {
22915       r_.i8[i] = b_.i8[srcpos];
22916     }
22917   }
22918 
22919   return simde__m64_from_private(r_);
22920 }
22921 #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
22922 #  define simde_mm_alignr_pi8(a, b, count) _mm_alignr_pi8(a, b, count)
22923 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22924   #define simde_mm_alignr_pi8(a, b, count) \
22925     ( \
22926       ((count) > 15) \
22927         ? simde__m64_from_neon_i8(vdup_n_s8(0)) \
22928         : ( \
22929           ((count) > 7) \
22930             ? (simde__m64_from_neon_i8(vext_s8(simde__m64_to_neon_i8(a), vdup_n_s8(0), (count) & 7))) \
22931             : (simde__m64_from_neon_i8(vext_s8(simde__m64_to_neon_i8(b), simde__m64_to_neon_i8(a), ((count) & 7))))))
22932 #endif
22933 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
22934 #  define _mm_alignr_pi8(a, b, count) simde_mm_alignr_pi8(a, b, count)
22935 #endif
22936 
22937 SIMDE_FUNCTION_ATTRIBUTES
22938 simde__m128i
simde_mm_shuffle_epi8(simde__m128i a,simde__m128i b)22939 simde_mm_shuffle_epi8 (simde__m128i a, simde__m128i b) {
22940   #if defined(SIMDE_X86_SSSE3_NATIVE)
22941     return _mm_shuffle_epi8(a, b);
22942   #else
22943     simde__m128i_private
22944       r_,
22945       a_ = simde__m128i_to_private(a),
22946       b_ = simde__m128i_to_private(b);
22947 
22948     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
22949       r_.neon_i8 = vqtbl1q_s8(a_.neon_i8, vandq_u8(b_.neon_u8, vdupq_n_u8(0x8F)));
22950     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22951       /* Mask out the bits we're not interested in.  vtbl will result in 0
22952        * for any values outside of [0, 15], so if the high bit is set it
22953        * will return 0, just like in SSSE3. */
22954       b_.neon_i8 = vandq_s8(b_.neon_i8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, (1 << 7) | 15)));
22955 
22956       /* Convert a from an int8x16_t to an int8x8x2_t */
22957       int8x8x2_t i;
22958       i.val[0] = vget_low_s8(a_.neon_i8);
22959       i.val[1] = vget_high_s8(a_.neon_i8);
22960 
22961       /* Table lookups */
22962       int8x8_t l = vtbl2_s8(i, vget_low_s8(b_.neon_i8));
22963       int8x8_t h = vtbl2_s8(i, vget_high_s8(b_.neon_i8));
22964 
22965       r_.neon_i8 = vcombine_s8(l, h);
22966     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
22967       /* This is a bit ugly because of the casts and the awful type
22968        * macros (SIMDE_POWER_ALTIVEC_VECTOR), but it's really just
22969        * vec_sel(vec_perm(a, a, b), 0, vec_cmplt(b, 0)) */
22970       SIMDE_POWER_ALTIVEC_VECTOR(signed char) z = { 0, };
22971       SIMDE_POWER_ALTIVEC_VECTOR(signed char) msb_mask = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmplt(b_.altivec_i8, z));
22972       SIMDE_POWER_ALTIVEC_VECTOR(signed char) c = vec_perm(a_.altivec_i8, a_.altivec_i8, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), b_.altivec_i8));
22973       r_.altivec_i8 = vec_sel(c, z, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), msb_mask));
22974     #else
22975       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
22976         r_.i8[i] = a_.i8[b_.i8[i] & 15] & (~(b_.i8[i]) >> 7);
22977       }
22978     #endif
22979 
22980     return simde__m128i_from_private(r_);
22981 #endif
22982 }
22983 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
22984 #  define _mm_shuffle_epi8(a, b) simde_mm_shuffle_epi8(a, b)
22985 #endif
22986 
22987 SIMDE_FUNCTION_ATTRIBUTES
22988 simde__m64
simde_mm_shuffle_pi8(simde__m64 a,simde__m64 b)22989 simde_mm_shuffle_pi8 (simde__m64 a, simde__m64 b) {
22990   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
22991     return _mm_shuffle_pi8(a, b);
22992   #else
22993     simde__m64_private
22994       r_,
22995       a_ = simde__m64_to_private(a),
22996       b_ = simde__m64_to_private(b);
22997 
22998     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
22999       b_.neon_i8 = vand_s8(b_.neon_i8, vdup_n_s8(HEDLEY_STATIC_CAST(int8_t, (1 << 7) | 7)));
23000       r_.neon_i8 = vtbl1_s8(a_.neon_i8, b_.neon_i8);
23001     #else
23002       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
23003         r_.i8[i] = a_.i8[b_.i8[i] & 7] & (~(b_.i8[i]) >> 7);
23004       }
23005     #endif
23006 
23007     return simde__m64_from_private(r_);
23008   #endif
23009 }
23010 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23011 #  define _mm_shuffle_pi8(a, b) simde_mm_shuffle_pi8(a, b)
23012 #endif
23013 
23014 SIMDE_FUNCTION_ATTRIBUTES
23015 simde__m128i
simde_mm_hadd_epi16(simde__m128i a,simde__m128i b)23016 simde_mm_hadd_epi16 (simde__m128i a, simde__m128i b) {
23017   #if defined(SIMDE_X86_SSSE3_NATIVE)
23018     return _mm_hadd_epi16(a, b);
23019   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23020     return simde__m128i_from_neon_i16(vpaddq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b)));
23021   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23022     int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b));
23023     return simde__m128i_from_neon_i16(vaddq_s16(t.val[0], t.val[1]));
23024   #else
23025     return simde_mm_add_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b));
23026   #endif
23027 }
23028 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23029 #  define _mm_hadd_epi16(a, b) simde_mm_hadd_epi16(a, b)
23030 #endif
23031 
23032 SIMDE_FUNCTION_ATTRIBUTES
23033 simde__m128i
simde_mm_hadd_epi32(simde__m128i a,simde__m128i b)23034 simde_mm_hadd_epi32 (simde__m128i a, simde__m128i b) {
23035   #if defined(SIMDE_X86_SSSE3_NATIVE)
23036     return _mm_hadd_epi32(a, b);
23037   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23038     return simde__m128i_from_neon_i32(vpaddq_s32(simde__m128i_to_neon_i32(a), simde__m128i_to_neon_i32(b)));
23039   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23040     int32x4x2_t t = vuzpq_s32(simde__m128i_to_neon_i32(a), simde__m128i_to_neon_i32(b));
23041     return simde__m128i_from_neon_i32(vaddq_s32(t.val[0], t.val[1]));
23042   #else
23043     return simde_mm_add_epi32(simde_x_mm_deinterleaveeven_epi32(a, b), simde_x_mm_deinterleaveodd_epi32(a, b));
23044   #endif
23045 }
23046 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23047 #  define _mm_hadd_epi32(a, b) simde_mm_hadd_epi32(a, b)
23048 #endif
23049 
23050 SIMDE_FUNCTION_ATTRIBUTES
23051 simde__m64
simde_mm_hadd_pi16(simde__m64 a,simde__m64 b)23052 simde_mm_hadd_pi16 (simde__m64 a, simde__m64 b) {
23053   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23054     return _mm_hadd_pi16(a, b);
23055   #else
23056     simde__m64_private
23057       r_,
23058       a_ = simde__m64_to_private(a),
23059       b_ = simde__m64_to_private(b);
23060 
23061     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23062       r_.neon_i16 = vpadd_s16(a_.neon_i16, b_.neon_i16);
23063     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23064       int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16);
23065       r_.neon_i16 = vadd_s16(t.val[0], t.val[1]);
23066     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
23067       r_.i16 =
23068         SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 2, 4, 6) +
23069         SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 1, 3, 5, 7);
23070     #else
23071       r_.i16[0] = a_.i16[0] + a_.i16[1];
23072       r_.i16[1] = a_.i16[2] + a_.i16[3];
23073       r_.i16[2] = b_.i16[0] + b_.i16[1];
23074       r_.i16[3] = b_.i16[2] + b_.i16[3];
23075     #endif
23076 
23077     return simde__m64_from_private(r_);
23078   #endif
23079 }
23080 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23081 #  define _mm_hadd_pi16(a, b) simde_mm_hadd_pi16(a, b)
23082 #endif
23083 
23084 SIMDE_FUNCTION_ATTRIBUTES
23085 simde__m64
simde_mm_hadd_pi32(simde__m64 a,simde__m64 b)23086 simde_mm_hadd_pi32 (simde__m64 a, simde__m64 b) {
23087   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23088     return _mm_hadd_pi32(a, b);
23089   #else
23090     simde__m64_private
23091       r_,
23092       a_ = simde__m64_to_private(a),
23093       b_ = simde__m64_to_private(b);
23094 
23095     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23096       r_.neon_i32 = vpadd_s32(a_.neon_i32, b_.neon_i32);
23097     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23098       int32x2x2_t t = vuzp_s32(a_.neon_i32, b_.neon_i32);
23099       r_.neon_i32 = vadd_s32(t.val[0], t.val[1]);
23100     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
23101       r_.i32 =
23102         SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2) +
23103         SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3);
23104     #else
23105       r_.i32[0] = a_.i32[0] + a_.i32[1];
23106       r_.i32[1] = b_.i32[0] + b_.i32[1];
23107     #endif
23108 
23109     return simde__m64_from_private(r_);
23110   #endif
23111 }
23112 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23113 #  define _mm_hadd_pi32(a, b) simde_mm_hadd_pi32(a, b)
23114 #endif
23115 
23116 SIMDE_FUNCTION_ATTRIBUTES
23117 simde__m128i
simde_mm_hadds_epi16(simde__m128i a,simde__m128i b)23118 simde_mm_hadds_epi16 (simde__m128i a, simde__m128i b) {
23119   #if defined(SIMDE_X86_SSSE3_NATIVE)
23120     return _mm_hadds_epi16(a, b);
23121   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23122     int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b));
23123     return simde__m128i_from_neon_i16(vqaddq_s16(t.val[0], t.val[1]));
23124   #else
23125     return simde_mm_adds_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b));
23126   #endif
23127 }
23128 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23129 #  define _mm_hadds_epi16(a, b) simde_mm_hadds_epi16(a, b)
23130 #endif
23131 
23132 SIMDE_FUNCTION_ATTRIBUTES
23133 simde__m64
simde_mm_hadds_pi16(simde__m64 a,simde__m64 b)23134 simde_mm_hadds_pi16 (simde__m64 a, simde__m64 b) {
23135   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23136     return _mm_hadds_pi16(a, b);
23137   #else
23138     simde__m64_private
23139       r_,
23140       a_ = simde__m64_to_private(a),
23141       b_ = simde__m64_to_private(b);
23142 
23143     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23144       int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16);
23145       r_.neon_i16 = vqadd_s16(t.val[0], t.val[1]);
23146     #else
23147       for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
23148         int32_t ta = HEDLEY_STATIC_CAST(int32_t, a_.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2) + 1]);
23149         r_.i16[  i  ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN;
23150         int32_t tb = HEDLEY_STATIC_CAST(int32_t, b_.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, b_.i16[(i * 2) + 1]);
23151         r_.i16[i + 2] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN;
23152       }
23153     #endif
23154 
23155     return simde__m64_from_private(r_);
23156   #endif
23157 }
23158 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23159 #  define _mm_hadds_pi16(a, b) simde_mm_hadds_pi16(a, b)
23160 #endif
23161 
23162 SIMDE_FUNCTION_ATTRIBUTES
23163 simde__m128i
simde_mm_hsub_epi16(simde__m128i a,simde__m128i b)23164 simde_mm_hsub_epi16 (simde__m128i a, simde__m128i b) {
23165   #if defined(SIMDE_X86_SSSE3_NATIVE)
23166     return _mm_hsub_epi16(a, b);
23167   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23168     int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b));
23169     return simde__m128i_from_neon_i16(vsubq_s16(t.val[0], t.val[1]));
23170   #else
23171     return simde_mm_sub_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b));
23172   #endif
23173 }
23174 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23175 #  define _mm_hsub_epi16(a, b) simde_mm_hsub_epi16(a, b)
23176 #endif
23177 
23178 SIMDE_FUNCTION_ATTRIBUTES
23179 simde__m128i
simde_mm_hsub_epi32(simde__m128i a,simde__m128i b)23180 simde_mm_hsub_epi32 (simde__m128i a, simde__m128i b) {
23181   #if defined(SIMDE_X86_SSSE3_NATIVE)
23182     return _mm_hsub_epi32(a, b);
23183   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23184     int32x4x2_t t = vuzpq_s32(simde__m128i_to_neon_i32(a), simde__m128i_to_neon_i32(b));
23185     return simde__m128i_from_neon_i32(vsubq_s32(t.val[0], t.val[1]));
23186   #else
23187     return simde_mm_sub_epi32(simde_x_mm_deinterleaveeven_epi32(a, b), simde_x_mm_deinterleaveodd_epi32(a, b));
23188   #endif
23189 }
23190 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23191 #  define _mm_hsub_epi32(a, b) simde_mm_hsub_epi32(a, b)
23192 #endif
23193 
23194 SIMDE_FUNCTION_ATTRIBUTES
23195 simde__m64
simde_mm_hsub_pi16(simde__m64 a,simde__m64 b)23196 simde_mm_hsub_pi16 (simde__m64 a, simde__m64 b) {
23197   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23198     return _mm_hsub_pi16(a, b);
23199   #else
23200     simde__m64_private
23201       r_,
23202       a_ = simde__m64_to_private(a),
23203       b_ = simde__m64_to_private(b);
23204 
23205     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23206       int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16);
23207       r_.neon_i16 = vsub_s16(t.val[0], t.val[1]);
23208     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
23209       r_.i16 =
23210         SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 2, 4, 6) -
23211         SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 1, 3, 5, 7);
23212     #else
23213       r_.i16[0] = a_.i16[0] - a_.i16[1];
23214       r_.i16[1] = a_.i16[2] - a_.i16[3];
23215       r_.i16[2] = b_.i16[0] - b_.i16[1];
23216       r_.i16[3] = b_.i16[2] - b_.i16[3];
23217     #endif
23218 
23219     return simde__m64_from_private(r_);
23220   #endif
23221 }
23222 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23223 #  define _mm_hsub_pi16(a, b) simde_mm_hsub_pi16(a, b)
23224 #endif
23225 
23226 SIMDE_FUNCTION_ATTRIBUTES
23227 simde__m64
simde_mm_hsub_pi32(simde__m64 a,simde__m64 b)23228 simde_mm_hsub_pi32 (simde__m64 a, simde__m64 b) {
23229   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23230     return _mm_hsub_pi32(a, b);
23231   #else
23232     simde__m64_private
23233       r_,
23234       a_ = simde__m64_to_private(a),
23235       b_ = simde__m64_to_private(b);
23236 
23237     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23238       int32x2x2_t t = vuzp_s32(a_.neon_i32, b_.neon_i32);
23239       r_.neon_i32 = vsub_s32(t.val[0], t.val[1]);
23240     #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_)
23241       r_.i32 =
23242         SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2) -
23243         SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3);
23244     #else
23245       r_.i32[0] = a_.i32[0] - a_.i32[1];
23246       r_.i32[1] = b_.i32[0] - b_.i32[1];
23247     #endif
23248 
23249     return simde__m64_from_private(r_);
23250   #endif
23251 }
23252 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23253 #  define _mm_hsub_pi32(a, b) simde_mm_hsub_pi32(a, b)
23254 #endif
23255 
23256 SIMDE_FUNCTION_ATTRIBUTES
23257 simde__m128i
simde_mm_hsubs_epi16(simde__m128i a,simde__m128i b)23258 simde_mm_hsubs_epi16 (simde__m128i a, simde__m128i b) {
23259   #if defined(SIMDE_X86_SSSE3_NATIVE)
23260     return _mm_hsubs_epi16(a, b);
23261   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23262     int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b));
23263     return simde__m128i_from_neon_i16(vqsubq_s16(t.val[0], t.val[1]));
23264   #else
23265     return simde_mm_subs_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b));
23266   #endif
23267 }
23268 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23269 #  define _mm_hsubs_epi16(a, b) simde_mm_hsubs_epi16(a, b)
23270 #endif
23271 
23272 SIMDE_FUNCTION_ATTRIBUTES
23273 simde__m64
simde_mm_hsubs_pi16(simde__m64 a,simde__m64 b)23274 simde_mm_hsubs_pi16 (simde__m64 a, simde__m64 b) {
23275   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23276     return _mm_hsubs_pi16(a, b);
23277   #else
23278     simde__m64_private
23279       r_,
23280       a_ = simde__m64_to_private(a),
23281       b_ = simde__m64_to_private(b);
23282 
23283     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23284       int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16);
23285       r_.neon_i16 = vqsub_s16(t.val[0], t.val[1]);
23286     #else
23287       for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
23288         r_.i16[  i  ] = simde_math_subs_i16(a_.i16[i * 2], a_.i16[(i * 2) + 1]);
23289         r_.i16[i + 2] = simde_math_subs_i16(b_.i16[i * 2], b_.i16[(i * 2) + 1]);
23290       }
23291     #endif
23292 
23293     return simde__m64_from_private(r_);
23294   #endif
23295 }
23296 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23297 #  define _mm_hsubs_pi16(a, b) simde_mm_hsubs_pi16(a, b)
23298 #endif
23299 
23300 SIMDE_FUNCTION_ATTRIBUTES
23301 simde__m128i
simde_mm_maddubs_epi16(simde__m128i a,simde__m128i b)23302 simde_mm_maddubs_epi16 (simde__m128i a, simde__m128i b) {
23303   #if defined(SIMDE_X86_SSSE3_NATIVE)
23304     return _mm_maddubs_epi16(a, b);
23305   #else
23306     simde__m128i_private
23307       r_,
23308       a_ = simde__m128i_to_private(a),
23309       b_ = simde__m128i_to_private(b);
23310 
23311     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23312       /* Zero extend a */
23313       int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a_.neon_u16, 8));
23314       int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a_.neon_u16, vdupq_n_u16(0xff00)));
23315 
23316       /* Sign extend by shifting left then shifting right. */
23317       int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b_.neon_i16, 8), 8);
23318       int16x8_t b_odd = vshrq_n_s16(b_.neon_i16, 8);
23319 
23320       /* multiply */
23321       int16x8_t prod1 = vmulq_s16(a_even, b_even);
23322       int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
23323 
23324       /* saturated add */
23325       r_.neon_i16 = vqaddq_s16(prod1, prod2);
23326     #else
23327       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
23328         const int idx = HEDLEY_STATIC_CAST(int, i) << 1;
23329         int32_t ts =
23330           (HEDLEY_STATIC_CAST(int16_t, a_.u8[  idx  ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[  idx  ])) +
23331           (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1]));
23332         r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN;
23333       }
23334     #endif
23335 
23336     return simde__m128i_from_private(r_);
23337   #endif
23338 }
23339 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23340 #  define _mm_maddubs_epi16(a, b) simde_mm_maddubs_epi16(a, b)
23341 #endif
23342 
23343 SIMDE_FUNCTION_ATTRIBUTES
23344 simde__m64
simde_mm_maddubs_pi16(simde__m64 a,simde__m64 b)23345 simde_mm_maddubs_pi16 (simde__m64 a, simde__m64 b) {
23346   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23347     return _mm_maddubs_pi16(a, b);
23348   #else
23349     simde__m64_private
23350       r_,
23351       a_ = simde__m64_to_private(a),
23352       b_ = simde__m64_to_private(b);
23353 
23354     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23355       int16x8_t ai = vreinterpretq_s16_u16(vmovl_u8(a_.neon_u8));
23356       int16x8_t bi = vmovl_s8(b_.neon_i8);
23357       int16x8_t p = vmulq_s16(ai, bi);
23358       int16x4_t l = vget_low_s16(p);
23359       int16x4_t h = vget_high_s16(p);
23360       r_.neon_i16 = vqadd_s16(vuzp1_s16(l, h), vuzp2_s16(l, h));
23361     #else
23362       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
23363         const int idx = HEDLEY_STATIC_CAST(int, i) << 1;
23364         int32_t ts =
23365           (HEDLEY_STATIC_CAST(int16_t, a_.u8[  idx  ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[  idx  ])) +
23366           (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1]));
23367         r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN;
23368       }
23369     #endif
23370 
23371     return simde__m64_from_private(r_);
23372   #endif
23373 }
23374 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23375 #  define _mm_maddubs_pi16(a, b) simde_mm_maddubs_pi16(a, b)
23376 #endif
23377 
23378 SIMDE_FUNCTION_ATTRIBUTES
23379 simde__m128i
simde_mm_mulhrs_epi16(simde__m128i a,simde__m128i b)23380 simde_mm_mulhrs_epi16 (simde__m128i a, simde__m128i b) {
23381   #if defined(SIMDE_X86_SSSE3_NATIVE)
23382     return _mm_mulhrs_epi16(a, b);
23383   #else
23384     simde__m128i_private
23385       r_,
23386       a_ = simde__m128i_to_private(a),
23387       b_ = simde__m128i_to_private(b);
23388 
23389     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23390       /* Multiply */
23391       int32x4_t mul_lo = vmull_s16(vget_low_s16(a_.neon_i16),
23392                                   vget_low_s16(b_.neon_i16));
23393       int32x4_t mul_hi = vmull_s16(vget_high_s16(a_.neon_i16),
23394                                   vget_high_s16(b_.neon_i16));
23395 
23396       /* Rounding narrowing shift right
23397        * narrow = (int16_t)((mul + 16384) >> 15); */
23398       int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
23399       int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
23400 
23401       /* Join together */
23402       r_.neon_i16 = vcombine_s16(narrow_lo, narrow_hi);
23403     #else
23404       SIMDE_VECTORIZE
23405       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
23406         r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15));
23407       }
23408     #endif
23409 
23410     return simde__m128i_from_private(r_);
23411   #endif
23412 }
23413 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23414 #  define _mm_mulhrs_epi16(a, b) simde_mm_mulhrs_epi16(a, b)
23415 #endif
23416 
23417 SIMDE_FUNCTION_ATTRIBUTES
23418 simde__m64
simde_mm_mulhrs_pi16(simde__m64 a,simde__m64 b)23419 simde_mm_mulhrs_pi16 (simde__m64 a, simde__m64 b) {
23420   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23421     return _mm_mulhrs_pi16(a, b);
23422   #else
23423     simde__m64_private
23424       r_,
23425       a_ = simde__m64_to_private(a),
23426       b_ = simde__m64_to_private(b);
23427 
23428     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23429       /* Multiply */
23430       int32x4_t mul = vmull_s16(a_.neon_i16, b_.neon_i16);
23431 
23432       /* Rounding narrowing shift right
23433        * narrow = (int16_t)((mul + 16384) >> 15); */
23434       int16x4_t narrow = vrshrn_n_s32(mul, 15);
23435 
23436       /* Join together */
23437       r_.neon_i16 = narrow;
23438     #else
23439       SIMDE_VECTORIZE
23440       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
23441         r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15));
23442       }
23443     #endif
23444 
23445     return simde__m64_from_private(r_);
23446   #endif
23447 }
23448 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23449 #  define _mm_mulhrs_pi16(a, b) simde_mm_mulhrs_pi16(a, b)
23450 #endif
23451 
23452 SIMDE_FUNCTION_ATTRIBUTES
23453 simde__m128i
simde_mm_sign_epi8(simde__m128i a,simde__m128i b)23454 simde_mm_sign_epi8 (simde__m128i a, simde__m128i b) {
23455   #if defined(SIMDE_X86_SSSE3_NATIVE)
23456     return _mm_sign_epi8(a, b);
23457   #else
23458     simde__m128i_private
23459       r_,
23460       a_ = simde__m128i_to_private(a),
23461       b_ = simde__m128i_to_private(b);
23462 
23463     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23464       uint8x16_t aneg_mask = vreinterpretq_u8_s8(vshrq_n_s8(b_.neon_i8, 7));
23465       uint8x16_t bnz_mask;
23466       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23467         bnz_mask = vceqzq_s8(b_.neon_i8);
23468       #else
23469         bnz_mask = vceqq_s8(b_.neon_i8, vdupq_n_s8(0));
23470       #endif
23471       bnz_mask = vmvnq_u8(bnz_mask);
23472 
23473       r_.neon_i8 = vbslq_s8(aneg_mask, vnegq_s8(a_.neon_i8), vandq_s8(a_.neon_i8, vreinterpretq_s8_u8(bnz_mask)));
23474     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
23475       simde__m128i mask = wasm_i8x16_shr(b_.wasm_v128, 7);
23476       simde__m128i zeromask = simde_mm_cmpeq_epi8(b_.wasm_v128, simde_mm_setzero_si128());
23477       r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi8(a_.wasm_v128, mask), mask));
23478     #else
23479       SIMDE_VECTORIZE
23480       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
23481         r_.i8[i] = (b_.i8[i] < 0) ? (- a_.i8[i]) : ((b_.i8[i] != 0) ? (a_.i8[i]) : INT8_C(0));
23482       }
23483     #endif
23484 
23485     return simde__m128i_from_private(r_);
23486   #endif
23487 }
23488 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23489 #  define _mm_sign_epi8(a, b) simde_mm_sign_epi8(a, b)
23490 #endif
23491 
23492 SIMDE_FUNCTION_ATTRIBUTES
23493 simde__m128i
simde_mm_sign_epi16(simde__m128i a,simde__m128i b)23494 simde_mm_sign_epi16 (simde__m128i a, simde__m128i b) {
23495   #if defined(SIMDE_X86_SSSE3_NATIVE)
23496     return _mm_sign_epi16(a, b);
23497   #else
23498     simde__m128i_private
23499       r_,
23500       a_ = simde__m128i_to_private(a),
23501       b_ = simde__m128i_to_private(b);
23502 
23503     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23504       uint16x8_t aneg_mask = vreinterpretq_u16_s16(vshrq_n_s16(b_.neon_i16, 15));
23505       uint16x8_t bnz_mask;
23506       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23507         bnz_mask = vceqzq_s16(b_.neon_i16);
23508       #else
23509         bnz_mask = vceqq_s16(b_.neon_i16, vdupq_n_s16(0));
23510       #endif
23511       bnz_mask = vmvnq_u16(bnz_mask);
23512 
23513       r_.neon_i16 = vbslq_s16(aneg_mask, vnegq_s16(a_.neon_i16), vandq_s16(a_.neon_i16, vreinterpretq_s16_u16(bnz_mask)));
23514     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
23515       simde__m128i mask = simde_mm_srai_epi16(b_.wasm_v128, 15);
23516       simde__m128i zeromask = simde_mm_cmpeq_epi16(b_.wasm_v128, simde_mm_setzero_si128());
23517       r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi16(a_.wasm_v128, mask), mask));
23518     #else
23519       SIMDE_VECTORIZE
23520       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
23521         r_.i16[i] = (b_.i16[i] < 0) ? (- a_.i16[i]) : ((b_.i16[i] != 0) ? (a_.i16[i]) : INT16_C(0));
23522       }
23523     #endif
23524 
23525     return simde__m128i_from_private(r_);
23526   #endif
23527 }
23528 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23529 #  define _mm_sign_epi16(a, b) simde_mm_sign_epi16(a, b)
23530 #endif
23531 
23532 SIMDE_FUNCTION_ATTRIBUTES
23533 simde__m128i
simde_mm_sign_epi32(simde__m128i a,simde__m128i b)23534 simde_mm_sign_epi32 (simde__m128i a, simde__m128i b) {
23535   #if defined(SIMDE_X86_SSSE3_NATIVE)
23536     return _mm_sign_epi32(a, b);
23537   #else
23538     simde__m128i_private
23539       r_,
23540       a_ = simde__m128i_to_private(a),
23541       b_ = simde__m128i_to_private(b);
23542 
23543     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23544       uint32x4_t aneg_mask = vreinterpretq_u32_s32(vshrq_n_s32(b_.neon_i32, 31));
23545       uint32x4_t bnz_mask;
23546       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23547         bnz_mask = vceqzq_s32(b_.neon_i32);
23548       #else
23549         bnz_mask = vceqq_s32(b_.neon_i32, vdupq_n_s32(0));
23550       #endif
23551       bnz_mask = vmvnq_u32(bnz_mask);
23552 
23553       r_.neon_i32 = vbslq_s32(aneg_mask, vnegq_s32(a_.neon_i32), vandq_s32(a_.neon_i32, vreinterpretq_s32_u32(bnz_mask)));
23554     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
23555       simde__m128i mask = simde_mm_srai_epi32(b_.wasm_v128, 31);
23556       simde__m128i zeromask = simde_mm_cmpeq_epi32(b_.wasm_v128, simde_mm_setzero_si128());
23557       r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi32(a_.wasm_v128, mask), mask));
23558     #else
23559       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
23560         r_.i32[i] = (b_.i32[i] < 0) ? (- a_.i32[i]) : ((b_.i32[i] != 0) ? (a_.i32[i]) : INT32_C(0));
23561       }
23562     #endif
23563 
23564     return simde__m128i_from_private(r_);
23565   #endif
23566 }
23567 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23568 #  define _mm_sign_epi32(a, b) simde_mm_sign_epi32(a, b)
23569 #endif
23570 
23571 SIMDE_FUNCTION_ATTRIBUTES
23572 simde__m64
simde_mm_sign_pi8(simde__m64 a,simde__m64 b)23573 simde_mm_sign_pi8 (simde__m64 a, simde__m64 b) {
23574   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23575     return _mm_sign_pi8(a, b);
23576   #else
23577     simde__m64_private
23578       r_,
23579       a_ = simde__m64_to_private(a),
23580       b_ = simde__m64_to_private(b);
23581 
23582     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23583       uint8x8_t aneg_mask = vreinterpret_u8_s8(vshr_n_s8(b_.neon_i8, 7));
23584       uint8x8_t bnz_mask;
23585       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23586         bnz_mask = vceqz_s8(b_.neon_i8);
23587       #else
23588         bnz_mask = vceq_s8(b_.neon_i8, vdup_n_s8(0));
23589       #endif
23590       bnz_mask = vmvn_u8(bnz_mask);
23591 
23592       r_.neon_i8 = vbsl_s8(aneg_mask, vneg_s8(a_.neon_i8), vand_s8(a_.neon_i8, vreinterpret_s8_u8(bnz_mask)));
23593     #else
23594       SIMDE_VECTORIZE
23595       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
23596         r_.i8[i] = (b_.i8[i] < 0) ? (- a_.i8[i]) : ((b_.i8[i] != 0) ? (a_.i8[i]) : INT8_C(0));
23597       }
23598     #endif
23599 
23600     return simde__m64_from_private(r_);
23601   #endif
23602 }
23603 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23604 #  define _mm_sign_pi8(a, b) simde_mm_sign_pi8(a, b)
23605 #endif
23606 
23607 SIMDE_FUNCTION_ATTRIBUTES
23608 simde__m64
simde_mm_sign_pi16(simde__m64 a,simde__m64 b)23609 simde_mm_sign_pi16 (simde__m64 a, simde__m64 b) {
23610   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23611     return _mm_sign_pi16(a, b);
23612   #else
23613     simde__m64_private
23614       r_,
23615       a_ = simde__m64_to_private(a),
23616       b_ = simde__m64_to_private(b);
23617 
23618     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23619       uint16x4_t aneg_mask = vreinterpret_u16_s16(vshr_n_s16(b_.neon_i16, 15));
23620       uint16x4_t bnz_mask;
23621       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23622         bnz_mask = vceqz_s16(b_.neon_i16);
23623       #else
23624         bnz_mask = vceq_s16(b_.neon_i16, vdup_n_s16(0));
23625       #endif
23626       bnz_mask = vmvn_u16(bnz_mask);
23627 
23628       r_.neon_i16 = vbsl_s16(aneg_mask, vneg_s16(a_.neon_i16), vand_s16(a_.neon_i16, vreinterpret_s16_u16(bnz_mask)));
23629     #else
23630       SIMDE_VECTORIZE
23631       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
23632         r_.i16[i] = (b_.i16[i] < 0) ? (- a_.i16[i]) : ((b_.i16[i] > 0) ? (a_.i16[i]) : INT16_C(0));
23633       }
23634     #endif
23635 
23636     return simde__m64_from_private(r_);
23637   #endif
23638 }
23639 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23640 #  define _mm_sign_pi16(a, b) simde_mm_sign_pi16(a, b)
23641 #endif
23642 
23643 SIMDE_FUNCTION_ATTRIBUTES
23644 simde__m64
simde_mm_sign_pi32(simde__m64 a,simde__m64 b)23645 simde_mm_sign_pi32 (simde__m64 a, simde__m64 b) {
23646   #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
23647     return _mm_sign_pi32(a, b);
23648   #else
23649     simde__m64_private
23650       r_,
23651       a_ = simde__m64_to_private(a),
23652       b_ = simde__m64_to_private(b);
23653 
23654     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23655       uint32x2_t aneg_mask = vreinterpret_u32_s32(vshr_n_s32(b_.neon_i32, 31));
23656       uint32x2_t bnz_mask;
23657       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23658         bnz_mask = vceqz_s32(b_.neon_i32);
23659       #else
23660         bnz_mask = vceq_s32(b_.neon_i32, vdup_n_s32(0));
23661       #endif
23662       bnz_mask = vmvn_u32(bnz_mask);
23663 
23664       r_.neon_i32 = vbsl_s32(aneg_mask, vneg_s32(a_.neon_i32), vand_s32(a_.neon_i32, vreinterpret_s32_u32(bnz_mask)));
23665     #else
23666       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
23667         r_.i32[i] = (b_.i32[i] < 0) ? (- a_.i32[i]) : ((b_.i32[i] > 0) ? (a_.i32[i]) : INT32_C(0));
23668       }
23669     #endif
23670 
23671     return simde__m64_from_private(r_);
23672   #endif
23673 }
23674 #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES)
23675 #  define _mm_sign_pi32(a, b) simde_mm_sign_pi32(a, b)
23676 #endif
23677 
23678 SIMDE_END_DECLS_
23679 
23680 HEDLEY_DIAGNOSTIC_POP
23681 
23682 #endif /* !defined(SIMDE_X86_SSE2_H) */
23683 /* :: End ../simde/simde/x86/ssse3.h :: */
23684 
23685 HEDLEY_DIAGNOSTIC_PUSH
23686 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
23687 SIMDE_BEGIN_DECLS_
23688 
23689 #if !defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
23690 #  define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES
23691 #endif
23692 
23693 SIMDE_FUNCTION_ATTRIBUTES
23694 simde__m128i
simde_mm_blend_epi16(simde__m128i a,simde__m128i b,const int imm8)23695 simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8)
23696     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
23697   simde__m128i_private
23698     r_,
23699     a_ = simde__m128i_to_private(a),
23700     b_ = simde__m128i_to_private(b);
23701 
23702   SIMDE_VECTORIZE
23703   for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
23704     r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i];
23705   }
23706 
23707   return simde__m128i_from_private(r_);
23708 }
23709 #if defined(SIMDE_X86_SSE4_1_NATIVE)
23710 #  define simde_mm_blend_epi16(a, b, imm8) _mm_blend_epi16(a, b, imm8)
23711 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23712 #  define simde_mm_blend_epi16(a, b, imm8) \
23713      (__extension__ ({ \
23714            const uint16_t _mask[8] = {               \
23715                ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
23716                ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
23717                ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
23718                ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
23719                ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
23720                ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
23721                ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
23722                ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000  \
23723            };                                        \
23724            uint16x8_t _mask_vec = vld1q_u16(_mask);  \
23725            simde__m128i_from_neon_u16(vbslq_u16(_mask_vec, simde__m128i_to_neon_u16(b), simde__m128i_to_neon_u16(a))); \
23726        }))
23727 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
23728 #  define simde_mm_blend_epi16(a, b, imm8)      \
23729      (__extension__ ({ \
23730            const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) _mask = {      \
23731                ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
23732                ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
23733                ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
23734                ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
23735                ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
23736                ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
23737                ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
23738                ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000  \
23739            };                                         \
23740            simde__m128i_from_altivec_u16(vec_sel(simde__m128i_to_altivec_u16(a), simde__m128i_to_altivec_u16(b), _mask)); \
23741        }))
23742 #endif
23743 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
23744   #undef _mm_blend_epi16
23745   #define _mm_blend_epi16(a, b, imm8) simde_mm_blend_epi16(a, b, imm8)
23746 #endif
23747 
23748 SIMDE_FUNCTION_ATTRIBUTES
23749 simde__m128d
simde_mm_blend_pd(simde__m128d a,simde__m128d b,const int imm8)23750 simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8)
23751     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
23752   simde__m128d_private
23753     r_,
23754     a_ = simde__m128d_to_private(a),
23755     b_ = simde__m128d_to_private(b);
23756 
23757   SIMDE_VECTORIZE
23758   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
23759     r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
23760   }
23761   return simde__m128d_from_private(r_);
23762 }
23763 #if defined(SIMDE_X86_SSE4_1_NATIVE)
23764 #  define simde_mm_blend_pd(a, b, imm8) _mm_blend_pd(a, b, imm8)
23765 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23766 #  define simde_mm_blend_pd(a, b, imm8) \
23767      (__extension__ ({ \
23768            const uint64_t _mask[2] = {               \
23769                ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
23770                ((imm8) & (1 << 1)) ? UINT64_MAX : 0  \
23771            };                                        \
23772            uint64x2_t _mask_vec = vld1q_u64(_mask);  \
23773            simde__m128d_from_neon_u64(vbslq_u64(_mask_vec, simde__m128d_to_neon_u64(b), simde__m128d_to_neon_u64(a))); \
23774        }))
23775 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
23776 #  define simde_mm_blend_pd(a, b, imm8)         \
23777      (__extension__ ({ \
23778            const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) _mask = { \
23779                ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
23780                ((imm8) & (1 << 1)) ? UINT64_MAX : 0  \
23781            };                                        \
23782            simde__m128d_from_altivec_f64(vec_sel(simde__m128d_to_altivec_f64(a), simde__m128d_to_altivec_f64(b), _mask)); \
23783        }))
23784 #endif
23785 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
23786   #undef _mm_blend_pd
23787   #define _mm_blend_pd(a, b, imm8) simde_mm_blend_pd(a, b, imm8)
23788 #endif
23789 
23790 SIMDE_FUNCTION_ATTRIBUTES
23791 simde__m128
simde_mm_blend_ps(simde__m128 a,simde__m128 b,const int imm8)23792 simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8)
23793     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15)  {
23794   simde__m128_private
23795     r_,
23796     a_ = simde__m128_to_private(a),
23797     b_ = simde__m128_to_private(b);
23798 
23799   SIMDE_VECTORIZE
23800   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
23801     r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
23802   }
23803   return simde__m128_from_private(r_);
23804 }
23805 #if defined(SIMDE_X86_SSE4_1_NATIVE)
23806 #  define simde_mm_blend_ps(a, b, imm8) _mm_blend_ps(a, b, imm8)
23807 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23808 #  define simde_mm_blend_ps(a, b, imm8) \
23809      (__extension__ ({ \
23810            const uint32_t _mask[4] = {               \
23811                ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
23812                ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
23813                ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
23814                ((imm8) & (1 << 3)) ? UINT32_MAX : 0  \
23815            };                                        \
23816            uint32x4_t _mask_vec = vld1q_u32(_mask);  \
23817            simde__m128_from_neon_f32(vbslq_f32(_mask_vec, simde__m128_to_neon_f32(b), simde__m128_to_neon_f32(a))); \
23818        }))
23819 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
23820 #  define simde_mm_blend_ps(a, b, imm8) \
23821      (__extension__ ({ \
23822            const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) _mask = {       \
23823                ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
23824                ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
23825                ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
23826                ((imm8) & (1 << 3)) ? UINT32_MAX : 0  \
23827            };                                        \
23828            simde__m128_from_altivec_f32(vec_sel(simde__m128_to_altivec_f32(a), simde__m128_to_altivec_f32(b), _mask)); \
23829        }))
23830 #endif
23831 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
23832   #undef _mm_blend_ps
23833   #define _mm_blend_ps(a, b, imm8) simde_mm_blend_ps(a, b, imm8)
23834 #endif
23835 
23836 SIMDE_FUNCTION_ATTRIBUTES
23837 simde__m128i
simde_mm_blendv_epi8(simde__m128i a,simde__m128i b,simde__m128i mask)23838 simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) {
23839   #if defined(SIMDE_X86_SSE4_1_NATIVE)
23840     return _mm_blendv_epi8(a, b, mask);
23841   #else
23842     simde__m128i_private
23843       r_,
23844       a_ = simde__m128i_to_private(a),
23845       b_ = simde__m128i_to_private(b),
23846       mask_ = simde__m128i_to_private(mask);
23847 
23848     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23849       /* Use a signed shift right to create a mask with the sign bit */
23850       mask_.neon_i8 = vshrq_n_s8(mask_.neon_i8, 7);
23851       r_.neon_i8 = vbslq_s8(mask_.neon_u8, b_.neon_i8, a_.neon_i8);
23852     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
23853       v128_t m = wasm_i8x16_shr(mask_.wasm_v128, 7);
23854       r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
23855     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
23856       r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0)));
23857     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
23858       /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */
23859       #if defined(HEDLEY_INTEL_VERSION_CHECK)
23860         __typeof__(mask_.i8) z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
23861         mask_.i8 = HEDLEY_STATIC_CAST(__typeof__(mask_.i8), mask_.i8 < z);
23862       #else
23863         mask_.i8 >>= (CHAR_BIT * sizeof(mask_.i8[0])) - 1;
23864       #endif
23865 
23866       r_.i8 = (mask_.i8 & b_.i8) | (~mask_.i8 & a_.i8);
23867     #else
23868       SIMDE_VECTORIZE
23869       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
23870         int8_t m = mask_.i8[i] >> 7;
23871         r_.i8[i] = (m & b_.i8[i]) | (~m & a_.i8[i]);
23872       }
23873     #endif
23874 
23875     return simde__m128i_from_private(r_);
23876   #endif
23877 }
23878 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
23879   #undef _mm_blendv_epi8
23880   #define _mm_blendv_epi8(a, b, mask) simde_mm_blendv_epi8(a, b, mask)
23881 #endif
23882 
23883 SIMDE_FUNCTION_ATTRIBUTES
23884 simde__m128i
simde_x_mm_blendv_epi16(simde__m128i a,simde__m128i b,simde__m128i mask)23885 simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) {
23886   #if defined(SIMDE_X86_SSE2_NATIVE)
23887     mask = simde_mm_srai_epi16(mask, 15);
23888     return simde_mm_or_si128(simde_mm_and_si128(mask, b), simde_mm_andnot_si128(mask, a));
23889   #else
23890     simde__m128i_private
23891       r_,
23892       a_ = simde__m128i_to_private(a),
23893       b_ = simde__m128i_to_private(b),
23894       mask_ = simde__m128i_to_private(mask);
23895 
23896     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23897       mask_ = simde__m128i_to_private(simde_mm_cmplt_epi16(mask, simde_mm_setzero_si128()));
23898       r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16);
23899     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
23900       r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0)));
23901     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
23902       #if defined(HEDLEY_INTEL_VERSION_CHECK)
23903         __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 };
23904         mask_.i16 = mask_.i16 < z;
23905       #else
23906         mask_.i16 >>= (CHAR_BIT * sizeof(mask_.i16[0])) - 1;
23907       #endif
23908 
23909       r_.i16 = (mask_.i16 & b_.i16) | (~mask_.i16 & a_.i16);
23910     #else
23911       SIMDE_VECTORIZE
23912       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
23913         int16_t m = mask_.i16[i] >> 15;
23914         r_.i16[i] = (m & b_.i16[i]) | (~m & a_.i16[i]);
23915       }
23916     #endif
23917 
23918     return simde__m128i_from_private(r_);
23919   #endif
23920 }
23921 
23922 SIMDE_FUNCTION_ATTRIBUTES
23923 simde__m128i
simde_x_mm_blendv_epi32(simde__m128i a,simde__m128i b,simde__m128i mask)23924 simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) {
23925   #if defined(SIMDE_X86_SSE4_1_NATIVE)
23926     return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
23927   #else
23928     simde__m128i_private
23929       r_,
23930       a_ = simde__m128i_to_private(a),
23931       b_ = simde__m128i_to_private(b),
23932       mask_ = simde__m128i_to_private(mask);
23933 
23934     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
23935       mask_ = simde__m128i_to_private(simde_mm_cmplt_epi32(mask, simde_mm_setzero_si128()));
23936       r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
23937     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
23938       v128_t m = wasm_i32x4_shr(mask_.wasm_v128, 31);
23939       r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
23940     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
23941       r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0)));
23942     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
23943       #if defined(HEDLEY_INTEL_VERSION_CHECK)
23944         __typeof__(mask_.i32) z = { 0, 0, 0, 0 };
23945         mask_.i32 = HEDLEY_STATIC_CAST(__typeof__(mask_.i32), mask_.i32 < z);
23946       #else
23947         mask_.i32 >>= (CHAR_BIT * sizeof(mask_.i32[0])) - 1;
23948       #endif
23949 
23950       r_.i32 = (mask_.i32 & b_.i32) | (~mask_.i32 & a_.i32);
23951     #else
23952       SIMDE_VECTORIZE
23953       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
23954         int32_t m = mask_.i32[i] >> 31;
23955         r_.i32[i] = (m & b_.i32[i]) | (~m & a_.i32[i]);
23956       }
23957     #endif
23958 
23959     return simde__m128i_from_private(r_);
23960   #endif
23961 }
23962 
23963 SIMDE_FUNCTION_ATTRIBUTES
23964 simde__m128i
simde_x_mm_blendv_epi64(simde__m128i a,simde__m128i b,simde__m128i mask)23965 simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) {
23966   #if defined(SIMDE_X86_SSE4_1_NATIVE)
23967     return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), _mm_castsi128_pd(mask)));
23968   #else
23969     simde__m128i_private
23970       r_,
23971       a_ = simde__m128i_to_private(a),
23972       b_ = simde__m128i_to_private(b),
23973       mask_ = simde__m128i_to_private(mask);
23974 
23975     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
23976       mask_.neon_u64 = vcltq_s64(mask_.neon_i64, vdupq_n_s64(UINT64_C(0)));
23977       r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
23978     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
23979       v128_t m = wasm_i64x2_shr(mask_.wasm_v128, 63);
23980       r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
23981     #elif (defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_BUG_CLANG_46770)) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
23982       r_.altivec_i64 = vec_sel(a_.altivec_i64, b_.altivec_i64, vec_cmplt(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(signed long long, 0))));
23983     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
23984       SIMDE_POWER_ALTIVEC_VECTOR(signed long long) selector = vec_sra(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63)));
23985       r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), selector));
23986     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
23987       #if defined(HEDLEY_INTEL_VERSION_CHECK)
23988         __typeof__(mask_.i64) z = { 0, 0 };
23989         mask_.i64 = HEDLEY_STATIC_CAST(__typeof__(mask_.i64), mask_.i64 < z);
23990       #else
23991         mask_.i64 >>= (CHAR_BIT * sizeof(mask_.i64[0])) - 1;
23992       #endif
23993 
23994     r_.i64 = (mask_.i64 & b_.i64) | (~mask_.i64 & a_.i64);
23995   #else
23996     SIMDE_VECTORIZE
23997     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
23998       int64_t m = mask_.i64[i] >> 63;
23999       r_.i64[i] = (m & b_.i64[i]) | (~m & a_.i64[i]);
24000     }
24001   #endif
24002 
24003     return simde__m128i_from_private(r_);
24004   #endif
24005 }
24006 
24007 SIMDE_FUNCTION_ATTRIBUTES
24008 simde__m128d
simde_mm_blendv_pd(simde__m128d a,simde__m128d b,simde__m128d mask)24009 simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) {
24010   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24011     return _mm_blendv_pd(a, b, mask);
24012   #else
24013     return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask)));
24014   #endif
24015 }
24016 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24017   #undef _mm_blendv_pd
24018   #define _mm_blendv_pd(a, b, mask) simde_mm_blendv_pd(a, b, mask)
24019 #endif
24020 
24021 SIMDE_FUNCTION_ATTRIBUTES
24022 simde__m128
simde_mm_blendv_ps(simde__m128 a,simde__m128 b,simde__m128 mask)24023 simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) {
24024   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24025     return _mm_blendv_ps(a, b, mask);
24026   #else
24027     return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask)));
24028   #endif
24029 }
24030 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24031   #undef _mm_blendv_ps
24032   #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask)
24033 #endif
24034 
24035 SIMDE_FUNCTION_ATTRIBUTES
24036 simde__m128d
simde_mm_round_pd(simde__m128d a,int rounding)24037 simde_mm_round_pd (simde__m128d a, int rounding)
24038     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
24039   simde__m128d_private
24040     r_,
24041     a_ = simde__m128d_to_private(a);
24042 
24043   /* For architectures which lack a current direction SIMD instruction. */
24044   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
24045     if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
24046       rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13;
24047   #endif
24048 
24049   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
24050     case SIMDE_MM_FROUND_CUR_DIRECTION:
24051       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
24052         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
24053       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
24054         r_.neon_f64 = vrndiq_f64(a_.neon_f64);
24055       #elif defined(simde_math_nearbyint)
24056         SIMDE_VECTORIZE
24057         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
24058           r_.f64[i] = simde_math_nearbyint(a_.f64[i]);
24059         }
24060       #else
24061         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
24062       #endif
24063       break;
24064 
24065     case SIMDE_MM_FROUND_TO_NEAREST_INT:
24066       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
24067         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
24068       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
24069         r_.neon_f64 = vrndaq_f64(a_.neon_f64);
24070       #elif defined(simde_math_roundeven)
24071         SIMDE_VECTORIZE
24072         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
24073           r_.f64[i] = simde_math_roundeven(a_.f64[i]);
24074         }
24075       #else
24076         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
24077       #endif
24078       break;
24079 
24080     case SIMDE_MM_FROUND_TO_NEG_INF:
24081       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
24082         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64));
24083       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
24084         r_.neon_f64 = vrndmq_f64(a_.neon_f64);
24085       #else
24086         SIMDE_VECTORIZE
24087         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
24088           r_.f64[i] = simde_math_floor(a_.f64[i]);
24089         }
24090       #endif
24091       break;
24092 
24093     case SIMDE_MM_FROUND_TO_POS_INF:
24094       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
24095         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64));
24096       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
24097         r_.neon_f64 = vrndpq_f64(a_.neon_f64);
24098       #elif defined(simde_math_ceil)
24099         SIMDE_VECTORIZE
24100         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
24101           r_.f64[i] = simde_math_ceil(a_.f64[i]);
24102         }
24103       #else
24104         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
24105       #endif
24106       break;
24107 
24108     case SIMDE_MM_FROUND_TO_ZERO:
24109       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
24110         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64));
24111       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
24112         r_.neon_f64 = vrndq_f64(a_.neon_f64);
24113       #else
24114         SIMDE_VECTORIZE
24115         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
24116           r_.f64[i] = simde_math_trunc(a_.f64[i]);
24117         }
24118       #endif
24119       break;
24120 
24121     default:
24122       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
24123   }
24124 
24125   return simde__m128d_from_private(r_);
24126 }
24127 #if defined(SIMDE_X86_SSE4_1_NATIVE)
24128   #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding)
24129 #endif
24130 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24131   #undef _mm_round_pd
24132   #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding)
24133 #endif
24134 
24135 SIMDE_FUNCTION_ATTRIBUTES
24136 simde__m128d
simde_mm_ceil_pd(simde__m128d a)24137 simde_mm_ceil_pd (simde__m128d a) {
24138   return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF);
24139 }
24140 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24141   #undef _mm_ceil_pd
24142   #define _mm_ceil_pd(a) simde_mm_ceil_pd(a)
24143 #endif
24144 
24145 SIMDE_FUNCTION_ATTRIBUTES
24146 simde__m128
simde_mm_ceil_ps(simde__m128 a)24147 simde_mm_ceil_ps (simde__m128 a) {
24148   return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF);
24149 }
24150 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24151   #undef _mm_ceil_ps
24152   #define _mm_ceil_ps(a) simde_mm_ceil_ps(a)
24153 #endif
24154 
24155 SIMDE_FUNCTION_ATTRIBUTES
24156 simde__m128d
simde_mm_ceil_sd(simde__m128d a,simde__m128d b)24157 simde_mm_ceil_sd (simde__m128d a, simde__m128d b) {
24158   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24159     return _mm_ceil_sd(a, b);
24160   #else
24161     simde__m128d_private
24162       r_,
24163       a_ = simde__m128d_to_private(a),
24164       b_ = simde__m128d_to_private(b);
24165 
24166     #if defined(simde_math_ceilf)
24167       r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0])));
24168     #else
24169       HEDLEY_UNREACHABLE();
24170     #endif
24171 
24172     return simde__m128d_from_private(r_);
24173   #endif
24174 }
24175 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24176   #undef _mm_ceil_sd
24177   #define _mm_ceil_sd(a, b) simde_mm_ceil_sd(a, b)
24178 #endif
24179 
24180 SIMDE_FUNCTION_ATTRIBUTES
24181 simde__m128
simde_mm_ceil_ss(simde__m128 a,simde__m128 b)24182 simde_mm_ceil_ss (simde__m128 a, simde__m128 b) {
24183   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24184     return _mm_ceil_ss(a, b);
24185   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
24186     return simde_mm_move_ss(a, simde_mm_ceil_ps(b));
24187   #else
24188     simde__m128_private
24189       r_,
24190       a_ = simde__m128_to_private(a),
24191       b_ = simde__m128_to_private(b);
24192 
24193     #if defined(simde_math_ceilf)
24194       r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0])));
24195     #else
24196       HEDLEY_UNREACHABLE();
24197     #endif
24198 
24199     return simde__m128_from_private(r_);
24200   #endif
24201 }
24202 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24203   #undef _mm_ceil_ss
24204   #define _mm_ceil_ss(a, b) simde_mm_ceil_ss(a, b)
24205 #endif
24206 
24207 SIMDE_FUNCTION_ATTRIBUTES
24208 simde__m128i
simde_mm_cmpeq_epi64(simde__m128i a,simde__m128i b)24209 simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) {
24210   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24211     return _mm_cmpeq_epi64(a, b);
24212   #else
24213     simde__m128i_private
24214       r_,
24215       a_ = simde__m128i_to_private(a),
24216       b_ = simde__m128i_to_private(b);
24217 
24218     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
24219       r_.neon_u64 = vceqq_u64(a_.neon_u64, b_.neon_u64);
24220     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24221       /* (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) */
24222       uint32x4_t cmp = vceqq_u32(a_.neon_u32, b_.neon_u32);
24223       uint32x4_t swapped = vrev64q_u32(cmp);
24224       r_.neon_u32 = vandq_u32(cmp, swapped);
24225     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
24226       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), a_.i64 == b_.i64);
24227     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
24228       r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpeq(a_.altivec_i64, b_.altivec_i64));
24229     #else
24230       SIMDE_VECTORIZE
24231       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
24232         r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0);
24233       }
24234     #endif
24235 
24236     return simde__m128i_from_private(r_);
24237   #endif
24238 }
24239 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24240   #undef _mm_cmpeq_epi64
24241   #define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b)
24242 #endif
24243 
24244 SIMDE_FUNCTION_ATTRIBUTES
24245 simde__m128i
simde_mm_cvtepi8_epi16(simde__m128i a)24246 simde_mm_cvtepi8_epi16 (simde__m128i a) {
24247   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24248     return _mm_cvtepi8_epi16(a);
24249   #elif defined(SIMDE_X86_SSE2_NATIVE)
24250     return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
24251   #else
24252     simde__m128i_private
24253       r_,
24254       a_ = simde__m128i_to_private(a);
24255 
24256     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24257       int8x16_t s8x16 = a_.neon_i8;                   /* xxxx xxxx xxxx DCBA */
24258       int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
24259       r_.neon_i16 = s16x8;
24260     #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24261       r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
24262           -1,  0, -1,  1, -1,  2,  -1,  3,
24263           -1,  4, -1,  5, -1,  6,  -1,  7));
24264       r_.i16 >>= 8;
24265     #elif defined(SIMDE_CONVERT_VECTOR_)
24266       SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].i8);
24267     #else
24268       SIMDE_VECTORIZE
24269       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
24270         r_.i16[i] = a_.i8[i];
24271       }
24272     #endif
24273 
24274     return simde__m128i_from_private(r_);
24275   #endif
24276 }
24277 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24278   #undef _mm_cvtepi8_epi16
24279   #define _mm_cvtepi8_epi16(a) simde_mm_cvtepi8_epi16(a)
24280 #endif
24281 
24282 SIMDE_FUNCTION_ATTRIBUTES
24283 simde__m128i
simde_mm_cvtepi8_epi32(simde__m128i a)24284 simde_mm_cvtepi8_epi32 (simde__m128i a) {
24285   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24286     return _mm_cvtepi8_epi32(a);
24287   #elif defined(SIMDE_X86_SSE2_NATIVE)
24288     __m128i tmp = _mm_unpacklo_epi8(a, a);
24289     tmp = _mm_unpacklo_epi16(tmp, tmp);
24290     return _mm_srai_epi32(tmp, 24);
24291   #else
24292     simde__m128i_private
24293       r_,
24294       a_ = simde__m128i_to_private(a);
24295 
24296     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24297       int8x16_t s8x16 = a_.neon_i8;                     /* xxxx xxxx xxxx DCBA */
24298       int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
24299       int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
24300       r_.neon_i32 = s32x4;
24301     #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24302       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
24303           -1, -1, -1,  0, -1, -1,  -1,  1,
24304           -1, -1, -1,  2, -1, -1,  -1,  3));
24305       r_.i32 >>= 24;
24306     #else
24307       SIMDE_VECTORIZE
24308       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
24309         r_.i32[i] = a_.i8[i];
24310       }
24311     #endif
24312 
24313     return simde__m128i_from_private(r_);
24314   #endif
24315 }
24316 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24317   #undef _mm_cvtepi8_epi32
24318   #define _mm_cvtepi8_epi32(a) simde_mm_cvtepi8_epi32(a)
24319 #endif
24320 
24321 SIMDE_FUNCTION_ATTRIBUTES
24322 simde__m128i
simde_mm_cvtepi8_epi64(simde__m128i a)24323 simde_mm_cvtepi8_epi64 (simde__m128i a) {
24324   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24325     return _mm_cvtepi8_epi64(a);
24326   #else
24327     simde__m128i_private
24328       r_,
24329       a_ = simde__m128i_to_private(a);
24330 
24331     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24332       int8x16_t s8x16 = a_.neon_i8;                     /* xxxx xxxx xxxx xxBA */
24333       int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
24334       int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
24335       int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
24336       r_.neon_i64 = s64x2;
24337     #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24338       /* Disabled on x86 due to lack of 64-bit arithmetic shift until
24339        * until AVX-512 (at which point we would be using the native
24340        * _mm_cvtepi_epi64 anyways). */
24341       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
24342           -1, -1, -1, -1, -1, -1,  -1,  0,
24343           -1, -1, -1, -1, -1, -1,  -1,  1));
24344       r_.i64 >>= 56;
24345     #else
24346       SIMDE_VECTORIZE
24347       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
24348         r_.i64[i] = a_.i8[i];
24349       }
24350     #endif
24351 
24352     return simde__m128i_from_private(r_);
24353   #endif
24354 }
24355 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24356   #undef _mm_cvtepi8_epi64
24357   #define _mm_cvtepi8_epi64(a) simde_mm_cvtepi8_epi64(a)
24358 #endif
24359 
24360 SIMDE_FUNCTION_ATTRIBUTES
24361 simde__m128i
simde_mm_cvtepu8_epi16(simde__m128i a)24362 simde_mm_cvtepu8_epi16 (simde__m128i a) {
24363   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24364     return _mm_cvtepu8_epi16(a);
24365   #elif defined(SIMDE_X86_SSE2_NATIVE)
24366     return _mm_unpacklo_epi8(a, _mm_setzero_si128());
24367   #else
24368     simde__m128i_private
24369       r_,
24370       a_ = simde__m128i_to_private(a);
24371 
24372     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24373       uint8x16_t u8x16 = a_.neon_u8;                   /* xxxx xxxx xxxx DCBA */
24374       uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
24375       r_.neon_u16 = u16x8;
24376     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24377       __typeof__(r_.i8) z = { 0, };
24378       r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
24379           0, 16, 1, 17, 2, 18, 3, 19,
24380           4, 20, 5, 21, 6, 22, 7, 23));
24381     #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
24382       SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].u8);
24383     #else
24384       SIMDE_VECTORIZE
24385       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
24386         r_.i16[i] = a_.u8[i];
24387       }
24388     #endif
24389 
24390     return simde__m128i_from_private(r_);
24391   #endif
24392 }
24393 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24394   #undef _mm_cvtepu8_epi16
24395   #define _mm_cvtepu8_epi16(a) simde_mm_cvtepu8_epi16(a)
24396 #endif
24397 
24398 SIMDE_FUNCTION_ATTRIBUTES
24399 simde__m128i
simde_mm_cvtepu8_epi32(simde__m128i a)24400 simde_mm_cvtepu8_epi32 (simde__m128i a) {
24401   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24402     return _mm_cvtepu8_epi32(a);
24403   #elif defined(SIMDE_X86_SSSE3_NATIVE)
24404     __m128i s = _mm_set_epi8(
24405         0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, 0x02,
24406         0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x00);
24407     return _mm_shuffle_epi8(a, s);
24408   #elif defined(SIMDE_X86_SSE2_NATIVE)
24409     __m128i z = _mm_setzero_si128();
24410     return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
24411   #else
24412     simde__m128i_private
24413       r_,
24414       a_ = simde__m128i_to_private(a);
24415 
24416     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24417       uint8x16_t u8x16 = a_.neon_u8;                     /* xxxx xxxx xxxx DCBA */
24418       uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
24419       uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
24420       r_.neon_u32 = u32x4;
24421     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24422       __typeof__(r_.i8) z = { 0, };
24423       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
24424           0, 17, 18, 19, 1, 21, 22, 23,
24425           2, 25, 26, 27, 3, 29, 30, 31));
24426     #else
24427       SIMDE_VECTORIZE
24428       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
24429         r_.i32[i] = a_.u8[i];
24430       }
24431     #endif
24432 
24433     return simde__m128i_from_private(r_);
24434   #endif
24435 }
24436 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24437   #undef _mm_cvtepu8_epi32
24438   #define _mm_cvtepu8_epi32(a) simde_mm_cvtepu8_epi32(a)
24439 #endif
24440 
24441 SIMDE_FUNCTION_ATTRIBUTES
24442 simde__m128i
simde_mm_cvtepu8_epi64(simde__m128i a)24443 simde_mm_cvtepu8_epi64 (simde__m128i a) {
24444   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24445     return _mm_cvtepu8_epi64(a);
24446   #elif defined(SIMDE_X86_SSSE3_NATIVE)
24447     __m128i s = _mm_set_epi8(
24448         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01,
24449         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00);
24450     return _mm_shuffle_epi8(a, s);
24451   #elif defined(SIMDE_X86_SSE2_NATIVE)
24452     __m128i z = _mm_setzero_si128();
24453     return _mm_unpacklo_epi32(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z), z);
24454   #else
24455     simde__m128i_private
24456       r_,
24457       a_ = simde__m128i_to_private(a);
24458 
24459     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24460       uint8x16_t u8x16 = a_.neon_u8;                     /* xxxx xxxx xxxx xxBA */
24461       uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
24462       uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
24463       uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
24464       r_.neon_u64 = u64x2;
24465     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24466       __typeof__(r_.i8) z = { 0, };
24467       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
24468           0, 17, 18, 19, 20, 21, 22, 23,
24469           1, 25, 26, 27, 28, 29, 30, 31));
24470     #else
24471       SIMDE_VECTORIZE
24472       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
24473         r_.i64[i] = a_.u8[i];
24474       }
24475     #endif
24476 
24477     return simde__m128i_from_private(r_);
24478   #endif
24479 }
24480 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24481   #undef _mm_cvtepu8_epi64
24482   #define _mm_cvtepu8_epi64(a) simde_mm_cvtepu8_epi64(a)
24483 #endif
24484 
24485 SIMDE_FUNCTION_ATTRIBUTES
24486 simde__m128i
simde_mm_cvtepi16_epi32(simde__m128i a)24487 simde_mm_cvtepi16_epi32 (simde__m128i a) {
24488   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24489     return _mm_cvtepi16_epi32(a);
24490   #elif defined(SIMDE_X86_SSE2_NATIVE)
24491     return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
24492   #else
24493     simde__m128i_private
24494       r_,
24495       a_ = simde__m128i_to_private(a);
24496 
24497     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24498       r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16));
24499     #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24500       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 0, 10, 1, 12, 2, 14, 3));
24501       r_.i32 >>= 16;
24502     #else
24503       SIMDE_VECTORIZE
24504       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
24505         r_.i32[i] = a_.i16[i];
24506       }
24507     #endif
24508 
24509     return simde__m128i_from_private(r_);
24510   #endif
24511 }
24512 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24513   #undef _mm_cvtepi16_epi32
24514   #define _mm_cvtepi16_epi32(a) simde_mm_cvtepi16_epi32(a)
24515 #endif
24516 
24517 SIMDE_FUNCTION_ATTRIBUTES
24518 simde__m128i
simde_mm_cvtepu16_epi32(simde__m128i a)24519 simde_mm_cvtepu16_epi32 (simde__m128i a) {
24520   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24521     return _mm_cvtepu16_epi32(a);
24522   #elif defined(SIMDE_X86_SSE2_NATIVE)
24523     return _mm_unpacklo_epi16(a, _mm_setzero_si128());
24524   #else
24525     simde__m128i_private
24526       r_,
24527       a_ = simde__m128i_to_private(a);
24528 
24529     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24530       r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16));
24531     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24532       __typeof__(r_.u16) z = { 0, };
24533       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z,
24534           0, 9, 1, 11, 2, 13, 3, 15));
24535     #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
24536       SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u16);
24537     #else
24538       SIMDE_VECTORIZE
24539       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
24540         r_.i32[i] = a_.u16[i];
24541       }
24542     #endif
24543 
24544     return simde__m128i_from_private(r_);
24545   #endif
24546 }
24547 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24548   #undef _mm_cvtepu16_epi32
24549   #define _mm_cvtepu16_epi32(a) simde_mm_cvtepu16_epi32(a)
24550 #endif
24551 
24552 SIMDE_FUNCTION_ATTRIBUTES
24553 simde__m128i
simde_mm_cvtepu16_epi64(simde__m128i a)24554 simde_mm_cvtepu16_epi64 (simde__m128i a) {
24555   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24556     return _mm_cvtepu16_epi64(a);
24557   #elif defined(SIMDE_X86_SSE2_NATIVE)
24558     __m128i z = _mm_setzero_si128();
24559     return _mm_unpacklo_epi32(_mm_unpacklo_epi16(a, z), z);
24560   #else
24561     simde__m128i_private
24562       r_,
24563       a_ = simde__m128i_to_private(a);
24564 
24565     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24566       uint16x8_t u16x8 = a_.neon_u16;                    /* xxxx xxxx xxxx 0B0A */
24567       uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
24568       uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
24569       r_.neon_u64 = u64x2;
24570     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24571       __typeof__(r_.u16) z = { 0, };
24572       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z,
24573           0,  9, 10, 11,
24574           1, 13, 14, 15));
24575     #else
24576       SIMDE_VECTORIZE
24577       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
24578         r_.i64[i] = a_.u16[i];
24579       }
24580     #endif
24581 
24582     return simde__m128i_from_private(r_);
24583   #endif
24584 }
24585 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24586   #undef _mm_cvtepu16_epi64
24587   #define _mm_cvtepu16_epi64(a) simde_mm_cvtepu16_epi64(a)
24588 #endif
24589 
24590 SIMDE_FUNCTION_ATTRIBUTES
24591 simde__m128i
simde_mm_cvtepi16_epi64(simde__m128i a)24592 simde_mm_cvtepi16_epi64 (simde__m128i a) {
24593   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24594     return _mm_cvtepi16_epi64(a);
24595   #else
24596     simde__m128i_private
24597       r_,
24598       a_ = simde__m128i_to_private(a);
24599 
24600     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24601       int16x8_t s16x8 = a_.neon_i16;                    /* xxxx xxxx xxxx 0B0A */
24602       int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
24603       int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
24604       r_.neon_i64 = s64x2;
24605     #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24606       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16,
24607            8,  9, 10, 0,
24608           12, 13, 14, 1));
24609       r_.i64 >>= 48;
24610     #else
24611       SIMDE_VECTORIZE
24612       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
24613         r_.i64[i] = a_.i16[i];
24614       }
24615     #endif
24616 
24617     return simde__m128i_from_private(r_);
24618   #endif
24619 }
24620 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24621   #undef _mm_cvtepi16_epi64
24622   #define _mm_cvtepi16_epi64(a) simde_mm_cvtepi16_epi64(a)
24623 #endif
24624 
24625 SIMDE_FUNCTION_ATTRIBUTES
24626 simde__m128i
simde_mm_cvtepi32_epi64(simde__m128i a)24627 simde_mm_cvtepi32_epi64 (simde__m128i a) {
24628   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24629     return _mm_cvtepi32_epi64(a);
24630   #elif defined(SIMDE_X86_SSE2_NATIVE)
24631     __m128i tmp = _mm_shuffle_epi32(a, 0x50);
24632     tmp = _mm_srai_epi32(tmp, 31);
24633     tmp = _mm_shuffle_epi32(tmp, 0xed);
24634     return _mm_unpacklo_epi32(a, tmp);
24635   #else
24636     simde__m128i_private
24637       r_,
24638       a_ = simde__m128i_to_private(a);
24639 
24640     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24641       r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32));
24642     #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24643       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, a_.i32, -1, 0, -1, 1));
24644       r_.i64 >>= 32;
24645     #elif defined(SIMDE_CONVERT_VECTOR_)
24646       SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i32);
24647     #else
24648       SIMDE_VECTORIZE
24649       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
24650         r_.i64[i] = a_.i32[i];
24651       }
24652     #endif
24653 
24654     return simde__m128i_from_private(r_);
24655   #endif
24656 }
24657 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24658   #undef _mm_cvtepi32_epi64
24659   #define _mm_cvtepi32_epi64(a) simde_mm_cvtepi32_epi64(a)
24660 #endif
24661 
24662 SIMDE_FUNCTION_ATTRIBUTES
24663 simde__m128i
simde_mm_cvtepu32_epi64(simde__m128i a)24664 simde_mm_cvtepu32_epi64 (simde__m128i a) {
24665   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24666     return _mm_cvtepu32_epi64(a);
24667   #elif defined(SIMDE_X86_SSE2_NATIVE)
24668     return _mm_unpacklo_epi32(a, _mm_setzero_si128());
24669   #else
24670     simde__m128i_private
24671       r_,
24672       a_ = simde__m128i_to_private(a);
24673 
24674     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24675       r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32));
24676     #elif defined(SIMDE_VECTOR_SCALAR) && defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
24677       __typeof__(r_.u32) z = { 0, };
24678       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 1, 6));
24679     #elif defined(SIMDE_CONVERT_VECTOR_)
24680       SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u32);
24681     #else
24682       SIMDE_VECTORIZE
24683       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
24684         r_.i64[i] = a_.u32[i];
24685       }
24686     #endif
24687 
24688     return simde__m128i_from_private(r_);
24689   #endif
24690 }
24691 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24692   #undef _mm_cvtepu32_epi64
24693   #define _mm_cvtepu32_epi64(a) simde_mm_cvtepu32_epi64(a)
24694 #endif
24695 
24696 SIMDE_FUNCTION_ATTRIBUTES
24697 simde__m128d
simde_mm_dp_pd(simde__m128d a,simde__m128d b,const int imm8)24698 simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8)
24699     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
24700   simde__m128d_private
24701     r_,
24702     a_ = simde__m128d_to_private(a),
24703     b_ = simde__m128d_to_private(b);
24704 
24705   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
24706     r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
24707 
24708     switch (imm8) {
24709       case 0xff:
24710         r_.neon_f64 = vaddq_f64(r_.neon_f64, vextq_f64(r_.neon_f64, r_.neon_f64, 1));
24711         break;
24712       case 0x13:
24713         r_.neon_f64 = vdupq_lane_f64(vget_low_f64(r_.neon_f64), 0);
24714         break;
24715       default:
24716         { /* imm8 is a compile-time constant, so this all becomes just a load */
24717           uint64_t mask_data[] = {
24718             (imm8 & (1 << 4)) ? ~UINT64_C(0) : UINT64_C(0),
24719             (imm8 & (1 << 5)) ? ~UINT64_C(0) : UINT64_C(0),
24720           };
24721           r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64)));
24722         }
24723 
24724         r_.neon_f64 = vdupq_n_f64(vaddvq_f64(r_.neon_f64));
24725 
24726         {
24727           uint64_t mask_data[] = {
24728             (imm8 & 1) ? ~UINT64_C(0) : UINT64_C(0),
24729             (imm8 & 2) ? ~UINT64_C(0) : UINT64_C(0)
24730           };
24731           r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64)));
24732         }
24733         break;
24734     }
24735   #else
24736     simde_float64 sum = SIMDE_FLOAT64_C(0.0);
24737 
24738     SIMDE_VECTORIZE_REDUCTION(+:sum)
24739     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
24740       sum += ((imm8 >> (i + 4)) & 1) ? (a_.f64[i] * b_.f64[i]) : 0.0;
24741     }
24742 
24743     SIMDE_VECTORIZE
24744     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
24745       r_.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0;
24746     }
24747   #endif
24748 
24749   return simde__m128d_from_private(r_);
24750 }
24751 #if defined(SIMDE_X86_SSE4_1_NATIVE)
24752 #  define simde_mm_dp_pd(a, b, imm8) _mm_dp_pd(a, b, imm8)
24753 #endif
24754 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24755   #undef _mm_dp_pd
24756   #define _mm_dp_pd(a, b, imm8) simde_mm_dp_pd(a, b, imm8)
24757 #endif
24758 
24759 SIMDE_FUNCTION_ATTRIBUTES
24760 simde__m128
simde_mm_dp_ps(simde__m128 a,simde__m128 b,const int imm8)24761 simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8)
24762     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
24763   simde__m128_private
24764     r_,
24765     a_ = simde__m128_to_private(a),
24766     b_ = simde__m128_to_private(b);
24767 
24768   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
24769     r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
24770 
24771     switch (imm8) {
24772       case 0xff:
24773         r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
24774         break;
24775       case 0x7f:
24776         r_.neon_f32 = vsetq_lane_f32(0, r_.neon_f32, 3);
24777         r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
24778         break;
24779       default:
24780         {
24781           {
24782             uint32_t mask_data[] = {
24783               (imm8 & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0),
24784               (imm8 & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0),
24785               (imm8 & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0),
24786               (imm8 & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0)
24787             };
24788             r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32)));
24789           }
24790 
24791           r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
24792 
24793           {
24794             uint32_t mask_data[] = {
24795               (imm8 & 1) ? ~UINT32_C(0) : UINT32_C(0),
24796               (imm8 & 2) ? ~UINT32_C(0) : UINT32_C(0),
24797               (imm8 & 4) ? ~UINT32_C(0) : UINT32_C(0),
24798               (imm8 & 8) ? ~UINT32_C(0) : UINT32_C(0)
24799             };
24800             r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32)));
24801           }
24802         }
24803         break;
24804     }
24805   #else
24806     simde_float32 sum = SIMDE_FLOAT32_C(0.0);
24807 
24808     SIMDE_VECTORIZE_REDUCTION(+:sum)
24809     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
24810       sum += ((imm8 >> (i + 4)) & 1) ? (a_.f32[i] * b_.f32[i]) : SIMDE_FLOAT32_C(0.0);
24811     }
24812 
24813     SIMDE_VECTORIZE
24814     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
24815       r_.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0);
24816     }
24817   #endif
24818 
24819   return simde__m128_from_private(r_);
24820 }
24821 #if defined(SIMDE_X86_SSE4_1_NATIVE)
24822 #  define simde_mm_dp_ps(a, b, imm8) _mm_dp_ps(a, b, imm8)
24823 #endif
24824 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24825   #undef _mm_dp_ps
24826   #define _mm_dp_ps(a, b, imm8) simde_mm_dp_ps(a, b, imm8)
24827 #endif
24828 
24829 #if defined(simde_mm_extract_epi8)
24830 #  undef simde_mm_extract_epi8
24831 #endif
24832 SIMDE_FUNCTION_ATTRIBUTES
24833 int8_t
simde_mm_extract_epi8(simde__m128i a,const int imm8)24834 simde_mm_extract_epi8 (simde__m128i a, const int imm8)
24835     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15)  {
24836   simde__m128i_private
24837     a_ = simde__m128i_to_private(a);
24838 
24839   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
24840     #if defined(SIMDE_BUG_GCC_95227)
24841       (void) a_;
24842       (void) imm8;
24843     #endif
24844     return vec_extract(a_.altivec_i8, imm8);
24845   #else
24846     return a_.i8[imm8 & 15];
24847   #endif
24848 }
24849 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8)
24850 #  define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8))
24851 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24852 #  define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_private(a).neon_i8, imm8)
24853 #endif
24854 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24855   #undef _mm_extract_epi8
24856   #define _mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int, simde_mm_extract_epi8(a, imm8))
24857 #endif
24858 
24859 #if defined(simde_mm_extract_epi32)
24860 #  undef simde_mm_extract_epi32
24861 #endif
24862 SIMDE_FUNCTION_ATTRIBUTES
24863 int32_t
simde_mm_extract_epi32(simde__m128i a,const int imm8)24864 simde_mm_extract_epi32 (simde__m128i a, const int imm8)
24865     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
24866   simde__m128i_private
24867     a_ = simde__m128i_to_private(a);
24868 
24869   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
24870     #if defined(SIMDE_BUG_GCC_95227)
24871       (void) a_;
24872       (void) imm8;
24873     #endif
24874     return vec_extract(a_.altivec_i32, imm8);
24875   #else
24876     return a_.i32[imm8 & 3];
24877   #endif
24878 }
24879 #if defined(SIMDE_X86_SSE4_1_NATIVE)
24880 #  define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8)
24881 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24882 #  define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_private(a).neon_i32, imm8)
24883 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
24884 #  define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_private(a).altivec_i32, imm8))
24885 #endif
24886 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24887   #undef _mm_extract_epi32
24888   #define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(a, imm8)
24889 #endif
24890 
24891 #if defined(simde_mm_extract_epi64)
24892 #  undef simde_mm_extract_epi64
24893 #endif
24894 SIMDE_FUNCTION_ATTRIBUTES
24895 int64_t
simde_mm_extract_epi64(simde__m128i a,const int imm8)24896 simde_mm_extract_epi64 (simde__m128i a, const int imm8)
24897     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1)  {
24898   simde__m128i_private
24899     a_ = simde__m128i_to_private(a);
24900 
24901   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
24902     #if defined(SIMDE_BUG_GCC_95227)
24903       (void) a_;
24904       (void) imm8;
24905     #endif
24906     return vec_extract(a_.altivec_i64, imm8);
24907   #else
24908     return a_.i64[imm8 & 1];
24909   #endif
24910 }
24911 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
24912 #  define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8)
24913 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24914 #  define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_private(a).neon_i64, imm8)
24915 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
24916 #  define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_private(a).altivec_i64, imm8))
24917 #endif
24918 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
24919   #undef _mm_extract_epi64
24920   #define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(a, imm8)
24921 #endif
24922 
24923 #if defined(simde_mm_extract_ps)
24924 #  undef simde_mm_extract_ps
24925 #endif
24926 SIMDE_FUNCTION_ATTRIBUTES
24927 int32_t
simde_mm_extract_ps(simde__m128 a,const int imm8)24928 simde_mm_extract_ps (simde__m128 a, const int imm8)
24929     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
24930   simde__m128_private
24931     a_ = simde__m128_to_private(a);
24932 
24933   return a_.i32[imm8 & 3];
24934 }
24935 #if defined(SIMDE_X86_SSE4_1_NATIVE)
24936   #define simde_mm_extract_ps(a, imm8) _mm_extract_ps(a, imm8)
24937 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
24938   #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_private(a).neon_i32, imm8)
24939 #endif
24940 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24941   #undef _mm_extract_ps
24942   #define _mm_extract_ps(a, imm8) simde_mm_extract_ps(a, imm8)
24943 #endif
24944 
24945 SIMDE_FUNCTION_ATTRIBUTES
24946 simde__m128d
simde_mm_floor_pd(simde__m128d a)24947 simde_mm_floor_pd (simde__m128d a) {
24948   return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF);
24949 }
24950 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24951   #undef _mm_floor_pd
24952   #define _mm_floor_pd(a) simde_mm_floor_pd(a)
24953 #endif
24954 
24955 SIMDE_FUNCTION_ATTRIBUTES
24956 simde__m128
simde_mm_floor_ps(simde__m128 a)24957 simde_mm_floor_ps (simde__m128 a) {
24958   return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF);
24959 }
24960 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24961   #undef _mm_floor_ps
24962   #define _mm_floor_ps(a) simde_mm_floor_ps(a)
24963 #endif
24964 
24965 SIMDE_FUNCTION_ATTRIBUTES
24966 simde__m128d
simde_mm_floor_sd(simde__m128d a,simde__m128d b)24967 simde_mm_floor_sd (simde__m128d a, simde__m128d b) {
24968   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24969     return _mm_floor_sd(a, b);
24970   #else
24971     simde__m128d_private
24972       r_,
24973       a_ = simde__m128d_to_private(a),
24974       b_ = simde__m128d_to_private(b);
24975 
24976     #if defined(simde_math_floor)
24977       r_.f64[0] = simde_math_floor(b_.f64[0]);
24978       r_.f64[1] = a_.f64[1];
24979     #else
24980       HEDLEY_UNREACHABLE();
24981     #endif
24982 
24983     return simde__m128d_from_private(r_);
24984   #endif
24985 }
24986 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
24987   #undef _mm_floor_sd
24988   #define _mm_floor_sd(a, b) simde_mm_floor_sd(a, b)
24989 #endif
24990 
24991 SIMDE_FUNCTION_ATTRIBUTES
24992 simde__m128
simde_mm_floor_ss(simde__m128 a,simde__m128 b)24993 simde_mm_floor_ss (simde__m128 a, simde__m128 b) {
24994   #if defined(SIMDE_X86_SSE4_1_NATIVE)
24995     return _mm_floor_ss(a, b);
24996   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
24997       return simde_mm_move_ss(a, simde_mm_floor_ps(b));
24998   #else
24999     simde__m128_private
25000       r_,
25001       a_ = simde__m128_to_private(a),
25002       b_ = simde__m128_to_private(b);
25003 
25004     #if defined(simde_math_floorf)
25005       r_.f32[0] = simde_math_floorf(b_.f32[0]);
25006       for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
25007         r_.f32[i] = a_.f32[i];
25008       }
25009     #else
25010       HEDLEY_UNREACHABLE();
25011     #endif
25012 
25013     return simde__m128_from_private(r_);
25014   #endif
25015 }
25016 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25017   #undef _mm_floor_ss
25018   #define _mm_floor_ss(a, b) simde_mm_floor_ss(a, b)
25019 #endif
25020 
25021 SIMDE_FUNCTION_ATTRIBUTES
25022 simde__m128i
simde_mm_insert_epi8(simde__m128i a,int i,const int imm8)25023 simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8)
25024     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15)  {
25025   simde__m128i_private
25026     r_ = simde__m128i_to_private(a);
25027 
25028   r_.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i);
25029 
25030   return simde__m128i_from_private(r_);
25031 }
25032 #if defined(SIMDE_X86_SSE4_1_NATIVE)
25033   /* clang-3.8 returns an incompatible type, so we need the cast.  MSVC
25034    * can't handle the cast ("error C2440: 'type cast': cannot convert
25035    * from '__m128i' to '__m128i'").  */
25036   #if defined(__clang__)
25037     #define simde_mm_insert_epi8(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi8(a, i, imm8))
25038   #else
25039     #define simde_mm_insert_epi8(a, i, imm8) _mm_insert_epi8(a, i, imm8)
25040   #endif
25041 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25042 #  define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_private(a).i8, imm8))
25043 #endif
25044 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25045   #undef _mm_insert_epi8
25046   #define _mm_insert_epi8(a, i, imm8) simde_mm_insert_epi8(a, i, imm8)
25047 #endif
25048 
25049 SIMDE_FUNCTION_ATTRIBUTES
25050 simde__m128i
simde_mm_insert_epi32(simde__m128i a,int i,const int imm8)25051 simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8)
25052     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
25053   simde__m128i_private
25054     r_ = simde__m128i_to_private(a);
25055 
25056   r_.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i);
25057 
25058   return simde__m128i_from_private(r_);
25059 }
25060 #if defined(SIMDE_X86_SSE4_1_NATIVE)
25061   #if defined(__clang__)
25062     #define simde_mm_insert_epi32(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi32(a, i, imm8))
25063   #else
25064     #define simde_mm_insert_epi32(a, i, imm8) _mm_insert_epi32(a, i, imm8)
25065   #endif
25066 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25067 #  define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_private(a).i32, imm8))
25068 #endif
25069 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25070   #undef _mm_insert_epi32
25071   #define _mm_insert_epi32(a, i, imm8) simde_mm_insert_epi32(a, i, imm8)
25072 #endif
25073 
25074 SIMDE_FUNCTION_ATTRIBUTES
25075 simde__m128i
simde_mm_insert_epi64(simde__m128i a,int64_t i,const int imm8)25076 simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8)
25077     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1)  {
25078   #if defined(SIMDE_BUG_GCC_94482)
25079     simde__m128i_private
25080       a_ = simde__m128i_to_private(a);
25081 
25082     switch(imm8) {
25083       case 0:
25084         return simde_mm_set_epi64x(a_.i64[1], i);
25085         break;
25086       case 1:
25087         return simde_mm_set_epi64x(i, a_.i64[0]);
25088         break;
25089       default:
25090         HEDLEY_UNREACHABLE();
25091         break;
25092     }
25093   #else
25094     simde__m128i_private
25095       r_ = simde__m128i_to_private(a);
25096 
25097     r_.i64[imm8] = i;
25098     return simde__m128i_from_private(r_);
25099   #endif
25100 }
25101 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
25102 #  define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8)
25103 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25104 #  define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_private(a).i64, imm8))
25105 #endif
25106 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
25107   #undef _mm_insert_epi64
25108   #define _mm_insert_epi64(a, i, imm8) simde_mm_insert_epi64(a, i, imm8)
25109 #endif
25110 
25111 SIMDE_FUNCTION_ATTRIBUTES
25112 simde__m128
simde_mm_insert_ps(simde__m128 a,simde__m128 b,const int imm8)25113 simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8)
25114     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
25115   simde__m128_private
25116     r_,
25117     a_ = simde__m128_to_private(a),
25118     b_ = simde__m128_to_private(b);
25119 
25120   a_.f32[0] = b_.f32[(imm8 >> 6) & 3];
25121   a_.f32[(imm8 >> 4) & 3] = a_.f32[0];
25122 
25123   SIMDE_VECTORIZE
25124   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
25125     r_.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i];
25126   }
25127 
25128   return simde__m128_from_private(r_);
25129 }
25130 #if defined(SIMDE_X86_SSE4_1_NATIVE)
25131 #  define simde_mm_insert_ps(a, b, imm8) _mm_insert_ps(a, b, imm8)
25132 #endif
25133 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25134   #undef _mm_insert_ps
25135   #define _mm_insert_ps(a, b, imm8) simde_mm_insert_ps(a, b, imm8)
25136 #endif
25137 
25138 SIMDE_FUNCTION_ATTRIBUTES
25139 simde__m128i
simde_mm_max_epi8(simde__m128i a,simde__m128i b)25140 simde_mm_max_epi8 (simde__m128i a, simde__m128i b) {
25141   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
25142     return _mm_max_epi8(a, b);
25143   #else
25144     simde__m128i_private
25145       r_,
25146       a_ = simde__m128i_to_private(a),
25147       b_ = simde__m128i_to_private(b);
25148 
25149     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25150       r_.neon_i8 = vmaxq_s8(a_.neon_i8, b_.neon_i8);
25151     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
25152       r_.wasm_v128 = wasm_i8x16_max(a_.wasm_v128, b_.wasm_v128);
25153     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
25154       r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8);
25155     #else
25156       SIMDE_VECTORIZE
25157       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
25158         r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
25159       }
25160     #endif
25161 
25162     return simde__m128i_from_private(r_);
25163   #endif
25164 }
25165 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25166   #undef _mm_max_epi8
25167   #define _mm_max_epi8(a, b) simde_mm_max_epi8(a, b)
25168 #endif
25169 
25170 SIMDE_FUNCTION_ATTRIBUTES
25171 simde__m128i
simde_mm_max_epi32(simde__m128i a,simde__m128i b)25172 simde_mm_max_epi32 (simde__m128i a, simde__m128i b) {
25173   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
25174     return _mm_max_epi32(a, b);
25175   #else
25176     simde__m128i_private
25177       r_,
25178       a_ = simde__m128i_to_private(a),
25179       b_ = simde__m128i_to_private(b);
25180 
25181     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25182       r_.neon_i32 = vmaxq_s32(a_.neon_i32, b_.neon_i32);
25183     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
25184       r_.wasm_v128 = wasm_i32x4_max(a_.wasm_v128, b_.wasm_v128);
25185     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
25186       r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32);
25187     #else
25188       SIMDE_VECTORIZE
25189       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
25190         r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
25191       }
25192     #endif
25193 
25194     return simde__m128i_from_private(r_);
25195   #endif
25196 }
25197 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25198   #undef _mm_max_epi32
25199   #define _mm_max_epi32(a, b) simde_mm_max_epi32(a, b)
25200 #endif
25201 
25202 SIMDE_FUNCTION_ATTRIBUTES
25203 simde__m128i
simde_mm_max_epu16(simde__m128i a,simde__m128i b)25204 simde_mm_max_epu16 (simde__m128i a, simde__m128i b) {
25205   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25206     return _mm_max_epu16(a, b);
25207   #else
25208     simde__m128i_private
25209       r_,
25210       a_ = simde__m128i_to_private(a),
25211       b_ = simde__m128i_to_private(b);
25212 
25213     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25214       r_.neon_u16 = vmaxq_u16(a_.neon_u16, b_.neon_u16);
25215     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
25216       r_.wasm_v128 = wasm_u16x8_max(a_.wasm_v128, b_.wasm_v128);
25217     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
25218       r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16);
25219     #else
25220       SIMDE_VECTORIZE
25221       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
25222         r_.u16[i] = a_.u16[i] > b_.u16[i] ? a_.u16[i] : b_.u16[i];
25223       }
25224     #endif
25225 
25226     return simde__m128i_from_private(r_);
25227   #endif
25228 }
25229 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25230   #undef _mm_max_epu16
25231   #define _mm_max_epu16(a, b) simde_mm_max_epu16(a, b)
25232 #endif
25233 
25234 SIMDE_FUNCTION_ATTRIBUTES
25235 simde__m128i
simde_mm_max_epu32(simde__m128i a,simde__m128i b)25236 simde_mm_max_epu32 (simde__m128i a, simde__m128i b) {
25237   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25238     return _mm_max_epu32(a, b);
25239   #else
25240     simde__m128i_private
25241       r_,
25242       a_ = simde__m128i_to_private(a),
25243       b_ = simde__m128i_to_private(b);
25244 
25245     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25246       r_.neon_u32 = vmaxq_u32(a_.neon_u32, b_.neon_u32);
25247     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
25248       r_.wasm_v128 = wasm_u32x4_max(a_.wasm_v128, b_.wasm_v128);
25249     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
25250       r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32);
25251     #else
25252       SIMDE_VECTORIZE
25253       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
25254         r_.u32[i] = a_.u32[i] > b_.u32[i] ? a_.u32[i] : b_.u32[i];
25255       }
25256     #endif
25257 
25258     return simde__m128i_from_private(r_);
25259   #endif
25260 }
25261 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25262   #undef _mm_max_epu32
25263   #define _mm_max_epu32(a, b) simde_mm_max_epu32(a, b)
25264 #endif
25265 
25266 SIMDE_FUNCTION_ATTRIBUTES
25267 simde__m128i
simde_mm_min_epi8(simde__m128i a,simde__m128i b)25268 simde_mm_min_epi8 (simde__m128i a, simde__m128i b) {
25269   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
25270     return _mm_min_epi8(a, b);
25271   #else
25272     simde__m128i_private
25273       r_,
25274       a_ = simde__m128i_to_private(a),
25275       b_ = simde__m128i_to_private(b);
25276 
25277     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25278       r_.neon_i8 = vminq_s8(a_.neon_i8, b_.neon_i8);
25279     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
25280       r_.wasm_v128 = wasm_i8x16_min(a_.wasm_v128, b_.wasm_v128);
25281     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
25282       r_.altivec_i8 = vec_min(a_.altivec_i8, b_.altivec_i8);
25283     #else
25284       SIMDE_VECTORIZE
25285       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
25286         r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
25287       }
25288     #endif
25289 
25290     return simde__m128i_from_private(r_);
25291   #endif
25292 }
25293 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25294   #undef _mm_min_epi8
25295   #define _mm_min_epi8(a, b) simde_mm_min_epi8(a, b)
25296 #endif
25297 
25298 SIMDE_FUNCTION_ATTRIBUTES
25299 simde__m128i
simde_mm_min_epi32(simde__m128i a,simde__m128i b)25300 simde_mm_min_epi32 (simde__m128i a, simde__m128i b) {
25301   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
25302     return _mm_min_epi32(a, b);
25303   #else
25304     simde__m128i_private
25305       r_,
25306       a_ = simde__m128i_to_private(a),
25307       b_ = simde__m128i_to_private(b);
25308 
25309     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25310       r_.neon_i32 = vminq_s32(a_.neon_i32, b_.neon_i32);
25311     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
25312       r_.wasm_v128 = wasm_i32x4_min(a_.wasm_v128, b_.wasm_v128);
25313     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
25314       r_.altivec_i32 = vec_min(a_.altivec_i32, b_.altivec_i32);
25315     #else
25316       SIMDE_VECTORIZE
25317       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
25318         r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
25319       }
25320     #endif
25321 
25322     return simde__m128i_from_private(r_);
25323   #endif
25324 }
25325 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25326   #undef _mm_min_epi32
25327   #define _mm_min_epi32(a, b) simde_mm_min_epi32(a, b)
25328 #endif
25329 
25330 SIMDE_FUNCTION_ATTRIBUTES
25331 simde__m128i
simde_mm_min_epu16(simde__m128i a,simde__m128i b)25332 simde_mm_min_epu16 (simde__m128i a, simde__m128i b) {
25333   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25334     return _mm_min_epu16(a, b);
25335   #else
25336     simde__m128i_private
25337       r_,
25338       a_ = simde__m128i_to_private(a),
25339       b_ = simde__m128i_to_private(b);
25340 
25341     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25342       r_.neon_u16 = vminq_u16(a_.neon_u16, b_.neon_u16);
25343     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
25344       r_.wasm_v128 = wasm_u16x8_min(a_.wasm_v128, b_.wasm_v128);
25345     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
25346       r_.altivec_u16 = vec_min(a_.altivec_u16, b_.altivec_u16);
25347     #else
25348       SIMDE_VECTORIZE
25349       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
25350         r_.u16[i] = a_.u16[i] < b_.u16[i] ? a_.u16[i] : b_.u16[i];
25351       }
25352     #endif
25353 
25354     return simde__m128i_from_private(r_);
25355   #endif
25356 }
25357 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25358   #undef _mm_min_epu16
25359   #define _mm_min_epu16(a, b) simde_mm_min_epu16(a, b)
25360 #endif
25361 
25362 SIMDE_FUNCTION_ATTRIBUTES
25363 simde__m128i
simde_mm_min_epu32(simde__m128i a,simde__m128i b)25364 simde_mm_min_epu32 (simde__m128i a, simde__m128i b) {
25365   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25366     return _mm_min_epu32(a, b);
25367   #else
25368     simde__m128i_private
25369       r_,
25370       a_ = simde__m128i_to_private(a),
25371       b_ = simde__m128i_to_private(b);
25372 
25373     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25374       r_.neon_u32 = vminq_u32(a_.neon_u32, b_.neon_u32);
25375     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
25376       r_.wasm_v128 = wasm_u32x4_min(a_.wasm_v128, b_.wasm_v128);
25377     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
25378       r_.altivec_u32 = vec_min(a_.altivec_u32, b_.altivec_u32);
25379     #else
25380       SIMDE_VECTORIZE
25381       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
25382         r_.u32[i] = a_.u32[i] < b_.u32[i] ? a_.u32[i] : b_.u32[i];
25383       }
25384     #endif
25385 
25386     return simde__m128i_from_private(r_);
25387   #endif
25388 }
25389 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25390   #undef _mm_min_epu32
25391   #define _mm_min_epu32(a, b) simde_mm_min_epu32(a, b)
25392 #endif
25393 
25394 SIMDE_FUNCTION_ATTRIBUTES
25395 simde__m128i
simde_mm_minpos_epu16(simde__m128i a)25396 simde_mm_minpos_epu16 (simde__m128i a) {
25397   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25398     return _mm_minpos_epu16(a);
25399   #else
25400     simde__m128i_private
25401       r_ = simde__m128i_to_private(simde_mm_setzero_si128()),
25402       a_ = simde__m128i_to_private(a);
25403 
25404     r_.u16[0] = UINT16_MAX;
25405     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
25406       if (a_.u16[i] < r_.u16[0]) {
25407         r_.u16[0] = a_.u16[i];
25408         r_.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i);
25409       }
25410     }
25411 
25412     return simde__m128i_from_private(r_);
25413   #endif
25414 }
25415 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25416   #undef _mm_minpos_epu16
25417   #define _mm_minpos_epu16(a) simde_mm_minpos_epu16(a)
25418 #endif
25419 
25420 SIMDE_FUNCTION_ATTRIBUTES
25421 simde__m128i
simde_mm_mpsadbw_epu8(simde__m128i a,simde__m128i b,const int imm8)25422 simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8)
25423     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
25424   simde__m128i_private
25425     r_,
25426     a_ = simde__m128i_to_private(a),
25427     b_ = simde__m128i_to_private(b);
25428 
25429   const int a_offset = imm8 & 4;
25430   const int b_offset = (imm8 & 3) << 2;
25431 
25432 #if defined(simde_math_abs)
25433   for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0]))) ; i++) {
25434     r_.u16[i] =
25435       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 0] - b_.u8[b_offset + 0]))) +
25436       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 1] - b_.u8[b_offset + 1]))) +
25437       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 2] - b_.u8[b_offset + 2]))) +
25438       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 3] - b_.u8[b_offset + 3])));
25439   }
25440 #else
25441   HEDLEY_UNREACHABLE();
25442 #endif
25443 
25444   return simde__m128i_from_private(r_);
25445 }
25446 #if defined(SIMDE_X86_SSE4_1_NATIVE)
25447 #  define simde_mm_mpsadbw_epu8(a, b, imm8) _mm_mpsadbw_epu8(a, b, imm8)
25448 #endif
25449 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25450   #undef _mm_mpsadbw_epu8
25451   #define _mm_mpsadbw_epu8(a, b, imm8) simde_mm_mpsadbw_epu8(a, b, imm8)
25452 #endif
25453 
25454 SIMDE_FUNCTION_ATTRIBUTES
25455 simde__m128i
simde_mm_mul_epi32(simde__m128i a,simde__m128i b)25456 simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) {
25457   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25458     return _mm_mul_epi32(a, b);
25459   #else
25460     simde__m128i_private
25461       r_,
25462       a_ = simde__m128i_to_private(a),
25463       b_ = simde__m128i_to_private(b);
25464 
25465     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25466       // vmull_s32 upcasts instead of masking, so we downcast.
25467       int32x2_t a_lo = vmovn_s64(a_.neon_i64);
25468       int32x2_t b_lo = vmovn_s64(b_.neon_i64);
25469       r_.neon_i64 = vmull_s32(a_lo, b_lo);
25470     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
25471       r_.wasm_v128 = wasm_i64x2_make(
25472         wasm_i32x4_extract_lane(a_.wasm_v128, 0) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 0)),
25473         wasm_i32x4_extract_lane(a_.wasm_v128, 2) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 2)));
25474     #else
25475       SIMDE_VECTORIZE
25476       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
25477         r_.i64[i] =
25478           HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
25479           HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
25480       }
25481     #endif
25482 
25483     return simde__m128i_from_private(r_);
25484   #endif
25485 }
25486 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25487   #undef _mm_mul_epi32
25488   #define _mm_mul_epi32(a, b) simde_mm_mul_epi32(a, b)
25489 #endif
25490 
25491 SIMDE_FUNCTION_ATTRIBUTES
25492 simde__m128i
simde_mm_mullo_epi32(simde__m128i a,simde__m128i b)25493 simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) {
25494   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25495     return _mm_mullo_epi32(a, b);
25496   #else
25497     simde__m128i_private
25498       r_,
25499       a_ = simde__m128i_to_private(a),
25500       b_ = simde__m128i_to_private(b);
25501 
25502     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25503       r_.neon_i32 = vmulq_s32(a_.neon_i32, b_.neon_i32);
25504     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
25505       (void) a_;
25506       (void) b_;
25507       r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32);
25508     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
25509       r_.wasm_v128 = wasm_i32x4_mul(a_.wasm_v128, b_.wasm_v128);
25510     #else
25511       SIMDE_VECTORIZE
25512       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
25513         r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]))) & 0xffffffff));
25514       }
25515     #endif
25516 
25517     return simde__m128i_from_private(r_);
25518   #endif
25519 }
25520 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25521   #undef _mm_mullo_epi32
25522   #define _mm_mullo_epi32(a, b) simde_mm_mullo_epi32(a, b)
25523 #endif
25524 
25525 SIMDE_FUNCTION_ATTRIBUTES
25526 simde__m128i
simde_x_mm_mullo_epu32(simde__m128i a,simde__m128i b)25527 simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) {
25528   simde__m128i_private
25529     r_,
25530     a_ = simde__m128i_to_private(a),
25531     b_ = simde__m128i_to_private(b);
25532 
25533     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25534       r_.neon_u32 = vmulq_u32(a_.neon_u32, b_.neon_u32);
25535     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
25536       r_.u32 = a_.u32 * b_.u32;
25537     #else
25538       SIMDE_VECTORIZE
25539       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
25540         r_.u32[i] = a_.u32[i] * b_.u32[i];
25541       }
25542     #endif
25543 
25544   return simde__m128i_from_private(r_);
25545 }
25546 
25547 SIMDE_FUNCTION_ATTRIBUTES
25548 simde__m128i
simde_mm_packus_epi32(simde__m128i a,simde__m128i b)25549 simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) {
25550   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25551     return _mm_packus_epi32(a, b);
25552   #else
25553     simde__m128i_private
25554       r_,
25555       a_ = simde__m128i_to_private(a),
25556       b_ = simde__m128i_to_private(b);
25557 
25558     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25559       const int32x4_t z = vdupq_n_s32(0);
25560       r_.neon_u16 = vcombine_u16(
25561           vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, a_.neon_i32))),
25562           vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, b_.neon_i32))));
25563     #else
25564       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
25565         r_.u16[i + 0] = (a_.i32[i] < 0) ? UINT16_C(0) : ((a_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
25566         r_.u16[i + 4] = (b_.i32[i] < 0) ? UINT16_C(0) : ((b_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
25567       }
25568     #endif
25569 
25570     return simde__m128i_from_private(r_);
25571   #endif
25572 }
25573 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25574   #undef _mm_packus_epi32
25575   #define _mm_packus_epi32(a, b) simde_mm_packus_epi32(a, b)
25576 #endif
25577 
25578 SIMDE_FUNCTION_ATTRIBUTES
25579 simde__m128d
simde_mm_round_sd(simde__m128d a,simde__m128d b,int rounding)25580 simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding)
25581     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
25582   simde__m128d_private
25583     r_ = simde__m128d_to_private(a),
25584     b_ = simde__m128d_to_private(b);
25585 
25586   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
25587     #if defined(simde_math_nearbyint)
25588       case SIMDE_MM_FROUND_TO_NEAREST_INT:
25589       case SIMDE_MM_FROUND_CUR_DIRECTION:
25590         r_.f64[0] = simde_math_nearbyint(b_.f64[0]);
25591         break;
25592     #endif
25593 
25594     #if defined(simde_math_floor)
25595       case SIMDE_MM_FROUND_TO_NEG_INF:
25596         r_.f64[0] = simde_math_floor(b_.f64[0]);
25597         break;
25598     #endif
25599 
25600     #if defined(simde_math_ceil)
25601       case SIMDE_MM_FROUND_TO_POS_INF:
25602         r_.f64[0] = simde_math_ceil(b_.f64[0]);
25603         break;
25604     #endif
25605 
25606     #if defined(simde_math_trunc)
25607       case SIMDE_MM_FROUND_TO_ZERO:
25608         r_.f64[0] = simde_math_trunc(b_.f64[0]);
25609         break;
25610     #endif
25611 
25612     default:
25613       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
25614   }
25615 
25616   return simde__m128d_from_private(r_);
25617 }
25618 #if defined(SIMDE_X86_SSE4_1_NATIVE)
25619 #  define simde_mm_round_sd(a, b, rounding) _mm_round_sd(a, b, rounding)
25620 #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
25621 #  define simde_mm_round_sd(a, b, rounding) simde_mm_move_sd(a, simde_mm_round_pd(b, rounding))
25622 #endif
25623 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25624   #undef _mm_round_sd
25625   #define _mm_round_sd(a, b, rounding) simde_mm_round_sd(a, b, rounding)
25626 #endif
25627 
25628 SIMDE_FUNCTION_ATTRIBUTES
25629 simde__m128
simde_mm_round_ss(simde__m128 a,simde__m128 b,int rounding)25630 simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding)
25631     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
25632   simde__m128_private
25633     r_ = simde__m128_to_private(a),
25634     b_ = simde__m128_to_private(b);
25635 
25636   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
25637     #if defined(simde_math_nearbyintf)
25638       case SIMDE_MM_FROUND_TO_NEAREST_INT:
25639       case SIMDE_MM_FROUND_CUR_DIRECTION:
25640         r_.f32[0] = simde_math_nearbyintf(b_.f32[0]);
25641         break;
25642     #endif
25643 
25644     #if defined(simde_math_floorf)
25645       case SIMDE_MM_FROUND_TO_NEG_INF:
25646         r_.f32[0] = simde_math_floorf(b_.f32[0]);
25647         break;
25648     #endif
25649 
25650     #if defined(simde_math_ceilf)
25651       case SIMDE_MM_FROUND_TO_POS_INF:
25652         r_.f32[0] = simde_math_ceilf(b_.f32[0]);
25653         break;
25654     #endif
25655 
25656     #if defined(simde_math_truncf)
25657       case SIMDE_MM_FROUND_TO_ZERO:
25658         r_.f32[0] = simde_math_truncf(b_.f32[0]);
25659         break;
25660     #endif
25661 
25662     default:
25663       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
25664   }
25665 
25666   return simde__m128_from_private(r_);
25667 }
25668 #if defined(SIMDE_X86_SSE4_1_NATIVE)
25669 #  define simde_mm_round_ss(a, b, rounding) _mm_round_ss(a, b, rounding)
25670 #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
25671 #  define simde_mm_round_ss(a, b, rounding) simde_mm_move_ss(a, simde_mm_round_ps(b, rounding))
25672 #endif
25673 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25674   #undef _mm_round_ss
25675   #define _mm_round_ss(a, b, rounding) simde_mm_round_ss(a, b, rounding)
25676 #endif
25677 
25678 SIMDE_FUNCTION_ATTRIBUTES
25679 simde__m128i
simde_mm_stream_load_si128(const simde__m128i * mem_addr)25680 simde_mm_stream_load_si128 (const simde__m128i* mem_addr) {
25681   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25682     return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr));
25683   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25684     return vreinterpretq_s64_s32(vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)));
25685   #else
25686     return *mem_addr;
25687   #endif
25688 }
25689 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25690   #undef _mm_stream_load_si128
25691   #define _mm_stream_load_si128(mem_addr) simde_mm_stream_load_si128(mem_addr)
25692 #endif
25693 
25694 SIMDE_FUNCTION_ATTRIBUTES
25695 int
simde_mm_test_all_ones(simde__m128i a)25696 simde_mm_test_all_ones (simde__m128i a) {
25697   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25698     return _mm_test_all_ones(a);
25699   #else
25700     simde__m128i_private a_ = simde__m128i_to_private(a);
25701     int r;
25702 
25703     #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
25704       r = vec_all_eq(a_.altivec_i32, vec_splats(~0));
25705     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25706       return r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0));
25707     #else
25708       int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
25709 
25710       SIMDE_VECTORIZE_REDUCTION(&:r_)
25711       for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
25712         r_ &= a_.i32f[i];
25713       }
25714 
25715       r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0));
25716     #endif
25717 
25718     return r;
25719   #endif
25720 }
25721 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25722   #undef _mm_test_all_ones
25723   #define _mm_test_all_ones(a) simde_mm_test_all_ones(a)
25724 #endif
25725 
25726 SIMDE_FUNCTION_ATTRIBUTES
25727 int
simde_mm_test_all_zeros(simde__m128i a,simde__m128i mask)25728 simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) {
25729   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25730     return _mm_test_all_zeros(a, mask);
25731   #else
25732     simde__m128i_private tmp_ = simde__m128i_to_private(simde_mm_and_si128(a, mask));
25733     int r;
25734 
25735     #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
25736       r = vec_all_eq(tmp_.altivec_i32, vec_splats(0));
25737     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25738       return !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1));
25739     #else
25740       int_fast32_t r_ = HEDLEY_STATIC_CAST(int_fast32_t, 0);
25741 
25742       SIMDE_VECTORIZE_REDUCTION(|:r_)
25743       for (size_t i = 0 ; i < (sizeof(tmp_.i32f) / sizeof(tmp_.i32f[0])) ; i++) {
25744         r_ |= tmp_.i32f[i];
25745       }
25746 
25747       r = !r_;
25748     #endif
25749 
25750     return r;
25751   #endif
25752 }
25753 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25754   #undef _mm_test_all_zeros
25755   #define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(a, mask)
25756 #endif
25757 
25758 SIMDE_FUNCTION_ATTRIBUTES
25759 int
simde_mm_test_mix_ones_zeros(simde__m128i a,simde__m128i mask)25760 simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) {
25761   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25762     return _mm_test_mix_ones_zeros(a, mask);
25763   #else
25764     simde__m128i_private
25765       a_ = simde__m128i_to_private(a),
25766       mask_ = simde__m128i_to_private(mask);
25767 
25768     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25769       int64x2_t s640 = vandq_s64(a_.neon_i64, mask_.neon_i64);
25770       int64x2_t s641 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a_.neon_i64))), mask_.neon_i64);
25771       return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0);
25772     #else
25773       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++)
25774         if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0))
25775           return 1;
25776 
25777       return 0;
25778     #endif
25779   #endif
25780 }
25781 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25782   #undef _mm_test_mix_ones_zeros
25783   #define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(a, mask)
25784 #endif
25785 
25786 SIMDE_FUNCTION_ATTRIBUTES
25787 int
simde_mm_testc_si128(simde__m128i a,simde__m128i b)25788 simde_mm_testc_si128 (simde__m128i a, simde__m128i b) {
25789   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25790     return _mm_testc_si128(a, b);
25791   #else
25792     simde__m128i_private
25793       a_ = simde__m128i_to_private(a),
25794       b_ = simde__m128i_to_private(b);
25795 
25796     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25797       int64x2_t s64 = vandq_s64(~a_.neon_i64, b_.neon_i64);
25798       return !(vgetq_lane_s64(s64, 0) & vgetq_lane_s64(s64, 1));
25799     #else
25800       int_fast32_t r = 0;
25801 
25802       SIMDE_VECTORIZE_REDUCTION(|:r)
25803       for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
25804         r |= ~a_.i32f[i] & b_.i32f[i];
25805       }
25806 
25807       return HEDLEY_STATIC_CAST(int, !r);
25808     #endif
25809   #endif
25810 }
25811 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25812   #undef _mm_testc_si128
25813   #define _mm_testc_si128(a, b) simde_mm_testc_si128(a, b)
25814 #endif
25815 
25816 SIMDE_FUNCTION_ATTRIBUTES
25817 int
simde_mm_testnzc_si128(simde__m128i a,simde__m128i b)25818 simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) {
25819   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25820     return _mm_testnzc_si128(a, b);
25821   #else
25822     simde__m128i_private
25823       a_ = simde__m128i_to_private(a),
25824       b_ = simde__m128i_to_private(b);
25825 
25826     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25827       int64x2_t s640 = vandq_s64(a_.neon_i64, b_.neon_i64);
25828       int64x2_t s641 = vandq_s64(~a_.neon_i64, b_.neon_i64);
25829       return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0);
25830     #else
25831       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
25832         if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0))
25833           return 1;
25834       }
25835 
25836       return 0;
25837     #endif
25838   #endif
25839 }
25840 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25841   #undef _mm_testnzc_si128
25842   #define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(a, b)
25843 #endif
25844 
25845 SIMDE_FUNCTION_ATTRIBUTES
25846 int
simde_mm_testz_si128(simde__m128i a,simde__m128i b)25847 simde_mm_testz_si128 (simde__m128i a, simde__m128i b) {
25848   #if defined(SIMDE_X86_SSE4_1_NATIVE)
25849     return _mm_testz_si128(a, b);
25850   #else
25851     simde__m128i_private
25852       a_ = simde__m128i_to_private(a),
25853       b_ = simde__m128i_to_private(b);
25854 
25855     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
25856       int64x2_t s64 = vandq_s64(a_.neon_i64, b_.neon_i64);
25857       return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
25858     #else
25859       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
25860         if ((a_.u64[i] & b_.u64[i]) == 0)
25861           return 1;
25862       }
25863     #endif
25864 
25865     return 0;
25866   #endif
25867 }
25868 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
25869   #undef _mm_testz_si128
25870   #define _mm_testz_si128(a, b) simde_mm_testz_si128(a, b)
25871 #endif
25872 
25873 SIMDE_END_DECLS_
25874 
25875 HEDLEY_DIAGNOSTIC_POP
25876 
25877 #endif /* !defined(SIMDE_X86_SSE4_1_H) */
25878 /* :: End ../simde/simde/x86/sse4.1.h :: */
25879 
25880 #if defined(__ARM_ACLE) || (defined(__GNUC__) && defined(__ARM_FEATURE_CRC32))
25881   #include <arm_acle.h>
25882 #endif
25883 
25884 HEDLEY_DIAGNOSTIC_PUSH
25885 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
25886 SIMDE_BEGIN_DECLS_
25887 
25888 #if defined(SIMDE_X86_SSE4_2_NATIVE)
25889   #define SIMDE_SIDD_UBYTE_OPS _SIDD_UBYTE_OPS
25890   #define SIMDE_SIDD_UWORD_OPS _SIDD_UWORD_OPS
25891   #define SIMDE_SIDD_SBYTE_OPS _SIDD_SBYTE_OPS
25892   #define SIMDE_SIDD_SWORD_OPS _SIDD_SWORD_OPS
25893   #define SIMDE_SIDD_CMP_EQUAL_ANY _SIDD_CMP_EQUAL_ANY
25894   #define SIMDE_SIDD_CMP_RANGES _SIDD_CMP_RANGES
25895   #define SIMDE_SIDD_CMP_EQUAL_EACH _SIDD_CMP_EQUAL_EACH
25896   #define SIMDE_SIDD_CMP_EQUAL_ORDERED _SIDD_CMP_EQUAL_ORDERED
25897   #define SIMDE_SIDD_POSITIVE_POLARITY _SIDD_POSITIVE_POLARITY
25898   #define SIMDE_SIDD_NEGATIVE_POLARITY _SIDD_NEGATIVE_POLARITY
25899   #define SIMDE_SIDD_MASKED_POSITIVE_POLARITY _SIDD_MASKED_POSITIVE_POLARITY
25900   #define SIMDE_SIDD_MASKED_NEGATIVE_POLARITY _SIDD_MASKED_NEGATIVE_POLARITY
25901   #define SIMDE_SIDD_LEAST_SIGNIFICANT _SIDD_LEAST_SIGNIFICANT
25902   #define SIMDE_SIDD_MOST_SIGNIFICANT _SIDD_MOST_SIGNIFICANT
25903   #define SIMDE_SIDD_BIT_MASK _SIDD_BIT_MASK
25904   #define SIMDE_SIDD_UNIT_MASK _SIDD_UNIT_MASK
25905 #else
25906   #define SIMDE_SIDD_UBYTE_OPS 0x00
25907   #define SIMDE_SIDD_UWORD_OPS 0x01
25908   #define SIMDE_SIDD_SBYTE_OPS 0x02
25909   #define SIMDE_SIDD_SWORD_OPS 0x03
25910   #define SIMDE_SIDD_CMP_EQUAL_ANY 0x00
25911   #define SIMDE_SIDD_CMP_RANGES 0x04
25912   #define SIMDE_SIDD_CMP_EQUAL_EACH 0x08
25913   #define SIMDE_SIDD_CMP_EQUAL_ORDERED 0x0c
25914   #define SIMDE_SIDD_POSITIVE_POLARITY 0x00
25915   #define SIMDE_SIDD_NEGATIVE_POLARITY 0x10
25916   #define SIMDE_SIDD_MASKED_POSITIVE_POLARITY 0x20
25917   #define SIMDE_SIDD_MASKED_NEGATIVE_POLARITY 0x30
25918   #define SIMDE_SIDD_LEAST_SIGNIFICANT 0x00
25919   #define SIMDE_SIDD_MOST_SIGNIFICANT 0x40
25920   #define SIMDE_SIDD_BIT_MASK 0x00
25921   #define SIMDE_SIDD_UNIT_MASK 0x40
25922 #endif
25923 
25924 #if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) && !defined(_SIDD_UBYTE_OPS)
25925   #define _SIDD_UBYTE_OPS SIMDE_SIDD_UBYTE_OPS
25926   #define _SIDD_UWORD_OPS SIMDE_SIDD_UWORD_OPS
25927   #define _SIDD_SBYTE_OPS SIMDE_SIDD_SBYTE_OPS
25928   #define _SIDD_SWORD_OPS SIMDE_SIDD_SWORD_OPS
25929   #define _SIDD_CMP_EQUAL_ANY SIMDE_SIDD_CMP_EQUAL_ANY
25930   #define _SIDD_CMP_RANGES SIMDE_SIDD_CMP_RANGES
25931   #define _SIDD_CMP_EQUAL_EACH SIMDE_SIDD_CMP_EQUAL_EACH
25932   #define _SIDD_CMP_EQUAL_ORDERED SIMDE_SIDD_CMP_EQUAL_ORDERED
25933   #define _SIDD_POSITIVE_POLARITY SIMDE_SIDD_POSITIVE_POLARITY
25934   #define _SIDD_NEGATIVE_POLARITY SIMDE_SIDD_NEGATIVE_POLARITY
25935   #define _SIDD_MASKED_POSITIVE_POLARITY SIMDE_SIDD_MASKED_POSITIVE_POLARITY
25936   #define _SIDD_MASKED_NEGATIVE_POLARITY SIMDE_SIDD_MASKED_NEGATIVE_POLARITY
25937   #define _SIDD_LEAST_SIGNIFICANT SIMDE_SIDD_LEAST_SIGNIFICANT
25938   #define _SIDD_MOST_SIGNIFICANT SIMDE_SIDD_MOST_SIGNIFICANT
25939   #define _SIDD_BIT_MASK SIMDE_SIDD_BIT_MASK
25940   #define _SIDD_UNIT_MASK SIMDE_SIDD_UNIT_MASK
25941 #endif
25942 
25943 SIMDE_FUNCTION_ATTRIBUTES
simde_mm_cmpestrs(simde__m128i a,int la,simde__m128i b,int lb,const int imm8)25944 int simde_mm_cmpestrs (simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
25945     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
25946   #if !defined(HEDLEY_PGI_VERSION)
25947     /* https://www.pgroup.com/userforum/viewtopic.php?f=4&p=27590&sid=cf89f8bf30be801831fe4a2ff0a2fa6c */
25948     (void) a;
25949     (void) b;
25950   #endif
25951   (void) la;
25952   (void) lb;
25953   return la <= ((128 / ((imm8 & SIMDE_SIDD_UWORD_OPS) ? 16 : 8)) - 1);
25954 }
25955 #if defined(SIMDE_X86_SSE4_2_NATIVE)
25956   #define simde_mm_cmpestrs(a, la, b, lb, imm8) _mm_cmpestrs(a, la, b, lb, imm8)
25957 #endif
25958 #if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
25959   #undef _mm_cmpestrs
25960   #define _mm_cmpestrs(a, la, b, lb, imm8) simde_mm_cmpestrs(a, la, b, lb, imm8)
25961 #endif
25962 
25963 SIMDE_FUNCTION_ATTRIBUTES
simde_mm_cmpestrz(simde__m128i a,int la,simde__m128i b,int lb,const int imm8)25964 int simde_mm_cmpestrz (simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
25965     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
25966   #if !defined(HEDLEY_PGI_VERSION)
25967     /* https://www.pgroup.com/userforum/viewtopic.php?f=4&p=27590&sid=cf89f8bf30be801831fe4a2ff0a2fa6c */
25968     (void) a;
25969     (void) b;
25970   #endif
25971   (void) la;
25972   (void) lb;
25973   return lb <= ((128 / ((imm8 & SIMDE_SIDD_UWORD_OPS) ? 16 : 8)) - 1);
25974 }
25975 #if defined(SIMDE_X86_SSE4_2_NATIVE)
25976   #define simde_mm_cmpestrz(a, la, b, lb, imm8) _mm_cmpestrz(a, la, b, lb, imm8)
25977 #endif
25978 #if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
25979   #undef _mm_cmpestrz
25980   #define _mm_cmpestrz(a, la, b, lb, imm8) simde_mm_cmpestrz(a, la, b, lb, imm8)
25981 #endif
25982 
25983 SIMDE_FUNCTION_ATTRIBUTES
25984 simde__m128i
simde_mm_cmpgt_epi64(simde__m128i a,simde__m128i b)25985 simde_mm_cmpgt_epi64 (simde__m128i a, simde__m128i b) {
25986   #if defined(SIMDE_X86_SSE4_2_NATIVE)
25987     return _mm_cmpgt_epi64(a, b);
25988   #elif defined(SIMDE_X86_SSE2_NATIVE)
25989     /* https://stackoverflow.com/a/65175746/501126 */
25990     __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a, b), _mm_sub_epi64(b, a));
25991     r = _mm_or_si128(r, _mm_cmpgt_epi32(a, b));
25992     return _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
25993   #else
25994     simde__m128i_private
25995       r_,
25996       a_ = simde__m128i_to_private(a),
25997       b_ = simde__m128i_to_private(b);
25998 
25999     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
26000       r_.neon_u64 = vcgtq_s64(a_.neon_i64, b_.neon_i64);
26001     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
26002       /* https://stackoverflow.com/a/65223269/501126 */
26003       r_.neon_i64 = vshrq_n_s64(vqsubq_s64(b_.neon_i64, a_.neon_i64), 63);
26004     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
26005       r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_cmpgt(a_.altivec_i64, b_.altivec_i64));
26006     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
26007       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), a_.i64 > b_.i64);
26008     #else
26009       SIMDE_VECTORIZE
26010       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
26011         r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
26012       }
26013     #endif
26014 
26015     return simde__m128i_from_private(r_);
26016   #endif
26017 }
26018 #if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
26019   #undef _mm_cmpgt_epi64
26020   #define _mm_cmpgt_epi64(a, b) simde_mm_cmpgt_epi64(a, b)
26021 #endif
26022 
26023 SIMDE_FUNCTION_ATTRIBUTES
26024 int
simde_mm_cmpistrs_8_(simde__m128i a)26025 simde_mm_cmpistrs_8_(simde__m128i a) {
26026   simde__m128i_private a_= simde__m128i_to_private(a);
26027   const int upper_bound = (128 / 8) - 1;
26028   int a_invalid = 0;
26029   SIMDE_VECTORIZE
26030   for (int i = 0 ; i <= upper_bound ; i++) {
26031     if(!a_.i8[i])
26032       a_invalid = 1;
26033   }
26034   return a_invalid;
26035 }
26036 
26037 SIMDE_FUNCTION_ATTRIBUTES
26038 int
simde_mm_cmpistrs_16_(simde__m128i a)26039 simde_mm_cmpistrs_16_(simde__m128i a) {
26040   simde__m128i_private a_= simde__m128i_to_private(a);
26041   const int upper_bound = (128 / 16) - 1;
26042   int a_invalid = 0;
26043   SIMDE_VECTORIZE
26044   for (int i = 0 ; i <= upper_bound ; i++) {
26045     if(!a_.i16[i])
26046       a_invalid = 1;
26047   }
26048   return a_invalid;
26049 }
26050 
26051 #if defined(SIMDE_X86_SSE4_2_NATIVE)
26052   #define simde_mm_cmpistrs(a, b, imm8) _mm_cmpistrs(a, b, imm8)
26053 #else
26054   #define simde_mm_cmpistrs(a, b, imm8) \
26055      (((imm8) & SIMDE_SIDD_UWORD_OPS) \
26056        ? simde_mm_cmpistrs_16_((a)) \
26057        : simde_mm_cmpistrs_8_((a)))
26058 #endif
26059 #if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
26060   #undef _mm_cmpistrs
26061   #define _mm_cmpistrs(a, b, imm8) simde_mm_cmpistrs(a, b, imm8)
26062 #endif
26063 
26064 SIMDE_FUNCTION_ATTRIBUTES
26065 int
simde_mm_cmpistrz_8_(simde__m128i b)26066 simde_mm_cmpistrz_8_(simde__m128i b) {
26067   simde__m128i_private b_= simde__m128i_to_private(b);
26068   const int upper_bound = (128 / 8) - 1;
26069   int b_invalid = 0;
26070   SIMDE_VECTORIZE
26071   for (int i = 0 ; i <= upper_bound ; i++) {
26072     if(!b_.i8[i])
26073       b_invalid = 1;
26074   }
26075   return b_invalid;
26076 }
26077 
26078 SIMDE_FUNCTION_ATTRIBUTES
26079 int
simde_mm_cmpistrz_16_(simde__m128i b)26080 simde_mm_cmpistrz_16_(simde__m128i b) {
26081   simde__m128i_private b_= simde__m128i_to_private(b);
26082   const int upper_bound = (128 / 16) - 1;
26083   int b_invalid = 0;
26084   SIMDE_VECTORIZE
26085   for (int i = 0 ; i <= upper_bound ; i++) {
26086     if(!b_.i16[i])
26087       b_invalid = 1;
26088   }
26089   return b_invalid;
26090 }
26091 
26092 #if defined(SIMDE_X86_SSE4_2_NATIVE)
26093   #define simde_mm_cmpistrz(a, b, imm8) _mm_cmpistrz(a, b, imm8)
26094 #else
26095   #define simde_mm_cmpistrz(a, b, imm8) \
26096      (((imm8) & SIMDE_SIDD_UWORD_OPS) \
26097        ? simde_mm_cmpistrz_16_((b)) \
26098        : simde_mm_cmpistrz_8_((b)))
26099 #endif
26100 #if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
26101   #undef _mm_cmpistrz
26102   #define _mm_cmpistrz(a, b, imm8) simde_mm_cmpistrz(a, b, imm8)
26103 #endif
26104 
26105 SIMDE_FUNCTION_ATTRIBUTES
26106 uint32_t
simde_mm_crc32_u8(uint32_t prevcrc,uint8_t v)26107 simde_mm_crc32_u8(uint32_t prevcrc, uint8_t v) {
26108   #if defined(SIMDE_X86_SSE4_2_NATIVE)
26109     return _mm_crc32_u8(prevcrc, v);
26110   #else
26111     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32)
26112       return __crc32cb(prevcrc, v);
26113     #else
26114       uint32_t crc = prevcrc;
26115       crc ^= v;
26116       for(int bit = 0 ; bit < 8 ; bit++) {
26117         if (crc & 1)
26118           crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
26119         else
26120           crc = (crc >> 1);
26121       }
26122       return crc;
26123     #endif
26124   #endif
26125 }
26126 #if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
26127   #define _mm_crc32_u8(prevcrc, v) simde_mm_crc32_u8(prevcrc, v)
26128 #endif
26129 
26130 SIMDE_FUNCTION_ATTRIBUTES
26131 uint32_t
simde_mm_crc32_u16(uint32_t prevcrc,uint16_t v)26132 simde_mm_crc32_u16(uint32_t prevcrc, uint16_t v) {
26133   #if defined(SIMDE_X86_SSE4_2_NATIVE)
26134     return _mm_crc32_u16(prevcrc, v);
26135   #else
26136     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32)
26137       return __crc32ch(prevcrc, v);
26138     #else
26139       uint32_t crc = prevcrc;
26140       crc = simde_mm_crc32_u8(crc, v & 0xff);
26141       crc = simde_mm_crc32_u8(crc, (v >> 8) & 0xff);
26142       return crc;
26143     #endif
26144   #endif
26145 }
26146 #if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
26147   #define _mm_crc32_u16(prevcrc, v) simde_mm_crc32_u16(prevcrc, v)
26148 #endif
26149 
26150 SIMDE_FUNCTION_ATTRIBUTES
26151 uint32_t
simde_mm_crc32_u32(uint32_t prevcrc,uint32_t v)26152 simde_mm_crc32_u32(uint32_t prevcrc, uint32_t v) {
26153   #if defined(SIMDE_X86_SSE4_2_NATIVE)
26154     return _mm_crc32_u32(prevcrc, v);
26155   #else
26156     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32)
26157       return __crc32cw(prevcrc, v);
26158     #else
26159       uint32_t crc = prevcrc;
26160       crc = simde_mm_crc32_u16(crc, v & 0xffff);
26161       crc = simde_mm_crc32_u16(crc, (v >> 16) & 0xffff);
26162       return crc;
26163     #endif
26164   #endif
26165 }
26166 #if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
26167   #define _mm_crc32_u32(prevcrc, v) simde_mm_crc32_u32(prevcrc, v)
26168 #endif
26169 
26170 SIMDE_FUNCTION_ATTRIBUTES
26171 uint64_t
simde_mm_crc32_u64(uint64_t prevcrc,uint64_t v)26172 simde_mm_crc32_u64(uint64_t prevcrc, uint64_t v) {
26173   #if defined(SIMDE_X86_SSE4_2_NATIVE) && defined(SIMDE_ARCH_AMD64)
26174     return _mm_crc32_u64(prevcrc, v);
26175   #else
26176     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32)
26177       return __crc32cd(HEDLEY_STATIC_CAST(uint32_t, prevcrc), v);
26178     #else
26179       uint64_t crc = prevcrc;
26180       crc = simde_mm_crc32_u32(HEDLEY_STATIC_CAST(uint32_t, crc), v & 0xffffffff);
26181       crc = simde_mm_crc32_u32(HEDLEY_STATIC_CAST(uint32_t, crc), (v >> 32) & 0xffffffff);
26182       return crc;
26183     #endif
26184   #endif
26185 }
26186 #if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
26187   #define _mm_crc32_u64(prevcrc, v) simde_mm_crc32_u64(prevcrc, v)
26188 #endif
26189 
26190 SIMDE_END_DECLS_
26191 
26192 HEDLEY_DIAGNOSTIC_POP
26193 
26194 #endif /* !defined(SIMDE_X86_SSE4_2_H) */
26195 /* :: End ../simde/simde/x86/sse4.2.h :: */
26196 
26197 HEDLEY_DIAGNOSTIC_PUSH
26198 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
26199 SIMDE_BEGIN_DECLS_
26200 
26201 typedef union {
26202   #if defined(SIMDE_VECTOR_SUBSCRIPT)
26203     SIMDE_ALIGN_TO_32 int8_t          i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26204     SIMDE_ALIGN_TO_32 int16_t        i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26205     SIMDE_ALIGN_TO_32 int32_t        i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26206     SIMDE_ALIGN_TO_32 int64_t        i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26207     SIMDE_ALIGN_TO_32 uint8_t         u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26208     SIMDE_ALIGN_TO_32 uint16_t       u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26209     SIMDE_ALIGN_TO_32 uint32_t       u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26210     SIMDE_ALIGN_TO_32 uint64_t       u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26211     #if defined(SIMDE_HAVE_INT128_)
26212     SIMDE_ALIGN_TO_32 simde_int128  i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26213     SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26214     #endif
26215     SIMDE_ALIGN_TO_32 simde_float32  f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26216     SIMDE_ALIGN_TO_32 simde_float64  f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26217     SIMDE_ALIGN_TO_32 int_fast32_t  i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26218     SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26219   #else
26220     SIMDE_ALIGN_TO_32 int8_t          i8[32];
26221     SIMDE_ALIGN_TO_32 int16_t        i16[16];
26222     SIMDE_ALIGN_TO_32 int32_t        i32[8];
26223     SIMDE_ALIGN_TO_32 int64_t        i64[4];
26224     SIMDE_ALIGN_TO_32 uint8_t         u8[32];
26225     SIMDE_ALIGN_TO_32 uint16_t       u16[16];
26226     SIMDE_ALIGN_TO_32 uint32_t       u32[8];
26227     SIMDE_ALIGN_TO_32 uint64_t       u64[4];
26228     SIMDE_ALIGN_TO_32 int_fast32_t  i32f[32 / sizeof(int_fast32_t)];
26229     SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)];
26230     #if defined(SIMDE_HAVE_INT128_)
26231     SIMDE_ALIGN_TO_32 simde_int128  i128[2];
26232     SIMDE_ALIGN_TO_32 simde_uint128 u128[2];
26233     #endif
26234     SIMDE_ALIGN_TO_32 simde_float32  f32[8];
26235     SIMDE_ALIGN_TO_32 simde_float64  f64[4];
26236   #endif
26237 
26238     SIMDE_ALIGN_TO_32 simde__m128_private m128_private[2];
26239     SIMDE_ALIGN_TO_32 simde__m128         m128[2];
26240 
26241   #if defined(SIMDE_X86_AVX_NATIVE)
26242     SIMDE_ALIGN_TO_32 __m256         n;
26243   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
26244     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)      altivec_u8[2];
26245     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)     altivec_u16[2];
26246     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)       altivec_u32[2];
26247     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char)        altivec_i8[2];
26248     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short)       altivec_i16[2];
26249     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(int)                altivec_i32[2];
26250     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float)              altivec_f32[2];
26251     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
26252       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2];
26253       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(long long)          altivec_i64[2];
26254       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64[2];
26255     #endif
26256   #endif
26257 } simde__m256_private;
26258 
26259 typedef union {
26260   #if defined(SIMDE_VECTOR_SUBSCRIPT)
26261     SIMDE_ALIGN_TO_32 int8_t          i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26262     SIMDE_ALIGN_TO_32 int16_t        i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26263     SIMDE_ALIGN_TO_32 int32_t        i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26264     SIMDE_ALIGN_TO_32 int64_t        i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26265     SIMDE_ALIGN_TO_32 uint8_t         u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26266     SIMDE_ALIGN_TO_32 uint16_t       u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26267     SIMDE_ALIGN_TO_32 uint32_t       u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26268     SIMDE_ALIGN_TO_32 uint64_t       u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26269     #if defined(SIMDE_HAVE_INT128_)
26270     SIMDE_ALIGN_TO_32 simde_int128  i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26271     SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26272     #endif
26273     SIMDE_ALIGN_TO_32 simde_float32  f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26274     SIMDE_ALIGN_TO_32 simde_float64  f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26275     SIMDE_ALIGN_TO_32 int_fast32_t  i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26276     SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26277   #else
26278     SIMDE_ALIGN_TO_32 int8_t          i8[32];
26279     SIMDE_ALIGN_TO_32 int16_t        i16[16];
26280     SIMDE_ALIGN_TO_32 int32_t        i32[8];
26281     SIMDE_ALIGN_TO_32 int64_t        i64[4];
26282     SIMDE_ALIGN_TO_32 uint8_t         u8[32];
26283     SIMDE_ALIGN_TO_32 uint16_t       u16[16];
26284     SIMDE_ALIGN_TO_32 uint32_t       u32[8];
26285     SIMDE_ALIGN_TO_32 uint64_t       u64[4];
26286     #if defined(SIMDE_HAVE_INT128_)
26287     SIMDE_ALIGN_TO_32 simde_int128  i128[2];
26288     SIMDE_ALIGN_TO_32 simde_uint128 u128[2];
26289     #endif
26290     SIMDE_ALIGN_TO_32 simde_float32  f32[8];
26291     SIMDE_ALIGN_TO_32 simde_float64  f64[4];
26292     SIMDE_ALIGN_TO_32 int_fast32_t  i32f[32 / sizeof(int_fast32_t)];
26293     SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)];
26294   #endif
26295 
26296     SIMDE_ALIGN_TO_32 simde__m128d_private m128d_private[2];
26297     SIMDE_ALIGN_TO_32 simde__m128d         m128d[2];
26298 
26299   #if defined(SIMDE_X86_AVX_NATIVE)
26300     SIMDE_ALIGN_TO_32 __m256d        n;
26301   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
26302     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)      altivec_u8[2];
26303     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)     altivec_u16[2];
26304     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)       altivec_u32[2];
26305     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char)        altivec_i8[2];
26306     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short)       altivec_i16[2];
26307     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)         altivec_i32[2];
26308     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float)              altivec_f32[2];
26309     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
26310       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2];
26311       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64[2];
26312       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64[2];
26313     #endif
26314   #endif
26315 } simde__m256d_private;
26316 
26317 typedef union {
26318   #if defined(SIMDE_VECTOR_SUBSCRIPT)
26319     SIMDE_ALIGN_TO_32 int8_t          i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26320     SIMDE_ALIGN_TO_32 int16_t        i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26321     SIMDE_ALIGN_TO_32 int32_t        i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26322     SIMDE_ALIGN_TO_32 int64_t        i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26323     SIMDE_ALIGN_TO_32 uint8_t         u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26324     SIMDE_ALIGN_TO_32 uint16_t       u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26325     SIMDE_ALIGN_TO_32 uint32_t       u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26326     SIMDE_ALIGN_TO_32 uint64_t       u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26327     #if defined(SIMDE_HAVE_INT128_)
26328     SIMDE_ALIGN_TO_32 simde_int128  i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26329     SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26330     #endif
26331     SIMDE_ALIGN_TO_32 simde_float32  f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26332     SIMDE_ALIGN_TO_32 simde_float64  f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26333     SIMDE_ALIGN_TO_32 int_fast32_t  i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26334     SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26335   #else
26336     SIMDE_ALIGN_TO_32 int8_t          i8[32];
26337     SIMDE_ALIGN_TO_32 int16_t        i16[16];
26338     SIMDE_ALIGN_TO_32 int32_t        i32[8];
26339     SIMDE_ALIGN_TO_32 int64_t        i64[4];
26340     SIMDE_ALIGN_TO_32 uint8_t         u8[32];
26341     SIMDE_ALIGN_TO_32 uint16_t       u16[16];
26342     SIMDE_ALIGN_TO_32 uint32_t       u32[8];
26343     SIMDE_ALIGN_TO_32 uint64_t       u64[4];
26344     SIMDE_ALIGN_TO_32 int_fast32_t  i32f[32 / sizeof(int_fast32_t)];
26345     SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)];
26346     #if defined(SIMDE_HAVE_INT128_)
26347     SIMDE_ALIGN_TO_32 simde_int128  i128[2];
26348     SIMDE_ALIGN_TO_32 simde_uint128 u128[2];
26349     #endif
26350     SIMDE_ALIGN_TO_32 simde_float32  f32[8];
26351     SIMDE_ALIGN_TO_32 simde_float64  f64[4];
26352   #endif
26353 
26354     SIMDE_ALIGN_TO_32 simde__m128i_private m128i_private[2];
26355     SIMDE_ALIGN_TO_32 simde__m128i         m128i[2];
26356 
26357   #if defined(SIMDE_X86_AVX_NATIVE)
26358     SIMDE_ALIGN_TO_32 __m256i        n;
26359   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
26360     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)      altivec_u8[2];
26361     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)     altivec_u16[2];
26362     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)       altivec_u32[2];
26363     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char)        altivec_i8[2];
26364     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short)       altivec_i16[2];
26365     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)         altivec_i32[2];
26366     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float)              altivec_f32[2];
26367     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
26368       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2];
26369       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64[2];
26370       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64[2];
26371     #endif
26372   #endif
26373 } simde__m256i_private;
26374 
26375 #if defined(SIMDE_X86_AVX_NATIVE)
26376   typedef __m256 simde__m256;
26377   typedef __m256i simde__m256i;
26378   typedef __m256d simde__m256d;
26379 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
26380   typedef simde_float32 simde__m256  SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26381   typedef int_fast32_t  simde__m256i SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26382   typedef simde_float64 simde__m256d SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
26383 #else
26384   typedef simde__m256_private  simde__m256;
26385   typedef simde__m256i_private simde__m256i;
26386   typedef simde__m256d_private simde__m256d;
26387 #endif
26388 
26389 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26390   #if !defined(HEDLEY_INTEL_VERSION) && !defined(_AVXINTRIN_H_INCLUDED) && !defined(__AVXINTRIN_H) && !defined(_CMP_EQ_OQ)
26391     typedef simde__m256 __m256;
26392     typedef simde__m256i __m256i;
26393     typedef simde__m256d __m256d;
26394   #else
26395     #undef __m256
26396     #define __m256 simde__m256
26397     #undef __m256i
26398     #define __m256i simde__m256i
26399     #undef __m256d
26400     #define __m256d simde__m256d
26401   #endif
26402 #endif
26403 
26404 HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256), "simde__m256 size incorrect");
26405 HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256_private), "simde__m256_private size incorrect");
26406 HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256i), "simde__m256i size incorrect");
26407 HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256i_private), "simde__m256i_private size incorrect");
26408 HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256d), "simde__m256d size incorrect");
26409 HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256d_private), "simde__m256d_private size incorrect");
26410 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
26411 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256) == 32, "simde__m256 is not 32-byte aligned");
26412 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256_private) == 32, "simde__m256_private is not 32-byte aligned");
26413 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256i) == 32, "simde__m256i is not 32-byte aligned");
26414 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256i_private) == 32, "simde__m256i_private is not 32-byte aligned");
26415 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256d) == 32, "simde__m256d is not 32-byte aligned");
26416 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256d_private) == 32, "simde__m256d_private is not 32-byte aligned");
26417 #endif
26418 
26419 SIMDE_FUNCTION_ATTRIBUTES
26420 simde__m256
simde__m256_from_private(simde__m256_private v)26421 simde__m256_from_private(simde__m256_private v) {
26422   simde__m256 r;
26423   simde_memcpy(&r, &v, sizeof(r));
26424   return r;
26425 }
26426 
26427 SIMDE_FUNCTION_ATTRIBUTES
26428 simde__m256_private
simde__m256_to_private(simde__m256 v)26429 simde__m256_to_private(simde__m256 v) {
26430   simde__m256_private r;
26431   simde_memcpy(&r, &v, sizeof(r));
26432   return r;
26433 }
26434 
26435 SIMDE_FUNCTION_ATTRIBUTES
26436 simde__m256i
simde__m256i_from_private(simde__m256i_private v)26437 simde__m256i_from_private(simde__m256i_private v) {
26438   simde__m256i r;
26439   simde_memcpy(&r, &v, sizeof(r));
26440   return r;
26441 }
26442 
26443 SIMDE_FUNCTION_ATTRIBUTES
26444 simde__m256i_private
simde__m256i_to_private(simde__m256i v)26445 simde__m256i_to_private(simde__m256i v) {
26446   simde__m256i_private r;
26447   simde_memcpy(&r, &v, sizeof(r));
26448   return r;
26449 }
26450 
26451 SIMDE_FUNCTION_ATTRIBUTES
26452 simde__m256d
simde__m256d_from_private(simde__m256d_private v)26453 simde__m256d_from_private(simde__m256d_private v) {
26454   simde__m256d r;
26455   simde_memcpy(&r, &v, sizeof(r));
26456   return r;
26457 }
26458 
26459 SIMDE_FUNCTION_ATTRIBUTES
26460 simde__m256d_private
simde__m256d_to_private(simde__m256d v)26461 simde__m256d_to_private(simde__m256d v) {
26462   simde__m256d_private r;
26463   simde_memcpy(&r, &v, sizeof(r));
26464   return r;
26465 }
26466 
26467 #define SIMDE_CMP_EQ_OQ     0
26468 #define SIMDE_CMP_LT_OS     1
26469 #define SIMDE_CMP_LE_OS     2
26470 #define SIMDE_CMP_UNORD_Q   3
26471 #define SIMDE_CMP_NEQ_UQ    4
26472 #define SIMDE_CMP_NLT_US    5
26473 #define SIMDE_CMP_NLE_US    6
26474 #define SIMDE_CMP_ORD_Q     7
26475 #define SIMDE_CMP_EQ_UQ     8
26476 #define SIMDE_CMP_NGE_US    9
26477 #define SIMDE_CMP_NGT_US   10
26478 #define SIMDE_CMP_FALSE_OQ 11
26479 #define SIMDE_CMP_NEQ_OQ   12
26480 #define SIMDE_CMP_GE_OS    13
26481 #define SIMDE_CMP_GT_OS    14
26482 #define SIMDE_CMP_TRUE_UQ  15
26483 #define SIMDE_CMP_EQ_OS    16
26484 #define SIMDE_CMP_LT_OQ    17
26485 #define SIMDE_CMP_LE_OQ    18
26486 #define SIMDE_CMP_UNORD_S  19
26487 #define SIMDE_CMP_NEQ_US   20
26488 #define SIMDE_CMP_NLT_UQ   21
26489 #define SIMDE_CMP_NLE_UQ   22
26490 #define SIMDE_CMP_ORD_S    23
26491 #define SIMDE_CMP_EQ_US    24
26492 #define SIMDE_CMP_NGE_UQ   25
26493 #define SIMDE_CMP_NGT_UQ   26
26494 #define SIMDE_CMP_FALSE_OS 27
26495 #define SIMDE_CMP_NEQ_OS   28
26496 #define SIMDE_CMP_GE_OQ    29
26497 #define SIMDE_CMP_GT_OQ    30
26498 #define SIMDE_CMP_TRUE_US  31
26499 
26500 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) && !defined(_CMP_EQ_OQ)
26501 #define _CMP_EQ_OQ SIMDE_CMP_EQ_OQ
26502 #define _CMP_LT_OS SIMDE_CMP_LT_OS
26503 #define _CMP_LE_OS SIMDE_CMP_LE_OS
26504 #define _CMP_UNORD_Q SIMDE_CMP_UNORD_Q
26505 #define _CMP_NEQ_UQ SIMDE_CMP_NEQ_UQ
26506 #define _CMP_NLT_US SIMDE_CMP_NLT_US
26507 #define _CMP_NLE_US SIMDE_CMP_NLE_US
26508 #define _CMP_ORD_Q SIMDE_CMP_ORD_Q
26509 #define _CMP_EQ_UQ SIMDE_CMP_EQ_UQ
26510 #define _CMP_NGE_US SIMDE_CMP_NGE_US
26511 #define _CMP_NGT_US SIMDE_CMP_NGT_US
26512 #define _CMP_FALSE_OQ SIMDE_CMP_FALSE_OQ
26513 #define _CMP_NEQ_OQ SIMDE_CMP_NEQ_OQ
26514 #define _CMP_GE_OS SIMDE_CMP_GE_OS
26515 #define _CMP_GT_OS SIMDE_CMP_GT_OS
26516 #define _CMP_TRUE_UQ SIMDE_CMP_TRUE_UQ
26517 #define _CMP_EQ_OS SIMDE_CMP_EQ_OS
26518 #define _CMP_LT_OQ SIMDE_CMP_LT_OQ
26519 #define _CMP_LE_OQ SIMDE_CMP_LE_OQ
26520 #define _CMP_UNORD_S SIMDE_CMP_UNORD_S
26521 #define _CMP_NEQ_US SIMDE_CMP_NEQ_US
26522 #define _CMP_NLT_UQ SIMDE_CMP_NLT_UQ
26523 #define _CMP_NLE_UQ SIMDE_CMP_NLE_UQ
26524 #define _CMP_ORD_S SIMDE_CMP_ORD_S
26525 #define _CMP_EQ_US SIMDE_CMP_EQ_US
26526 #define _CMP_NGE_UQ SIMDE_CMP_NGE_UQ
26527 #define _CMP_NGT_UQ SIMDE_CMP_NGT_UQ
26528 #define _CMP_FALSE_OS SIMDE_CMP_FALSE_OS
26529 #define _CMP_NEQ_OS SIMDE_CMP_NEQ_OS
26530 #define _CMP_GE_OQ SIMDE_CMP_GE_OQ
26531 #define _CMP_GT_OQ SIMDE_CMP_GT_OQ
26532 #define _CMP_TRUE_US SIMDE_CMP_TRUE_US
26533 #endif
26534 
26535 SIMDE_FUNCTION_ATTRIBUTES
26536 simde__m256d
simde_mm256_castps_pd(simde__m256 a)26537 simde_mm256_castps_pd (simde__m256 a) {
26538   #if defined(SIMDE_X86_AVX_NATIVE)
26539     return _mm256_castps_pd(a);
26540   #else
26541     return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a);
26542   #endif
26543 }
26544 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26545   #undef _mm256_castps_pd
26546   #define _mm256_castps_pd(a) simde_mm256_castps_pd(a)
26547 #endif
26548 
26549 SIMDE_FUNCTION_ATTRIBUTES
26550 simde__m256i
simde_mm256_castps_si256(simde__m256 a)26551 simde_mm256_castps_si256 (simde__m256 a) {
26552   #if defined(SIMDE_X86_AVX_NATIVE)
26553     return _mm256_castps_si256(a);
26554   #else
26555     return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a);
26556   #endif
26557 }
26558 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26559   #undef _mm256_castps_si256
26560   #define _mm256_castps_si256(a) simde_mm256_castps_si256(a)
26561 #endif
26562 
26563 SIMDE_FUNCTION_ATTRIBUTES
26564 simde__m256d
simde_mm256_castsi256_pd(simde__m256i a)26565 simde_mm256_castsi256_pd (simde__m256i a) {
26566   #if defined(SIMDE_X86_AVX_NATIVE)
26567     return _mm256_castsi256_pd(a);
26568   #else
26569     return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a);
26570   #endif
26571 }
26572 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26573   #undef _mm256_castsi256_pd
26574   #define _mm256_castsi256_pd(a) simde_mm256_castsi256_pd(a)
26575 #endif
26576 
26577 SIMDE_FUNCTION_ATTRIBUTES
26578 simde__m256
simde_mm256_castsi256_ps(simde__m256i a)26579 simde_mm256_castsi256_ps (simde__m256i a) {
26580   #if defined(SIMDE_X86_AVX_NATIVE)
26581     return _mm256_castsi256_ps(a);
26582   #else
26583     return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a);
26584   #endif
26585 }
26586 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26587   #undef _mm256_castsi256_ps
26588   #define _mm256_castsi256_ps(a) simde_mm256_castsi256_ps(a)
26589 #endif
26590 
26591 SIMDE_FUNCTION_ATTRIBUTES
26592 simde__m256
simde_mm256_castpd_ps(simde__m256d a)26593 simde_mm256_castpd_ps (simde__m256d a) {
26594   #if defined(SIMDE_X86_AVX_NATIVE)
26595     return _mm256_castpd_ps(a);
26596   #else
26597     return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a);
26598   #endif
26599 }
26600 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26601   #undef _mm256_castpd_ps
26602   #define _mm256_castpd_ps(a) simde_mm256_castpd_ps(a)
26603 #endif
26604 
26605 SIMDE_FUNCTION_ATTRIBUTES
26606 simde__m256i
simde_mm256_castpd_si256(simde__m256d a)26607 simde_mm256_castpd_si256 (simde__m256d a) {
26608   #if defined(SIMDE_X86_AVX_NATIVE)
26609     return _mm256_castpd_si256(a);
26610   #else
26611     return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a);
26612   #endif
26613 }
26614 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26615   #undef _mm256_castpd_si256
26616   #define _mm256_castpd_si256(a) simde_mm256_castpd_si256(a)
26617 #endif
26618 
26619 SIMDE_FUNCTION_ATTRIBUTES
26620 simde__m256i
simde_mm256_setzero_si256(void)26621 simde_mm256_setzero_si256 (void) {
26622   #if defined(SIMDE_X86_AVX_NATIVE)
26623     return _mm256_setzero_si256();
26624   #else
26625     simde__m256i_private r_;
26626 
26627     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
26628       r_.m128i[0] = simde_mm_setzero_si128();
26629       r_.m128i[1] = simde_mm_setzero_si128();
26630     #else
26631       SIMDE_VECTORIZE
26632       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
26633         r_.i32f[i] = 0;
26634       }
26635     #endif
26636 
26637     return simde__m256i_from_private(r_);
26638   #endif
26639 }
26640 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26641   #undef _mm256_setzero_si256
26642   #define _mm256_setzero_si256() simde_mm256_setzero_si256()
26643 #endif
26644 
26645 SIMDE_FUNCTION_ATTRIBUTES
26646 simde__m256
simde_mm256_setzero_ps(void)26647 simde_mm256_setzero_ps (void) {
26648   #if defined(SIMDE_X86_AVX_NATIVE)
26649     return _mm256_setzero_ps();
26650   #else
26651     return simde_mm256_castsi256_ps(simde_mm256_setzero_si256());
26652   #endif
26653 }
26654 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26655   #undef _mm256_setzero_ps
26656   #define _mm256_setzero_ps() simde_mm256_setzero_ps()
26657 #endif
26658 
26659 SIMDE_FUNCTION_ATTRIBUTES
26660 simde__m256d
simde_mm256_setzero_pd(void)26661 simde_mm256_setzero_pd (void) {
26662   #if defined(SIMDE_X86_AVX_NATIVE)
26663     return _mm256_setzero_pd();
26664   #else
26665     return simde_mm256_castsi256_pd(simde_mm256_setzero_si256());
26666   #endif
26667 }
26668 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26669   #undef _mm256_setzero_pd
26670   #define _mm256_setzero_pd() simde_mm256_setzero_pd()
26671 #endif
26672 
26673 SIMDE_FUNCTION_ATTRIBUTES
26674 simde__m256
simde_x_mm256_not_ps(simde__m256 a)26675 simde_x_mm256_not_ps(simde__m256 a) {
26676   simde__m256_private
26677     r_,
26678     a_ = simde__m256_to_private(a);
26679 
26680   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
26681     r_.i32 = ~a_.i32;
26682   #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
26683     r_.m128[0] = simde_x_mm_not_ps(a_.m128[0]);
26684     r_.m128[1] = simde_x_mm_not_ps(a_.m128[1]);
26685   #else
26686     SIMDE_VECTORIZE
26687     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
26688       r_.i32[i] = ~(a_.i32[i]);
26689     }
26690   #endif
26691 
26692   return simde__m256_from_private(r_);
26693 }
26694 
26695 SIMDE_FUNCTION_ATTRIBUTES
26696 simde__m256
simde_x_mm256_select_ps(simde__m256 a,simde__m256 b,simde__m256 mask)26697 simde_x_mm256_select_ps(simde__m256 a, simde__m256 b, simde__m256 mask) {
26698   /* This function is for when you want to blend two elements together
26699    * according to a mask.  It is similar to _mm256_blendv_ps, except that
26700    * it is undefined whether the blend is based on the highest bit in
26701    * each lane (like blendv) or just bitwise operations.  This allows
26702    * us to implement the function efficiently everywhere.
26703    *
26704    * Basically, you promise that all the lanes in mask are either 0 or
26705    * ~0. */
26706   #if defined(SIMDE_X86_AVX_NATIVE)
26707     return _mm256_blendv_ps(a, b, mask);
26708   #else
26709     simde__m256_private
26710       r_,
26711       a_ = simde__m256_to_private(a),
26712       b_ = simde__m256_to_private(b),
26713       mask_ = simde__m256_to_private(mask);
26714 
26715     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
26716       r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32);
26717     #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
26718       r_.m128[0] = simde_x_mm_select_ps(a_.m128[0], b_.m128[0], mask_.m128[0]);
26719       r_.m128[1] = simde_x_mm_select_ps(a_.m128[1], b_.m128[1], mask_.m128[1]);
26720     #else
26721       SIMDE_VECTORIZE
26722       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
26723         r_.i32[i] = a_.i32[i] ^ ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]);
26724       }
26725     #endif
26726 
26727     return simde__m256_from_private(r_);
26728   #endif
26729 }
26730 
26731 SIMDE_FUNCTION_ATTRIBUTES
26732 simde__m256d
simde_x_mm256_not_pd(simde__m256d a)26733 simde_x_mm256_not_pd(simde__m256d a) {
26734   simde__m256d_private
26735     r_,
26736     a_ = simde__m256d_to_private(a);
26737 
26738   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
26739     r_.i64 = ~a_.i64;
26740   #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
26741     r_.m128d[0] = simde_x_mm_not_pd(a_.m128d[0]);
26742     r_.m128d[1] = simde_x_mm_not_pd(a_.m128d[1]);
26743   #else
26744     SIMDE_VECTORIZE
26745     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
26746       r_.i64[i] = ~(a_.i64[i]);
26747     }
26748   #endif
26749 
26750   return simde__m256d_from_private(r_);
26751 }
26752 
26753 SIMDE_FUNCTION_ATTRIBUTES
26754 simde__m256d
simde_x_mm256_select_pd(simde__m256d a,simde__m256d b,simde__m256d mask)26755 simde_x_mm256_select_pd(simde__m256d a, simde__m256d b, simde__m256d mask) {
26756   /* This function is for when you want to blend two elements together
26757    * according to a mask.  It is similar to _mm256_blendv_pd, except that
26758    * it is undefined whether the blend is based on the highest bit in
26759    * each lane (like blendv) or just bitwise operations.  This allows
26760    * us to implement the function efficiently everywhere.
26761    *
26762    * Basically, you promise that all the lanes in mask are either 0 or
26763    * ~0. */
26764   #if defined(SIMDE_X86_AVX_NATIVE)
26765     return _mm256_blendv_pd(a, b, mask);
26766   #else
26767     simde__m256d_private
26768       r_,
26769       a_ = simde__m256d_to_private(a),
26770       b_ = simde__m256d_to_private(b),
26771       mask_ = simde__m256d_to_private(mask);
26772 
26773     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
26774       r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64);
26775     #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
26776       r_.m128d[0] = simde_x_mm_select_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]);
26777       r_.m128d[1] = simde_x_mm_select_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]);
26778     #else
26779       SIMDE_VECTORIZE
26780       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
26781         r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]);
26782       }
26783     #endif
26784 
26785     return simde__m256d_from_private(r_);
26786   #endif
26787 }
26788 
26789 SIMDE_FUNCTION_ATTRIBUTES
26790 simde__m256i
simde_x_mm256_setone_si256(void)26791 simde_x_mm256_setone_si256 (void) {
26792   simde__m256i_private r_;
26793 
26794 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
26795   __typeof__(r_.i32f) rv = { 0, };
26796   r_.i32f = ~rv;
26797 #elif defined(SIMDE_X86_AVX2_NATIVE)
26798   __m256i t = _mm256_setzero_si256();
26799   r_.n = _mm256_cmpeq_epi32(t, t);
26800 #else
26801   SIMDE_VECTORIZE
26802   for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
26803     r_.i32f[i] = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
26804   }
26805 #endif
26806 
26807   return simde__m256i_from_private(r_);
26808 }
26809 
26810 SIMDE_FUNCTION_ATTRIBUTES
26811 simde__m256
simde_x_mm256_setone_ps(void)26812 simde_x_mm256_setone_ps (void) {
26813   return simde_mm256_castsi256_ps(simde_x_mm256_setone_si256());
26814 }
26815 
26816 SIMDE_FUNCTION_ATTRIBUTES
26817 simde__m256d
simde_x_mm256_setone_pd(void)26818 simde_x_mm256_setone_pd (void) {
26819   return simde_mm256_castsi256_pd(simde_x_mm256_setone_si256());
26820 }
26821 
26822 SIMDE_FUNCTION_ATTRIBUTES
26823 simde__m256i
simde_mm256_set_epi8(int8_t e31,int8_t e30,int8_t e29,int8_t e28,int8_t e27,int8_t e26,int8_t e25,int8_t e24,int8_t e23,int8_t e22,int8_t e21,int8_t e20,int8_t e19,int8_t e18,int8_t e17,int8_t e16,int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)26824 simde_mm256_set_epi8 (int8_t e31, int8_t e30, int8_t e29, int8_t e28,
26825                       int8_t e27, int8_t e26, int8_t e25, int8_t e24,
26826                       int8_t e23, int8_t e22, int8_t e21, int8_t e20,
26827                       int8_t e19, int8_t e18, int8_t e17, int8_t e16,
26828                       int8_t e15, int8_t e14, int8_t e13, int8_t e12,
26829                       int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
26830                       int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
26831                       int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
26832   #if defined(SIMDE_X86_AVX_NATIVE)
26833     return _mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24,
26834                            e23, e22, e21, e20, e19, e18, e17, e16,
26835                            e15, e14, e13, e12, e11, e10,  e9,  e8,
26836                             e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
26837   #else
26838     simde__m256i_private r_;
26839 
26840     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
26841       r_.m128i[0] = simde_mm_set_epi8(
26842         e15, e14, e13, e12, e11, e10,  e9,  e8,
26843         e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
26844       r_.m128i[1] = simde_mm_set_epi8(
26845         e31, e30, e29, e28, e27, e26, e25, e24,
26846         e23, e22, e21, e20, e19, e18, e17, e16);
26847     #else
26848       r_.i8[ 0] =  e0;
26849       r_.i8[ 1] =  e1;
26850       r_.i8[ 2] =  e2;
26851       r_.i8[ 3] =  e3;
26852       r_.i8[ 4] =  e4;
26853       r_.i8[ 5] =  e5;
26854       r_.i8[ 6] =  e6;
26855       r_.i8[ 7] =  e7;
26856       r_.i8[ 8] =  e8;
26857       r_.i8[ 9] =  e9;
26858       r_.i8[10] = e10;
26859       r_.i8[11] = e11;
26860       r_.i8[12] = e12;
26861       r_.i8[13] = e13;
26862       r_.i8[14] = e14;
26863       r_.i8[15] = e15;
26864       r_.i8[16] = e16;
26865       r_.i8[17] = e17;
26866       r_.i8[18] = e18;
26867       r_.i8[19] = e19;
26868       r_.i8[20] = e20;
26869       r_.i8[21] = e21;
26870       r_.i8[22] = e22;
26871       r_.i8[23] = e23;
26872       r_.i8[24] = e24;
26873       r_.i8[25] = e25;
26874       r_.i8[26] = e26;
26875       r_.i8[27] = e27;
26876       r_.i8[28] = e28;
26877       r_.i8[29] = e29;
26878       r_.i8[30] = e30;
26879       r_.i8[31] = e31;
26880     #endif
26881 
26882     return simde__m256i_from_private(r_);
26883   #endif
26884 }
26885 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26886   #undef _mm256_set_epi8
26887   #define _mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \
26888   simde_mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
26889 #endif
26890 
26891 SIMDE_FUNCTION_ATTRIBUTES
26892 simde__m256i
simde_mm256_set_epi16(int16_t e15,int16_t e14,int16_t e13,int16_t e12,int16_t e11,int16_t e10,int16_t e9,int16_t e8,int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)26893 simde_mm256_set_epi16 (int16_t e15, int16_t e14, int16_t e13, int16_t e12,
26894                        int16_t e11, int16_t e10, int16_t  e9, int16_t  e8,
26895                        int16_t  e7, int16_t  e6, int16_t  e5, int16_t  e4,
26896                        int16_t  e3, int16_t  e2, int16_t  e1, int16_t  e0) {
26897   #if defined(SIMDE_X86_AVX_NATIVE)
26898     return _mm256_set_epi16(e15, e14, e13, e12, e11, e10,  e9,  e8,
26899                             e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
26900   #else
26901     simde__m256i_private r_;
26902 
26903     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
26904       r_.m128i[0] = simde_mm_set_epi16( e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
26905       r_.m128i[1] = simde_mm_set_epi16(e15, e14, e13, e12, e11, e10,  e9,  e8);
26906     #else
26907       r_.i16[ 0] =  e0;
26908       r_.i16[ 1] =  e1;
26909       r_.i16[ 2] =  e2;
26910       r_.i16[ 3] =  e3;
26911       r_.i16[ 4] =  e4;
26912       r_.i16[ 5] =  e5;
26913       r_.i16[ 6] =  e6;
26914       r_.i16[ 7] =  e7;
26915       r_.i16[ 8] =  e8;
26916       r_.i16[ 9] =  e9;
26917       r_.i16[10] = e10;
26918       r_.i16[11] = e11;
26919       r_.i16[12] = e12;
26920       r_.i16[13] = e13;
26921       r_.i16[14] = e14;
26922       r_.i16[15] = e15;
26923     #endif
26924 
26925     return simde__m256i_from_private(r_);
26926   #endif
26927 }
26928 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26929   #undef _mm256_set_epi16
26930   #define _mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \
26931   simde_mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
26932 #endif
26933 
26934 SIMDE_FUNCTION_ATTRIBUTES
26935 simde__m256i
simde_mm256_set_epi32(int32_t e7,int32_t e6,int32_t e5,int32_t e4,int32_t e3,int32_t e2,int32_t e1,int32_t e0)26936 simde_mm256_set_epi32 (int32_t e7, int32_t e6, int32_t e5, int32_t e4,
26937                        int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
26938   #if defined(SIMDE_X86_AVX_NATIVE)
26939     return _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
26940   #else
26941     simde__m256i_private r_;
26942 
26943     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
26944       r_.m128i[0] = simde_mm_set_epi32(e3, e2, e1, e0);
26945       r_.m128i[1] = simde_mm_set_epi32(e7, e6, e5, e4);
26946     #else
26947       r_.i32[ 0] =  e0;
26948       r_.i32[ 1] =  e1;
26949       r_.i32[ 2] =  e2;
26950       r_.i32[ 3] =  e3;
26951       r_.i32[ 4] =  e4;
26952       r_.i32[ 5] =  e5;
26953       r_.i32[ 6] =  e6;
26954       r_.i32[ 7] =  e7;
26955     #endif
26956 
26957     return simde__m256i_from_private(r_);
26958   #endif
26959 }
26960 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26961   #undef _mm256_set_epi32
26962   #define _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0) \
26963   simde_mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
26964 #endif
26965 
26966 SIMDE_FUNCTION_ATTRIBUTES
26967 simde__m256i
simde_mm256_set_epi64x(int64_t e3,int64_t e2,int64_t e1,int64_t e0)26968 simde_mm256_set_epi64x (int64_t  e3, int64_t  e2, int64_t  e1, int64_t  e0) {
26969   #if defined(SIMDE_X86_AVX_NATIVE)
26970     return _mm256_set_epi64x(e3, e2, e1, e0);
26971   #else
26972     simde__m256i_private r_;
26973 
26974     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
26975       r_.m128i[0] = simde_mm_set_epi64x(e1, e0);
26976       r_.m128i[1] = simde_mm_set_epi64x(e3, e2);
26977     #else
26978       r_.i64[0] = e0;
26979       r_.i64[1] = e1;
26980       r_.i64[2] = e2;
26981       r_.i64[3] = e3;
26982     #endif
26983 
26984     return simde__m256i_from_private(r_);
26985   #endif
26986 }
26987 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
26988   #undef _mm256_set_epi64x
26989   #define _mm256_set_epi64x(e3, e2, e1, e0) simde_mm256_set_epi64x(e3, e2, e1, e0)
26990 #endif
26991 
26992 SIMDE_FUNCTION_ATTRIBUTES
26993 simde__m256i
simde_x_mm256_set_epu8(uint8_t e31,uint8_t e30,uint8_t e29,uint8_t e28,uint8_t e27,uint8_t e26,uint8_t e25,uint8_t e24,uint8_t e23,uint8_t e22,uint8_t e21,uint8_t e20,uint8_t e19,uint8_t e18,uint8_t e17,uint8_t e16,uint8_t e15,uint8_t e14,uint8_t e13,uint8_t e12,uint8_t e11,uint8_t e10,uint8_t e9,uint8_t e8,uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)26994 simde_x_mm256_set_epu8 (uint8_t e31, uint8_t e30, uint8_t e29, uint8_t e28,
26995                         uint8_t e27, uint8_t e26, uint8_t e25, uint8_t e24,
26996                         uint8_t e23, uint8_t e22, uint8_t e21, uint8_t e20,
26997                         uint8_t e19, uint8_t e18, uint8_t e17, uint8_t e16,
26998                         uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
26999                         uint8_t e11, uint8_t e10, uint8_t  e9, uint8_t  e8,
27000                         uint8_t  e7, uint8_t  e6, uint8_t  e5, uint8_t  e4,
27001                         uint8_t  e3, uint8_t  e2, uint8_t  e1, uint8_t  e0) {
27002   simde__m256i_private r_;
27003 
27004   r_.u8[ 0] =  e0;
27005   r_.u8[ 1] =  e1;
27006   r_.u8[ 2] =  e2;
27007   r_.u8[ 3] =  e3;
27008   r_.u8[ 4] =  e4;
27009   r_.u8[ 5] =  e5;
27010   r_.u8[ 6] =  e6;
27011   r_.u8[ 7] =  e7;
27012   r_.u8[ 8] =  e8;
27013   r_.u8[ 9] =  e9;
27014   r_.u8[10] = e10;
27015   r_.u8[11] = e11;
27016   r_.u8[12] = e12;
27017   r_.u8[13] = e13;
27018   r_.u8[14] = e14;
27019   r_.u8[15] = e15;
27020   r_.u8[16] = e16;
27021   r_.u8[17] = e17;
27022   r_.u8[18] = e18;
27023   r_.u8[19] = e19;
27024   r_.u8[20] = e20;
27025   r_.u8[20] = e20;
27026   r_.u8[21] = e21;
27027   r_.u8[22] = e22;
27028   r_.u8[23] = e23;
27029   r_.u8[24] = e24;
27030   r_.u8[25] = e25;
27031   r_.u8[26] = e26;
27032   r_.u8[27] = e27;
27033   r_.u8[28] = e28;
27034   r_.u8[29] = e29;
27035   r_.u8[30] = e30;
27036   r_.u8[31] = e31;
27037 
27038   return simde__m256i_from_private(r_);
27039 }
27040 
27041 SIMDE_FUNCTION_ATTRIBUTES
27042 simde__m256i
simde_x_mm256_set_epu16(uint16_t e15,uint16_t e14,uint16_t e13,uint16_t e12,uint16_t e11,uint16_t e10,uint16_t e9,uint16_t e8,uint16_t e7,uint16_t e6,uint16_t e5,uint16_t e4,uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)27043 simde_x_mm256_set_epu16 (uint16_t e15, uint16_t e14, uint16_t e13, uint16_t e12,
27044                        uint16_t e11, uint16_t e10, uint16_t  e9, uint16_t  e8,
27045                        uint16_t  e7, uint16_t  e6, uint16_t  e5, uint16_t  e4,
27046                        uint16_t  e3, uint16_t  e2, uint16_t  e1, uint16_t  e0) {
27047   simde__m256i_private r_;
27048 
27049   r_.u16[ 0] =  e0;
27050   r_.u16[ 1] =  e1;
27051   r_.u16[ 2] =  e2;
27052   r_.u16[ 3] =  e3;
27053   r_.u16[ 4] =  e4;
27054   r_.u16[ 5] =  e5;
27055   r_.u16[ 6] =  e6;
27056   r_.u16[ 7] =  e7;
27057   r_.u16[ 8] =  e8;
27058   r_.u16[ 9] =  e9;
27059   r_.u16[10] = e10;
27060   r_.u16[11] = e11;
27061   r_.u16[12] = e12;
27062   r_.u16[13] = e13;
27063   r_.u16[14] = e14;
27064   r_.u16[15] = e15;
27065 
27066   return simde__m256i_from_private(r_);
27067 }
27068 
27069 SIMDE_FUNCTION_ATTRIBUTES
27070 simde__m256i
simde_x_mm256_set_epu32(uint32_t e7,uint32_t e6,uint32_t e5,uint32_t e4,uint32_t e3,uint32_t e2,uint32_t e1,uint32_t e0)27071 simde_x_mm256_set_epu32 (uint32_t e7, uint32_t e6, uint32_t e5, uint32_t e4,
27072                          uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
27073   #if defined(SIMDE_X86_AVX_NATIVE)
27074     return _mm256_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4),
27075                             HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0));
27076   #else
27077     simde__m256i_private r_;
27078 
27079     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27080       r_.m128i[0] = simde_mm_set_epi32(HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0));
27081       r_.m128i[1] = simde_mm_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4));
27082     #else
27083       r_.u32[ 0] =  e0;
27084       r_.u32[ 1] =  e1;
27085       r_.u32[ 2] =  e2;
27086       r_.u32[ 3] =  e3;
27087       r_.u32[ 4] =  e4;
27088       r_.u32[ 5] =  e5;
27089       r_.u32[ 6] =  e6;
27090       r_.u32[ 7] =  e7;
27091     #endif
27092 
27093     return simde__m256i_from_private(r_);
27094   #endif
27095 }
27096 
27097 SIMDE_FUNCTION_ATTRIBUTES
27098 simde__m256i
simde_x_mm256_set_epu64x(uint64_t e3,uint64_t e2,uint64_t e1,uint64_t e0)27099 simde_x_mm256_set_epu64x (uint64_t  e3, uint64_t  e2, uint64_t  e1, uint64_t  e0) {
27100   simde__m256i_private r_;
27101 
27102   r_.u64[0] = e0;
27103   r_.u64[1] = e1;
27104   r_.u64[2] = e2;
27105   r_.u64[3] = e3;
27106 
27107   return simde__m256i_from_private(r_);
27108 }
27109 
27110 SIMDE_FUNCTION_ATTRIBUTES
27111 simde__m256
simde_mm256_set_ps(simde_float32 e7,simde_float32 e6,simde_float32 e5,simde_float32 e4,simde_float32 e3,simde_float32 e2,simde_float32 e1,simde_float32 e0)27112 simde_mm256_set_ps (simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4,
27113                     simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
27114   #if defined(SIMDE_X86_AVX_NATIVE)
27115     return _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
27116   #else
27117     simde__m256_private r_;
27118 
27119     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27120       r_.m128[0] = simde_mm_set_ps(e3, e2, e1, e0);
27121       r_.m128[1] = simde_mm_set_ps(e7, e6, e5, e4);
27122     #else
27123       r_.f32[0] = e0;
27124       r_.f32[1] = e1;
27125       r_.f32[2] = e2;
27126       r_.f32[3] = e3;
27127       r_.f32[4] = e4;
27128       r_.f32[5] = e5;
27129       r_.f32[6] = e6;
27130       r_.f32[7] = e7;
27131     #endif
27132 
27133     return simde__m256_from_private(r_);
27134   #endif
27135 }
27136 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27137   #undef _mm256_set_ps
27138   #define _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0) \
27139   simde_mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0)
27140 #endif
27141 
27142 SIMDE_FUNCTION_ATTRIBUTES
27143 simde__m256d
simde_mm256_set_pd(simde_float64 e3,simde_float64 e2,simde_float64 e1,simde_float64 e0)27144 simde_mm256_set_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) {
27145   #if defined(SIMDE_X86_AVX_NATIVE)
27146     return _mm256_set_pd(e3, e2, e1, e0);
27147   #else
27148     simde__m256d_private r_;
27149 
27150     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27151       r_.m128d[0] = simde_mm_set_pd(e1, e0);
27152       r_.m128d[1] = simde_mm_set_pd(e3, e2);
27153     #else
27154       r_.f64[0] = e0;
27155       r_.f64[1] = e1;
27156       r_.f64[2] = e2;
27157       r_.f64[3] = e3;
27158     #endif
27159 
27160     return simde__m256d_from_private(r_);
27161   #endif
27162 }
27163 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27164   #undef _mm256_set_pd
27165   #define _mm256_set_pd(e3, e2, e1, e0) \
27166   simde_mm256_set_pd(e3, e2, e1, e0)
27167 #endif
27168 
27169 SIMDE_FUNCTION_ATTRIBUTES
27170 simde__m256
simde_mm256_set_m128(simde__m128 e1,simde__m128 e0)27171 simde_mm256_set_m128 (simde__m128 e1, simde__m128 e0) {
27172   #if defined(SIMDE_X86_AVX_NATIVE)
27173     return _mm256_insertf128_ps(_mm256_castps128_ps256(e0), e1, 1);
27174   #else
27175     simde__m256_private r_;
27176     simde__m128_private
27177       e1_ = simde__m128_to_private(e1),
27178       e0_ = simde__m128_to_private(e0);
27179 
27180     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27181       r_.m128_private[0] = e0_;
27182       r_.m128_private[1] = e1_;
27183     #elif defined(SIMDE_HAVE_INT128_)
27184       r_.i128[0] = e0_.i128[0];
27185       r_.i128[1] = e1_.i128[0];
27186     #else
27187       r_.i64[0] = e0_.i64[0];
27188       r_.i64[1] = e0_.i64[1];
27189       r_.i64[2] = e1_.i64[0];
27190       r_.i64[3] = e1_.i64[1];
27191     #endif
27192 
27193     return simde__m256_from_private(r_);
27194   #endif
27195 }
27196 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27197   #undef _mm256_set_m128
27198   #define _mm256_set_m128(e1, e0) simde_mm256_set_m128(e1, e0)
27199 #endif
27200 
27201 SIMDE_FUNCTION_ATTRIBUTES
27202 simde__m256d
simde_mm256_set_m128d(simde__m128d e1,simde__m128d e0)27203 simde_mm256_set_m128d (simde__m128d e1, simde__m128d e0) {
27204   #if defined(SIMDE_X86_AVX_NATIVE)
27205     return _mm256_insertf128_pd(_mm256_castpd128_pd256(e0), e1, 1);
27206   #else
27207     simde__m256d_private r_;
27208     simde__m128d_private
27209       e1_ = simde__m128d_to_private(e1),
27210       e0_ = simde__m128d_to_private(e0);
27211 
27212     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27213       r_.m128d_private[0] = e0_;
27214       r_.m128d_private[1] = e1_;
27215     #else
27216       r_.i64[0] = e0_.i64[0];
27217       r_.i64[1] = e0_.i64[1];
27218       r_.i64[2] = e1_.i64[0];
27219       r_.i64[3] = e1_.i64[1];
27220     #endif
27221 
27222     return simde__m256d_from_private(r_);
27223   #endif
27224 }
27225 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27226   #undef _mm256_set_m128d
27227   #define _mm256_set_m128d(e1, e0) simde_mm256_set_m128d(e1, e0)
27228 #endif
27229 
27230 SIMDE_FUNCTION_ATTRIBUTES
27231 simde__m256i
simde_mm256_set_m128i(simde__m128i e1,simde__m128i e0)27232 simde_mm256_set_m128i (simde__m128i e1, simde__m128i e0) {
27233   #if defined(SIMDE_X86_AVX_NATIVE)
27234     return _mm256_insertf128_si256(_mm256_castsi128_si256(e0), e1, 1);
27235   #else
27236     simde__m256i_private r_;
27237     simde__m128i_private
27238       e1_ = simde__m128i_to_private(e1),
27239       e0_ = simde__m128i_to_private(e0);
27240 
27241     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27242       r_.m128i_private[0] = e0_;
27243       r_.m128i_private[1] = e1_;
27244     #else
27245       r_.i64[0] = e0_.i64[0];
27246       r_.i64[1] = e0_.i64[1];
27247       r_.i64[2] = e1_.i64[0];
27248       r_.i64[3] = e1_.i64[1];
27249     #endif
27250 
27251     return simde__m256i_from_private(r_);
27252   #endif
27253 }
27254 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27255   #undef _mm256_set_m128i
27256   #define _mm256_set_m128i(e1, e0) simde_mm256_set_m128i(e1, e0)
27257 #endif
27258 
27259 SIMDE_FUNCTION_ATTRIBUTES
27260 simde__m256i
simde_mm256_set1_epi8(int8_t a)27261 simde_mm256_set1_epi8 (int8_t a) {
27262   #if defined(SIMDE_X86_AVX_NATIVE)
27263     return _mm256_set1_epi8(a);
27264   #else
27265     simde__m256i_private r_;
27266 
27267     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27268       r_.m128i[0] = simde_mm_set1_epi8(a);
27269       r_.m128i[1] = simde_mm_set1_epi8(a);
27270     #else
27271       SIMDE_VECTORIZE
27272       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
27273         r_.i8[i] = a;
27274       }
27275     #endif
27276 
27277     return simde__m256i_from_private(r_);
27278   #endif
27279 }
27280 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27281   #undef _mm256_set1_epi8
27282   #define _mm256_set1_epi8(a) simde_mm256_set1_epi8(a)
27283 #endif
27284 
27285 SIMDE_FUNCTION_ATTRIBUTES
27286 simde__m256i
simde_mm256_set1_epi16(int16_t a)27287 simde_mm256_set1_epi16 (int16_t a) {
27288   #if defined(SIMDE_X86_AVX_NATIVE)
27289     return _mm256_set1_epi16(a);
27290   #else
27291     simde__m256i_private r_;
27292 
27293     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27294       r_.m128i[0] = simde_mm_set1_epi16(a);
27295       r_.m128i[1] = simde_mm_set1_epi16(a);
27296     #else
27297       SIMDE_VECTORIZE
27298       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
27299         r_.i16[i] = a;
27300       }
27301     #endif
27302 
27303     return simde__m256i_from_private(r_);
27304   #endif
27305 }
27306 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27307   #undef _mm256_set1_epi16
27308   #define _mm256_set1_epi16(a) simde_mm256_set1_epi16(a)
27309 #endif
27310 
27311 SIMDE_FUNCTION_ATTRIBUTES
27312 simde__m256i
simde_mm256_set1_epi32(int32_t a)27313 simde_mm256_set1_epi32 (int32_t a) {
27314   #if defined(SIMDE_X86_AVX_NATIVE)
27315     return _mm256_set1_epi32(a);
27316   #else
27317     simde__m256i_private r_;
27318 
27319     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27320       r_.m128i[0] = simde_mm_set1_epi32(a);
27321       r_.m128i[1] = simde_mm_set1_epi32(a);
27322     #else
27323       SIMDE_VECTORIZE
27324       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
27325         r_.i32[i] = a;
27326       }
27327     #endif
27328 
27329     return simde__m256i_from_private(r_);
27330   #endif
27331 }
27332 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27333   #undef _mm256_set1_epi32
27334   #define _mm256_set1_epi32(a) simde_mm256_set1_epi32(a)
27335 #endif
27336 
27337 SIMDE_FUNCTION_ATTRIBUTES
27338 simde__m256i
simde_mm256_set1_epi64x(int64_t a)27339 simde_mm256_set1_epi64x (int64_t a) {
27340   #if defined(SIMDE_X86_AVX_NATIVE)
27341     return _mm256_set1_epi64x(a);
27342   #else
27343     simde__m256i_private r_;
27344 
27345     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27346       r_.m128i[0] = simde_mm_set1_epi64x(a);
27347       r_.m128i[1] = simde_mm_set1_epi64x(a);
27348     #else
27349       SIMDE_VECTORIZE
27350       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
27351         r_.i64[i] = a;
27352       }
27353     #endif
27354 
27355     return simde__m256i_from_private(r_);
27356   #endif
27357 }
27358 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27359   #undef _mm256_set1_epi64x
27360   #define _mm256_set1_epi64x(a) simde_mm256_set1_epi64x(a)
27361 #endif
27362 
27363 SIMDE_FUNCTION_ATTRIBUTES
27364 simde__m256
simde_mm256_set1_ps(simde_float32 a)27365 simde_mm256_set1_ps (simde_float32 a) {
27366   #if defined(SIMDE_X86_AVX_NATIVE)
27367     return _mm256_set1_ps(a);
27368   #else
27369     simde__m256_private r_;
27370 
27371     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27372       r_.m128[0] = simde_mm_set1_ps(a);
27373       r_.m128[1] = simde_mm_set1_ps(a);
27374     #else
27375       SIMDE_VECTORIZE
27376       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
27377         r_.f32[i] = a;
27378       }
27379     #endif
27380 
27381     return simde__m256_from_private(r_);
27382   #endif
27383 }
27384 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27385   #undef _mm256_set1_ps
27386   #define _mm256_set1_ps(a) simde_mm256_set1_ps(a)
27387 #endif
27388 
27389 SIMDE_FUNCTION_ATTRIBUTES
27390 simde__m256d
simde_mm256_set1_pd(simde_float64 a)27391 simde_mm256_set1_pd (simde_float64 a) {
27392   #if defined(SIMDE_X86_AVX_NATIVE)
27393     return _mm256_set1_pd(a);
27394   #else
27395     simde__m256d_private r_;
27396 
27397     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27398       r_.m128d[0] = simde_mm_set1_pd(a);
27399       r_.m128d[1] = simde_mm_set1_pd(a);
27400     #else
27401       SIMDE_VECTORIZE
27402       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
27403         r_.f64[i] = a;
27404       }
27405     #endif
27406 
27407     return simde__m256d_from_private(r_);
27408   #endif
27409 }
27410 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27411   #undef _mm256_set1_pd
27412   #define _mm256_set1_pd(a) simde_mm256_set1_pd(a)
27413 #endif
27414 
27415 SIMDE_FUNCTION_ATTRIBUTES
27416 simde__m256i
simde_x_mm256_deinterleaveeven_epi16(simde__m256i a,simde__m256i b)27417 simde_x_mm256_deinterleaveeven_epi16 (simde__m256i a, simde__m256i b) {
27418   simde__m256i_private
27419     r_,
27420     a_ = simde__m256i_to_private(a),
27421     b_ = simde__m256i_to_private(b);
27422 
27423   #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27424     r_.m128i[0] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[0], b_.m128i[0]);
27425     r_.m128i[1] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[1], b_.m128i[1]);
27426   #elif defined(SIMDE_SHUFFLE_VECTOR_)
27427     r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30);
27428   #else
27429     const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
27430     const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4;
27431     for (size_t i = 0 ; i < quarter_point ; i++) {
27432       r_.i16[i] = a_.i16[2 * i];
27433       r_.i16[i + quarter_point] = b_.i16[2 * i];
27434       r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i];
27435       r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i];
27436     }
27437   #endif
27438 
27439   return simde__m256i_from_private(r_);
27440 }
27441 
27442 SIMDE_FUNCTION_ATTRIBUTES
27443 simde__m256i
simde_x_mm256_deinterleaveodd_epi16(simde__m256i a,simde__m256i b)27444 simde_x_mm256_deinterleaveodd_epi16 (simde__m256i a, simde__m256i b) {
27445   simde__m256i_private
27446     r_,
27447     a_ = simde__m256i_to_private(a),
27448     b_ = simde__m256i_to_private(b);
27449 
27450   #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27451     r_.m128i[0] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[0], b_.m128i[0]);
27452     r_.m128i[1] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[1], b_.m128i[1]);
27453   #elif defined(SIMDE_SHUFFLE_VECTOR_)
27454     r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31);
27455   #else
27456     const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
27457     const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4;
27458     for (size_t i = 0 ; i < quarter_point ; i++) {
27459       r_.i16[i] = a_.i16[2 * i + 1];
27460       r_.i16[i + quarter_point] = b_.i16[2 * i + 1];
27461       r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i + 1];
27462       r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i + 1];
27463     }
27464   #endif
27465 
27466   return simde__m256i_from_private(r_);
27467 }
27468 
27469 SIMDE_FUNCTION_ATTRIBUTES
27470 simde__m256i
simde_x_mm256_deinterleaveeven_epi32(simde__m256i a,simde__m256i b)27471 simde_x_mm256_deinterleaveeven_epi32 (simde__m256i a, simde__m256i b) {
27472   simde__m256i_private
27473     r_,
27474     a_ = simde__m256i_to_private(a),
27475     b_ = simde__m256i_to_private(b);
27476 
27477   #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27478     r_.m128i[0] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[0], b_.m128i[0]);
27479     r_.m128i[1] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[1], b_.m128i[1]);
27480   #elif defined(SIMDE_SHUFFLE_VECTOR_)
27481     r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 2, 8, 10, 4, 6, 12, 14);
27482   #else
27483     const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2;
27484     const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4;
27485     for (size_t i = 0 ; i < quarter_point ; i++) {
27486       r_.i32[i] = a_.i32[2 * i];
27487       r_.i32[i + quarter_point] = b_.i32[2 * i];
27488       r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i];
27489       r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i];
27490     }
27491   #endif
27492 
27493   return simde__m256i_from_private(r_);
27494 }
27495 
27496 SIMDE_FUNCTION_ATTRIBUTES
27497 simde__m256i
simde_x_mm256_deinterleaveodd_epi32(simde__m256i a,simde__m256i b)27498 simde_x_mm256_deinterleaveodd_epi32 (simde__m256i a, simde__m256i b) {
27499   simde__m256i_private
27500     r_,
27501     a_ = simde__m256i_to_private(a),
27502     b_ = simde__m256i_to_private(b);
27503 
27504   #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27505     r_.m128i[0] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[0], b_.m128i[0]);
27506     r_.m128i[1] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[1], b_.m128i[1]);
27507   #elif defined(SIMDE_SHUFFLE_VECTOR_)
27508     r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 1, 3, 9, 11, 5, 7, 13, 15);
27509   #else
27510     const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2;
27511     const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4;
27512     for (size_t i = 0 ; i < quarter_point ; i++) {
27513       r_.i32[i] = a_.i32[2 * i + 1];
27514       r_.i32[i + quarter_point] = b_.i32[2 * i + 1];
27515       r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i + 1];
27516       r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i + 1];
27517     }
27518   #endif
27519 
27520   return simde__m256i_from_private(r_);
27521 }
27522 
27523 SIMDE_FUNCTION_ATTRIBUTES
27524 simde__m256
simde_x_mm256_deinterleaveeven_ps(simde__m256 a,simde__m256 b)27525 simde_x_mm256_deinterleaveeven_ps (simde__m256 a, simde__m256 b) {
27526   simde__m256_private
27527     r_,
27528     a_ = simde__m256_to_private(a),
27529     b_ = simde__m256_to_private(b);
27530 
27531   #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27532     r_.m128[0] = simde_x_mm_deinterleaveeven_ps(a_.m128[0], b_.m128[0]);
27533     r_.m128[1] = simde_x_mm_deinterleaveeven_ps(a_.m128[1], b_.m128[1]);
27534   #elif defined(SIMDE_SHUFFLE_VECTOR_)
27535     r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 0, 2, 8, 10, 4, 6, 12, 14);
27536   #else
27537     const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2;
27538     const size_t quarter_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 4;
27539     for (size_t i = 0 ; i < quarter_point ; i++) {
27540       r_.f32[i] = a_.f32[2 * i];
27541       r_.f32[i + quarter_point] = b_.f32[2 * i];
27542       r_.f32[halfway_point + i] = a_.f32[halfway_point + 2 * i];
27543       r_.f32[halfway_point + i + quarter_point] = b_.f32[halfway_point + 2 * i];
27544     }
27545   #endif
27546 
27547   return simde__m256_from_private(r_);
27548 }
27549 
27550 SIMDE_FUNCTION_ATTRIBUTES
27551 simde__m256
simde_x_mm256_deinterleaveodd_ps(simde__m256 a,simde__m256 b)27552 simde_x_mm256_deinterleaveodd_ps (simde__m256 a, simde__m256 b) {
27553   simde__m256_private
27554     r_,
27555     a_ = simde__m256_to_private(a),
27556     b_ = simde__m256_to_private(b);
27557 
27558   #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27559     r_.m128[0] = simde_x_mm_deinterleaveodd_ps(a_.m128[0], b_.m128[0]);
27560     r_.m128[1] = simde_x_mm_deinterleaveodd_ps(a_.m128[1], b_.m128[1]);
27561   #elif defined(SIMDE_SHUFFLE_VECTOR_)
27562     r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 1, 3, 9, 11, 5, 7, 13, 15);
27563   #else
27564     const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2;
27565     const size_t quarter_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 4;
27566     for (size_t i = 0 ; i < quarter_point ; i++) {
27567       r_.f32[i] = a_.f32[2 * i + 1];
27568       r_.f32[i + quarter_point] = b_.f32[2 * i + 1];
27569       r_.f32[halfway_point + i] = a_.f32[halfway_point + 2 * i + 1];
27570       r_.f32[halfway_point + i + quarter_point] = b_.f32[halfway_point + 2 * i + 1];
27571     }
27572   #endif
27573 
27574   return simde__m256_from_private(r_);
27575 }
27576 
27577 SIMDE_FUNCTION_ATTRIBUTES
27578 simde__m256d
simde_x_mm256_deinterleaveeven_pd(simde__m256d a,simde__m256d b)27579 simde_x_mm256_deinterleaveeven_pd (simde__m256d a, simde__m256d b) {
27580   simde__m256d_private
27581     r_,
27582     a_ = simde__m256d_to_private(a),
27583     b_ = simde__m256d_to_private(b);
27584 
27585   #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27586     r_.m128d[0] = simde_x_mm_deinterleaveeven_pd(a_.m128d[0], b_.m128d[0]);
27587     r_.m128d[1] = simde_x_mm_deinterleaveeven_pd(a_.m128d[1], b_.m128d[1]);
27588   #elif defined(SIMDE_SHUFFLE_VECTOR_)
27589     r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 0, 4, 2, 6);
27590   #else
27591     const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2;
27592     const size_t quarter_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 4;
27593     for (size_t i = 0 ; i < quarter_point ; i++) {
27594       r_.f64[i] = a_.f64[2 * i];
27595       r_.f64[i + quarter_point] = b_.f64[2 * i];
27596       r_.f64[halfway_point + i] = a_.f64[halfway_point + 2 * i];
27597       r_.f64[halfway_point + i + quarter_point] = b_.f64[halfway_point + 2 * i];
27598     }
27599   #endif
27600 
27601   return simde__m256d_from_private(r_);
27602 }
27603 
27604 SIMDE_FUNCTION_ATTRIBUTES
27605 simde__m256d
simde_x_mm256_deinterleaveodd_pd(simde__m256d a,simde__m256d b)27606 simde_x_mm256_deinterleaveodd_pd (simde__m256d a, simde__m256d b) {
27607   simde__m256d_private
27608     r_,
27609     a_ = simde__m256d_to_private(a),
27610     b_ = simde__m256d_to_private(b);
27611 
27612   #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27613     r_.m128d[0] = simde_x_mm_deinterleaveodd_pd(a_.m128d[0], b_.m128d[0]);
27614     r_.m128d[1] = simde_x_mm_deinterleaveodd_pd(a_.m128d[1], b_.m128d[1]);
27615   #elif defined(SIMDE_SHUFFLE_VECTOR_)
27616     r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 1, 5, 3, 7);
27617   #else
27618     const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2;
27619     const size_t quarter_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 4;
27620     for (size_t i = 0 ; i < quarter_point ; i++) {
27621       r_.f64[i] = a_.f64[2 * i + 1];
27622       r_.f64[i + quarter_point] = b_.f64[2 * i + 1];
27623       r_.f64[halfway_point + i] = a_.f64[halfway_point + 2 * i + 1];
27624       r_.f64[halfway_point + i + quarter_point] = b_.f64[halfway_point + 2 * i + 1];
27625     }
27626   #endif
27627 
27628   return simde__m256d_from_private(r_);
27629 }
27630 
27631 SIMDE_FUNCTION_ATTRIBUTES
27632 simde__m256
simde_x_mm256_abs_ps(simde__m256 a)27633 simde_x_mm256_abs_ps(simde__m256 a) {
27634     simde__m256_private
27635       r_,
27636       a_ = simde__m256_to_private(a);
27637 
27638       SIMDE_VECTORIZE
27639       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
27640         r_.f32[i] = simde_math_fabsf(a_.f32[i]);
27641       }
27642     return simde__m256_from_private(r_);
27643 }
27644 
27645 SIMDE_FUNCTION_ATTRIBUTES
27646 simde__m256d
simde_x_mm256_abs_pd(simde__m256d a)27647 simde_x_mm256_abs_pd(simde__m256d a) {
27648     simde__m256d_private
27649       r_,
27650       a_ = simde__m256d_to_private(a);
27651 
27652       SIMDE_VECTORIZE
27653       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
27654         r_.f64[i] = simde_math_fabs(a_.f64[i]);
27655       }
27656     return simde__m256d_from_private(r_);
27657 }
27658 
27659 SIMDE_FUNCTION_ATTRIBUTES
27660 simde__m256
simde_mm256_add_ps(simde__m256 a,simde__m256 b)27661 simde_mm256_add_ps (simde__m256 a, simde__m256 b) {
27662   #if defined(SIMDE_X86_AVX_NATIVE)
27663     return _mm256_add_ps(a, b);
27664   #else
27665     simde__m256_private
27666       r_,
27667       a_ = simde__m256_to_private(a),
27668       b_ = simde__m256_to_private(b);
27669 
27670     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27671       r_.m128[0] = simde_mm_add_ps(a_.m128[0], b_.m128[0]);
27672       r_.m128[1] = simde_mm_add_ps(a_.m128[1], b_.m128[1]);
27673     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
27674       r_.f32 = a_.f32 + b_.f32;
27675     #else
27676       SIMDE_VECTORIZE
27677       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
27678         r_.f32[i] = a_.f32[i] + b_.f32[i];
27679       }
27680     #endif
27681 
27682     return simde__m256_from_private(r_);
27683   #endif
27684 }
27685 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27686   #undef _mm256_add_ps
27687   #define _mm256_add_ps(a, b) simde_mm256_add_ps(a, b)
27688 #endif
27689 
27690 SIMDE_FUNCTION_ATTRIBUTES
27691 simde__m256
simde_mm256_hadd_ps(simde__m256 a,simde__m256 b)27692 simde_mm256_hadd_ps (simde__m256 a, simde__m256 b) {
27693   #if defined(SIMDE_X86_AVX_NATIVE)
27694     return _mm256_hadd_ps(a, b);
27695   #else
27696     return simde_mm256_add_ps(simde_x_mm256_deinterleaveeven_ps(a, b), simde_x_mm256_deinterleaveodd_ps(a, b));
27697   #endif
27698 }
27699 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27700   #undef _mm256_hadd_ps
27701   #define _mm256_hadd_ps(a, b) simde_mm256_hadd_ps(a, b)
27702 #endif
27703 
27704 SIMDE_FUNCTION_ATTRIBUTES
27705 simde__m256d
simde_mm256_add_pd(simde__m256d a,simde__m256d b)27706 simde_mm256_add_pd (simde__m256d a, simde__m256d b) {
27707   #if defined(SIMDE_X86_AVX_NATIVE)
27708     return _mm256_add_pd(a, b);
27709   #else
27710     simde__m256d_private
27711       r_,
27712       a_ = simde__m256d_to_private(a),
27713       b_ = simde__m256d_to_private(b);
27714 
27715     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27716       r_.m128d[0] = simde_mm_add_pd(a_.m128d[0], b_.m128d[0]);
27717       r_.m128d[1] = simde_mm_add_pd(a_.m128d[1], b_.m128d[1]);
27718     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
27719       r_.f64 = a_.f64 + b_.f64;
27720     #else
27721       SIMDE_VECTORIZE
27722       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
27723         r_.f64[i] = a_.f64[i] + b_.f64[i];
27724       }
27725     #endif
27726 
27727     return simde__m256d_from_private(r_);
27728   #endif
27729 }
27730 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27731   #undef _mm256_add_pd
27732   #define _mm256_add_pd(a, b) simde_mm256_add_pd(a, b)
27733 #endif
27734 
27735 SIMDE_FUNCTION_ATTRIBUTES
27736 simde__m256d
simde_mm256_hadd_pd(simde__m256d a,simde__m256d b)27737 simde_mm256_hadd_pd (simde__m256d a, simde__m256d b) {
27738   #if defined(SIMDE_X86_AVX_NATIVE)
27739     return _mm256_hadd_pd(a, b);
27740   #else
27741       return simde_mm256_add_pd(simde_x_mm256_deinterleaveeven_pd(a, b), simde_x_mm256_deinterleaveodd_pd(a, b));
27742   #endif
27743 }
27744 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27745   #undef _mm256_hadd_pd
27746   #define _mm256_hadd_pd(a, b) simde_mm256_hadd_pd(a, b)
27747 #endif
27748 
27749 SIMDE_FUNCTION_ATTRIBUTES
27750 simde__m256
simde_mm256_addsub_ps(simde__m256 a,simde__m256 b)27751 simde_mm256_addsub_ps (simde__m256 a, simde__m256 b) {
27752   #if defined(SIMDE_X86_AVX_NATIVE)
27753     return _mm256_addsub_ps(a, b);
27754   #else
27755     simde__m256_private
27756       r_,
27757       a_ = simde__m256_to_private(a),
27758       b_ = simde__m256_to_private(b);
27759 
27760     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27761       r_.m128[0] = simde_mm_addsub_ps(a_.m128[0], b_.m128[0]);
27762       r_.m128[1] = simde_mm_addsub_ps(a_.m128[1], b_.m128[1]);
27763     #else
27764       SIMDE_VECTORIZE
27765       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) {
27766         r_.f32[  i  ] = a_.f32[  i  ] - b_.f32[  i  ];
27767         r_.f32[i + 1] = a_.f32[i + 1] + b_.f32[i + 1];
27768       }
27769     #endif
27770 
27771     return simde__m256_from_private(r_);
27772   #endif
27773 }
27774 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27775   #undef _mm256_addsub_ps
27776   #define _mm256_addsub_ps(a, b) simde_mm256_addsub_ps(a, b)
27777 #endif
27778 
27779 SIMDE_FUNCTION_ATTRIBUTES
27780 simde__m256d
simde_mm256_addsub_pd(simde__m256d a,simde__m256d b)27781 simde_mm256_addsub_pd (simde__m256d a, simde__m256d b) {
27782   #if defined(SIMDE_X86_AVX_NATIVE)
27783     return _mm256_addsub_pd(a, b);
27784   #else
27785     simde__m256d_private
27786       r_,
27787       a_ = simde__m256d_to_private(a),
27788       b_ = simde__m256d_to_private(b);
27789 
27790     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27791       r_.m128d[0] = simde_mm_addsub_pd(a_.m128d[0], b_.m128d[0]);
27792       r_.m128d[1] = simde_mm_addsub_pd(a_.m128d[1], b_.m128d[1]);
27793     #else
27794       SIMDE_VECTORIZE
27795       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) {
27796         r_.f64[  i  ] = a_.f64[  i  ] - b_.f64[  i  ];
27797         r_.f64[i + 1] = a_.f64[i + 1] + b_.f64[i + 1];
27798       }
27799     #endif
27800 
27801     return simde__m256d_from_private(r_);
27802   #endif
27803 }
27804 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27805   #undef _mm256_addsub_pd
27806   #define _mm256_addsub_pd(a, b) simde_mm256_addsub_pd(a, b)
27807 #endif
27808 
27809 SIMDE_FUNCTION_ATTRIBUTES
27810 simde__m256
simde_mm256_and_ps(simde__m256 a,simde__m256 b)27811 simde_mm256_and_ps (simde__m256 a, simde__m256 b) {
27812   #if defined(SIMDE_X86_AVX_NATIVE)
27813     return _mm256_and_ps(a, b);
27814   #else
27815     simde__m256_private
27816       r_,
27817       a_ = simde__m256_to_private(a),
27818       b_ = simde__m256_to_private(b);
27819 
27820     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27821       r_.m128[0] = simde_mm_and_ps(a_.m128[0], b_.m128[0]);
27822       r_.m128[1] = simde_mm_and_ps(a_.m128[1], b_.m128[1]);
27823     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
27824       r_.i32f = a_.i32f & b_.i32f;
27825     #else
27826       SIMDE_VECTORIZE
27827       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
27828         r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
27829       }
27830     #endif
27831 
27832     return simde__m256_from_private(r_);
27833   #endif
27834 }
27835 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27836   #undef _mm256_and_ps
27837   #define _mm256_and_ps(a, b) simde_mm256_and_ps(a, b)
27838 #endif
27839 
27840 SIMDE_FUNCTION_ATTRIBUTES
27841 simde__m256d
simde_mm256_and_pd(simde__m256d a,simde__m256d b)27842 simde_mm256_and_pd (simde__m256d a, simde__m256d b) {
27843   #if defined(SIMDE_X86_AVX_NATIVE)
27844     return _mm256_and_pd(a, b);
27845   #else
27846     simde__m256d_private
27847       r_,
27848       a_ = simde__m256d_to_private(a),
27849       b_ = simde__m256d_to_private(b);
27850 
27851     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27852       r_.m128d[0] = simde_mm_and_pd(a_.m128d[0], b_.m128d[0]);
27853       r_.m128d[1] = simde_mm_and_pd(a_.m128d[1], b_.m128d[1]);
27854     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
27855       r_.i32f = a_.i32f & b_.i32f;
27856     #else
27857       SIMDE_VECTORIZE
27858       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
27859         r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
27860       }
27861     #endif
27862 
27863     return simde__m256d_from_private(r_);
27864   #endif
27865 }
27866 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27867   #undef _mm256_and_pd
27868   #define _mm256_and_pd(a, b) simde_mm256_and_pd(a, b)
27869 #endif
27870 
27871 SIMDE_FUNCTION_ATTRIBUTES
27872 simde__m256
simde_mm256_andnot_ps(simde__m256 a,simde__m256 b)27873 simde_mm256_andnot_ps (simde__m256 a, simde__m256 b) {
27874   #if defined(SIMDE_X86_AVX_NATIVE)
27875     return _mm256_andnot_ps(a, b);
27876   #else
27877     simde__m256_private
27878       r_,
27879       a_ = simde__m256_to_private(a),
27880       b_ = simde__m256_to_private(b);
27881 
27882     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27883       r_.m128[0] = simde_mm_andnot_ps(a_.m128[0], b_.m128[0]);
27884       r_.m128[1] = simde_mm_andnot_ps(a_.m128[1], b_.m128[1]);
27885     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
27886       r_.i32f = ~a_.i32f & b_.i32f;
27887     #else
27888       SIMDE_VECTORIZE
27889       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
27890         r_.i32f[i] = ~a_.i32f[i] & b_.i32f[i];
27891       }
27892     #endif
27893 
27894     return simde__m256_from_private(r_);
27895   #endif
27896 }
27897 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27898   #undef _mm256_andnot_ps
27899   #define _mm256_andnot_ps(a, b) simde_mm256_andnot_ps(a, b)
27900 #endif
27901 
27902 SIMDE_FUNCTION_ATTRIBUTES
27903 simde__m256d
simde_mm256_andnot_pd(simde__m256d a,simde__m256d b)27904 simde_mm256_andnot_pd (simde__m256d a, simde__m256d b) {
27905   #if defined(SIMDE_X86_AVX_NATIVE)
27906     return _mm256_andnot_pd(a, b);
27907   #else
27908     simde__m256d_private
27909       r_,
27910       a_ = simde__m256d_to_private(a),
27911       b_ = simde__m256d_to_private(b);
27912 
27913     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27914       r_.m128d[0] = simde_mm_andnot_pd(a_.m128d[0], b_.m128d[0]);
27915       r_.m128d[1] = simde_mm_andnot_pd(a_.m128d[1], b_.m128d[1]);
27916     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
27917       r_.i32f = ~a_.i32f & b_.i32f;
27918     #else
27919       SIMDE_VECTORIZE
27920       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
27921         r_.i32f[i] = ~a_.i32f[i] & b_.i32f[i];
27922       }
27923     #endif
27924 
27925     return simde__m256d_from_private(r_);
27926   #endif
27927 }
27928 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27929   #undef _mm256_andnot_pd
27930   #define _mm256_andnot_pd(a, b) simde_mm256_andnot_pd(a, b)
27931 #endif
27932 
27933 SIMDE_FUNCTION_ATTRIBUTES
27934 simde__m256
simde_mm256_blend_ps(simde__m256 a,simde__m256 b,const int imm8)27935 simde_mm256_blend_ps (simde__m256 a, simde__m256 b, const int imm8)
27936     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
27937   simde__m256_private
27938     r_,
27939     a_ = simde__m256_to_private(a),
27940     b_ = simde__m256_to_private(b);
27941 
27942   SIMDE_VECTORIZE
27943   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
27944     r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
27945   }
27946 
27947   return simde__m256_from_private(r_);
27948 }
27949 #if defined(SIMDE_X86_AVX_NATIVE)
27950 #  define simde_mm256_blend_ps(a, b, imm8) _mm256_blend_ps(a, b, imm8)
27951 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27952 #  define simde_mm256_blend_ps(a, b, imm8) \
27953       simde_mm256_set_m128( \
27954           simde_mm_blend_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), (imm8) >> 4), \
27955           simde_mm_blend_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), (imm8) & 0x0F))
27956 #endif
27957 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27958   #undef _mm256_blend_ps
27959   #define _mm256_blend_ps(a, b, imm8) simde_mm256_blend_ps(a, b, imm8)
27960 #endif
27961 
27962 SIMDE_FUNCTION_ATTRIBUTES
27963 simde__m256d
simde_mm256_blend_pd(simde__m256d a,simde__m256d b,const int imm8)27964 simde_mm256_blend_pd (simde__m256d a, simde__m256d b, const int imm8)
27965     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
27966   simde__m256d_private
27967     r_,
27968     a_ = simde__m256d_to_private(a),
27969     b_ = simde__m256d_to_private(b);
27970 
27971   SIMDE_VECTORIZE
27972   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
27973     r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
27974   }
27975   return simde__m256d_from_private(r_);
27976 }
27977 #if defined(SIMDE_X86_AVX_NATIVE)
27978 #  define simde_mm256_blend_pd(a, b, imm8) _mm256_blend_pd(a, b, imm8)
27979 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
27980 #  define simde_mm256_blend_pd(a, b, imm8) \
27981       simde_mm256_set_m128d( \
27982           simde_mm_blend_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8) >> 2), \
27983           simde_mm_blend_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8) & 3))
27984 #endif
27985 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
27986   #undef _mm256_blend_pd
27987   #define _mm256_blend_pd(a, b, imm8) simde_mm256_blend_pd(a, b, imm8)
27988 #endif
27989 
27990 SIMDE_FUNCTION_ATTRIBUTES
27991 simde__m256
simde_mm256_blendv_ps(simde__m256 a,simde__m256 b,simde__m256 mask)27992 simde_mm256_blendv_ps (simde__m256 a, simde__m256 b, simde__m256 mask) {
27993   #if defined(SIMDE_X86_AVX_NATIVE)
27994     return _mm256_blendv_ps(a, b, mask);
27995   #else
27996     simde__m256_private
27997       r_,
27998       a_ = simde__m256_to_private(a),
27999       b_ = simde__m256_to_private(b),
28000       mask_ = simde__m256_to_private(mask);
28001 
28002     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
28003       r_.m128[0] = simde_mm_blendv_ps(a_.m128[0], b_.m128[0], mask_.m128[0]);
28004       r_.m128[1] = simde_mm_blendv_ps(a_.m128[1], b_.m128[1], mask_.m128[1]);
28005     #else
28006       SIMDE_VECTORIZE
28007       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
28008         r_.f32[i] = (mask_.u32[i] & (UINT32_C(1) << 31)) ? b_.f32[i] : a_.f32[i];
28009       }
28010     #endif
28011 
28012     return simde__m256_from_private(r_);
28013   #endif
28014 }
28015 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28016   #undef _mm256_blendv_ps
28017   #define _mm256_blendv_ps(a, b, imm8) simde_mm256_blendv_ps(a, b, imm8)
28018 #endif
28019 
28020 SIMDE_FUNCTION_ATTRIBUTES
28021 simde__m256d
simde_mm256_blendv_pd(simde__m256d a,simde__m256d b,simde__m256d mask)28022 simde_mm256_blendv_pd (simde__m256d a, simde__m256d b, simde__m256d mask) {
28023   #if defined(SIMDE_X86_AVX_NATIVE)
28024     return _mm256_blendv_pd(a, b, mask);
28025   #else
28026     simde__m256d_private
28027       r_,
28028       a_ = simde__m256d_to_private(a),
28029       b_ = simde__m256d_to_private(b),
28030       mask_ = simde__m256d_to_private(mask);
28031 
28032     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
28033       r_.m128d[0] = simde_mm_blendv_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]);
28034       r_.m128d[1] = simde_mm_blendv_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]);
28035     #else
28036       SIMDE_VECTORIZE
28037       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
28038         r_.f64[i] = (mask_.u64[i] & (UINT64_C(1) << 63)) ? b_.f64[i] : a_.f64[i];
28039       }
28040     #endif
28041 
28042     return simde__m256d_from_private(r_);
28043   #endif
28044 }
28045 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28046   #undef _mm256_blendv_pd
28047   #define _mm256_blendv_pd(a, b, imm8) simde_mm256_blendv_pd(a, b, imm8)
28048 #endif
28049 
28050 SIMDE_FUNCTION_ATTRIBUTES
28051 simde__m256d
simde_mm256_broadcast_pd(simde__m128d const * mem_addr)28052 simde_mm256_broadcast_pd (simde__m128d const * mem_addr) {
28053   #if defined(SIMDE_X86_AVX_NATIVE)
28054     return _mm256_broadcast_pd(mem_addr);
28055   #else
28056     simde__m256d_private r_;
28057 
28058     simde__m128d tmp = simde_mm_loadu_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, mem_addr));
28059     r_.m128d[0] = tmp;
28060     r_.m128d[1] = tmp;
28061 
28062     return simde__m256d_from_private(r_);
28063   #endif
28064 }
28065 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28066   #undef _mm256_broadcast_pd
28067   #define _mm256_broadcast_pd(mem_addr) simde_mm256_broadcast_pd(mem_addr)
28068 #endif
28069 
28070 SIMDE_FUNCTION_ATTRIBUTES
28071 simde__m256
simde_mm256_broadcast_ps(simde__m128 const * mem_addr)28072 simde_mm256_broadcast_ps (simde__m128 const * mem_addr) {
28073   #if defined(SIMDE_X86_AVX_NATIVE)
28074     return _mm256_broadcast_ps(mem_addr);
28075   #else
28076     simde__m256_private r_;
28077 
28078     simde__m128 tmp = simde_mm_loadu_ps(HEDLEY_REINTERPRET_CAST(simde_float32 const*, mem_addr));
28079     r_.m128[0] = tmp;
28080     r_.m128[1] = tmp;
28081 
28082     return simde__m256_from_private(r_);
28083   #endif
28084 }
28085 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28086   #undef _mm256_broadcast_ps
28087   #define _mm256_broadcast_ps(mem_addr) simde_mm256_broadcast_ps(HEDLEY_REINTERPRET_CAST(simde__m128 const*, mem_addr))
28088 #endif
28089 
28090 SIMDE_FUNCTION_ATTRIBUTES
28091 simde__m256d
simde_mm256_broadcast_sd(simde_float64 const * a)28092 simde_mm256_broadcast_sd (simde_float64 const * a) {
28093   #if defined(SIMDE_X86_AVX_NATIVE)
28094     return _mm256_broadcast_sd(a);
28095   #else
28096     return simde_mm256_set1_pd(*a);
28097   #endif
28098 }
28099 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28100   #undef _mm256_broadcast_sd
28101   #define _mm256_broadcast_sd(mem_addr) simde_mm256_broadcast_sd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr))
28102 #endif
28103 
28104 SIMDE_FUNCTION_ATTRIBUTES
28105 simde__m128
simde_mm_broadcast_ss(simde_float32 const * a)28106 simde_mm_broadcast_ss (simde_float32 const * a) {
28107   #if defined(SIMDE_X86_AVX_NATIVE)
28108     return _mm_broadcast_ss(a);
28109   #else
28110     return simde_mm_set1_ps(*a);
28111   #endif
28112 }
28113 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28114   #undef _mm_broadcast_ss
28115   #define _mm_broadcast_ss(mem_addr) simde_mm_broadcast_ss(mem_addr)
28116 #endif
28117 
28118 SIMDE_FUNCTION_ATTRIBUTES
28119 simde__m256
simde_mm256_broadcast_ss(simde_float32 const * a)28120 simde_mm256_broadcast_ss (simde_float32 const * a) {
28121   #if defined(SIMDE_X86_AVX_NATIVE)
28122     return _mm256_broadcast_ss(a);
28123   #else
28124     return simde_mm256_set1_ps(*a);
28125   #endif
28126 }
28127 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28128   #undef _mm256_broadcast_ss
28129   #define _mm256_broadcast_ss(mem_addr) simde_mm256_broadcast_ss(mem_addr)
28130 #endif
28131 
28132 SIMDE_FUNCTION_ATTRIBUTES
28133 simde__m256d
simde_mm256_castpd128_pd256(simde__m128d a)28134 simde_mm256_castpd128_pd256 (simde__m128d a) {
28135   #if defined(SIMDE_X86_AVX_NATIVE)
28136     return _mm256_castpd128_pd256(a);
28137   #else
28138     simde__m256d_private r_;
28139     simde__m128d_private a_ = simde__m128d_to_private(a);
28140 
28141     r_.m128d_private[0] = a_;
28142 
28143     return simde__m256d_from_private(r_);
28144   #endif
28145 }
28146 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28147   #undef _mm256_castpd128_pd256
28148   #define _mm256_castpd128_pd256(a) simde_mm256_castpd128_pd256(a)
28149 #endif
28150 
28151 SIMDE_FUNCTION_ATTRIBUTES
28152 simde__m128d
simde_mm256_castpd256_pd128(simde__m256d a)28153 simde_mm256_castpd256_pd128 (simde__m256d a) {
28154   #if defined(SIMDE_X86_AVX_NATIVE)
28155     return _mm256_castpd256_pd128(a);
28156   #else
28157     simde__m256d_private a_ = simde__m256d_to_private(a);
28158     return a_.m128d[0];
28159   #endif
28160 }
28161 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28162   #undef _mm256_castpd256_pd128
28163   #define _mm256_castpd256_pd128(a) simde_mm256_castpd256_pd128(a)
28164 #endif
28165 
28166 SIMDE_FUNCTION_ATTRIBUTES
28167 simde__m256
simde_mm256_castps128_ps256(simde__m128 a)28168 simde_mm256_castps128_ps256 (simde__m128 a) {
28169   #if defined(SIMDE_X86_AVX_NATIVE)
28170     return _mm256_castps128_ps256(a);
28171   #else
28172     simde__m256_private r_;
28173     simde__m128_private a_ = simde__m128_to_private(a);
28174 
28175     r_.m128_private[0] = a_;
28176 
28177     return simde__m256_from_private(r_);
28178   #endif
28179 }
28180 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28181   #undef _mm256_castps128_ps256
28182   #define _mm256_castps128_ps256(a) simde_mm256_castps128_ps256(a)
28183 #endif
28184 
28185 SIMDE_FUNCTION_ATTRIBUTES
28186 simde__m128
simde_mm256_castps256_ps128(simde__m256 a)28187 simde_mm256_castps256_ps128 (simde__m256 a) {
28188   #if defined(SIMDE_X86_AVX_NATIVE)
28189     return _mm256_castps256_ps128(a);
28190   #else
28191     simde__m256_private a_ = simde__m256_to_private(a);
28192     return a_.m128[0];
28193   #endif
28194 }
28195 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28196   #undef _mm256_castps256_ps128
28197   #define _mm256_castps256_ps128(a) simde_mm256_castps256_ps128(a)
28198 #endif
28199 
28200 SIMDE_FUNCTION_ATTRIBUTES
28201 simde__m256i
simde_mm256_castsi128_si256(simde__m128i a)28202 simde_mm256_castsi128_si256 (simde__m128i a) {
28203   #if defined(SIMDE_X86_AVX_NATIVE)
28204     return _mm256_castsi128_si256(a);
28205   #else
28206     simde__m256i_private r_;
28207     simde__m128i_private a_ = simde__m128i_to_private(a);
28208 
28209     r_.m128i_private[0] = a_;
28210 
28211     return simde__m256i_from_private(r_);
28212   #endif
28213 }
28214 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28215   #undef _mm256_castsi128_si256
28216   #define _mm256_castsi128_si256(a) simde_mm256_castsi128_si256(a)
28217 #endif
28218 
28219 SIMDE_FUNCTION_ATTRIBUTES
28220 simde__m128i
simde_mm256_castsi256_si128(simde__m256i a)28221 simde_mm256_castsi256_si128 (simde__m256i a) {
28222   #if defined(SIMDE_X86_AVX_NATIVE)
28223     return _mm256_castsi256_si128(a);
28224   #else
28225     simde__m256i_private a_ = simde__m256i_to_private(a);
28226     return a_.m128i[0];
28227   #endif
28228 }
28229 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28230   #undef _mm256_castsi256_si128
28231   #define _mm256_castsi256_si128(a) simde_mm256_castsi256_si128(a)
28232 #endif
28233 
28234 SIMDE_FUNCTION_ATTRIBUTES
28235 simde__m256
simde_mm256_round_ps(simde__m256 a,const int rounding)28236 simde_mm256_round_ps (simde__m256 a, const int rounding) {
28237   simde__m256_private
28238     r_,
28239     a_ = simde__m256_to_private(a);
28240 
28241   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
28242     #if defined(simde_math_nearbyintf)
28243       case SIMDE_MM_FROUND_CUR_DIRECTION:
28244         for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
28245           r_.f32[i] = simde_math_nearbyintf(a_.f32[i]);
28246         }
28247         break;
28248     #endif
28249 
28250     #if defined(simde_math_roundf)
28251       case SIMDE_MM_FROUND_TO_NEAREST_INT:
28252         for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
28253           r_.f32[i] = simde_math_roundf(a_.f32[i]);
28254         }
28255         break;
28256     #endif
28257 
28258     #if defined(simde_math_floorf)
28259       case SIMDE_MM_FROUND_TO_NEG_INF:
28260         for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
28261           r_.f32[i] = simde_math_floorf(a_.f32[i]);
28262         }
28263         break;
28264     #endif
28265 
28266     #if defined(simde_math_ceilf)
28267       case SIMDE_MM_FROUND_TO_POS_INF:
28268         for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
28269           r_.f32[i] = simde_math_ceilf(a_.f32[i]);
28270         }
28271         break;
28272     #endif
28273 
28274     #if defined(simde_math_truncf)
28275       case SIMDE_MM_FROUND_TO_ZERO:
28276         for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
28277           r_.f32[i] = simde_math_truncf(a_.f32[i]);
28278         }
28279         break;
28280     #endif
28281 
28282     default:
28283       HEDLEY_UNREACHABLE_RETURN(simde_mm256_undefined_ps());
28284   }
28285 
28286   return simde__m256_from_private(r_);
28287 }
28288 #if defined(SIMDE_X86_AVX_NATIVE)
28289 #  define simde_mm256_round_ps(a, rounding) _mm256_round_ps(a, rounding)
28290 #endif
28291 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28292   #undef _mm256_round_ps
28293   #define _mm256_round_ps(a, rounding) simde_mm256_round_ps(a, rounding)
28294 #endif
28295 
28296 SIMDE_FUNCTION_ATTRIBUTES
28297 simde__m256d
simde_mm256_round_pd(simde__m256d a,const int rounding)28298 simde_mm256_round_pd (simde__m256d a, const int rounding) {
28299   simde__m256d_private
28300     r_,
28301     a_ = simde__m256d_to_private(a);
28302 
28303   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
28304     #if defined(simde_math_nearbyint)
28305       case SIMDE_MM_FROUND_CUR_DIRECTION:
28306         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
28307           r_.f64[i] = simde_math_nearbyint(a_.f64[i]);
28308         }
28309         break;
28310     #endif
28311 
28312     #if defined(simde_math_round)
28313       case SIMDE_MM_FROUND_TO_NEAREST_INT:
28314         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
28315           r_.f64[i] = simde_math_round(a_.f64[i]);
28316         }
28317         break;
28318     #endif
28319 
28320     #if defined(simde_math_floor)
28321       case SIMDE_MM_FROUND_TO_NEG_INF:
28322         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
28323           r_.f64[i] = simde_math_floor(a_.f64[i]);
28324         }
28325         break;
28326     #endif
28327 
28328     #if defined(simde_math_ceil)
28329       case SIMDE_MM_FROUND_TO_POS_INF:
28330         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
28331           r_.f64[i] = simde_math_ceil(a_.f64[i]);
28332         }
28333         break;
28334     #endif
28335 
28336     #if defined(simde_math_trunc)
28337       case SIMDE_MM_FROUND_TO_ZERO:
28338         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
28339           r_.f64[i] = simde_math_trunc(a_.f64[i]);
28340         }
28341         break;
28342     #endif
28343 
28344     default:
28345       HEDLEY_UNREACHABLE_RETURN(simde_mm256_undefined_pd());
28346   }
28347 
28348   return simde__m256d_from_private(r_);
28349 }
28350 #if defined(SIMDE_X86_AVX_NATIVE)
28351 #  define simde_mm256_round_pd(a, rounding) _mm256_round_pd(a, rounding)
28352 #endif
28353 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28354   #undef _mm256_round_pd
28355   #define _mm256_round_pd(a, rounding) simde_mm256_round_pd(a, rounding)
28356 #endif
28357 
28358 SIMDE_FUNCTION_ATTRIBUTES
28359 simde__m256d
simde_mm256_ceil_pd(simde__m256d a)28360 simde_mm256_ceil_pd (simde__m256d a) {
28361   return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF);
28362 }
28363 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28364   #undef _mm256_ceil_pd
28365   #define _mm256_ceil_pd(a) simde_mm256_ceil_pd(a)
28366 #endif
28367 
28368 SIMDE_FUNCTION_ATTRIBUTES
28369 simde__m256
simde_mm256_ceil_ps(simde__m256 a)28370 simde_mm256_ceil_ps (simde__m256 a) {
28371   return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF);
28372 }
28373 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28374   #undef _mm256_ceil_ps
28375   #define _mm256_ceil_ps(a) simde_mm256_ceil_ps(a)
28376 #endif
28377 
28378 HEDLEY_DIAGNOSTIC_PUSH
28379 SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL
28380 
28381 /* This implementation does not support signaling NaNs (yet?) */
28382 SIMDE_FUNCTION_ATTRIBUTES
28383 simde__m128d
simde_mm_cmp_pd(simde__m128d a,simde__m128d b,const int imm8)28384 simde_mm_cmp_pd (simde__m128d a, simde__m128d b, const int imm8)
28385     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
28386   switch (imm8) {
28387     case SIMDE_CMP_EQ_OQ:
28388     case SIMDE_CMP_EQ_UQ:
28389     case SIMDE_CMP_EQ_OS:
28390     case SIMDE_CMP_EQ_US:
28391       return simde_mm_cmpeq_pd(a, b);
28392       break;
28393     case SIMDE_CMP_LT_OS:
28394     case SIMDE_CMP_NGE_US:
28395     case SIMDE_CMP_LT_OQ:
28396     case SIMDE_CMP_NGE_UQ:
28397       return simde_mm_cmplt_pd(a, b);
28398       break;
28399     case SIMDE_CMP_LE_OS:
28400     case SIMDE_CMP_NGT_US:
28401     case SIMDE_CMP_LE_OQ:
28402     case SIMDE_CMP_NGT_UQ:
28403       return simde_mm_cmple_pd(a, b);
28404       break;
28405     case SIMDE_CMP_NEQ_UQ:
28406     case SIMDE_CMP_NEQ_OQ:
28407     case SIMDE_CMP_NEQ_US:
28408     case SIMDE_CMP_NEQ_OS:
28409       return simde_mm_cmpneq_pd(a, b);
28410       break;
28411     case SIMDE_CMP_NLT_US:
28412     case SIMDE_CMP_GE_OS:
28413     case SIMDE_CMP_NLT_UQ:
28414     case SIMDE_CMP_GE_OQ:
28415       return simde_mm_cmpge_pd(a, b);
28416       break;
28417     case SIMDE_CMP_NLE_US:
28418     case SIMDE_CMP_GT_OS:
28419     case SIMDE_CMP_NLE_UQ:
28420     case SIMDE_CMP_GT_OQ:
28421       return simde_mm_cmpgt_pd(a, b);
28422       break;
28423     case SIMDE_CMP_FALSE_OQ:
28424     case SIMDE_CMP_FALSE_OS:
28425       return simde_mm_setzero_pd();
28426       break;
28427     case SIMDE_CMP_TRUE_UQ:
28428     case SIMDE_CMP_TRUE_US:
28429       return simde_x_mm_setone_pd();
28430       break;
28431     case SIMDE_CMP_UNORD_Q:
28432     case SIMDE_CMP_UNORD_S:
28433       return simde_mm_cmpunord_pd(a, b);
28434       break;
28435     case SIMDE_CMP_ORD_Q:
28436     case SIMDE_CMP_ORD_S:
28437       return simde_mm_cmpord_pd(a, b);
28438       break;
28439   }
28440 
28441   HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_pd());
28442 }
28443 #if defined(SIMDE_X86_AVX_NATIVE) && (!defined(__clang__) || !defined(__AVX512F__))
28444 #  define simde_mm_cmp_pd(a, b, imm8) _mm_cmp_pd(a, b, imm8)
28445 #endif
28446 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28447   #undef _mm_cmp_pd
28448   #define _mm_cmp_pd(a, b, imm8) simde_mm_cmp_pd(a, b, imm8)
28449 #endif
28450 
28451 SIMDE_FUNCTION_ATTRIBUTES
28452 simde__m128
simde_mm_cmp_ps(simde__m128 a,simde__m128 b,const int imm8)28453 simde_mm_cmp_ps (simde__m128 a, simde__m128 b, const int imm8)
28454     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
28455   switch (imm8) {
28456     case SIMDE_CMP_EQ_OQ:
28457     case SIMDE_CMP_EQ_UQ:
28458     case SIMDE_CMP_EQ_OS:
28459     case SIMDE_CMP_EQ_US:
28460       return simde_mm_cmpeq_ps(a, b);
28461       break;
28462     case SIMDE_CMP_LT_OS:
28463     case SIMDE_CMP_NGE_US:
28464     case SIMDE_CMP_LT_OQ:
28465     case SIMDE_CMP_NGE_UQ:
28466       return simde_mm_cmplt_ps(a, b);
28467       break;
28468     case SIMDE_CMP_LE_OS:
28469     case SIMDE_CMP_NGT_US:
28470     case SIMDE_CMP_LE_OQ:
28471     case SIMDE_CMP_NGT_UQ:
28472       return simde_mm_cmple_ps(a, b);
28473       break;
28474     case SIMDE_CMP_NEQ_UQ:
28475     case SIMDE_CMP_NEQ_OQ:
28476     case SIMDE_CMP_NEQ_US:
28477     case SIMDE_CMP_NEQ_OS:
28478       return simde_mm_cmpneq_ps(a, b);
28479       break;
28480     case SIMDE_CMP_NLT_US:
28481     case SIMDE_CMP_GE_OS:
28482     case SIMDE_CMP_NLT_UQ:
28483     case SIMDE_CMP_GE_OQ:
28484       return simde_mm_cmpge_ps(a, b);
28485       break;
28486     case SIMDE_CMP_NLE_US:
28487     case SIMDE_CMP_GT_OS:
28488     case SIMDE_CMP_NLE_UQ:
28489     case SIMDE_CMP_GT_OQ:
28490       return simde_mm_cmpgt_ps(a, b);
28491       break;
28492     case SIMDE_CMP_FALSE_OQ:
28493     case SIMDE_CMP_FALSE_OS:
28494       return simde_mm_setzero_ps();
28495       break;
28496     case SIMDE_CMP_TRUE_UQ:
28497     case SIMDE_CMP_TRUE_US:
28498       return simde_x_mm_setone_ps();
28499       break;
28500     case SIMDE_CMP_UNORD_Q:
28501     case SIMDE_CMP_UNORD_S:
28502       return simde_mm_cmpunord_ps(a, b);
28503       break;
28504     case SIMDE_CMP_ORD_Q:
28505     case SIMDE_CMP_ORD_S:
28506       return simde_mm_cmpord_ps(a, b);
28507       break;
28508   }
28509 
28510   HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_ps());
28511 }
28512 /* Prior to 9.0 clang has problems with _mm{,256}_cmp_{ps,pd} for all four of the true/false
28513    comparisons, but only when AVX-512 is enabled.  __FILE_NAME__ was added in 9.0, so that's
28514    what we use to check for clang 9 since the version macros are unreliable. */
28515 #if defined(SIMDE_X86_AVX_NATIVE) && (!defined(__clang__) || !defined(__AVX512F__))
28516 #  define simde_mm_cmp_ps(a, b, imm8) _mm_cmp_ps(a, b, imm8)
28517 #endif
28518 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28519   #undef _mm_cmp_ps
28520   #define _mm_cmp_ps(a, b, imm8) simde_mm_cmp_ps(a, b, imm8)
28521 #endif
28522 
28523 SIMDE_FUNCTION_ATTRIBUTES
28524 simde__m128d
simde_mm_cmp_sd(simde__m128d a,simde__m128d b,const int imm8)28525 simde_mm_cmp_sd (simde__m128d a, simde__m128d b, const int imm8)
28526     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
28527   simde__m128d_private
28528     r_,
28529     a_ = simde__m128d_to_private(a),
28530     b_ = simde__m128d_to_private(b);
28531 
28532   switch (imm8) {
28533     case SIMDE_CMP_EQ_OQ:
28534       r_.u64[0] = (a_.f64[0] == b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28535       break;
28536     case SIMDE_CMP_LT_OS:
28537       r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28538       break;
28539     case SIMDE_CMP_LE_OS:
28540       r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28541       break;
28542     case SIMDE_CMP_UNORD_Q:
28543 #if defined(simde_math_isnan)
28544       r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
28545 #else
28546   HEDLEY_UNREACHABLE();
28547 #endif
28548       break;
28549     case SIMDE_CMP_NEQ_UQ:
28550       r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28551       break;
28552     case SIMDE_CMP_NLT_US:
28553       r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28554       break;
28555     case SIMDE_CMP_NLE_US:
28556       r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28557       break;
28558     case SIMDE_CMP_ORD_Q:
28559 #if defined(simde_math_isnan)
28560       r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
28561 #else
28562   HEDLEY_UNREACHABLE();
28563 #endif
28564       break;
28565     case SIMDE_CMP_EQ_UQ:
28566       r_.u64[0] = (a_.f64[0] == b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28567       break;
28568     case SIMDE_CMP_NGE_US:
28569       r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28570       break;
28571     case SIMDE_CMP_NGT_US:
28572       r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28573       break;
28574     case SIMDE_CMP_FALSE_OQ:
28575       r_.u64[0] = UINT64_C(0);
28576       break;
28577     case SIMDE_CMP_NEQ_OQ:
28578       r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28579       break;
28580     case SIMDE_CMP_GE_OS:
28581       r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28582       break;
28583     case SIMDE_CMP_GT_OS:
28584       r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28585       break;
28586     case SIMDE_CMP_TRUE_UQ:
28587       r_.u64[0] = ~UINT64_C(0);
28588       break;
28589     case SIMDE_CMP_EQ_OS:
28590       r_.u64[0] = (a_.f64[0] == b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28591       break;
28592     case SIMDE_CMP_LT_OQ:
28593       r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28594       break;
28595     case SIMDE_CMP_LE_OQ:
28596       r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28597       break;
28598     case SIMDE_CMP_UNORD_S:
28599 #if defined(simde_math_isnan)
28600       r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
28601 #else
28602   HEDLEY_UNREACHABLE();
28603 #endif
28604       break;
28605     case SIMDE_CMP_NEQ_US:
28606       r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28607       break;
28608     case SIMDE_CMP_NLT_UQ:
28609       r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28610       break;
28611     case SIMDE_CMP_NLE_UQ:
28612       r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28613       break;
28614     case SIMDE_CMP_ORD_S:
28615 #if defined(simde_math_isnan)
28616       r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? UINT64_C(0) : ~UINT64_C(0);
28617 #else
28618   HEDLEY_UNREACHABLE();
28619 #endif
28620       break;
28621     case SIMDE_CMP_EQ_US:
28622       r_.u64[0] = (a_.f64[0] == b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28623       break;
28624     case SIMDE_CMP_NGE_UQ:
28625       r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28626       break;
28627     case SIMDE_CMP_NGT_UQ:
28628       r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28629       break;
28630     case SIMDE_CMP_FALSE_OS:
28631       r_.u64[0] = UINT64_C(0);
28632       break;
28633     case SIMDE_CMP_NEQ_OS:
28634       r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28635       break;
28636     case SIMDE_CMP_GE_OQ:
28637       r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28638       break;
28639     case SIMDE_CMP_GT_OQ:
28640       r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
28641       break;
28642     case SIMDE_CMP_TRUE_US:
28643       r_.u64[0] = ~UINT64_C(0);
28644       break;
28645   }
28646   r_.u64[1] = a_.u64[1];
28647 
28648   return simde__m128d_from_private(r_);
28649 }
28650 #if defined(SIMDE_X86_AVX_NATIVE)
28651 #  define simde_mm_cmp_sd(a, b, imm8) _mm_cmp_sd(a, b, imm8)
28652 #endif
28653 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28654   #undef _mm_cmp_sd
28655   #define _mm_cmp_sd(a, b, imm8) simde_mm_cmp_sd(a, b, imm8)
28656 #endif
28657 
28658 SIMDE_FUNCTION_ATTRIBUTES
28659 simde__m128
simde_mm_cmp_ss(simde__m128 a,simde__m128 b,const int imm8)28660 simde_mm_cmp_ss (simde__m128 a, simde__m128 b, const int imm8)
28661     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
28662   simde__m128_private
28663     r_,
28664     a_ = simde__m128_to_private(a),
28665     b_ = simde__m128_to_private(b);
28666 
28667   switch (imm8) {
28668     case SIMDE_CMP_EQ_OQ:
28669       r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28670       break;
28671     case SIMDE_CMP_LT_OS:
28672       r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28673       break;
28674     case SIMDE_CMP_LE_OS:
28675       r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28676       break;
28677     case SIMDE_CMP_UNORD_Q:
28678 #if defined(simde_math_isnanf)
28679       r_.u32[0] = (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0);
28680 #else
28681       HEDLEY_UNREACHABLE();
28682 #endif
28683       break;
28684     case SIMDE_CMP_NEQ_UQ:
28685       r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28686       break;
28687     case SIMDE_CMP_NLT_US:
28688       r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28689       break;
28690     case SIMDE_CMP_NLE_US:
28691       r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28692       break;
28693     case SIMDE_CMP_ORD_Q:
28694 #if defined(simde_math_isnanf)
28695       r_.u32[0] = (!simde_math_isnanf(a_.f32[0]) && !simde_math_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0);
28696 #else
28697       HEDLEY_UNREACHABLE();
28698 #endif
28699       break;
28700     case SIMDE_CMP_EQ_UQ:
28701       r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28702       break;
28703     case SIMDE_CMP_NGE_US:
28704       r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28705       break;
28706     case SIMDE_CMP_NGT_US:
28707       r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28708       break;
28709     case SIMDE_CMP_FALSE_OQ:
28710       r_.u32[0] = UINT32_C(0);
28711       break;
28712     case SIMDE_CMP_NEQ_OQ:
28713       r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28714       break;
28715     case SIMDE_CMP_GE_OS:
28716       r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28717       break;
28718     case SIMDE_CMP_GT_OS:
28719       r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28720       break;
28721     case SIMDE_CMP_TRUE_UQ:
28722       r_.u32[0] = ~UINT32_C(0);
28723       break;
28724     case SIMDE_CMP_EQ_OS:
28725       r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28726       break;
28727     case SIMDE_CMP_LT_OQ:
28728       r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28729       break;
28730     case SIMDE_CMP_LE_OQ:
28731       r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28732       break;
28733     case SIMDE_CMP_UNORD_S:
28734 #if defined(simde_math_isnanf)
28735       r_.u32[0] = (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0);
28736 #else
28737       HEDLEY_UNREACHABLE();
28738 #endif
28739       break;
28740     case SIMDE_CMP_NEQ_US:
28741       r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28742       break;
28743     case SIMDE_CMP_NLT_UQ:
28744       r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28745       break;
28746     case SIMDE_CMP_NLE_UQ:
28747       r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28748       break;
28749     case SIMDE_CMP_ORD_S:
28750 #if defined(simde_math_isnanf)
28751       r_.u32[0] = (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) ? UINT32_C(0) : ~UINT32_C(0);
28752 #else
28753       HEDLEY_UNREACHABLE();
28754 #endif
28755       break;
28756     case SIMDE_CMP_EQ_US:
28757       r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28758       break;
28759     case SIMDE_CMP_NGE_UQ:
28760       r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28761       break;
28762     case SIMDE_CMP_NGT_UQ:
28763       r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28764       break;
28765     case SIMDE_CMP_FALSE_OS:
28766       r_.u32[0] = UINT32_C(0);
28767       break;
28768     case SIMDE_CMP_NEQ_OS:
28769       r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28770       break;
28771     case SIMDE_CMP_GE_OQ:
28772       r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28773       break;
28774     case SIMDE_CMP_GT_OQ:
28775       r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
28776       break;
28777     case SIMDE_CMP_TRUE_US:
28778       r_.u32[0] = ~UINT32_C(0);
28779       break;
28780   }
28781   r_.u32[1] = a_.u32[1];
28782   r_.u32[2] = a_.u32[2];
28783   r_.u32[3] = a_.u32[3];
28784 
28785   return simde__m128_from_private(r_);
28786 }
28787 #if defined(SIMDE_X86_AVX_NATIVE)
28788 #  define simde_mm_cmp_ss(a, b, imm8) _mm_cmp_ss(a, b, imm8)
28789 #endif
28790 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
28791   #undef _mm_cmp_ss
28792   #define _mm_cmp_ss(a, b, imm8) simde_mm_cmp_ss(a, b, imm8)
28793 #endif
28794 
28795 SIMDE_FUNCTION_ATTRIBUTES
28796 simde__m256d
simde_mm256_cmp_pd(simde__m256d a,simde__m256d b,const int imm8)28797 simde_mm256_cmp_pd (simde__m256d a, simde__m256d b, const int imm8)
28798     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
28799   simde__m256d_private
28800     r_,
28801     a_ = simde__m256d_to_private(a),
28802     b_ = simde__m256d_to_private(b);
28803 
28804 
28805 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
28806   switch (imm8) {
28807     case SIMDE_CMP_EQ_OQ:
28808       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
28809       break;
28810     case SIMDE_CMP_LT_OS:
28811       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
28812       break;
28813     case SIMDE_CMP_LE_OS:
28814       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
28815       break;
28816     case SIMDE_CMP_UNORD_Q:
28817 #if defined(simde_math_isnan)
28818       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
28819         r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
28820       }
28821 #else
28822       HEDLEY_UNREACHABLE();
28823 #endif
28824       break;
28825     case SIMDE_CMP_NEQ_UQ:
28826       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
28827       break;
28828     case SIMDE_CMP_NLT_US:
28829       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
28830       break;
28831     case SIMDE_CMP_NLE_US:
28832       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
28833       break;
28834     case SIMDE_CMP_ORD_Q:
28835 #if defined(simde_math_isnan)
28836       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
28837         r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
28838       }
28839 #else
28840       HEDLEY_UNREACHABLE();
28841 #endif
28842       break;
28843     case SIMDE_CMP_EQ_UQ:
28844       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
28845       break;
28846     case SIMDE_CMP_NGE_US:
28847       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
28848       break;
28849     case SIMDE_CMP_NGT_US:
28850       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
28851       break;
28852     case SIMDE_CMP_FALSE_OQ:
28853       r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
28854       break;
28855     case SIMDE_CMP_NEQ_OQ:
28856       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
28857       break;
28858     case SIMDE_CMP_GE_OS:
28859       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
28860       break;
28861     case SIMDE_CMP_GT_OS:
28862       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
28863       break;
28864     case SIMDE_CMP_TRUE_UQ:
28865       r_ = simde__m256d_to_private(simde_x_mm256_setone_pd());
28866       break;
28867     case SIMDE_CMP_EQ_OS:
28868       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
28869       break;
28870     case SIMDE_CMP_LT_OQ:
28871       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
28872       break;
28873     case SIMDE_CMP_LE_OQ:
28874       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
28875       break;
28876     case SIMDE_CMP_UNORD_S:
28877 #if defined(simde_math_isnan)
28878       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
28879         r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
28880       }
28881 #else
28882       HEDLEY_UNREACHABLE();
28883 #endif
28884       break;
28885      case SIMDE_CMP_NEQ_US:
28886       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
28887       break;
28888     case SIMDE_CMP_NLT_UQ:
28889       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
28890       break;
28891     case SIMDE_CMP_NLE_UQ:
28892       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
28893       break;
28894     case SIMDE_CMP_ORD_S:
28895 #if defined(simde_math_isnan)
28896       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
28897         r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? UINT64_C(0) : ~UINT64_C(0);
28898       }
28899 #else
28900       HEDLEY_UNREACHABLE();
28901 #endif
28902       break;
28903     case SIMDE_CMP_EQ_US:
28904       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
28905       break;
28906     case SIMDE_CMP_NGE_UQ:
28907       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
28908       break;
28909     case SIMDE_CMP_NGT_UQ:
28910       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
28911       break;
28912     case SIMDE_CMP_FALSE_OS:
28913       r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
28914       break;
28915     case SIMDE_CMP_NEQ_OS:
28916       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
28917       break;
28918     case SIMDE_CMP_GE_OQ:
28919       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
28920       break;
28921     case SIMDE_CMP_GT_OQ:
28922       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
28923       break;
28924     case SIMDE_CMP_TRUE_US:
28925       r_ = simde__m256d_to_private(simde_x_mm256_setone_pd());
28926       break;
28927     default:
28928       HEDLEY_UNREACHABLE();
28929       break;
28930   }
28931 #else
28932   SIMDE_VECTORIZE
28933   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
28934     switch (imm8) {
28935       case SIMDE_CMP_EQ_OQ:
28936         r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28937         break;
28938       case SIMDE_CMP_LT_OS:
28939         r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28940         break;
28941       case SIMDE_CMP_LE_OS:
28942         r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28943         break;
28944       case SIMDE_CMP_UNORD_Q:
28945         r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
28946         break;
28947       case SIMDE_CMP_NEQ_UQ:
28948         r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28949         break;
28950       case SIMDE_CMP_NLT_US:
28951         r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28952         break;
28953       case SIMDE_CMP_NLE_US:
28954         r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28955         break;
28956       case SIMDE_CMP_ORD_Q:
28957 #if defined(simde_math_isnan)
28958         r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
28959 #else
28960         HEDLEY_UNREACHABLE();
28961 #endif
28962         break;
28963       case SIMDE_CMP_EQ_UQ:
28964         r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28965         break;
28966       case SIMDE_CMP_NGE_US:
28967         r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28968         break;
28969       case SIMDE_CMP_NGT_US:
28970         r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28971         break;
28972       case SIMDE_CMP_FALSE_OQ:
28973         r_.u64[i] = UINT64_C(0);
28974         break;
28975       case SIMDE_CMP_NEQ_OQ:
28976         r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28977         break;
28978       case SIMDE_CMP_GE_OS:
28979         r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28980         break;
28981       case SIMDE_CMP_GT_OS:
28982         r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28983         break;
28984       case SIMDE_CMP_TRUE_UQ:
28985         r_.u64[i] = ~UINT64_C(0);
28986         break;
28987       case SIMDE_CMP_EQ_OS:
28988         r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28989         break;
28990       case SIMDE_CMP_LT_OQ:
28991         r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28992         break;
28993       case SIMDE_CMP_LE_OQ:
28994         r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
28995         break;
28996       case SIMDE_CMP_UNORD_S:
28997 #if defined(simde_math_isnan)
28998         r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
28999 #else
29000         HEDLEY_UNREACHABLE();
29001 #endif
29002         break;
29003       case SIMDE_CMP_NEQ_US:
29004         r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
29005         break;
29006       case SIMDE_CMP_NLT_UQ:
29007         r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
29008         break;
29009       case SIMDE_CMP_NLE_UQ:
29010         r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
29011         break;
29012       case SIMDE_CMP_ORD_S:
29013 #if defined(simde_math_isnan)
29014         r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? UINT64_C(0) : ~UINT64_C(0);
29015 #else
29016         HEDLEY_UNREACHABLE();
29017 #endif
29018         break;
29019       case SIMDE_CMP_EQ_US:
29020         r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
29021         break;
29022       case SIMDE_CMP_NGE_UQ:
29023         r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
29024         break;
29025       case SIMDE_CMP_NGT_UQ:
29026         r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
29027         break;
29028       case SIMDE_CMP_FALSE_OS:
29029         r_.u64[i] = UINT64_C(0);
29030         break;
29031       case SIMDE_CMP_NEQ_OS:
29032         r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
29033         break;
29034       case SIMDE_CMP_GE_OQ:
29035         r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
29036         break;
29037       case SIMDE_CMP_GT_OQ:
29038         r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
29039         break;
29040       case SIMDE_CMP_TRUE_US:
29041         r_.u64[i] = ~UINT64_C(0);
29042         break;
29043       default:
29044         HEDLEY_UNREACHABLE();
29045         break;
29046     }
29047   }
29048 #endif
29049 
29050   return simde__m256d_from_private(r_);
29051 }
29052 #if defined(SIMDE_X86_AVX_NATIVE) && (!defined(__clang__) || !defined(__AVX512F__))
29053 #  define simde_mm256_cmp_pd(a, b, imm8) _mm256_cmp_pd(a, b, imm8)
29054 #endif
29055 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29056   #undef _mm256_cmp_pd
29057   #define _mm256_cmp_pd(a, b, imm8) simde_mm256_cmp_pd(a, b, imm8)
29058 #endif
29059 
29060 SIMDE_FUNCTION_ATTRIBUTES
29061 simde__m256
simde_mm256_cmp_ps(simde__m256 a,simde__m256 b,const int imm8)29062 simde_mm256_cmp_ps (simde__m256 a, simde__m256 b, const int imm8)
29063     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
29064   simde__m256_private
29065     r_,
29066     a_ = simde__m256_to_private(a),
29067     b_ = simde__m256_to_private(b);
29068 
29069 
29070 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
29071   switch (imm8) {
29072     case SIMDE_CMP_EQ_OQ:
29073       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 == b_.f32));
29074       break;
29075     case SIMDE_CMP_LT_OS:
29076       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32));
29077       break;
29078     case SIMDE_CMP_LE_OS:
29079       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32));
29080       break;
29081     case SIMDE_CMP_UNORD_Q:
29082 #if defined(simde_math_isnanf)
29083       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
29084         r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0);
29085       }
29086 #else
29087       HEDLEY_UNREACHABLE();
29088 #endif
29089       break;
29090     case SIMDE_CMP_NEQ_UQ:
29091       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32));
29092       break;
29093     case SIMDE_CMP_NLT_US:
29094       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32));
29095       break;
29096     case SIMDE_CMP_NLE_US:
29097       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32));
29098       break;
29099     case SIMDE_CMP_ORD_Q:
29100 #if defined(simde_math_isnanf)
29101       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
29102         r_.u32[i] = (!simde_math_isnanf(a_.f32[i]) && !simde_math_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0);
29103       }
29104 #else
29105       HEDLEY_UNREACHABLE();
29106 #endif
29107       break;
29108     case SIMDE_CMP_EQ_UQ:
29109       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 == b_.f32));
29110       break;
29111     case SIMDE_CMP_NGE_US:
29112       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32));
29113       break;
29114     case SIMDE_CMP_NGT_US:
29115       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32));
29116       break;
29117     case SIMDE_CMP_FALSE_OQ:
29118       r_ = simde__m256_to_private(simde_mm256_setzero_ps());
29119       break;
29120     case SIMDE_CMP_NEQ_OQ:
29121       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32));
29122       break;
29123     case SIMDE_CMP_GE_OS:
29124       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32));
29125       break;
29126     case SIMDE_CMP_GT_OS:
29127       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32));
29128       break;
29129     case SIMDE_CMP_TRUE_UQ:
29130       r_ = simde__m256_to_private(simde_x_mm256_setone_ps());
29131       break;
29132     case SIMDE_CMP_EQ_OS:
29133       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 == b_.f32));
29134       break;
29135     case SIMDE_CMP_LT_OQ:
29136       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32));
29137       break;
29138     case SIMDE_CMP_LE_OQ:
29139       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32));
29140       break;
29141     case SIMDE_CMP_UNORD_S:
29142 #if defined(simde_math_isnanf)
29143       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
29144         r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0);
29145       }
29146 #else
29147       HEDLEY_UNREACHABLE();
29148 #endif
29149       break;
29150     case SIMDE_CMP_NEQ_US:
29151       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32));
29152       break;
29153     case SIMDE_CMP_NLT_UQ:
29154       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32));
29155       break;
29156     case SIMDE_CMP_NLE_UQ:
29157       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32));
29158       break;
29159     case SIMDE_CMP_ORD_S:
29160 #if defined(simde_math_isnanf)
29161       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
29162         r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? UINT32_C(0) : ~UINT32_C(0);
29163       }
29164 #else
29165       HEDLEY_UNREACHABLE();
29166 #endif
29167       break;
29168     case SIMDE_CMP_EQ_US:
29169       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 == b_.f32));
29170       break;
29171     case SIMDE_CMP_NGE_UQ:
29172       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32));
29173       break;
29174     case SIMDE_CMP_NGT_UQ:
29175       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32));
29176       break;
29177     case SIMDE_CMP_FALSE_OS:
29178       r_ = simde__m256_to_private(simde_mm256_setzero_ps());
29179       break;
29180     case SIMDE_CMP_NEQ_OS:
29181       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32));
29182       break;
29183     case SIMDE_CMP_GE_OQ:
29184       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32));
29185       break;
29186     case SIMDE_CMP_GT_OQ:
29187       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32));
29188       break;
29189     case SIMDE_CMP_TRUE_US:
29190       r_ = simde__m256_to_private(simde_x_mm256_setone_ps());
29191       break;
29192     default:
29193       HEDLEY_UNREACHABLE();
29194       break;
29195   }
29196 #else
29197   SIMDE_VECTORIZE
29198   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
29199     switch (imm8) {
29200       case SIMDE_CMP_EQ_OQ:
29201         r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29202         break;
29203       case SIMDE_CMP_LT_OS:
29204         r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29205         break;
29206       case SIMDE_CMP_LE_OS:
29207         r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29208         break;
29209       case SIMDE_CMP_UNORD_Q:
29210 #if defined(simde_math_isnanf)
29211         r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0);
29212 #else
29213         HEDLEY_UNREACHABLE();
29214 #endif
29215         break;
29216       case SIMDE_CMP_NEQ_UQ:
29217         r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29218         break;
29219       case SIMDE_CMP_NLT_US:
29220         r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29221         break;
29222       case SIMDE_CMP_NLE_US:
29223         r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29224         break;
29225       case SIMDE_CMP_ORD_Q:
29226 #if defined(simde_math_isnanf)
29227         r_.u32[i] = (!simde_math_isnanf(a_.f32[i]) && !simde_math_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0);
29228 #else
29229         HEDLEY_UNREACHABLE();
29230 #endif
29231         break;
29232       case SIMDE_CMP_EQ_UQ:
29233         r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29234         break;
29235       case SIMDE_CMP_NGE_US:
29236         r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29237         break;
29238       case SIMDE_CMP_NGT_US:
29239         r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29240         break;
29241       case SIMDE_CMP_FALSE_OQ:
29242         r_.u32[i] = UINT32_C(0);
29243         break;
29244       case SIMDE_CMP_NEQ_OQ:
29245         r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29246         break;
29247       case SIMDE_CMP_GE_OS:
29248         r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29249         break;
29250       case SIMDE_CMP_GT_OS:
29251         r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29252         break;
29253       case SIMDE_CMP_TRUE_UQ:
29254         r_.u32[i] = ~UINT32_C(0);
29255         break;
29256       case SIMDE_CMP_EQ_OS:
29257         r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29258         break;
29259       case SIMDE_CMP_LT_OQ:
29260         r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29261         break;
29262       case SIMDE_CMP_LE_OQ:
29263         r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29264         break;
29265       case SIMDE_CMP_UNORD_S:
29266 #if defined(simde_math_isnanf)
29267         r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0);
29268 #else
29269         HEDLEY_UNREACHABLE();
29270 #endif
29271         break;
29272       case SIMDE_CMP_NEQ_US:
29273         r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29274         break;
29275       case SIMDE_CMP_NLT_UQ:
29276         r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29277         break;
29278       case SIMDE_CMP_NLE_UQ:
29279         r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29280         break;
29281       case SIMDE_CMP_ORD_S:
29282 #if defined(simde_math_isnanf)
29283         r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? UINT32_C(0) : ~UINT32_C(0);
29284 #else
29285         HEDLEY_UNREACHABLE();
29286 #endif
29287         break;
29288       case SIMDE_CMP_EQ_US:
29289         r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29290         break;
29291       case SIMDE_CMP_NGE_UQ:
29292         r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29293         break;
29294       case SIMDE_CMP_NGT_UQ:
29295         r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29296         break;
29297       case SIMDE_CMP_FALSE_OS:
29298         r_.u32[i] = UINT32_C(0);
29299         break;
29300       case SIMDE_CMP_NEQ_OS:
29301         r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29302         break;
29303       case SIMDE_CMP_GE_OQ:
29304         r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29305         break;
29306       case SIMDE_CMP_GT_OQ:
29307         r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
29308         break;
29309       case SIMDE_CMP_TRUE_US:
29310         r_.u32[i] = ~UINT32_C(0);
29311         break;
29312       default:
29313         HEDLEY_UNREACHABLE();
29314         break;
29315     }
29316   }
29317 #endif
29318 
29319   return simde__m256_from_private(r_);
29320 }
29321 #if defined(SIMDE_X86_AVX_NATIVE) && (!defined(__clang__) || !defined(__AVX512F__))
29322 #  define simde_mm256_cmp_ps(a, b, imm8) _mm256_cmp_ps(a, b, imm8)
29323 #endif
29324 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29325   #undef _mm256_cmp_ps
29326   #define _mm256_cmp_ps(a, b, imm8) simde_mm256_cmp_ps(a, b, imm8)
29327 #endif
29328 
29329 SIMDE_FUNCTION_ATTRIBUTES
29330 simde__m256
simde_x_mm256_copysign_ps(simde__m256 dest,simde__m256 src)29331 simde_x_mm256_copysign_ps(simde__m256 dest, simde__m256 src) {
29332   simde__m256_private
29333     r_,
29334     dest_ = simde__m256_to_private(dest),
29335     src_ = simde__m256_to_private(src);
29336 
29337   #if defined(simde_math_copysignf)
29338     SIMDE_VECTORIZE
29339     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
29340       r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]);
29341     }
29342   #else
29343     simde__m256 sgnbit = simde_mm256_xor_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0)), simde_mm256_set1_ps(-SIMDE_FLOAT32_C(0.0)));
29344     return simde_mm256_xor_ps(simde_mm256_and_ps(sgnbit, src), simde_mm256_andnot_ps(sgnbit, dest));
29345   #endif
29346 
29347   return simde__m256_from_private(r_);
29348 }
29349 
29350 SIMDE_FUNCTION_ATTRIBUTES
29351 simde__m256d
simde_x_mm256_copysign_pd(simde__m256d dest,simde__m256d src)29352 simde_x_mm256_copysign_pd(simde__m256d dest, simde__m256d src) {
29353   simde__m256d_private
29354     r_,
29355     dest_ = simde__m256d_to_private(dest),
29356     src_ = simde__m256d_to_private(src);
29357 
29358   #if defined(simde_math_copysign)
29359     SIMDE_VECTORIZE
29360     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
29361       r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]);
29362     }
29363   #else
29364     simde__m256d sgnbit = simde_mm256_xor_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0)), simde_mm256_set1_pd(-SIMDE_FLOAT64_C(0.0)));
29365     return simde_mm256_xor_pd(simde_mm256_and_pd(sgnbit, src), simde_mm256_andnot_pd(sgnbit, dest));
29366   #endif
29367 
29368   return simde__m256d_from_private(r_);
29369 }
29370 
29371 HEDLEY_DIAGNOSTIC_POP /* -Wfloat-equal */
29372 
29373 SIMDE_FUNCTION_ATTRIBUTES
29374 simde__m256d
simde_mm256_cvtepi32_pd(simde__m128i a)29375 simde_mm256_cvtepi32_pd (simde__m128i a) {
29376   #if defined(SIMDE_X86_AVX_NATIVE)
29377     return _mm256_cvtepi32_pd(a);
29378   #else
29379     simde__m256d_private r_;
29380     simde__m128i_private a_ = simde__m128i_to_private(a);
29381 
29382     SIMDE_VECTORIZE
29383     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
29384       r_.f64[i] = HEDLEY_STATIC_CAST(simde_float64, a_.i32[i]);
29385     }
29386 
29387     return simde__m256d_from_private(r_);
29388   #endif
29389 }
29390 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29391   #undef _mm256_cvtepi32_pd
29392   #define _mm256_cvtepi32_pd(a) simde_mm256_cvtepi32_pd(a)
29393 #endif
29394 
29395 SIMDE_FUNCTION_ATTRIBUTES
29396 simde__m256
simde_mm256_cvtepi32_ps(simde__m256i a)29397   simde_mm256_cvtepi32_ps (simde__m256i a) {
29398   #if defined(SIMDE_X86_AVX_NATIVE)
29399     return _mm256_cvtepi32_ps(a);
29400   #else
29401     simde__m256_private r_;
29402     simde__m256i_private a_ = simde__m256i_to_private(a);
29403 
29404     SIMDE_VECTORIZE
29405     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
29406       r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.i32[i]);
29407     }
29408 
29409     return simde__m256_from_private(r_);
29410   #endif
29411 }
29412 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29413   #undef _mm256_cvtepi32_ps
29414   #define _mm256_cvtepi32_ps(a) simde_mm256_cvtepi32_ps(a)
29415 #endif
29416 
29417 SIMDE_FUNCTION_ATTRIBUTES
29418 simde__m128i
simde_mm256_cvtpd_epi32(simde__m256d a)29419 simde_mm256_cvtpd_epi32 (simde__m256d a) {
29420   #if defined(SIMDE_X86_AVX_NATIVE)
29421     return _mm256_cvtpd_epi32(a);
29422   #else
29423     simde__m128i_private r_;
29424     simde__m256d_private a_ = simde__m256d_to_private(a);
29425 
29426     #if defined(simde_math_nearbyint)
29427       SIMDE_VECTORIZE
29428       for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
29429         r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i]));
29430       }
29431     #else
29432       HEDLEY_UNREACHABLE();
29433     #endif
29434 
29435     return simde__m128i_from_private(r_);
29436   #endif
29437 }
29438 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29439   #undef _mm256_cvtpd_epi32
29440   #define _mm256_cvtpd_epi32(a) simde_mm256_cvtpd_epi32(a)
29441 #endif
29442 
29443 SIMDE_FUNCTION_ATTRIBUTES
29444 simde__m128
simde_mm256_cvtpd_ps(simde__m256d a)29445 simde_mm256_cvtpd_ps (simde__m256d a) {
29446   #if defined(SIMDE_X86_AVX_NATIVE)
29447     return _mm256_cvtpd_ps(a);
29448   #else
29449     simde__m128_private r_;
29450     simde__m256d_private a_ = simde__m256d_to_private(a);
29451 
29452     SIMDE_VECTORIZE
29453     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
29454       r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.f64[i]);
29455     }
29456 
29457     return simde__m128_from_private(r_);
29458   #endif
29459 }
29460 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29461   #undef _mm256_cvtpd_ps
29462   #define _mm256_cvtpd_ps(a) simde_mm256_cvtpd_ps(a)
29463 #endif
29464 
29465 SIMDE_FUNCTION_ATTRIBUTES
29466 simde__m256i
simde_mm256_cvtps_epi32(simde__m256 a)29467 simde_mm256_cvtps_epi32 (simde__m256 a) {
29468   #if defined(SIMDE_X86_AVX_NATIVE)
29469     return _mm256_cvtps_epi32(a);
29470   #else
29471     simde__m256i_private r_;
29472     simde__m256_private a_ = simde__m256_to_private(a);
29473 
29474     #if defined(simde_math_nearbyintf)
29475       SIMDE_VECTORIZE
29476       for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
29477         r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[i]));
29478       }
29479     #else
29480       HEDLEY_UNREACHABLE();
29481     #endif
29482 
29483     return simde__m256i_from_private(r_);
29484   #endif
29485 }
29486 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29487   #undef _mm256_cvtps_epi32
29488   #define _mm256_cvtps_epi32(a) simde_mm256_cvtps_epi32(a)
29489 #endif
29490 
29491 SIMDE_FUNCTION_ATTRIBUTES
29492 simde__m256d
simde_mm256_cvtps_pd(simde__m128 a)29493 simde_mm256_cvtps_pd (simde__m128 a) {
29494   #if defined(SIMDE_X86_AVX_NATIVE)
29495     return _mm256_cvtps_pd(a);
29496   #else
29497     simde__m256d_private r_;
29498     simde__m128_private a_ = simde__m128_to_private(a);
29499 
29500     SIMDE_VECTORIZE
29501     for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
29502       r_.f64[i] = HEDLEY_STATIC_CAST(double, a_.f32[i]);
29503     }
29504 
29505     return simde__m256d_from_private(r_);
29506   #endif
29507 }
29508 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29509   #undef _mm256_cvtps_pd
29510   #define _mm256_cvtps_pd(a) simde_mm256_cvtps_pd(a)
29511 #endif
29512 
29513 SIMDE_FUNCTION_ATTRIBUTES
29514 simde_float64
simde_mm256_cvtsd_f64(simde__m256d a)29515 simde_mm256_cvtsd_f64 (simde__m256d a) {
29516   #if defined(SIMDE_X86_AVX_NATIVE) && ( \
29517       SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \
29518       HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
29519       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
29520       HEDLEY_MSVC_VERSION_CHECK(19,14,0))
29521     return _mm256_cvtsd_f64(a);
29522   #else
29523     simde__m256d_private a_ = simde__m256d_to_private(a);
29524     return a_.f64[0];
29525   #endif
29526 }
29527 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29528   #undef _mm256_cvtsd_f64
29529   #define _mm256_cvtsd_f64(a) simde_mm256_cvtsd_f64(a)
29530 #endif
29531 
29532 SIMDE_FUNCTION_ATTRIBUTES
29533 int32_t
simde_mm256_cvtsi256_si32(simde__m256i a)29534 simde_mm256_cvtsi256_si32 (simde__m256i a) {
29535   #if defined(SIMDE_X86_AVX_NATIVE) && ( \
29536       SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \
29537       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
29538       HEDLEY_MSVC_VERSION_CHECK(19,14,0))
29539     return _mm256_cvtsi256_si32(a);
29540   #else
29541     simde__m256i_private a_ = simde__m256i_to_private(a);
29542     return a_.i32[0];
29543   #endif
29544 }
29545 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29546   #undef _mm256_cvtsi256_si32
29547   #define _mm256_cvtsi256_si32(a) simde_mm256_cvtsi256_si32(a)
29548 #endif
29549 
29550 SIMDE_FUNCTION_ATTRIBUTES
29551 simde_float32
simde_mm256_cvtss_f32(simde__m256 a)29552 simde_mm256_cvtss_f32 (simde__m256 a) {
29553   #if defined(SIMDE_X86_AVX_NATIVE) && ( \
29554       SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \
29555       HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
29556       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
29557       HEDLEY_MSVC_VERSION_CHECK(19,14,0))
29558     return _mm256_cvtss_f32(a);
29559   #else
29560     simde__m256_private a_ = simde__m256_to_private(a);
29561     return a_.f32[0];
29562   #endif
29563 }
29564 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29565   #undef _mm256_cvtss_f32
29566   #define _mm256_cvtss_f32(a) simde_mm256_cvtss_f32(a)
29567 #endif
29568 
29569 
29570 SIMDE_FUNCTION_ATTRIBUTES
29571 simde__m128i
simde_mm256_cvttpd_epi32(simde__m256d a)29572 simde_mm256_cvttpd_epi32 (simde__m256d a) {
29573   #if defined(SIMDE_X86_AVX_NATIVE)
29574     return _mm256_cvttpd_epi32(a);
29575   #else
29576     simde__m128i_private r_;
29577     simde__m256d_private a_ = simde__m256d_to_private(a);
29578 
29579     #if defined(simde_math_trunc)
29580       SIMDE_VECTORIZE
29581       for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
29582         r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_trunc(a_.f64[i]));
29583       }
29584     #else
29585       HEDLEY_UNREACHABLE();
29586     #endif
29587 
29588     return simde__m128i_from_private(r_);
29589   #endif
29590 }
29591 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29592   #undef _mm256_cvttpd_epi32
29593   #define _mm256_cvttpd_epi32(a) simde_mm256_cvttpd_epi32(a)
29594 #endif
29595 
29596 SIMDE_FUNCTION_ATTRIBUTES
29597 simde__m256i
simde_mm256_cvttps_epi32(simde__m256 a)29598 simde_mm256_cvttps_epi32 (simde__m256 a) {
29599   #if defined(SIMDE_X86_AVX_NATIVE)
29600     return _mm256_cvttps_epi32(a);
29601   #else
29602     simde__m256i_private r_;
29603     simde__m256_private a_ = simde__m256_to_private(a);
29604 
29605     #if defined(simde_math_truncf)
29606       SIMDE_VECTORIZE
29607       for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
29608         r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_truncf(a_.f32[i]));
29609       }
29610     #else
29611       HEDLEY_UNREACHABLE();
29612     #endif
29613 
29614     return simde__m256i_from_private(r_);
29615   #endif
29616 }
29617 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29618   #undef _mm256_cvttps_epi32
29619   #define _mm256_cvttps_epi32(a) simde_mm256_cvttps_epi32(a)
29620 #endif
29621 
29622 SIMDE_FUNCTION_ATTRIBUTES
29623 simde__m256
simde_mm256_div_ps(simde__m256 a,simde__m256 b)29624 simde_mm256_div_ps (simde__m256 a, simde__m256 b) {
29625   #if defined(SIMDE_X86_AVX_NATIVE)
29626     return _mm256_div_ps(a, b);
29627   #else
29628     simde__m256_private
29629       r_,
29630       a_ = simde__m256_to_private(a),
29631       b_ = simde__m256_to_private(b);
29632 
29633     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
29634       r_.m128[0] = simde_mm_div_ps(a_.m128[0], b_.m128[0]);
29635       r_.m128[1] = simde_mm_div_ps(a_.m128[1], b_.m128[1]);
29636     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
29637       r_.f32 = a_.f32 / b_.f32;
29638     #else
29639       SIMDE_VECTORIZE
29640       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
29641         r_.f32[i] = a_.f32[i] / b_.f32[i];
29642       }
29643     #endif
29644 
29645     return simde__m256_from_private(r_);
29646   #endif
29647 }
29648 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29649   #undef _mm256_div_ps
29650   #define _mm256_div_ps(a, b) simde_mm256_div_ps(a, b)
29651 #endif
29652 
29653 SIMDE_FUNCTION_ATTRIBUTES
29654 simde__m256d
simde_mm256_div_pd(simde__m256d a,simde__m256d b)29655 simde_mm256_div_pd (simde__m256d a, simde__m256d b) {
29656   #if defined(SIMDE_X86_AVX_NATIVE)
29657     return _mm256_div_pd(a, b);
29658   #else
29659     simde__m256d_private
29660       r_,
29661       a_ = simde__m256d_to_private(a),
29662       b_ = simde__m256d_to_private(b);
29663 
29664     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
29665       r_.m128d[0] = simde_mm_div_pd(a_.m128d[0], b_.m128d[0]);
29666       r_.m128d[1] = simde_mm_div_pd(a_.m128d[1], b_.m128d[1]);
29667     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
29668       r_.f64 = a_.f64 / b_.f64;
29669     #else
29670       SIMDE_VECTORIZE
29671       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
29672         r_.f64[i] = a_.f64[i] / b_.f64[i];
29673       }
29674     #endif
29675 
29676     return simde__m256d_from_private(r_);
29677   #endif
29678 }
29679 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29680   #undef _mm256_div_pd
29681   #define _mm256_div_pd(a, b) simde_mm256_div_pd(a, b)
29682 #endif
29683 
29684 SIMDE_FUNCTION_ATTRIBUTES
29685 simde__m128d
simde_mm256_extractf128_pd(simde__m256d a,const int imm8)29686 simde_mm256_extractf128_pd (simde__m256d a, const int imm8)
29687     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
29688   simde__m256d_private a_ = simde__m256d_to_private(a);
29689   return a_.m128d[imm8];
29690 }
29691 #if defined(SIMDE_X86_AVX_NATIVE)
29692 #  define simde_mm256_extractf128_pd(a, imm8) _mm256_extractf128_pd(a, imm8)
29693 #endif
29694 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29695   #undef _mm256_extractf128_pd
29696   #define _mm256_extractf128_pd(a, imm8) simde_mm256_extractf128_pd(a, imm8)
29697 #endif
29698 
29699 SIMDE_FUNCTION_ATTRIBUTES
29700 simde__m128
simde_mm256_extractf128_ps(simde__m256 a,const int imm8)29701 simde_mm256_extractf128_ps (simde__m256 a, const int imm8)
29702     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
29703   simde__m256_private a_ = simde__m256_to_private(a);
29704   return a_.m128[imm8];
29705 }
29706 #if defined(SIMDE_X86_AVX_NATIVE)
29707 #  define simde_mm256_extractf128_ps(a, imm8) _mm256_extractf128_ps(a, imm8)
29708 #endif
29709 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29710   #undef _mm256_extractf128_ps
29711   #define _mm256_extractf128_ps(a, imm8) simde_mm256_extractf128_ps(a, imm8)
29712 #endif
29713 
29714 SIMDE_FUNCTION_ATTRIBUTES
29715 simde__m128i
simde_mm256_extractf128_si256(simde__m256i a,const int imm8)29716 simde_mm256_extractf128_si256 (simde__m256i a, const int imm8)
29717     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
29718   simde__m256i_private a_ = simde__m256i_to_private(a);
29719   return a_.m128i[imm8];
29720 }
29721 #if defined(SIMDE_X86_AVX_NATIVE)
29722 #  define simde_mm256_extractf128_si256(a, imm8) _mm256_extractf128_si256(a, imm8)
29723 #endif
29724 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29725   #undef _mm256_extractf128_si256
29726   #define _mm256_extractf128_si256(a, imm8) simde_mm256_extractf128_si256(a, imm8)
29727 #endif
29728 
29729 SIMDE_FUNCTION_ATTRIBUTES
29730 simde__m256d
simde_mm256_floor_pd(simde__m256d a)29731 simde_mm256_floor_pd (simde__m256d a) {
29732   return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF);
29733 }
29734 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29735   #undef _mm256_floor_pd
29736   #define _mm256_floor_pd(a) simde_mm256_floor_pd(a)
29737 #endif
29738 
29739 SIMDE_FUNCTION_ATTRIBUTES
29740 simde__m256
simde_mm256_floor_ps(simde__m256 a)29741 simde_mm256_floor_ps (simde__m256 a) {
29742   return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF);
29743 }
29744 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29745   #undef _mm256_floor_ps
29746   #define _mm256_floor_ps(a) simde_mm256_floor_ps(a)
29747 #endif
29748 
29749 SIMDE_FUNCTION_ATTRIBUTES
29750 simde__m256i
simde_mm256_insert_epi8(simde__m256i a,int8_t i,const int index)29751 simde_mm256_insert_epi8 (simde__m256i a, int8_t i, const int index)
29752     SIMDE_REQUIRE_RANGE(index, 0, 31) {
29753   simde__m256i_private a_ = simde__m256i_to_private(a);
29754 
29755   a_.i8[index] = i;
29756 
29757   return simde__m256i_from_private(a_);
29758 }
29759 #if defined(SIMDE_X86_AVX_NATIVE)
29760   #define simde_mm256_insert_epi8(a, i, index) _mm256_insert_epi8(a, i, index)
29761 #endif
29762 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29763   #undef _mm256_insert_epi8
29764   #define _mm256_insert_epi8(a, i, index) simde_mm256_insert_epi8(a, i, index)
29765 #endif
29766 
29767 SIMDE_FUNCTION_ATTRIBUTES
29768 simde__m256i
simde_mm256_insert_epi16(simde__m256i a,int16_t i,const int index)29769 simde_mm256_insert_epi16 (simde__m256i a, int16_t i, const int index)
29770     SIMDE_REQUIRE_RANGE(index, 0, 15)  {
29771   simde__m256i_private a_ = simde__m256i_to_private(a);
29772 
29773   a_.i16[index] = i;
29774 
29775   return simde__m256i_from_private(a_);
29776 }
29777 #if defined(SIMDE_X86_AVX_NATIVE)
29778   #define simde_mm256_insert_epi16(a, i, index) _mm256_insert_epi16(a, i, index)
29779 #endif
29780 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29781   #undef _mm256_insert_epi16
29782   #define _mm256_insert_epi16(a, i, imm8) simde_mm256_insert_epi16(a, i, imm8)
29783 #endif
29784 
29785 SIMDE_FUNCTION_ATTRIBUTES
29786 simde__m256i
simde_mm256_insert_epi32(simde__m256i a,int32_t i,const int index)29787 simde_mm256_insert_epi32 (simde__m256i a, int32_t i, const int index)
29788     SIMDE_REQUIRE_RANGE(index, 0, 7)  {
29789   simde__m256i_private a_ = simde__m256i_to_private(a);
29790 
29791   a_.i32[index] = i;
29792 
29793   return simde__m256i_from_private(a_);
29794 }
29795 #if defined(SIMDE_X86_AVX_NATIVE)
29796   #define simde_mm256_insert_epi32(a, i, index) _mm256_insert_epi32(a, i, index)
29797 #endif
29798 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29799   #undef _mm256_insert_epi32
29800   #define _mm256_insert_epi32(a, i, index) simde_mm256_insert_epi32(a, i, index)
29801 #endif
29802 
29803 SIMDE_FUNCTION_ATTRIBUTES
29804 simde__m256i
simde_mm256_insert_epi64(simde__m256i a,int64_t i,const int index)29805 simde_mm256_insert_epi64 (simde__m256i a, int64_t i, const int index)
29806     SIMDE_REQUIRE_RANGE(index, 0, 3)  {
29807   simde__m256i_private a_ = simde__m256i_to_private(a);
29808 
29809   a_.i64[index] = i;
29810 
29811   return simde__m256i_from_private(a_);
29812 }
29813 #if defined(SIMDE_X86_AVX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
29814     (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) && \
29815     SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0)
29816   #define simde_mm256_insert_epi64(a, i, index) _mm256_insert_epi64(a, i, index)
29817 #endif
29818 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
29819   #undef _mm256_insert_epi64
29820   #define _mm256_insert_epi64(a, i, index) simde_mm256_insert_epi64(a, i, index)
29821 #endif
29822 
29823 SIMDE_FUNCTION_ATTRIBUTES
simde_mm256_insertf128_pd(simde__m256d a,simde__m128d b,int imm8)29824 simde__m256d simde_mm256_insertf128_pd(simde__m256d a, simde__m128d b, int imm8)
29825     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
29826   simde__m256d_private a_ = simde__m256d_to_private(a);
29827   simde__m128d_private b_ = simde__m128d_to_private(b);
29828 
29829   a_.m128d_private[imm8] = b_;
29830 
29831   return simde__m256d_from_private(a_);
29832 }
29833 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29834   #undef _mm256_insertf128_pd
29835   #define _mm256_insertf128_pd(a, b, imm8) simde_mm256_insertf128_pd(a, b, imm8)
29836 #endif
29837 
29838 SIMDE_FUNCTION_ATTRIBUTES
simde_mm256_insertf128_ps(simde__m256 a,simde__m128 b,int imm8)29839 simde__m256 simde_mm256_insertf128_ps(simde__m256 a, simde__m128 b, int imm8)
29840     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
29841   simde__m256_private a_ = simde__m256_to_private(a);
29842   simde__m128_private b_ = simde__m128_to_private(b);
29843 
29844   a_.m128_private[imm8] = b_;
29845 
29846   return simde__m256_from_private(a_);
29847 }
29848 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29849   #undef _mm256_insertf128_ps
29850   #define _mm256_insertf128_ps(a, b, imm8) simde_mm256_insertf128_ps(a, b, imm8)
29851 #endif
29852 
29853 SIMDE_FUNCTION_ATTRIBUTES
simde_mm256_insertf128_si256(simde__m256i a,simde__m128i b,int imm8)29854 simde__m256i simde_mm256_insertf128_si256(simde__m256i a, simde__m128i b, int imm8)
29855     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
29856   simde__m256i_private a_ = simde__m256i_to_private(a);
29857   simde__m128i_private b_ = simde__m128i_to_private(b);
29858 
29859   a_.m128i_private[imm8] = b_;
29860 
29861   return simde__m256i_from_private(a_);
29862 }
29863 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29864   #undef _mm256_insertf128_si256
29865   #define _mm256_insertf128_si256(a, b, imm8) simde_mm256_insertf128_si256(a, b, imm8)
29866 #endif
29867 
29868 #if defined(SIMDE_X86_AVX_NATIVE)
29869 #  define simde_mm256_dp_ps(a, b, imm8) _mm256_dp_ps(a, b, imm8)
29870 #else
29871 #  define simde_mm256_dp_ps(a, b, imm8) \
29872     simde_mm256_set_m128( \
29873       simde_mm_dp_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), imm8), \
29874       simde_mm_dp_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), imm8))
29875 #endif
29876 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29877   #undef _mm256_dp_ps
29878   #define _mm256_dp_ps(a, b, imm8) simde_mm256_dp_ps(a, b, imm8)
29879 #endif
29880 
29881 SIMDE_FUNCTION_ATTRIBUTES
29882 int32_t
simde_mm256_extract_epi32(simde__m256i a,const int index)29883 simde_mm256_extract_epi32 (simde__m256i a, const int index)
29884     SIMDE_REQUIRE_RANGE(index, 0, 7) {
29885   simde__m256i_private a_ = simde__m256i_to_private(a);
29886   return a_.i32[index];
29887 }
29888 #if defined(SIMDE_X86_AVX_NATIVE)
29889   #define simde_mm256_extract_epi32(a, index) _mm256_extract_epi32(a, index)
29890 #endif
29891 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29892   #undef _mm256_extract_epi32
29893   #define _mm256_extract_epi32(a, index) simde_mm256_extract_epi32(a, index)
29894 #endif
29895 
29896 SIMDE_FUNCTION_ATTRIBUTES
29897 int64_t
simde_mm256_extract_epi64(simde__m256i a,const int index)29898 simde_mm256_extract_epi64 (simde__m256i a, const int index)
29899     SIMDE_REQUIRE_RANGE(index, 0, 3) {
29900   simde__m256i_private a_ = simde__m256i_to_private(a);
29901   return a_.i64[index];
29902 }
29903 #if defined(SIMDE_X86_AVX_NATIVE) && defined(SIMDE_ARCH_AMD64)
29904   #if !defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)
29905     #define simde_mm256_extract_epi64(a, index) _mm256_extract_epi64(a, index)
29906   #endif
29907 #endif
29908 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
29909   #undef _mm256_extract_epi64
29910   #define _mm256_extract_epi64(a, index) simde_mm256_extract_epi64(a, index)
29911 #endif
29912 
29913 SIMDE_FUNCTION_ATTRIBUTES
29914 simde__m256i
simde_mm256_lddqu_si256(simde__m256i const * mem_addr)29915 simde_mm256_lddqu_si256 (simde__m256i const * mem_addr) {
29916   #if defined(SIMDE_X86_AVX_NATIVE)
29917     return _mm256_loadu_si256(mem_addr);
29918   #else
29919     simde__m256i r;
29920     simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r));
29921     return r;
29922   #endif
29923 }
29924 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29925   #undef _mm256_lddqu_si256
29926   #define _mm256_lddqu_si256(a) simde_mm256_lddqu_si256(a)
29927 #endif
29928 
29929 SIMDE_FUNCTION_ATTRIBUTES
29930 simde__m256d
simde_mm256_load_pd(const double mem_addr[HEDLEY_ARRAY_PARAM (4)])29931 simde_mm256_load_pd (const double mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
29932   #if defined(SIMDE_X86_AVX_NATIVE)
29933     return _mm256_load_pd(mem_addr);
29934   #else
29935     simde__m256d r;
29936     simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), sizeof(r));
29937     return r;
29938   #endif
29939 }
29940 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29941   #undef _mm256_load_pd
29942   #define _mm256_load_pd(a) simde_mm256_load_pd(a)
29943 #endif
29944 
29945 SIMDE_FUNCTION_ATTRIBUTES
29946 simde__m256
simde_mm256_load_ps(const float mem_addr[HEDLEY_ARRAY_PARAM (8)])29947 simde_mm256_load_ps (const float mem_addr[HEDLEY_ARRAY_PARAM(8)]) {
29948   #if defined(SIMDE_X86_AVX_NATIVE)
29949     return _mm256_load_ps(mem_addr);
29950   #else
29951     simde__m256 r;
29952     simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), sizeof(r));
29953     return r;
29954   #endif
29955 }
29956 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29957   #undef _mm256_load_ps
29958   #define _mm256_load_ps(a) simde_mm256_load_ps(a)
29959 #endif
29960 
29961 SIMDE_FUNCTION_ATTRIBUTES
29962 simde__m256i
simde_mm256_load_si256(simde__m256i const * mem_addr)29963 simde_mm256_load_si256 (simde__m256i const * mem_addr) {
29964   #if defined(SIMDE_X86_AVX_NATIVE)
29965     return _mm256_load_si256(mem_addr);
29966   #else
29967     simde__m256i r;
29968     simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r));
29969     return r;
29970   #endif
29971 }
29972 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29973   #undef _mm256_load_si256
29974   #define _mm256_load_si256(a) simde_mm256_load_si256(a)
29975 #endif
29976 
29977 SIMDE_FUNCTION_ATTRIBUTES
29978 simde__m256d
simde_mm256_loadu_pd(const double a[HEDLEY_ARRAY_PARAM (4)])29979 simde_mm256_loadu_pd (const double a[HEDLEY_ARRAY_PARAM(4)]) {
29980   #if defined(SIMDE_X86_AVX_NATIVE)
29981     return _mm256_loadu_pd(a);
29982   #else
29983     simde__m256d r;
29984     simde_memcpy(&r, a, sizeof(r));
29985     return r;
29986   #endif
29987 }
29988 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
29989   #undef _mm256_loadu_pd
29990   #define _mm256_loadu_pd(a) simde_mm256_loadu_pd(a)
29991 #endif
29992 
29993 SIMDE_FUNCTION_ATTRIBUTES
29994 simde__m256
simde_mm256_loadu_ps(const float a[HEDLEY_ARRAY_PARAM (8)])29995 simde_mm256_loadu_ps (const float a[HEDLEY_ARRAY_PARAM(8)]) {
29996   #if defined(SIMDE_X86_AVX_NATIVE)
29997     return _mm256_loadu_ps(a);
29998   #else
29999     simde__m256 r;
30000     simde_memcpy(&r, a, sizeof(r));
30001     return r;
30002   #endif
30003 }
30004 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30005   #undef _mm256_loadu_ps
30006   #define _mm256_loadu_ps(a) simde_mm256_loadu_ps(a)
30007 #endif
30008 
30009 SIMDE_FUNCTION_ATTRIBUTES
30010 simde__m256i
simde_mm256_loadu_epi8(void const * mem_addr)30011 simde_mm256_loadu_epi8(void const * mem_addr) {
30012   #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
30013     return _mm256_loadu_epi8(mem_addr);
30014   #elif defined(SIMDE_X86_AVX_NATIVE)
30015     return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr));
30016   #else
30017     simde__m256i r;
30018     simde_memcpy(&r, mem_addr, sizeof(r));
30019     return r;
30020   #endif
30021 }
30022 #define simde_x_mm256_loadu_epi8(mem_addr) simde_mm256_loadu_epi8(mem_addr)
30023 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
30024   #undef _mm256_loadu_epi8
30025   #define _mm256_loadu_epi8(a) simde_mm256_loadu_epi8(a)
30026 #endif
30027 
30028 SIMDE_FUNCTION_ATTRIBUTES
30029 simde__m256i
simde_mm256_loadu_epi16(void const * mem_addr)30030 simde_mm256_loadu_epi16(void const * mem_addr) {
30031   #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
30032     return _mm256_loadu_epi16(mem_addr);
30033   #elif defined(SIMDE_X86_AVX_NATIVE)
30034     return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr));
30035   #else
30036     simde__m256i r;
30037     simde_memcpy(&r, mem_addr, sizeof(r));
30038     return r;
30039   #endif
30040 }
30041 #define simde_x_mm256_loadu_epi16(mem_addr) simde_mm256_loadu_epi16(mem_addr)
30042 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
30043   #undef _mm256_loadu_epi16
30044   #define _mm256_loadu_epi16(a) simde_mm256_loadu_epi16(a)
30045 #endif
30046 
30047 SIMDE_FUNCTION_ATTRIBUTES
30048 simde__m256i
simde_mm256_loadu_epi32(void const * mem_addr)30049 simde_mm256_loadu_epi32(void const * mem_addr) {
30050   #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
30051     return _mm256_loadu_epi32(mem_addr);
30052   #elif defined(SIMDE_X86_AVX_NATIVE)
30053     return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr));
30054   #else
30055     simde__m256i r;
30056     simde_memcpy(&r, mem_addr, sizeof(r));
30057     return r;
30058   #endif
30059 }
30060 #define simde_x_mm256_loadu_epi32(mem_addr) simde_mm256_loadu_epi32(mem_addr)
30061 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
30062   #undef _mm256_loadu_epi32
30063   #define _mm256_loadu_epi32(a) simde_mm256_loadu_epi32(a)
30064 #endif
30065 
30066 SIMDE_FUNCTION_ATTRIBUTES
30067 simde__m256i
simde_mm256_loadu_epi64(void const * mem_addr)30068 simde_mm256_loadu_epi64(void const * mem_addr) {
30069   #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
30070     return _mm256_loadu_epi64(mem_addr);
30071   #elif defined(SIMDE_X86_AVX_NATIVE)
30072     return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr));
30073   #else
30074     simde__m256i r;
30075     simde_memcpy(&r, mem_addr, sizeof(r));
30076     return r;
30077   #endif
30078 }
30079 #define simde_x_mm256_loadu_epi64(mem_addr) simde_mm256_loadu_epi64(mem_addr)
30080 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
30081   #undef _mm256_loadu_epi64
30082   #define _mm256_loadu_epi64(a) simde_mm256_loadu_epi64(a)
30083 #endif
30084 
30085 SIMDE_FUNCTION_ATTRIBUTES
30086 simde__m256i
simde_mm256_loadu_si256(void const * mem_addr)30087 simde_mm256_loadu_si256 (void const * mem_addr) {
30088   #if defined(SIMDE_X86_AVX_NATIVE)
30089     return _mm256_loadu_si256(SIMDE_ALIGN_CAST(const __m256i*, mem_addr));
30090   #else
30091     simde__m256i r;
30092     simde_memcpy(&r, mem_addr, sizeof(r));
30093     return r;
30094   #endif
30095 }
30096 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30097   #undef _mm256_loadu_si256
30098   #define _mm256_loadu_si256(mem_addr) simde_mm256_loadu_si256(mem_addr)
30099 #endif
30100 
30101 SIMDE_FUNCTION_ATTRIBUTES
30102 simde__m256
simde_mm256_loadu2_m128(const float hiaddr[HEDLEY_ARRAY_PARAM (4)],const float loaddr[HEDLEY_ARRAY_PARAM (4)])30103 simde_mm256_loadu2_m128 (const float hiaddr[HEDLEY_ARRAY_PARAM(4)], const float loaddr[HEDLEY_ARRAY_PARAM(4)]) {
30104   #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
30105     return _mm256_loadu2_m128(hiaddr, loaddr);
30106   #else
30107     return
30108       simde_mm256_insertf128_ps(simde_mm256_castps128_ps256(simde_mm_loadu_ps(loaddr)),
30109               simde_mm_loadu_ps(hiaddr), 1);
30110   #endif
30111 }
30112 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30113   #undef _mm256_loadu2_m128
30114   #define _mm256_loadu2_m128(hiaddr, loaddr) simde_mm256_loadu2_m128(hiaddr, loaddr)
30115 #endif
30116 
30117 SIMDE_FUNCTION_ATTRIBUTES
30118 simde__m256d
simde_mm256_loadu2_m128d(const double hiaddr[HEDLEY_ARRAY_PARAM (2)],const double loaddr[HEDLEY_ARRAY_PARAM (2)])30119 simde_mm256_loadu2_m128d (const double hiaddr[HEDLEY_ARRAY_PARAM(2)], const double loaddr[HEDLEY_ARRAY_PARAM(2)]) {
30120   #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
30121     return _mm256_loadu2_m128d(hiaddr, loaddr);
30122   #else
30123     return
30124       simde_mm256_insertf128_pd(simde_mm256_castpd128_pd256(simde_mm_loadu_pd(loaddr)),
30125               simde_mm_loadu_pd(hiaddr), 1);
30126   #endif
30127 }
30128 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30129   #undef _mm256_loadu2_m128d
30130   #define _mm256_loadu2_m128d(hiaddr, loaddr) simde_mm256_loadu2_m128d(hiaddr, loaddr)
30131 #endif
30132 
30133 SIMDE_FUNCTION_ATTRIBUTES
30134 simde__m256i
simde_mm256_loadu2_m128i(const simde__m128i * hiaddr,const simde__m128i * loaddr)30135 simde_mm256_loadu2_m128i (const simde__m128i* hiaddr, const simde__m128i* loaddr) {
30136   #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
30137     return _mm256_loadu2_m128i(hiaddr, loaddr);
30138   #else
30139     return
30140       simde_mm256_insertf128_si256(simde_mm256_castsi128_si256(simde_mm_loadu_si128(loaddr)),
30141           simde_mm_loadu_si128(hiaddr), 1);
30142   #endif
30143 }
30144 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30145   #undef _mm256_loadu2_m128i
30146   #define _mm256_loadu2_m128i(hiaddr, loaddr) simde_mm256_loadu2_m128i(hiaddr, loaddr)
30147 #endif
30148 
30149 SIMDE_FUNCTION_ATTRIBUTES
30150 simde__m128d
simde_mm_maskload_pd(const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m128i mask)30151 simde_mm_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) {
30152   #if defined(SIMDE_X86_AVX_NATIVE)
30153     return _mm_maskload_pd(mem_addr, mask);
30154   #else
30155     simde__m128d_private
30156       mem_ = simde__m128d_to_private(simde_mm_loadu_pd(mem_addr)),
30157       r_;
30158     simde__m128i_private mask_ = simde__m128i_to_private(mask);
30159 
30160     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
30161       r_.neon_i64 = vandq_s64(mem_.neon_i64, vshrq_n_s64(mask_.neon_i64, 63));
30162     #else
30163       SIMDE_VECTORIZE
30164       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
30165         r_.i64[i] = mem_.i64[i] & (mask_.i64[i] >> 63);
30166       }
30167     #endif
30168 
30169     return simde__m128d_from_private(r_);
30170   #endif
30171 }
30172 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30173   #undef _mm_maskload_pd
30174   #define _mm_maskload_pd(mem_addr, mask) simde_mm_maskload_pd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr), mask)
30175 #endif
30176 
30177 SIMDE_FUNCTION_ATTRIBUTES
30178 simde__m256d
simde_mm256_maskload_pd(const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m256i mask)30179 simde_mm256_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) {
30180   #if defined(SIMDE_X86_AVX_NATIVE)
30181     return _mm256_maskload_pd(mem_addr, mask);
30182   #else
30183     simde__m256d_private r_;
30184     simde__m256i_private mask_ = simde__m256i_to_private(mask);
30185 
30186     r_ = simde__m256d_to_private(simde_mm256_loadu_pd(mem_addr));
30187     SIMDE_VECTORIZE
30188     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
30189       r_.i64[i] &= mask_.i64[i] >> 63;
30190     }
30191 
30192     return simde__m256d_from_private(r_);
30193   #endif
30194 }
30195 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30196   #undef _mm256_maskload_pd
30197   #define _mm256_maskload_pd(mem_addr, mask) simde_mm256_maskload_pd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr), mask)
30198 #endif
30199 
30200 SIMDE_FUNCTION_ATTRIBUTES
30201 simde__m128
simde_mm_maskload_ps(const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m128i mask)30202 simde_mm_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) {
30203   #if defined(SIMDE_X86_AVX_NATIVE)
30204     return _mm_maskload_ps(mem_addr, mask);
30205   #else
30206     simde__m128_private
30207       mem_ = simde__m128_to_private(simde_mm_loadu_ps(mem_addr)),
30208       r_;
30209     simde__m128i_private mask_ = simde__m128i_to_private(mask);
30210 
30211     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
30212       r_.neon_i32 = vandq_s32(mem_.neon_i32, vshrq_n_s32(mask_.neon_i32, 31));
30213     #else
30214       SIMDE_VECTORIZE
30215       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
30216         r_.i32[i] = mem_.i32[i] & (mask_.i32[i] >> 31);
30217       }
30218     #endif
30219 
30220     return simde__m128_from_private(r_);
30221   #endif
30222 }
30223 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30224   #undef _mm_maskload_ps
30225   #define _mm_maskload_ps(mem_addr, mask) simde_mm_maskload_ps(HEDLEY_REINTERPRET_CAST(float const*, mem_addr), mask)
30226 #endif
30227 
30228 SIMDE_FUNCTION_ATTRIBUTES
30229 simde__m256
simde_mm256_maskload_ps(const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m256i mask)30230 simde_mm256_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) {
30231   #if defined(SIMDE_X86_AVX_NATIVE)
30232     return _mm256_maskload_ps(mem_addr, mask);
30233   #else
30234     simde__m256_private r_;
30235     simde__m256i_private mask_ = simde__m256i_to_private(mask);
30236 
30237     r_ = simde__m256_to_private(simde_mm256_loadu_ps(mem_addr));
30238     SIMDE_VECTORIZE
30239     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
30240       r_.i32[i] &= mask_.i32[i] >> 31;
30241     }
30242 
30243     return simde__m256_from_private(r_);
30244   #endif
30245 }
30246 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30247   #undef _mm256_maskload_ps
30248   #define _mm256_maskload_ps(mem_addr, mask) simde_mm256_maskload_ps(HEDLEY_REINTERPRET_CAST(float const*, mem_addr), mask)
30249 #endif
30250 
30251 SIMDE_FUNCTION_ATTRIBUTES
30252 void
simde_mm_maskstore_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128i mask,simde__m128d a)30253 simde_mm_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask, simde__m128d a) {
30254   #if defined(SIMDE_X86_AVX_NATIVE)
30255     _mm_maskstore_pd(mem_addr, mask, a);
30256   #else
30257     simde__m128i_private mask_ = simde__m128i_to_private(mask);
30258     simde__m128d_private a_ = simde__m128d_to_private(a);
30259 
30260     SIMDE_VECTORIZE
30261     for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
30262       if (mask_.u64[i] >> 63)
30263         mem_addr[i] = a_.f64[i];
30264     }
30265   #endif
30266 }
30267 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30268   #undef _mm_maskstore_pd
30269   #define _mm_maskstore_pd(mem_addr, mask, a) simde_mm_maskstore_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), mask, a)
30270 #endif
30271 
30272 SIMDE_FUNCTION_ATTRIBUTES
30273 void
simde_mm256_maskstore_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m256i mask,simde__m256d a)30274 simde_mm256_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask, simde__m256d a) {
30275   #if defined(SIMDE_X86_AVX_NATIVE)
30276     _mm256_maskstore_pd(mem_addr, mask, a);
30277   #else
30278     simde__m256i_private mask_ = simde__m256i_to_private(mask);
30279     simde__m256d_private a_ = simde__m256d_to_private(a);
30280 
30281     SIMDE_VECTORIZE
30282     for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
30283       if (mask_.u64[i] & (UINT64_C(1) << 63))
30284         mem_addr[i] = a_.f64[i];
30285     }
30286   #endif
30287 }
30288 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30289   #undef _mm256_maskstore_pd
30290   #define _mm256_maskstore_pd(mem_addr, mask, a) simde_mm256_maskstore_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), mask, a)
30291 #endif
30292 
30293 SIMDE_FUNCTION_ATTRIBUTES
30294 void
simde_mm_maskstore_ps(simde_float32 mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m128i mask,simde__m128 a)30295 simde_mm_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask, simde__m128 a) {
30296   #if defined(SIMDE_X86_AVX_NATIVE)
30297     _mm_maskstore_ps(mem_addr, mask, a);
30298   #else
30299     simde__m128i_private mask_ = simde__m128i_to_private(mask);
30300     simde__m128_private a_ = simde__m128_to_private(a);
30301 
30302     SIMDE_VECTORIZE
30303     for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
30304       if (mask_.u32[i] & (UINT32_C(1) << 31))
30305         mem_addr[i] = a_.f32[i];
30306     }
30307   #endif
30308 }
30309 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30310   #undef _mm_maskstore_ps
30311   #define _mm_maskstore_ps(mem_addr, mask, a) simde_mm_maskstore_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), mask, a)
30312 #endif
30313 
30314 SIMDE_FUNCTION_ATTRIBUTES
30315 void
simde_mm256_maskstore_ps(simde_float32 mem_addr[HEDLEY_ARRAY_PARAM (8)],simde__m256i mask,simde__m256 a)30316 simde_mm256_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask, simde__m256 a) {
30317   #if defined(SIMDE_X86_AVX_NATIVE)
30318     _mm256_maskstore_ps(mem_addr, mask, a);
30319   #else
30320     simde__m256i_private mask_ = simde__m256i_to_private(mask);
30321     simde__m256_private a_ = simde__m256_to_private(a);
30322 
30323     SIMDE_VECTORIZE
30324     for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
30325       if (mask_.u32[i] & (UINT32_C(1) << 31))
30326         mem_addr[i] = a_.f32[i];
30327     }
30328   #endif
30329 }
30330 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30331   #undef _mm256_maskstore_ps
30332   #define _mm256_maskstore_ps(mem_addr, mask, a) simde_mm256_maskstore_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), mask, a)
30333 #endif
30334 
30335 SIMDE_FUNCTION_ATTRIBUTES
30336 simde__m256
simde_mm256_min_ps(simde__m256 a,simde__m256 b)30337 simde_mm256_min_ps (simde__m256 a, simde__m256 b) {
30338   #if defined(SIMDE_X86_AVX_NATIVE)
30339     return _mm256_min_ps(a, b);
30340   #else
30341     simde__m256_private
30342       r_,
30343       a_ = simde__m256_to_private(a),
30344       b_ = simde__m256_to_private(b);
30345 
30346     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
30347       r_.m128[0] = simde_mm_min_ps(a_.m128[0], b_.m128[0]);
30348       r_.m128[1] = simde_mm_min_ps(a_.m128[1], b_.m128[1]);
30349     #else
30350       SIMDE_VECTORIZE
30351       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
30352         r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i];
30353       }
30354     #endif
30355 
30356     return simde__m256_from_private(r_);
30357   #endif
30358 }
30359 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30360   #undef _mm256_min_ps
30361   #define _mm256_min_ps(a, b) simde_mm256_min_ps(a, b)
30362 #endif
30363 
30364 SIMDE_FUNCTION_ATTRIBUTES
30365 simde__m256d
simde_mm256_min_pd(simde__m256d a,simde__m256d b)30366 simde_mm256_min_pd (simde__m256d a, simde__m256d b) {
30367   #if defined(SIMDE_X86_AVX_NATIVE)
30368     return _mm256_min_pd(a, b);
30369   #else
30370     simde__m256d_private
30371       r_,
30372       a_ = simde__m256d_to_private(a),
30373       b_ = simde__m256d_to_private(b);
30374 
30375     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
30376       r_.m128d[0] = simde_mm_min_pd(a_.m128d[0], b_.m128d[0]);
30377       r_.m128d[1] = simde_mm_min_pd(a_.m128d[1], b_.m128d[1]);
30378     #else
30379       SIMDE_VECTORIZE
30380       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
30381         r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
30382       }
30383     #endif
30384 
30385     return simde__m256d_from_private(r_);
30386   #endif
30387 }
30388 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30389   #undef _mm256_min_pd
30390   #define _mm256_min_pd(a, b) simde_mm256_min_pd(a, b)
30391 #endif
30392 
30393 SIMDE_FUNCTION_ATTRIBUTES
30394 simde__m256
simde_mm256_max_ps(simde__m256 a,simde__m256 b)30395 simde_mm256_max_ps (simde__m256 a, simde__m256 b) {
30396   #if defined(SIMDE_X86_AVX_NATIVE)
30397     return _mm256_max_ps(a, b);
30398   #else
30399     simde__m256_private
30400       r_,
30401       a_ = simde__m256_to_private(a),
30402       b_ = simde__m256_to_private(b);
30403 
30404     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
30405       r_.m128[0] = simde_mm_max_ps(a_.m128[0], b_.m128[0]);
30406       r_.m128[1] = simde_mm_max_ps(a_.m128[1], b_.m128[1]);
30407     #else
30408       SIMDE_VECTORIZE
30409       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
30410         r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i];
30411       }
30412     #endif
30413 
30414     return simde__m256_from_private(r_);
30415   #endif
30416 }
30417 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30418   #undef _mm256_max_ps
30419   #define _mm256_max_ps(a, b) simde_mm256_max_ps(a, b)
30420 #endif
30421 
30422 SIMDE_FUNCTION_ATTRIBUTES
30423 simde__m256d
simde_mm256_max_pd(simde__m256d a,simde__m256d b)30424 simde_mm256_max_pd (simde__m256d a, simde__m256d b) {
30425   #if defined(SIMDE_X86_AVX_NATIVE)
30426     return _mm256_max_pd(a, b);
30427   #else
30428     simde__m256d_private
30429       r_,
30430       a_ = simde__m256d_to_private(a),
30431       b_ = simde__m256d_to_private(b);
30432 
30433     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
30434       r_.m128d[0] = simde_mm_max_pd(a_.m128d[0], b_.m128d[0]);
30435       r_.m128d[1] = simde_mm_max_pd(a_.m128d[1], b_.m128d[1]);
30436     #else
30437       SIMDE_VECTORIZE
30438       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
30439         r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
30440       }
30441     #endif
30442 
30443     return simde__m256d_from_private(r_);
30444   #endif
30445 }
30446 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30447   #undef _mm256_max_pd
30448   #define _mm256_max_pd(a, b) simde_mm256_max_pd(a, b)
30449 #endif
30450 
30451 SIMDE_FUNCTION_ATTRIBUTES
30452 simde__m256d
simde_mm256_movedup_pd(simde__m256d a)30453 simde_mm256_movedup_pd (simde__m256d a) {
30454   #if defined(SIMDE_X86_AVX_NATIVE)
30455     return _mm256_movedup_pd(a);
30456   #else
30457     simde__m256d_private
30458       r_,
30459       a_ = simde__m256d_to_private(a);
30460 
30461     #if defined(SIMDE_SHUFFLE_VECTOR_)
30462       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, a_.f64, 0, 0, 2, 2);
30463     #else
30464       SIMDE_VECTORIZE
30465       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) {
30466         r_.f64[i] = r_.f64[i + 1] = a_.f64[i];
30467       }
30468     #endif
30469 
30470     return simde__m256d_from_private(r_);
30471   #endif
30472 }
30473 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30474   #undef _mm256_movedup_pd
30475   #define _mm256_movedup_pd(a) simde_mm256_movedup_pd(a)
30476 #endif
30477 
30478 SIMDE_FUNCTION_ATTRIBUTES
30479 simde__m256
simde_mm256_movehdup_ps(simde__m256 a)30480 simde_mm256_movehdup_ps (simde__m256 a) {
30481   #if defined(SIMDE_X86_AVX_NATIVE)
30482     return _mm256_movehdup_ps(a);
30483   #else
30484     simde__m256_private
30485       r_,
30486       a_ = simde__m256_to_private(a);
30487 
30488     #if defined(SIMDE_SHUFFLE_VECTOR_)
30489       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, a_.f32, 1, 1, 3, 3, 5, 5, 7, 7);
30490     #else
30491       SIMDE_VECTORIZE
30492       for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) {
30493         r_.f32[i - 1] = r_.f32[i] = a_.f32[i];
30494       }
30495     #endif
30496 
30497     return simde__m256_from_private(r_);
30498   #endif
30499 }
30500 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30501   #undef _mm256_movehdup_ps
30502   #define _mm256_movehdup_ps(a) simde_mm256_movehdup_ps(a)
30503 #endif
30504 
30505 SIMDE_FUNCTION_ATTRIBUTES
30506 simde__m256
simde_mm256_moveldup_ps(simde__m256 a)30507 simde_mm256_moveldup_ps (simde__m256 a) {
30508   #if defined(SIMDE_X86_AVX_NATIVE)
30509     return _mm256_moveldup_ps(a);
30510   #else
30511     simde__m256_private
30512       r_,
30513       a_ = simde__m256_to_private(a);
30514 
30515     #if defined(SIMDE_SHUFFLE_VECTOR_)
30516       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, a_.f32, 0, 0, 2, 2, 4, 4, 6, 6);
30517     #else
30518       SIMDE_VECTORIZE
30519       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) {
30520         r_.f32[i] = r_.f32[i + 1] = a_.f32[i];
30521       }
30522     #endif
30523 
30524     return simde__m256_from_private(r_);
30525   #endif
30526 }
30527 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30528   #undef _mm256_moveldup_ps
30529   #define _mm256_moveldup_ps(a) simde_mm256_moveldup_ps(a)
30530 #endif
30531 
30532 SIMDE_FUNCTION_ATTRIBUTES
30533 int
simde_mm256_movemask_ps(simde__m256 a)30534 simde_mm256_movemask_ps (simde__m256 a) {
30535   #if defined(SIMDE_X86_AVX_NATIVE)
30536     return _mm256_movemask_ps(a);
30537   #else
30538     simde__m256_private a_ = simde__m256_to_private(a);
30539     int r = 0;
30540 
30541     SIMDE_VECTORIZE_REDUCTION(|:r)
30542     for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
30543       r |= (a_.u32[i] >> 31) << i;
30544     }
30545 
30546     return r;
30547   #endif
30548 }
30549 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30550   #undef _mm256_movemask_ps
30551   #define _mm256_movemask_ps(a) simde_mm256_movemask_ps(a)
30552 #endif
30553 
30554 SIMDE_FUNCTION_ATTRIBUTES
30555 int
simde_mm256_movemask_pd(simde__m256d a)30556 simde_mm256_movemask_pd (simde__m256d a) {
30557   #if defined(SIMDE_X86_AVX_NATIVE)
30558     return _mm256_movemask_pd(a);
30559   #else
30560     simde__m256d_private a_ = simde__m256d_to_private(a);
30561     int r = 0;
30562 
30563     SIMDE_VECTORIZE_REDUCTION(|:r)
30564     for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
30565       r |= (a_.u64[i] >> 63) << i;
30566     }
30567 
30568     return r;
30569   #endif
30570 }
30571 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30572   #undef _mm256_movemask_pd
30573   #define _mm256_movemask_pd(a) simde_mm256_movemask_pd(a)
30574 #endif
30575 
30576 SIMDE_FUNCTION_ATTRIBUTES
30577 simde__m256
simde_mm256_mul_ps(simde__m256 a,simde__m256 b)30578 simde_mm256_mul_ps (simde__m256 a, simde__m256 b) {
30579   #if defined(SIMDE_X86_AVX_NATIVE)
30580     return _mm256_mul_ps(a, b);
30581   #else
30582     simde__m256_private
30583       r_,
30584       a_ = simde__m256_to_private(a),
30585       b_ = simde__m256_to_private(b);
30586 
30587     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
30588       r_.m128[0] = simde_mm_mul_ps(a_.m128[0], b_.m128[0]);
30589       r_.m128[1] = simde_mm_mul_ps(a_.m128[1], b_.m128[1]);
30590     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
30591       r_.f32 = a_.f32 * b_.f32;
30592     #else
30593       SIMDE_VECTORIZE
30594       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
30595         r_.f32[i] = a_.f32[i] * b_.f32[i];
30596       }
30597     #endif
30598 
30599     return simde__m256_from_private(r_);
30600   #endif
30601 }
30602 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30603   #undef _mm256_mul_ps
30604   #define _mm256_mul_ps(a, b) simde_mm256_mul_ps(a, b)
30605 #endif
30606 
30607 SIMDE_FUNCTION_ATTRIBUTES
30608 simde__m256d
simde_mm256_mul_pd(simde__m256d a,simde__m256d b)30609 simde_mm256_mul_pd (simde__m256d a, simde__m256d b) {
30610   #if defined(SIMDE_X86_AVX_NATIVE)
30611     return _mm256_mul_pd(a, b);
30612   #else
30613     simde__m256d_private
30614       r_,
30615       a_ = simde__m256d_to_private(a),
30616       b_ = simde__m256d_to_private(b);
30617 
30618     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
30619       r_.m128d[0] = simde_mm_mul_pd(a_.m128d[0], b_.m128d[0]);
30620       r_.m128d[1] = simde_mm_mul_pd(a_.m128d[1], b_.m128d[1]);
30621     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
30622       r_.f64 = a_.f64 * b_.f64;
30623     #else
30624       SIMDE_VECTORIZE
30625       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
30626         r_.f64[i] = a_.f64[i] * b_.f64[i];
30627       }
30628     #endif
30629 
30630     return simde__m256d_from_private(r_);
30631   #endif
30632 }
30633 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30634   #undef _mm256_mul_pd
30635   #define _mm256_mul_pd(a, b) simde_mm256_mul_pd(a, b)
30636 #endif
30637 
30638 SIMDE_FUNCTION_ATTRIBUTES
30639 simde__m256
simde_mm256_or_ps(simde__m256 a,simde__m256 b)30640 simde_mm256_or_ps (simde__m256 a, simde__m256 b) {
30641   #if defined(SIMDE_X86_AVX_NATIVE)
30642     return _mm256_or_ps(a, b);
30643   #else
30644     simde__m256_private
30645       r_,
30646       a_ = simde__m256_to_private(a),
30647       b_ = simde__m256_to_private(b);
30648 
30649     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
30650       r_.m128[0] = simde_mm_or_ps(a_.m128[0], b_.m128[0]);
30651       r_.m128[1] = simde_mm_or_ps(a_.m128[1], b_.m128[1]);
30652     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
30653       r_.i32f = a_.i32f | b_.i32f;
30654     #else
30655       SIMDE_VECTORIZE
30656       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
30657         r_.u32[i] = a_.u32[i] | b_.u32[i];
30658       }
30659     #endif
30660 
30661     return simde__m256_from_private(r_);
30662   #endif
30663 }
30664 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30665   #undef _mm256_or_ps
30666   #define _mm256_or_ps(a, b) simde_mm256_or_ps(a, b)
30667 #endif
30668 
30669 SIMDE_FUNCTION_ATTRIBUTES
30670 simde__m256d
simde_mm256_or_pd(simde__m256d a,simde__m256d b)30671 simde_mm256_or_pd (simde__m256d a, simde__m256d b) {
30672   #if defined(SIMDE_X86_AVX_NATIVE)
30673     return _mm256_or_pd(a, b);
30674   #else
30675     simde__m256d_private
30676       r_,
30677       a_ = simde__m256d_to_private(a),
30678       b_ = simde__m256d_to_private(b);
30679 
30680     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
30681       r_.m128d[0] = simde_mm_or_pd(a_.m128d[0], b_.m128d[0]);
30682       r_.m128d[1] = simde_mm_or_pd(a_.m128d[1], b_.m128d[1]);
30683     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
30684       r_.i32f = a_.i32f | b_.i32f;
30685     #else
30686       SIMDE_VECTORIZE
30687       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
30688         r_.u64[i] = a_.u64[i] | b_.u64[i];
30689       }
30690     #endif
30691 
30692     return simde__m256d_from_private(r_);
30693   #endif
30694 }
30695 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30696   #undef _mm256_or_pd
30697   #define _mm256_or_pd(a, b) simde_mm256_or_pd(a, b)
30698 #endif
30699 
30700 SIMDE_FUNCTION_ATTRIBUTES
30701 simde__m256
simde_mm256_permute_ps(simde__m256 a,const int imm8)30702 simde_mm256_permute_ps (simde__m256 a, const int imm8)
30703     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
30704   simde__m256_private
30705     r_,
30706     a_ = simde__m256_to_private(a);
30707 
30708   SIMDE_VECTORIZE
30709   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
30710     r_.f32[i] = a_.m128_private[i >> 2].f32[(imm8 >> ((i << 1) & 7)) & 3];
30711   }
30712 
30713   return simde__m256_from_private(r_);
30714 }
30715 #if defined(SIMDE_X86_AVX_NATIVE)
30716 #  define simde_mm256_permute_ps(a, imm8) _mm256_permute_ps(a, imm8)
30717 #endif
30718 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30719   #undef _mm256_permute_ps
30720   #define _mm256_permute_ps(a, imm8) simde_mm256_permute_ps(a, imm8)
30721 #endif
30722 
30723 SIMDE_FUNCTION_ATTRIBUTES
30724 simde__m256d
simde_mm256_permute_pd(simde__m256d a,const int imm8)30725 simde_mm256_permute_pd (simde__m256d a, const int imm8)
30726     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
30727   simde__m256d_private
30728     r_,
30729     a_ = simde__m256d_to_private(a);
30730 
30731   SIMDE_VECTORIZE
30732   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
30733     r_.f64[i] = a_.f64[((imm8 >> i) & 1) + (i & 2)];
30734   }
30735 
30736   return simde__m256d_from_private(r_);
30737 }
30738 #if defined(SIMDE_X86_AVX_NATIVE)
30739 #  define simde_mm256_permute_pd(a, imm8) _mm256_permute_pd(a, imm8)
30740 #endif
30741 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30742   #undef _mm256_permute_pd
30743   #define _mm256_permute_pd(a, imm8) simde_mm256_permute_pd(a, imm8)
30744 #endif
30745 
30746 SIMDE_FUNCTION_ATTRIBUTES
30747 simde__m128
simde_mm_permute_ps(simde__m128 a,const int imm8)30748 simde_mm_permute_ps (simde__m128 a, const int imm8)
30749     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
30750   simde__m128_private
30751     r_,
30752     a_ = simde__m128_to_private(a);
30753 
30754   SIMDE_VECTORIZE
30755   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
30756     r_.f32[i] = a_.f32[(imm8 >> ((i << 1) & 7)) & 3];
30757   }
30758 
30759   return simde__m128_from_private(r_);
30760 }
30761 #if defined(SIMDE_X86_AVX_NATIVE)
30762 #  define simde_mm_permute_ps(a, imm8) _mm_permute_ps(a, imm8)
30763 #endif
30764 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30765   #undef _mm_permute_ps
30766   #define _mm_permute_ps(a, imm8) simde_mm_permute_ps(a, imm8)
30767 #endif
30768 
30769 
30770 SIMDE_FUNCTION_ATTRIBUTES
30771 simde__m128d
simde_mm_permute_pd(simde__m128d a,const int imm8)30772 simde_mm_permute_pd (simde__m128d a, const int imm8)
30773     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
30774   simde__m128d_private
30775     r_,
30776     a_ = simde__m128d_to_private(a);
30777 
30778   SIMDE_VECTORIZE
30779   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
30780     r_.f64[i] = a_.f64[((imm8 >> i) & 1) + (i & 2)];
30781   }
30782 
30783   return simde__m128d_from_private(r_);
30784 }
30785 #if defined(SIMDE_X86_AVX_NATIVE)
30786 #  define simde_mm_permute_pd(a, imm8) _mm_permute_pd(a, imm8)
30787 #endif
30788 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30789   #undef _mm_permute_pd
30790   #define _mm_permute_pd(a, imm8) simde_mm_permute_pd(a, imm8)
30791 #endif
30792 
30793 SIMDE_FUNCTION_ATTRIBUTES
30794 simde__m128
simde_mm_permutevar_ps(simde__m128 a,simde__m128i b)30795 simde_mm_permutevar_ps (simde__m128 a, simde__m128i b) {
30796   #if defined(SIMDE_X86_AVX_NATIVE)
30797     return _mm_permutevar_ps(a, b);
30798   #else
30799     simde__m128_private
30800       r_,
30801       a_ = simde__m128_to_private(a);
30802     simde__m128i_private b_ = simde__m128i_to_private(b);
30803 
30804     SIMDE_VECTORIZE
30805     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
30806       r_.f32[i] = a_.f32[b_.i32[i] & 3];
30807     }
30808 
30809     return simde__m128_from_private(r_);
30810   #endif
30811 }
30812 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30813   #undef _mm_permutevar_ps
30814   #define _mm_permutevar_ps(a, b) simde_mm_permutevar_ps(a, b)
30815 #endif
30816 
30817 SIMDE_FUNCTION_ATTRIBUTES
30818 simde__m128d
simde_mm_permutevar_pd(simde__m128d a,simde__m128i b)30819 simde_mm_permutevar_pd (simde__m128d a, simde__m128i b) {
30820   #if defined(SIMDE_X86_AVX_NATIVE)
30821     return _mm_permutevar_pd(a, b);
30822   #else
30823     simde__m128d_private
30824       r_,
30825       a_ = simde__m128d_to_private(a);
30826     simde__m128i_private b_ = simde__m128i_to_private(b);
30827 
30828     SIMDE_VECTORIZE
30829     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
30830       r_.f64[i] = a_.f64[(b_.i64[i] & 2) >> 1];
30831     }
30832 
30833     return simde__m128d_from_private(r_);
30834   #endif
30835 }
30836 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30837   #undef _mm_permutevar_pd
30838   #define _mm_permutevar_pd(a, b) simde_mm_permutevar_pd(a, b)
30839 #endif
30840 
30841 SIMDE_FUNCTION_ATTRIBUTES
30842 simde__m256
simde_mm256_permutevar_ps(simde__m256 a,simde__m256i b)30843 simde_mm256_permutevar_ps (simde__m256 a, simde__m256i b) {
30844   #if defined(SIMDE_X86_AVX_NATIVE)
30845     return _mm256_permutevar_ps(a, b);
30846   #else
30847     simde__m256_private
30848       r_,
30849       a_ = simde__m256_to_private(a);
30850     simde__m256i_private b_ = simde__m256i_to_private(b);
30851 
30852     SIMDE_VECTORIZE
30853     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
30854       r_.f32[i] = a_.f32[(b_.i32[i] & 3) + (i & 4)];
30855     }
30856 
30857     return simde__m256_from_private(r_);
30858   #endif
30859 }
30860 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30861   #undef _mm256_permutevar_ps
30862   #define _mm256_permutevar_ps(a, b) simde_mm256_permutevar_ps(a, b)
30863 #endif
30864 
30865 SIMDE_FUNCTION_ATTRIBUTES
30866 simde__m256d
simde_mm256_permutevar_pd(simde__m256d a,simde__m256i b)30867 simde_mm256_permutevar_pd (simde__m256d a, simde__m256i b) {
30868   #if defined(SIMDE_X86_AVX_NATIVE)
30869     return _mm256_permutevar_pd(a, b);
30870   #else
30871     simde__m256d_private
30872       r_,
30873       a_ = simde__m256d_to_private(a);
30874     simde__m256i_private b_ = simde__m256i_to_private(b);
30875 
30876     SIMDE_VECTORIZE
30877     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
30878       r_.f64[i] = a_.f64[((b_.i64[i] & 2) >> 1) + (i & 2)];
30879     }
30880 
30881     return simde__m256d_from_private(r_);
30882   #endif
30883 }
30884 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30885   #undef _mm256_permutevar_pd
30886   #define _mm256_permutevar_pd(a, b) simde_mm256_permutevar_pd(a, b)
30887 #endif
30888 
30889 SIMDE_FUNCTION_ATTRIBUTES
30890 simde__m256
simde_mm256_permute2f128_ps(simde__m256 a,simde__m256 b,const int imm8)30891 simde_mm256_permute2f128_ps (simde__m256 a, simde__m256 b, const int imm8)
30892     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
30893   simde__m256_private
30894     r_,
30895     a_ = simde__m256_to_private(a),
30896     b_ = simde__m256_to_private(b);
30897 
30898   r_.m128_private[0] = (imm8 & 0x08) ? simde__m128_to_private(simde_mm_setzero_ps()) : ((imm8 & 0x02) ? b_.m128_private[(imm8     ) & 1] : a_.m128_private[(imm8     ) & 1]);
30899   r_.m128_private[1] = (imm8 & 0x80) ? simde__m128_to_private(simde_mm_setzero_ps()) : ((imm8 & 0x20) ? b_.m128_private[(imm8 >> 4) & 1] : a_.m128_private[(imm8 >> 4) & 1]);
30900 
30901   return simde__m256_from_private(r_);
30902 }
30903 #if defined(SIMDE_X86_AVX_NATIVE)
30904 #  define simde_mm256_permute2f128_ps(a, b, imm8) _mm256_permute2f128_ps(a, b, imm8)
30905 #endif
30906 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30907   #undef _mm256_permute2f128_ps
30908   #define _mm256_permute2f128_ps(a, b, imm8) simde_mm256_permute2f128_ps(a, b, imm8)
30909 #endif
30910 
30911 SIMDE_FUNCTION_ATTRIBUTES
30912 simde__m256d
simde_mm256_permute2f128_pd(simde__m256d a,simde__m256d b,const int imm8)30913 simde_mm256_permute2f128_pd (simde__m256d a, simde__m256d b, const int imm8)
30914     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
30915   simde__m256d_private
30916     r_,
30917     a_ = simde__m256d_to_private(a),
30918     b_ = simde__m256d_to_private(b);
30919 
30920   r_.m128d_private[0] = (imm8 & 0x08) ? simde__m128d_to_private(simde_mm_setzero_pd()) : ((imm8 & 0x02) ? b_.m128d_private[(imm8     ) & 1] : a_.m128d_private[(imm8     ) & 1]);
30921   r_.m128d_private[1] = (imm8 & 0x80) ? simde__m128d_to_private(simde_mm_setzero_pd()) : ((imm8 & 0x20) ? b_.m128d_private[(imm8 >> 4) & 1] : a_.m128d_private[(imm8 >> 4) & 1]);
30922 
30923   return simde__m256d_from_private(r_);
30924 }
30925 #if defined(SIMDE_X86_AVX_NATIVE)
30926 #  define simde_mm256_permute2f128_pd(a, b, imm8) _mm256_permute2f128_pd(a, b, imm8)
30927 #endif
30928 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30929   #undef _mm256_permute2f128_pd
30930   #define _mm256_permute2f128_pd(a, b, imm8) simde_mm256_permute2f128_pd(a, b, imm8)
30931 #endif
30932 
30933 SIMDE_FUNCTION_ATTRIBUTES
30934 simde__m256i
simde_mm256_permute2f128_si256(simde__m256i a,simde__m256i b,const int imm8)30935 simde_mm256_permute2f128_si256 (simde__m256i a, simde__m256i b, const int imm8)
30936     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
30937   simde__m256i_private
30938     r_,
30939     a_ = simde__m256i_to_private(a),
30940     b_ = simde__m256i_to_private(b);
30941 
30942   r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8     ) & 1] : a_.m128i_private[(imm8     ) & 1]);
30943   r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]);
30944 
30945   return simde__m256i_from_private(r_);
30946 }
30947 #if defined(SIMDE_X86_AVX_NATIVE)
30948 #  define simde_mm256_permute2f128_si128(a, b, imm8) _mm256_permute2f128_si128(a, b, imm8)
30949 #endif
30950 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30951   #undef _mm256_permute2f128_si256
30952   #define _mm256_permute2f128_si256(a, b, imm8) simde_mm256_permute2f128_si256(a, b, imm8)
30953 #endif
30954 
30955 SIMDE_FUNCTION_ATTRIBUTES
30956 simde__m256
simde_mm256_rcp_ps(simde__m256 a)30957 simde_mm256_rcp_ps (simde__m256 a) {
30958   #if defined(SIMDE_X86_AVX_NATIVE)
30959     return _mm256_rcp_ps(a);
30960   #else
30961     simde__m256_private
30962       r_,
30963       a_ = simde__m256_to_private(a);
30964 
30965     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
30966       r_.m128[0] = simde_mm_rcp_ps(a_.m128[0]);
30967       r_.m128[1] = simde_mm_rcp_ps(a_.m128[1]);
30968     #else
30969       SIMDE_VECTORIZE
30970       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
30971         r_.f32[i] = SIMDE_FLOAT32_C(1.0) / a_.f32[i];
30972       }
30973     #endif
30974 
30975     return simde__m256_from_private(r_);
30976   #endif
30977 }
30978 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
30979   #undef _mm256_rcp_ps
30980   #define _mm256_rcp_ps(a) simde_mm256_rcp_ps(a)
30981 #endif
30982 
30983 SIMDE_FUNCTION_ATTRIBUTES
30984 simde__m256
simde_mm256_rsqrt_ps(simde__m256 a)30985 simde_mm256_rsqrt_ps (simde__m256 a) {
30986   #if defined(SIMDE_X86_AVX_NATIVE)
30987     return _mm256_rsqrt_ps(a);
30988   #else
30989     simde__m256_private
30990       r_,
30991       a_ = simde__m256_to_private(a);
30992 
30993     #if defined(simde_math_sqrtf)
30994       SIMDE_VECTORIZE
30995       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
30996         r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]);
30997       }
30998     #else
30999       HEDLEY_UNREACHABLE();
31000     #endif
31001 
31002     return simde__m256_from_private(r_);
31003   #endif
31004 }
31005 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31006   #undef _mm256_rsqrt_ps
31007   #define _mm256_rsqrt_ps(a) simde_mm256_rsqrt_ps(a)
31008 #endif
31009 
31010 SIMDE_FUNCTION_ATTRIBUTES
31011 simde__m256i
simde_mm256_setr_epi8(int8_t e31,int8_t e30,int8_t e29,int8_t e28,int8_t e27,int8_t e26,int8_t e25,int8_t e24,int8_t e23,int8_t e22,int8_t e21,int8_t e20,int8_t e19,int8_t e18,int8_t e17,int8_t e16,int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)31012 simde_mm256_setr_epi8 (
31013     int8_t e31, int8_t e30, int8_t e29, int8_t e28, int8_t e27, int8_t e26, int8_t e25, int8_t e24,
31014     int8_t e23, int8_t e22, int8_t e21, int8_t e20, int8_t e19, int8_t e18, int8_t e17, int8_t e16,
31015     int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
31016     int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4, int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
31017   #if defined(SIMDE_X86_AVX_NATIVE)
31018     return _mm256_setr_epi8(
31019         e31, e30, e29, e28, e27, e26, e25, e24,
31020         e23, e22, e21, e20, e19, e18, e17, e16,
31021         e15, e14, e13, e12, e11, e10,  e9,  e8,
31022         e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
31023   #else
31024     return simde_mm256_set_epi8(
31025         e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
31026         e8,  e9, e10, e11, e12, e13, e14, e15,
31027         e16, e17, e18, e19, e20, e21, e22, e23,
31028         e24, e25, e26, e27, e28, e29, e30, e31);
31029   #endif
31030 }
31031 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31032   #undef _mm256_setr_epi8
31033   #define _mm256_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \
31034     simde_mm256_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
31035 #endif
31036 
31037 SIMDE_FUNCTION_ATTRIBUTES
31038 simde__m256i
simde_mm256_setr_epi16(int16_t e15,int16_t e14,int16_t e13,int16_t e12,int16_t e11,int16_t e10,int16_t e9,int16_t e8,int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)31039 simde_mm256_setr_epi16 (
31040     int16_t e15, int16_t e14, int16_t e13, int16_t e12, int16_t e11, int16_t e10, int16_t  e9, int16_t  e8,
31041     int16_t  e7, int16_t  e6, int16_t  e5, int16_t  e4, int16_t  e3, int16_t  e2, int16_t  e1, int16_t  e0) {
31042   #if defined(SIMDE_X86_AVX_NATIVE)
31043     return _mm256_setr_epi16(
31044         e15, e14, e13, e12, e11, e10,  e9,  e8,
31045         e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
31046   #else
31047     return simde_mm256_set_epi16(
31048         e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
31049         e8,  e9, e10, e11, e12, e13, e14, e15);
31050   #endif
31051 }
31052 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31053   #undef _mm256_setr_epi16
31054   #define _mm256_setr_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \
31055     simde_mm256_setr_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
31056 #endif
31057 
31058 SIMDE_FUNCTION_ATTRIBUTES
31059 simde__m256i
simde_mm256_setr_epi32(int32_t e7,int32_t e6,int32_t e5,int32_t e4,int32_t e3,int32_t e2,int32_t e1,int32_t e0)31060 simde_mm256_setr_epi32 (
31061     int32_t  e7, int32_t  e6, int32_t  e5, int32_t  e4, int32_t  e3, int32_t  e2, int32_t  e1, int32_t  e0) {
31062   #if defined(SIMDE_X86_AVX_NATIVE)
31063     return _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
31064   #else
31065     return simde_mm256_set_epi32(e0, e1, e2, e3, e4, e5, e6, e7);
31066   #endif
31067 }
31068 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31069   #undef _mm256_setr_epi32
31070   #define _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) \
31071     simde_mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
31072 #endif
31073 
31074 SIMDE_FUNCTION_ATTRIBUTES
31075 simde__m256i
simde_mm256_setr_epi64x(int64_t e3,int64_t e2,int64_t e1,int64_t e0)31076 simde_mm256_setr_epi64x (int64_t  e3, int64_t  e2, int64_t  e1, int64_t  e0) {
31077   #if defined(SIMDE_X86_AVX_NATIVE)
31078     return _mm256_setr_epi64x(e3, e2, e1, e0);
31079   #else
31080     return simde_mm256_set_epi64x(e0, e1, e2, e3);
31081   #endif
31082 }
31083 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31084   #undef _mm256_setr_epi64x
31085   #define _mm256_setr_epi64x(e3, e2, e1, e0) \
31086     simde_mm256_setr_epi64x(e3, e2, e1, e0)
31087 #endif
31088 
31089 SIMDE_FUNCTION_ATTRIBUTES
31090 simde__m256
simde_mm256_setr_ps(simde_float32 e7,simde_float32 e6,simde_float32 e5,simde_float32 e4,simde_float32 e3,simde_float32 e2,simde_float32 e1,simde_float32 e0)31091 simde_mm256_setr_ps (
31092     simde_float32  e7, simde_float32  e6, simde_float32  e5, simde_float32  e4,
31093     simde_float32  e3, simde_float32  e2, simde_float32  e1, simde_float32  e0) {
31094   #if defined(SIMDE_X86_AVX_NATIVE)
31095     return _mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0);
31096   #else
31097     return simde_mm256_set_ps(e0, e1, e2, e3, e4, e5, e6, e7);
31098   #endif
31099 }
31100 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31101   #undef _mm256_setr_ps
31102   #define _mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0) \
31103     simde_mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0)
31104 #endif
31105 
31106 SIMDE_FUNCTION_ATTRIBUTES
31107 simde__m256d
simde_mm256_setr_pd(simde_float64 e3,simde_float64 e2,simde_float64 e1,simde_float64 e0)31108 simde_mm256_setr_pd (simde_float64  e3, simde_float64  e2, simde_float64  e1, simde_float64  e0) {
31109   #if defined(SIMDE_X86_AVX_NATIVE)
31110     return _mm256_setr_pd(e3, e2, e1, e0);
31111   #else
31112     return simde_mm256_set_pd(e0, e1, e2, e3);
31113   #endif
31114 }
31115 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31116   #undef _mm256_setr_pd
31117   #define _mm256_setr_pd(e3, e2, e1, e0) \
31118     simde_mm256_setr_pd(e3, e2, e1, e0)
31119 #endif
31120 
31121 SIMDE_FUNCTION_ATTRIBUTES
31122 simde__m256
simde_mm256_setr_m128(simde__m128 lo,simde__m128 hi)31123 simde_mm256_setr_m128 (simde__m128 lo, simde__m128 hi) {
31124   #if defined(SIMDE_X86_AVX_NATIVE) && \
31125       !defined(SIMDE_BUG_GCC_REV_247851) && \
31126       SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0)
31127     return _mm256_setr_m128(lo, hi);
31128   #else
31129     return simde_mm256_set_m128(hi, lo);
31130   #endif
31131 }
31132 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31133   #undef _mm256_setr_m128
31134   #define _mm256_setr_m128(lo, hi) \
31135     simde_mm256_setr_m128(lo, hi)
31136 #endif
31137 
31138 SIMDE_FUNCTION_ATTRIBUTES
31139 simde__m256d
simde_mm256_setr_m128d(simde__m128d lo,simde__m128d hi)31140 simde_mm256_setr_m128d (simde__m128d lo, simde__m128d hi) {
31141   #if defined(SIMDE_X86_AVX_NATIVE) && \
31142       !defined(SIMDE_BUG_GCC_REV_247851) && \
31143       SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0)
31144     return _mm256_setr_m128d(lo, hi);
31145   #else
31146     return simde_mm256_set_m128d(hi, lo);
31147   #endif
31148 }
31149 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31150   #undef _mm256_setr_m128d
31151   #define _mm256_setr_m128d(lo, hi) \
31152     simde_mm256_setr_m128d(lo, hi)
31153 #endif
31154 
31155 SIMDE_FUNCTION_ATTRIBUTES
31156 simde__m256i
simde_mm256_setr_m128i(simde__m128i lo,simde__m128i hi)31157 simde_mm256_setr_m128i (simde__m128i lo, simde__m128i hi) {
31158   #if defined(SIMDE_X86_AVX_NATIVE) && \
31159       !defined(SIMDE_BUG_GCC_REV_247851) && \
31160       SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0)
31161     return _mm256_setr_m128i(lo, hi);
31162   #else
31163     return simde_mm256_set_m128i(hi, lo);
31164   #endif
31165 }
31166 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31167   #undef _mm256_setr_m128i
31168   #define _mm256_setr_m128i(lo, hi) \
31169     simde_mm256_setr_m128i(lo, hi)
31170 #endif
31171 
31172 SIMDE_FUNCTION_ATTRIBUTES
31173 simde__m256
simde_mm256_shuffle_ps(simde__m256 a,simde__m256 b,const int imm8)31174 simde_mm256_shuffle_ps (simde__m256 a, simde__m256 b, const int imm8)
31175     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
31176   simde__m256_private
31177     r_,
31178     a_ = simde__m256_to_private(a),
31179     b_ = simde__m256_to_private(b);
31180 
31181   r_.f32[0] = a_.m128_private[0].f32[(imm8 >> 0) & 3];
31182   r_.f32[1] = a_.m128_private[0].f32[(imm8 >> 2) & 3];
31183   r_.f32[2] = b_.m128_private[0].f32[(imm8 >> 4) & 3];
31184   r_.f32[3] = b_.m128_private[0].f32[(imm8 >> 6) & 3];
31185   r_.f32[4] = a_.m128_private[1].f32[(imm8 >> 0) & 3];
31186   r_.f32[5] = a_.m128_private[1].f32[(imm8 >> 2) & 3];
31187   r_.f32[6] = b_.m128_private[1].f32[(imm8 >> 4) & 3];
31188   r_.f32[7] = b_.m128_private[1].f32[(imm8 >> 6) & 3];
31189 
31190   return simde__m256_from_private(r_);
31191 }
31192 #if defined(SIMDE_X86_AVX_NATIVE)
31193   #define simde_mm256_shuffle_ps(a, b, imm8) _mm256_shuffle_ps(a, b, imm8)
31194 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
31195   #define simde_mm256_shuffle_ps(a, b, imm8) \
31196       simde_mm256_set_m128( \
31197           simde_mm_shuffle_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), (imm8)), \
31198           simde_mm_shuffle_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), (imm8)))
31199 #elif defined(SIMDE_SHUFFLE_VECTOR_)
31200   #define simde_mm256_shuffle_ps(a, b, imm8) \
31201     SIMDE_SHUFFLE_VECTOR_(32, 32, a, b, \
31202       (((imm8) >> 0) & 3) + 0, \
31203       (((imm8) >> 2) & 3) + 0, \
31204       (((imm8) >> 4) & 3) + 8, \
31205       (((imm8) >> 6) & 3) + 8, \
31206       (((imm8) >> 0) & 3) + 4, \
31207       (((imm8) >> 2) & 3) + 4, \
31208       (((imm8) >> 4) & 3) + 12, \
31209       (((imm8) >> 6) & 3) + 12)
31210 #endif
31211 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31212   #undef _mm256_shuffle_ps
31213   #define _mm256_shuffle_ps(a, b, imm8) simde_mm256_shuffle_ps(a, b, imm8)
31214 #endif
31215 
31216 SIMDE_FUNCTION_ATTRIBUTES
31217 simde__m256d
simde_mm256_shuffle_pd(simde__m256d a,simde__m256d b,const int imm8)31218 simde_mm256_shuffle_pd (simde__m256d a, simde__m256d b, const int imm8)
31219     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
31220   simde__m256d_private
31221     r_,
31222     a_ = simde__m256d_to_private(a),
31223     b_ = simde__m256d_to_private(b);
31224 
31225   r_.f64[0] = a_.f64[((imm8     ) & 1)    ];
31226   r_.f64[1] = b_.f64[((imm8 >> 1) & 1)    ];
31227   r_.f64[2] = a_.f64[((imm8 >> 2) & 1) | 2];
31228   r_.f64[3] = b_.f64[((imm8 >> 3) & 1) | 2];
31229 
31230   return simde__m256d_from_private(r_);
31231 }
31232 #if defined(SIMDE_X86_AVX_NATIVE)
31233   #define simde_mm256_shuffle_pd(a, b, imm8) _mm256_shuffle_pd(a, b, imm8)
31234 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
31235   #define simde_mm256_shuffle_pd(a, b, imm8) \
31236       simde_mm256_set_m128d( \
31237           simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8 >> 0) & 3), \
31238           simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8 >> 2) & 3))
31239 #elif defined(SIMDE_SHUFFLE_VECTOR_)
31240   #define simde_mm256_shuffle_pd(a, b, imm8) \
31241     SIMDE_SHUFFLE_VECTOR_(64, 32, a, b, \
31242       (((imm8) >> 0) & 1) + 0, \
31243       (((imm8) >> 1) & 1) + 4, \
31244       (((imm8) >> 2) & 1) + 2, \
31245       (((imm8) >> 3) & 1) + 6)
31246 #endif
31247 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31248   #undef _mm256_shuffle_pd
31249   #define _mm256_shuffle_pd(a, b, imm8) simde_mm256_shuffle_pd(a, b, imm8)
31250 #endif
31251 
31252 SIMDE_FUNCTION_ATTRIBUTES
31253 simde__m256
simde_mm256_sqrt_ps(simde__m256 a)31254 simde_mm256_sqrt_ps (simde__m256 a) {
31255   #if defined(SIMDE_X86_AVX_NATIVE)
31256     return _mm256_sqrt_ps(a);
31257   #else
31258     simde__m256_private
31259       r_,
31260       a_ = simde__m256_to_private(a);
31261 
31262     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
31263       r_.m128[0] = simde_mm_sqrt_ps(a_.m128[0]);
31264       r_.m128[1] = simde_mm_sqrt_ps(a_.m128[1]);
31265     #elif defined(simde_math_sqrtf)
31266       SIMDE_VECTORIZE
31267       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
31268         r_.f32[i] = simde_math_sqrtf(a_.f32[i]);
31269       }
31270     #else
31271       HEDLEY_UNREACHABLE();
31272     #endif
31273 
31274     return simde__m256_from_private(r_);
31275   #endif
31276 }
31277 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31278   #undef _mm256_sqrt_ps
31279   #define _mm256_sqrt_ps(a) simde_mm256_sqrt_ps(a)
31280 #endif
31281 
31282 SIMDE_FUNCTION_ATTRIBUTES
31283 simde__m256d
simde_mm256_sqrt_pd(simde__m256d a)31284 simde_mm256_sqrt_pd (simde__m256d a) {
31285   #if defined(SIMDE_X86_AVX_NATIVE)
31286     return _mm256_sqrt_pd(a);
31287   #else
31288     simde__m256d_private
31289       r_,
31290       a_ = simde__m256d_to_private(a);
31291 
31292     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
31293       r_.m128d[0] = simde_mm_sqrt_pd(a_.m128d[0]);
31294       r_.m128d[1] = simde_mm_sqrt_pd(a_.m128d[1]);
31295     #elif defined(simde_math_sqrt)
31296       SIMDE_VECTORIZE
31297       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
31298         r_.f64[i] = simde_math_sqrt(a_.f64[i]);
31299       }
31300     #else
31301       HEDLEY_UNREACHABLE();
31302     #endif
31303 
31304     return simde__m256d_from_private(r_);
31305   #endif
31306 }
31307 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31308   #undef _mm256_sqrt_pd
31309   #define _mm256_sqrt_pd(a) simde_mm256_sqrt_pd(a)
31310 #endif
31311 
31312 SIMDE_FUNCTION_ATTRIBUTES
31313 void
simde_mm256_store_ps(simde_float32 mem_addr[8],simde__m256 a)31314 simde_mm256_store_ps (simde_float32 mem_addr[8], simde__m256 a) {
31315   #if defined(SIMDE_X86_AVX_NATIVE)
31316     _mm256_store_ps(mem_addr, a);
31317   #else
31318     simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a));
31319   #endif
31320 }
31321 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31322   #undef _mm256_store_ps
31323   #define _mm256_store_ps(mem_addr, a) simde_mm256_store_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a)
31324 #endif
31325 
31326 SIMDE_FUNCTION_ATTRIBUTES
31327 void
simde_mm256_store_pd(simde_float64 mem_addr[4],simde__m256d a)31328 simde_mm256_store_pd (simde_float64 mem_addr[4], simde__m256d a) {
31329   #if defined(SIMDE_X86_AVX_NATIVE)
31330     _mm256_store_pd(mem_addr, a);
31331   #else
31332     simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a));
31333   #endif
31334 }
31335 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31336   #undef _mm256_store_pd
31337   #define _mm256_store_pd(mem_addr, a) simde_mm256_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
31338 #endif
31339 
31340 SIMDE_FUNCTION_ATTRIBUTES
31341 void
simde_mm256_store_si256(simde__m256i * mem_addr,simde__m256i a)31342 simde_mm256_store_si256 (simde__m256i* mem_addr, simde__m256i a) {
31343   #if defined(SIMDE_X86_AVX_NATIVE)
31344     _mm256_store_si256(mem_addr, a);
31345   #else
31346   simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a));
31347   #endif
31348 }
31349 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31350   #undef _mm256_store_si256
31351   #define _mm256_store_si256(mem_addr, a) simde_mm256_store_si256(mem_addr, a)
31352 #endif
31353 
31354 SIMDE_FUNCTION_ATTRIBUTES
31355 void
simde_mm256_storeu_ps(simde_float32 mem_addr[8],simde__m256 a)31356 simde_mm256_storeu_ps (simde_float32 mem_addr[8], simde__m256 a) {
31357   #if defined(SIMDE_X86_AVX_NATIVE)
31358     _mm256_storeu_ps(mem_addr, a);
31359   #else
31360     simde_memcpy(mem_addr, &a, sizeof(a));
31361   #endif
31362 }
31363 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31364   #undef _mm256_storeu_ps
31365   #define _mm256_storeu_ps(mem_addr, a) simde_mm256_storeu_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a)
31366 #endif
31367 
31368 SIMDE_FUNCTION_ATTRIBUTES
31369 void
simde_mm256_storeu_pd(simde_float64 mem_addr[4],simde__m256d a)31370 simde_mm256_storeu_pd (simde_float64 mem_addr[4], simde__m256d a) {
31371   #if defined(SIMDE_X86_AVX_NATIVE)
31372     _mm256_storeu_pd(mem_addr, a);
31373   #else
31374     simde_memcpy(mem_addr, &a, sizeof(a));
31375   #endif
31376 }
31377 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31378   #undef _mm256_storeu_pd
31379   #define _mm256_storeu_pd(mem_addr, a) simde_mm256_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
31380 #endif
31381 
31382 SIMDE_FUNCTION_ATTRIBUTES
31383 void
simde_mm256_storeu_si256(void * mem_addr,simde__m256i a)31384 simde_mm256_storeu_si256 (void* mem_addr, simde__m256i a) {
31385   #if defined(SIMDE_X86_AVX_NATIVE)
31386     _mm256_storeu_si256(SIMDE_ALIGN_CAST(__m256i*, mem_addr), a);
31387   #else
31388     simde_memcpy(mem_addr, &a, sizeof(a));
31389   #endif
31390 }
31391 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31392   #undef _mm256_storeu_si256
31393   #define _mm256_storeu_si256(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a)
31394 #endif
31395 
31396 SIMDE_FUNCTION_ATTRIBUTES
31397 void
simde_mm256_storeu2_m128(simde_float32 hi_addr[4],simde_float32 lo_addr[4],simde__m256 a)31398 simde_mm256_storeu2_m128 (simde_float32 hi_addr[4], simde_float32 lo_addr[4], simde__m256 a) {
31399   #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
31400     _mm256_storeu2_m128(hi_addr, lo_addr, a);
31401   #else
31402     simde_mm_storeu_ps(lo_addr, simde_mm256_castps256_ps128(a));
31403     simde_mm_storeu_ps(hi_addr, simde_mm256_extractf128_ps(a, 1));
31404   #endif
31405 }
31406 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31407   #undef _mm256_storeu2_m128
31408   #define _mm256_storeu2_m128(hi_addr, lo_addr, a) simde_mm256_storeu2_m128(hi_addr, lo_addr, a)
31409 #endif
31410 
31411 SIMDE_FUNCTION_ATTRIBUTES
31412 void
simde_mm256_storeu2_m128d(simde_float64 hi_addr[2],simde_float64 lo_addr[2],simde__m256d a)31413 simde_mm256_storeu2_m128d (simde_float64 hi_addr[2], simde_float64 lo_addr[2], simde__m256d a) {
31414   #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
31415     _mm256_storeu2_m128d(hi_addr, lo_addr, a);
31416   #else
31417     simde_mm_storeu_pd(lo_addr, simde_mm256_castpd256_pd128(a));
31418     simde_mm_storeu_pd(hi_addr, simde_mm256_extractf128_pd(a, 1));
31419   #endif
31420 }
31421 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31422   #undef _mm256_storeu2_m128d
31423   #define _mm256_storeu2_m128d(hi_addr, lo_addr, a) simde_mm256_storeu2_m128d(hi_addr, lo_addr, a)
31424 #endif
31425 
31426 SIMDE_FUNCTION_ATTRIBUTES
31427 void
simde_mm256_storeu2_m128i(simde__m128i * hi_addr,simde__m128i * lo_addr,simde__m256i a)31428 simde_mm256_storeu2_m128i (simde__m128i* hi_addr, simde__m128i* lo_addr, simde__m256i a) {
31429   #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
31430     _mm256_storeu2_m128i(hi_addr, lo_addr, a);
31431   #else
31432     simde_mm_storeu_si128(lo_addr, simde_mm256_castsi256_si128(a));
31433     simde_mm_storeu_si128(hi_addr, simde_mm256_extractf128_si256(a, 1));
31434   #endif
31435 }
31436 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31437   #undef _mm256_storeu2_m128i
31438   #define _mm256_storeu2_m128i(hi_addr, lo_addr, a) simde_mm256_storeu2_m128i(hi_addr, lo_addr, a)
31439 #endif
31440 
31441 SIMDE_FUNCTION_ATTRIBUTES
31442 void
simde_mm256_stream_ps(simde_float32 mem_addr[8],simde__m256 a)31443 simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) {
31444   #if defined(SIMDE_X86_AVX_NATIVE)
31445     _mm256_stream_ps(mem_addr, a);
31446   #else
31447     simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a));
31448   #endif
31449 }
31450 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31451   #undef _mm256_stream_ps
31452   #define _mm256_stream_ps(mem_addr, a) simde_mm256_stream_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a)
31453 #endif
31454 
31455 SIMDE_FUNCTION_ATTRIBUTES
31456 void
simde_mm256_stream_pd(simde_float64 mem_addr[4],simde__m256d a)31457 simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) {
31458   #if defined(SIMDE_X86_AVX_NATIVE)
31459     _mm256_stream_pd(mem_addr, a);
31460   #else
31461     simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a));
31462   #endif
31463 }
31464 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31465   #undef _mm256_stream_pd
31466   #define _mm256_stream_pd(mem_addr, a) simde_mm256_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
31467 #endif
31468 
31469 SIMDE_FUNCTION_ATTRIBUTES
31470 void
simde_mm256_stream_si256(simde__m256i * mem_addr,simde__m256i a)31471 simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) {
31472   #if defined(SIMDE_X86_AVX_NATIVE)
31473     _mm256_stream_si256(mem_addr, a);
31474   #else
31475   simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a));
31476   #endif
31477 }
31478 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31479   #undef _mm256_stream_si256
31480   #define _mm256_stream_si256(mem_addr, a) simde_mm256_stream_si256(mem_addr, a)
31481 #endif
31482 
31483 SIMDE_FUNCTION_ATTRIBUTES
31484 simde__m256
simde_mm256_sub_ps(simde__m256 a,simde__m256 b)31485 simde_mm256_sub_ps (simde__m256 a, simde__m256 b) {
31486   #if defined(SIMDE_X86_AVX_NATIVE)
31487     return _mm256_sub_ps(a, b);
31488   #else
31489     simde__m256_private
31490       r_,
31491       a_ = simde__m256_to_private(a),
31492       b_ = simde__m256_to_private(b);
31493 
31494     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
31495       r_.m128[0] = simde_mm_sub_ps(a_.m128[0], b_.m128[0]);
31496       r_.m128[1] = simde_mm_sub_ps(a_.m128[1], b_.m128[1]);
31497     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
31498       r_.f32 = a_.f32 - b_.f32;
31499     #else
31500       SIMDE_VECTORIZE
31501       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
31502         r_.f32[i] = a_.f32[i] - b_.f32[i];
31503       }
31504     #endif
31505 
31506     return simde__m256_from_private(r_);
31507   #endif
31508 }
31509 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31510   #undef _mm256_sub_ps
31511   #define _mm256_sub_ps(a, b) simde_mm256_sub_ps(a, b)
31512 #endif
31513 
31514 SIMDE_FUNCTION_ATTRIBUTES
31515 simde__m256
simde_mm256_hsub_ps(simde__m256 a,simde__m256 b)31516 simde_mm256_hsub_ps (simde__m256 a, simde__m256 b) {
31517   #if defined(SIMDE_X86_AVX_NATIVE)
31518     return _mm256_hsub_ps(a, b);
31519   #else
31520       return simde_mm256_sub_ps(simde_x_mm256_deinterleaveeven_ps(a, b), simde_x_mm256_deinterleaveodd_ps(a, b));
31521   #endif
31522 }
31523 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31524   #undef _mm256_hsub_ps
31525   #define _mm256_hsub_ps(a, b) simde_mm256_hsub_ps(a, b)
31526 #endif
31527 
31528 SIMDE_FUNCTION_ATTRIBUTES
31529 simde__m256d
simde_mm256_sub_pd(simde__m256d a,simde__m256d b)31530 simde_mm256_sub_pd (simde__m256d a, simde__m256d b) {
31531   #if defined(SIMDE_X86_AVX_NATIVE)
31532     return _mm256_sub_pd(a, b);
31533   #else
31534     simde__m256d_private
31535       r_,
31536       a_ = simde__m256d_to_private(a),
31537       b_ = simde__m256d_to_private(b);
31538 
31539     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
31540       r_.m128d[0] = simde_mm_sub_pd(a_.m128d[0], b_.m128d[0]);
31541       r_.m128d[1] = simde_mm_sub_pd(a_.m128d[1], b_.m128d[1]);
31542     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
31543       r_.f64 = a_.f64 - b_.f64;
31544     #else
31545       SIMDE_VECTORIZE
31546       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
31547         r_.f64[i] = a_.f64[i] - b_.f64[i];
31548       }
31549     #endif
31550 
31551     return simde__m256d_from_private(r_);
31552   #endif
31553 }
31554 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31555   #undef _mm256_sub_pd
31556   #define _mm256_sub_pd(a, b) simde_mm256_sub_pd(a, b)
31557 #endif
31558 
31559 SIMDE_FUNCTION_ATTRIBUTES
31560 simde__m256d
simde_mm256_hsub_pd(simde__m256d a,simde__m256d b)31561 simde_mm256_hsub_pd (simde__m256d a, simde__m256d b) {
31562   #if defined(SIMDE_X86_AVX_NATIVE)
31563     return _mm256_hsub_pd(a, b);
31564   #else
31565       return simde_mm256_sub_pd(simde_x_mm256_deinterleaveeven_pd(a, b), simde_x_mm256_deinterleaveodd_pd(a, b));
31566   #endif
31567 }
31568 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31569   #undef _mm256_hsub_pd
31570   #define _mm256_hsub_pd(a, b) simde_mm256_hsub_pd(a, b)
31571 #endif
31572 
31573 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
31574   HEDLEY_DIAGNOSTIC_PUSH
31575   SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
31576 #endif
31577 
31578 SIMDE_FUNCTION_ATTRIBUTES
31579 simde__m256
simde_mm256_undefined_ps(void)31580 simde_mm256_undefined_ps (void) {
31581   simde__m256_private r_;
31582 
31583 #if \
31584     defined(SIMDE_X86_AVX_NATIVE) && \
31585     (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \
31586     (!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256))
31587   r_.n = _mm256_undefined_ps();
31588 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
31589   r_ = simde__m256_to_private(simde_mm256_setzero_ps());
31590 #endif
31591 
31592   return simde__m256_from_private(r_);
31593 }
31594 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31595   #undef _mm256_undefined_ps
31596   #define _mm256_undefined_ps() simde_mm256_undefined_ps()
31597 #endif
31598 
31599 SIMDE_FUNCTION_ATTRIBUTES
31600 simde__m256d
simde_mm256_undefined_pd(void)31601 simde_mm256_undefined_pd (void) {
31602   simde__m256d_private r_;
31603 
31604 #if \
31605     defined(SIMDE_X86_AVX_NATIVE) && \
31606     (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \
31607     (!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256))
31608   r_.n = _mm256_undefined_pd();
31609 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
31610   r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
31611 #endif
31612 
31613   return simde__m256d_from_private(r_);
31614 }
31615 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31616   #undef _mm256_undefined_pd
31617   #define _mm256_undefined_pd() simde_mm256_undefined_pd()
31618 #endif
31619 
31620 SIMDE_FUNCTION_ATTRIBUTES
31621 simde__m256i
simde_mm256_undefined_si256(void)31622 simde_mm256_undefined_si256 (void) {
31623   simde__m256i_private r_;
31624 #if \
31625     defined(SIMDE_X86_AVX_NATIVE) && \
31626     (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \
31627     (!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256))
31628   r_.n = _mm256_undefined_si256();
31629 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
31630   r_ = simde__m256i_to_private(simde_mm256_setzero_si256());
31631 #endif
31632 
31633   return simde__m256i_from_private(r_);
31634 }
31635 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31636   #undef _mm256_undefined_si256
31637   #define _mm256_undefined_si256() simde_mm256_undefined_si256()
31638 #endif
31639 
31640 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
31641   HEDLEY_DIAGNOSTIC_POP
31642 #endif
31643 
31644 SIMDE_FUNCTION_ATTRIBUTES
31645 simde__m256
simde_mm256_xor_ps(simde__m256 a,simde__m256 b)31646 simde_mm256_xor_ps (simde__m256 a, simde__m256 b) {
31647   #if defined(SIMDE_X86_AVX_NATIVE)
31648     return _mm256_xor_ps(a, b);
31649   #else
31650     simde__m256_private
31651       r_,
31652       a_ = simde__m256_to_private(a),
31653       b_ = simde__m256_to_private(b);
31654 
31655     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
31656       r_.m128[0] = simde_mm_xor_ps(a_.m128[0], b_.m128[0]);
31657       r_.m128[1] = simde_mm_xor_ps(a_.m128[1], b_.m128[1]);
31658     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
31659       r_.i32f = a_.i32f ^ b_.i32f;
31660     #else
31661       SIMDE_VECTORIZE
31662       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
31663         r_.u32[i] = a_.u32[i] ^ b_.u32[i];
31664       }
31665     #endif
31666 
31667     return simde__m256_from_private(r_);
31668   #endif
31669 }
31670 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31671   #undef _mm256_xor_ps
31672   #define _mm256_xor_ps(a, b) simde_mm256_xor_ps(a, b)
31673 #endif
31674 
31675 SIMDE_FUNCTION_ATTRIBUTES
31676 simde__m256d
simde_mm256_xor_pd(simde__m256d a,simde__m256d b)31677 simde_mm256_xor_pd (simde__m256d a, simde__m256d b) {
31678   #if defined(SIMDE_X86_AVX_NATIVE)
31679     return _mm256_xor_pd(a, b);
31680   #else
31681     simde__m256d_private
31682       r_,
31683       a_ = simde__m256d_to_private(a),
31684       b_ = simde__m256d_to_private(b);
31685 
31686     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
31687       r_.m128d[0] = simde_mm_xor_pd(a_.m128d[0], b_.m128d[0]);
31688       r_.m128d[1] = simde_mm_xor_pd(a_.m128d[1], b_.m128d[1]);
31689     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
31690       r_.i32f = a_.i32f ^ b_.i32f;
31691     #else
31692       SIMDE_VECTORIZE
31693       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
31694         r_.u64[i] = a_.u64[i] ^ b_.u64[i];
31695       }
31696     #endif
31697 
31698     return simde__m256d_from_private(r_);
31699   #endif
31700 }
31701 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31702   #undef _mm256_xor_pd
31703   #define _mm256_xor_pd(a, b) simde_mm256_xor_pd(a, b)
31704 #endif
31705 
31706 SIMDE_FUNCTION_ATTRIBUTES
31707 simde__m256
simde_x_mm256_xorsign_ps(simde__m256 dest,simde__m256 src)31708 simde_x_mm256_xorsign_ps(simde__m256 dest, simde__m256 src) {
31709   return simde_mm256_xor_ps(simde_mm256_and_ps(simde_mm256_set1_ps(-0.0f), src), dest);
31710 }
31711 
31712 SIMDE_FUNCTION_ATTRIBUTES
31713 simde__m256d
simde_x_mm256_xorsign_pd(simde__m256d dest,simde__m256d src)31714 simde_x_mm256_xorsign_pd(simde__m256d dest, simde__m256d src) {
31715   return simde_mm256_xor_pd(simde_mm256_and_pd(simde_mm256_set1_pd(-0.0), src), dest);
31716 }
31717 
31718 SIMDE_FUNCTION_ATTRIBUTES
31719 simde__m256
simde_x_mm256_negate_ps(simde__m256 a)31720 simde_x_mm256_negate_ps(simde__m256 a) {
31721   #if defined(SIMDE_X86_AVX_NATIVE)
31722     return simde_mm256_xor_ps(a,_mm256_set1_ps(SIMDE_FLOAT32_C(-0.0)));
31723   #else
31724     simde__m256_private
31725       r_,
31726       a_ = simde__m256_to_private(a);
31727 
31728     #if defined(SIMDE_VECTOR_NEGATE)
31729       r_.f32 = -a_.f32;
31730     #else
31731       SIMDE_VECTORIZE
31732       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
31733         r_.f32[i] = -a_.f32[i];
31734       }
31735     #endif
31736 
31737     return simde__m256_from_private(r_);
31738   #endif
31739 }
31740 
31741 SIMDE_FUNCTION_ATTRIBUTES
31742 simde__m256d
simde_x_mm256_negate_pd(simde__m256d a)31743 simde_x_mm256_negate_pd(simde__m256d a) {
31744   #if defined(SIMDE_X86_AVX2_NATIVE)
31745     return simde_mm256_xor_pd(a, _mm256_set1_pd(SIMDE_FLOAT64_C(-0.0)));
31746   #else
31747     simde__m256d_private
31748       r_,
31749       a_ = simde__m256d_to_private(a);
31750 
31751     #if defined(SIMDE_VECTOR_NEGATE)
31752       r_.f64 = -a_.f64;
31753     #else
31754       SIMDE_VECTORIZE
31755       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
31756         r_.f64[i] = -a_.f64[i];
31757       }
31758     #endif
31759 
31760     return simde__m256d_from_private(r_);
31761   #endif
31762 }
31763 
31764 SIMDE_FUNCTION_ATTRIBUTES
31765 simde__m256
simde_mm256_unpackhi_ps(simde__m256 a,simde__m256 b)31766 simde_mm256_unpackhi_ps (simde__m256 a, simde__m256 b) {
31767   #if defined(SIMDE_X86_AVX_NATIVE)
31768     return _mm256_unpackhi_ps(a, b);
31769   #else
31770     simde__m256_private
31771       r_,
31772       a_ = simde__m256_to_private(a),
31773       b_ = simde__m256_to_private(b);
31774 
31775     #if defined(SIMDE_SHUFFLE_VECTOR_)
31776       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 2, 10, 3, 11, 6, 14, 7, 15);
31777     #else
31778       r_.f32[0] = a_.f32[2];
31779       r_.f32[1] = b_.f32[2];
31780       r_.f32[2] = a_.f32[3];
31781       r_.f32[3] = b_.f32[3];
31782       r_.f32[4] = a_.f32[6];
31783       r_.f32[5] = b_.f32[6];
31784       r_.f32[6] = a_.f32[7];
31785       r_.f32[7] = b_.f32[7];
31786     #endif
31787 
31788     return simde__m256_from_private(r_);
31789   #endif
31790 }
31791 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31792   #undef _mm256_unpackhi_ps
31793   #define _mm256_unpackhi_ps(a, b) simde_mm256_unpackhi_ps(a, b)
31794 #endif
31795 
31796 SIMDE_FUNCTION_ATTRIBUTES
31797 simde__m256d
simde_mm256_unpackhi_pd(simde__m256d a,simde__m256d b)31798 simde_mm256_unpackhi_pd (simde__m256d a, simde__m256d b) {
31799   #if defined(SIMDE_X86_AVX_NATIVE)
31800     return _mm256_unpackhi_pd(a, b);
31801   #else
31802     simde__m256d_private
31803       r_,
31804       a_ = simde__m256d_to_private(a),
31805       b_ = simde__m256d_to_private(b);
31806 
31807     #if defined(SIMDE_SHUFFLE_VECTOR_)
31808       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 1, 5, 3, 7);
31809     #else
31810       r_.f64[0] = a_.f64[1];
31811       r_.f64[1] = b_.f64[1];
31812       r_.f64[2] = a_.f64[3];
31813       r_.f64[3] = b_.f64[3];
31814     #endif
31815 
31816     return simde__m256d_from_private(r_);
31817   #endif
31818 }
31819 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31820   #undef _mm256_unpackhi_pd
31821   #define _mm256_unpackhi_pd(a, b) simde_mm256_unpackhi_pd(a, b)
31822 #endif
31823 
31824 SIMDE_FUNCTION_ATTRIBUTES
31825 simde__m256
simde_mm256_unpacklo_ps(simde__m256 a,simde__m256 b)31826 simde_mm256_unpacklo_ps (simde__m256 a, simde__m256 b) {
31827   #if defined(SIMDE_X86_AVX_NATIVE)
31828     return _mm256_unpacklo_ps(a, b);
31829   #else
31830     simde__m256_private
31831       r_,
31832       a_ = simde__m256_to_private(a),
31833       b_ = simde__m256_to_private(b);
31834 
31835     #if defined(SIMDE_SHUFFLE_VECTOR_)
31836       r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 0, 8, 1, 9, 4, 12, 5, 13);
31837     #else
31838       r_.f32[0] = a_.f32[0];
31839       r_.f32[1] = b_.f32[0];
31840       r_.f32[2] = a_.f32[1];
31841       r_.f32[3] = b_.f32[1];
31842       r_.f32[4] = a_.f32[4];
31843       r_.f32[5] = b_.f32[4];
31844       r_.f32[6] = a_.f32[5];
31845       r_.f32[7] = b_.f32[5];
31846     #endif
31847 
31848     return simde__m256_from_private(r_);
31849   #endif
31850 }
31851 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31852   #undef _mm256_unpacklo_ps
31853   #define _mm256_unpacklo_ps(a, b) simde_mm256_unpacklo_ps(a, b)
31854 #endif
31855 
31856 SIMDE_FUNCTION_ATTRIBUTES
31857 simde__m256d
simde_mm256_unpacklo_pd(simde__m256d a,simde__m256d b)31858 simde_mm256_unpacklo_pd (simde__m256d a, simde__m256d b) {
31859   #if defined(SIMDE_X86_AVX_NATIVE)
31860     return _mm256_unpacklo_pd(a, b);
31861   #else
31862     simde__m256d_private
31863       r_,
31864       a_ = simde__m256d_to_private(a),
31865       b_ = simde__m256d_to_private(b);
31866 
31867     #if defined(SIMDE_SHUFFLE_VECTOR_)
31868       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 0, 4, 2, 6);
31869     #else
31870       r_.f64[0] = a_.f64[0];
31871       r_.f64[1] = b_.f64[0];
31872       r_.f64[2] = a_.f64[2];
31873       r_.f64[3] = b_.f64[2];
31874     #endif
31875 
31876     return simde__m256d_from_private(r_);
31877   #endif
31878 }
31879 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31880   #undef _mm256_unpacklo_pd
31881   #define _mm256_unpacklo_pd(a, b) simde_mm256_unpacklo_pd(a, b)
31882 #endif
31883 
31884 SIMDE_FUNCTION_ATTRIBUTES
31885 simde__m256
simde_mm256_zextps128_ps256(simde__m128 a)31886 simde_mm256_zextps128_ps256 (simde__m128 a) {
31887   #if defined(SIMDE_X86_AVX_NATIVE)
31888     return _mm256_insertf128_ps(_mm256_setzero_ps(), a, 0);
31889   #else
31890     simde__m256_private r_;
31891 
31892     r_.m128_private[0] = simde__m128_to_private(a);
31893     r_.m128_private[1] = simde__m128_to_private(simde_mm_setzero_ps());
31894 
31895     return simde__m256_from_private(r_);
31896   #endif
31897 }
31898 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31899   #undef _mm256_zextps128_ps256
31900   #define _mm256_zextps128_ps256(a) simde_mm256_zextps128_ps256(a)
31901 #endif
31902 
31903 SIMDE_FUNCTION_ATTRIBUTES
31904 simde__m256d
simde_mm256_zextpd128_pd256(simde__m128d a)31905 simde_mm256_zextpd128_pd256 (simde__m128d a) {
31906   #if defined(SIMDE_X86_AVX_NATIVE)
31907     return _mm256_insertf128_pd(_mm256_setzero_pd(), a, 0);
31908   #else
31909     simde__m256d_private r_;
31910 
31911     r_.m128d_private[0] = simde__m128d_to_private(a);
31912     r_.m128d_private[1] = simde__m128d_to_private(simde_mm_setzero_pd());
31913 
31914     return simde__m256d_from_private(r_);
31915   #endif
31916 }
31917 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31918   #undef _mm256_zextpd128_pd256
31919   #define _mm256_zextpd128_pd256(a) simde_mm256_zextpd128_pd256(a)
31920 #endif
31921 
31922 SIMDE_FUNCTION_ATTRIBUTES
31923 simde__m256i
simde_mm256_zextsi128_si256(simde__m128i a)31924 simde_mm256_zextsi128_si256 (simde__m128i a) {
31925   #if defined(SIMDE_X86_AVX_NATIVE)
31926     return _mm256_insertf128_si256(_mm256_setzero_si256(), a, 0);
31927   #else
31928     simde__m256i_private r_;
31929 
31930     r_.m128i_private[0] = simde__m128i_to_private(a);
31931     r_.m128i_private[1] = simde__m128i_to_private(simde_mm_setzero_si128());
31932 
31933     return simde__m256i_from_private(r_);
31934   #endif
31935 }
31936 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31937   #undef _mm256_zextsi128_si256
31938   #define _mm256_zextsi128_si256(a) simde_mm256_zextsi128_si256(a)
31939 #endif
31940 
31941 SIMDE_FUNCTION_ATTRIBUTES
31942 int
simde_mm_testc_ps(simde__m128 a,simde__m128 b)31943 simde_mm_testc_ps (simde__m128 a, simde__m128 b) {
31944   #if defined(SIMDE_X86_AVX_NATIVE)
31945     return _mm_testc_ps(a, b);
31946   #else
31947     simde__m128_private
31948       a_ = simde__m128_to_private(a),
31949       b_ = simde__m128_to_private(b);
31950 
31951     #if defined(SIMDE_WASM_SIMD128_NATIVE)
31952       v128_t m = wasm_u32x4_shr(wasm_v128_or(wasm_v128_not(b_.wasm_v128), a_.wasm_v128), 31);
31953       m = wasm_v128_and(m, simde_mm_movehl_ps(m, m));
31954       m = wasm_v128_and(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1)));
31955       return wasm_i32x4_extract_lane(m, 0);
31956     #else
31957       uint_fast32_t r = 0;
31958       SIMDE_VECTORIZE_REDUCTION(|:r)
31959       for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
31960         r |= ~a_.u32[i] & b_.u32[i];
31961       }
31962 
31963       return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1));
31964     #endif
31965   #endif
31966 }
31967 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31968   #undef _mm_testc_ps
31969   #define _mm_testc_ps(a, b) simde_mm_testc_ps(a, b)
31970 #endif
31971 
31972 SIMDE_FUNCTION_ATTRIBUTES
31973 int
simde_mm_testc_pd(simde__m128d a,simde__m128d b)31974 simde_mm_testc_pd (simde__m128d a, simde__m128d b) {
31975   #if defined(SIMDE_X86_AVX_NATIVE)
31976     return _mm_testc_pd(a, b);
31977   #else
31978     simde__m128d_private
31979       a_ = simde__m128d_to_private(a),
31980       b_ = simde__m128d_to_private(b);
31981 
31982     #if defined(SIMDE_WASM_SIMD128_NATIVE)
31983       v128_t m = wasm_u64x2_shr(wasm_v128_or(wasm_v128_not(b_.wasm_v128), a_.wasm_v128), 63);
31984       return HEDLEY_STATIC_CAST(int, wasm_i64x2_extract_lane(m, 0) & wasm_i64x2_extract_lane(m, 1));
31985     #else
31986       uint_fast64_t r = 0;
31987       SIMDE_VECTORIZE_REDUCTION(|:r)
31988       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
31989         r |= ~a_.u64[i] & b_.u64[i];
31990       }
31991 
31992       return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1));
31993     #endif
31994   #endif
31995 }
31996 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
31997   #undef _mm_testc_pd
31998   #define _mm_testc_pd(a, b) simde_mm_testc_pd(a, b)
31999 #endif
32000 
32001 SIMDE_FUNCTION_ATTRIBUTES
32002 int
simde_mm256_testc_ps(simde__m256 a,simde__m256 b)32003 simde_mm256_testc_ps (simde__m256 a, simde__m256 b) {
32004   #if defined(SIMDE_X86_AVX_NATIVE)
32005     return _mm256_testc_ps(a, b);
32006   #else
32007     uint_fast32_t r = 0;
32008     simde__m256_private
32009       a_ = simde__m256_to_private(a),
32010       b_ = simde__m256_to_private(b);
32011 
32012     SIMDE_VECTORIZE_REDUCTION(|:r)
32013     for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
32014       r |= ~a_.u32[i] & b_.u32[i];
32015     }
32016 
32017     return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1));
32018   #endif
32019 }
32020 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32021   #undef _mm256_testc_ps
32022   #define _mm256_testc_ps(a, b) simde_mm256_testc_ps(a, b)
32023 #endif
32024 
32025 SIMDE_FUNCTION_ATTRIBUTES
32026 int
simde_mm256_testc_pd(simde__m256d a,simde__m256d b)32027 simde_mm256_testc_pd (simde__m256d a, simde__m256d b) {
32028   #if defined(SIMDE_X86_AVX_NATIVE)
32029     return _mm256_testc_pd(a, b);
32030   #else
32031     uint_fast64_t r = 0;
32032     simde__m256d_private
32033       a_ = simde__m256d_to_private(a),
32034       b_ = simde__m256d_to_private(b);
32035 
32036     SIMDE_VECTORIZE_REDUCTION(|:r)
32037     for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
32038       r |= ~a_.u64[i] & b_.u64[i];
32039     }
32040 
32041     return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1));
32042   #endif
32043 }
32044 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32045   #undef _mm256_testc_pd
32046   #define _mm256_testc_pd(a, b) simde_mm256_testc_pd(a, b)
32047 #endif
32048 
32049 SIMDE_FUNCTION_ATTRIBUTES
32050 int
simde_mm256_testc_si256(simde__m256i a,simde__m256i b)32051 simde_mm256_testc_si256 (simde__m256i a, simde__m256i b) {
32052   #if defined(SIMDE_X86_AVX_NATIVE)
32053     return _mm256_testc_si256(a, b);
32054   #else
32055     int_fast32_t r = 0;
32056     simde__m256i_private
32057       a_ = simde__m256i_to_private(a),
32058       b_ = simde__m256i_to_private(b);
32059 
32060     SIMDE_VECTORIZE_REDUCTION(|:r)
32061     for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
32062       r |= ~a_.i32f[i] & b_.i32f[i];
32063     }
32064 
32065     return HEDLEY_STATIC_CAST(int, !r);
32066   #endif
32067 }
32068 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32069   #undef _mm256_testc_si256
32070   #define _mm256_testc_si256(a, b) simde_mm256_testc_si256(a, b)
32071 #endif
32072 
32073 SIMDE_FUNCTION_ATTRIBUTES
32074 int
simde_mm_testz_ps(simde__m128 a,simde__m128 b)32075 simde_mm_testz_ps (simde__m128 a, simde__m128 b) {
32076   #if defined(SIMDE_X86_AVX_NATIVE)
32077     return _mm_testz_ps(a, b);
32078   #else
32079     simde__m128_private
32080       a_ = simde__m128_to_private(a),
32081       b_ = simde__m128_to_private(b);
32082 
32083     #if defined(SIMDE_WASM_SIMD128_NATIVE)
32084       v128_t m = wasm_u32x4_shr(wasm_v128_not(wasm_v128_and(a_.wasm_v128, b_.wasm_v128)), 31);
32085       m = wasm_v128_and(m, simde_mm_movehl_ps(m, m));
32086       m = wasm_v128_and(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1)));
32087       return wasm_i32x4_extract_lane(m, 0);
32088     #else
32089       uint_fast32_t r = 0;
32090       SIMDE_VECTORIZE_REDUCTION(|:r)
32091       for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
32092         r |= a_.u32[i] & b_.u32[i];
32093       }
32094 
32095       return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1));
32096     #endif
32097   #endif
32098 }
32099 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32100   #undef _mm_testz_ps
32101   #define _mm_testz_ps(a, b) simde_mm_testz_ps(a, b)
32102 #endif
32103 
32104 SIMDE_FUNCTION_ATTRIBUTES
32105 int
simde_mm_testz_pd(simde__m128d a,simde__m128d b)32106 simde_mm_testz_pd (simde__m128d a, simde__m128d b) {
32107   #if defined(SIMDE_X86_AVX_NATIVE)
32108     return _mm_testz_pd(a, b);
32109   #else
32110     simde__m128d_private
32111       a_ = simde__m128d_to_private(a),
32112       b_ = simde__m128d_to_private(b);
32113 
32114     #if defined(SIMDE_WASM_SIMD128_NATIVE)
32115       v128_t m = wasm_u64x2_shr(wasm_v128_not(wasm_v128_and(a_.wasm_v128, b_.wasm_v128)), 63);
32116       return HEDLEY_STATIC_CAST(int, wasm_i64x2_extract_lane(m, 0) & wasm_i64x2_extract_lane(m, 1));
32117     #else
32118       uint_fast64_t r = 0;
32119       SIMDE_VECTORIZE_REDUCTION(|:r)
32120       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
32121         r |= a_.u64[i] & b_.u64[i];
32122       }
32123 
32124       return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1));
32125     #endif
32126   #endif
32127 }
32128 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32129   #undef _mm_testz_pd
32130   #define _mm_testz_pd(a, b) simde_mm_testz_pd(a, b)
32131 #endif
32132 
32133 SIMDE_FUNCTION_ATTRIBUTES
32134 int
simde_mm256_testz_ps(simde__m256 a,simde__m256 b)32135 simde_mm256_testz_ps (simde__m256 a, simde__m256 b) {
32136   #if defined(SIMDE_X86_AVX_NATIVE)
32137     return _mm256_testz_ps(a, b);
32138   #else
32139     uint_fast32_t r = 0;
32140     simde__m256_private
32141       a_ = simde__m256_to_private(a),
32142       b_ = simde__m256_to_private(b);
32143 
32144     SIMDE_VECTORIZE_REDUCTION(|:r)
32145     for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
32146       r |= a_.u32[i] & b_.u32[i];
32147     }
32148 
32149     return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1));
32150   #endif
32151 }
32152 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32153   #undef _mm256_testz_ps
32154   #define _mm256_testz_ps(a, b) simde_mm256_testz_ps(a, b)
32155 #endif
32156 
32157 SIMDE_FUNCTION_ATTRIBUTES
32158 int
simde_mm256_testz_pd(simde__m256d a,simde__m256d b)32159 simde_mm256_testz_pd (simde__m256d a, simde__m256d b) {
32160   #if defined(SIMDE_X86_AVX_NATIVE)
32161     return _mm256_testz_pd(a, b);
32162   #else
32163     uint_fast64_t r = 0;
32164     simde__m256d_private
32165       a_ = simde__m256d_to_private(a),
32166       b_ = simde__m256d_to_private(b);
32167 
32168     SIMDE_VECTORIZE_REDUCTION(|:r)
32169     for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
32170       r |= a_.u64[i] & b_.u64[i];
32171     }
32172 
32173     return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1));
32174   #endif
32175 }
32176 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32177   #undef _mm256_testz_pd
32178   #define _mm256_testz_pd(a, b) simde_mm256_testz_pd(a, b)
32179 #endif
32180 
32181 SIMDE_FUNCTION_ATTRIBUTES
32182 int
simde_mm256_testz_si256(simde__m256i a,simde__m256i b)32183 simde_mm256_testz_si256 (simde__m256i a, simde__m256i b) {
32184   #if defined(SIMDE_X86_AVX_NATIVE)
32185     return _mm256_testz_si256(a, b);
32186   #else
32187     int_fast32_t r = 0;
32188     simde__m256i_private
32189       a_ = simde__m256i_to_private(a),
32190       b_ = simde__m256i_to_private(b);
32191 
32192     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32193       r = simde_mm_testz_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testz_si128(a_.m128i[1], b_.m128i[1]);
32194     #else
32195       SIMDE_VECTORIZE_REDUCTION(|:r)
32196       for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
32197         r |= a_.i32f[i] & b_.i32f[i];
32198       }
32199 
32200       r = !r;
32201     #endif
32202 
32203     return HEDLEY_STATIC_CAST(int, r);
32204   #endif
32205 }
32206 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32207   #undef _mm256_testz_si256
32208   #define _mm256_testz_si256(a, b) simde_mm256_testz_si256(a, b)
32209 #endif
32210 
32211 SIMDE_FUNCTION_ATTRIBUTES
32212 int
simde_mm_testnzc_ps(simde__m128 a,simde__m128 b)32213 simde_mm_testnzc_ps (simde__m128 a, simde__m128 b) {
32214   #if defined(SIMDE_X86_AVX_NATIVE)
32215     return _mm_testnzc_ps(a, b);
32216   #else
32217     simde__m128_private
32218       a_ = simde__m128_to_private(a),
32219       b_ = simde__m128_to_private(b);
32220 
32221     #if defined(SIMDE_WASM_SIMD128_NATIVE)
32222       v128_t m = wasm_u32x4_shr(wasm_v128_and(a_.wasm_v128, b_.wasm_v128), 31);
32223       v128_t m2 = wasm_u32x4_shr(wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128), 31);
32224       m  = wasm_v128_or(m,  simde_mm_movehl_ps(m, m));
32225       m2 = wasm_v128_or(m2, simde_mm_movehl_ps(m2, m2));
32226       m  = wasm_v128_or(m,  simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1)));
32227       m2 = wasm_v128_or(m2, simde_mm_shuffle_epi32(m2, SIMDE_MM_SHUFFLE(3, 2, 0, 1)));
32228       return wasm_i32x4_extract_lane(m, 0) & wasm_i32x4_extract_lane(m2, 0);
32229     #else
32230       uint32_t rz = 0, rc = 0;
32231       for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
32232         rc |= ~a_.u32[i] & b_.u32[i];
32233         rz |=  a_.u32[i] & b_.u32[i];
32234       }
32235 
32236       return
32237         (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) &
32238         (rz >> ((sizeof(rz) * CHAR_BIT) - 1));
32239     #endif
32240   #endif
32241 }
32242 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32243   #undef _mm_testnzc_ps
32244   #define _mm_testnzc_ps(a, b) simde_mm_testnzc_ps(a, b)
32245 #endif
32246 
32247 SIMDE_FUNCTION_ATTRIBUTES
32248 int
simde_mm_testnzc_pd(simde__m128d a,simde__m128d b)32249 simde_mm_testnzc_pd (simde__m128d a, simde__m128d b) {
32250   #if defined(SIMDE_X86_AVX_NATIVE)
32251     return _mm_testnzc_pd(a, b);
32252   #else
32253     simde__m128d_private
32254       a_ = simde__m128d_to_private(a),
32255       b_ = simde__m128d_to_private(b);
32256     #if defined(SIMDE_WASM_SIMD128_NATIVE)
32257       v128_t m = wasm_u64x2_shr(wasm_v128_and(a_.wasm_v128, b_.wasm_v128), 63);
32258       v128_t m2 = wasm_u64x2_shr(wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128), 63);
32259       return HEDLEY_STATIC_CAST(int, (wasm_i64x2_extract_lane(m, 0)  | wasm_i64x2_extract_lane(m, 1))
32260         & (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1)));
32261     #else
32262       uint64_t rc = 0, rz = 0;
32263       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
32264         rc |= ~a_.u64[i] & b_.u64[i];
32265         rz |=  a_.u64[i] & b_.u64[i];
32266       }
32267 
32268       return
32269         (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) &
32270         (rz >> ((sizeof(rz) * CHAR_BIT) - 1));
32271     #endif
32272   #endif
32273 }
32274 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32275   #undef _mm_testnzc_pd
32276   #define _mm_testnzc_pd(a, b) simde_mm_testnzc_pd(a, b)
32277 #endif
32278 
32279 SIMDE_FUNCTION_ATTRIBUTES
32280 int
simde_mm256_testnzc_ps(simde__m256 a,simde__m256 b)32281 simde_mm256_testnzc_ps (simde__m256 a, simde__m256 b) {
32282   #if defined(SIMDE_X86_AVX_NATIVE)
32283     return _mm256_testnzc_ps(a, b);
32284   #else
32285     uint32_t rc = 0, rz = 0;
32286     simde__m256_private
32287       a_ = simde__m256_to_private(a),
32288       b_ = simde__m256_to_private(b);
32289 
32290     for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
32291       rc |= ~a_.u32[i] & b_.u32[i];
32292       rz |=  a_.u32[i] & b_.u32[i];
32293     }
32294 
32295     return
32296       (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) &
32297       (rz >> ((sizeof(rz) * CHAR_BIT) - 1));
32298   #endif
32299 }
32300 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32301   #undef _mm256_testnzc_ps
32302   #define _mm256_testnzc_ps(a, b) simde_mm256_testnzc_ps(a, b)
32303 #endif
32304 
32305 SIMDE_FUNCTION_ATTRIBUTES
32306 int
simde_mm256_testnzc_pd(simde__m256d a,simde__m256d b)32307 simde_mm256_testnzc_pd (simde__m256d a, simde__m256d b) {
32308   #if defined(SIMDE_X86_AVX_NATIVE)
32309     return _mm256_testnzc_pd(a, b);
32310   #else
32311     uint64_t rc = 0, rz = 0;
32312     simde__m256d_private
32313       a_ = simde__m256d_to_private(a),
32314       b_ = simde__m256d_to_private(b);
32315 
32316     for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
32317       rc |= ~a_.u64[i] & b_.u64[i];
32318       rz |=  a_.u64[i] & b_.u64[i];
32319     }
32320 
32321     return
32322       (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) &
32323       (rz >> ((sizeof(rz) * CHAR_BIT) - 1));
32324   #endif
32325 }
32326 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32327   #undef _mm256_testnzc_pd
32328   #define _mm256_testnzc_pd(a, b) simde_mm256_testnzc_pd(a, b)
32329 #endif
32330 
32331 SIMDE_FUNCTION_ATTRIBUTES
32332 int
simde_mm256_testnzc_si256(simde__m256i a,simde__m256i b)32333 simde_mm256_testnzc_si256 (simde__m256i a, simde__m256i b) {
32334   #if defined(SIMDE_X86_AVX_NATIVE)
32335     return _mm256_testnzc_si256(a, b);
32336   #else
32337     int32_t rc = 0, rz = 0;
32338     simde__m256i_private
32339       a_ = simde__m256i_to_private(a),
32340       b_ = simde__m256i_to_private(b);
32341 
32342     for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
32343       rc |= ~a_.i32f[i] & b_.i32f[i];
32344       rz |=  a_.i32f[i] & b_.i32f[i];
32345     }
32346 
32347     return !!(rc & rz);
32348   #endif
32349 }
32350 #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
32351   #undef _mm256_testnzc_si256
32352   #define _mm256_testnzc_si256(a, b) simde_mm256_testnzc_si256(a, b)
32353 #endif
32354 
32355 SIMDE_END_DECLS_
32356 
32357 HEDLEY_DIAGNOSTIC_POP
32358 
32359 #endif /* !defined(SIMDE_X86_AVX_H) */
32360 /* :: End ../simde/simde/x86/avx.h :: */
32361 
32362 HEDLEY_DIAGNOSTIC_PUSH
32363 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
32364 SIMDE_BEGIN_DECLS_
32365 
32366 SIMDE_FUNCTION_ATTRIBUTES
32367 simde__m256i
simde_mm256_abs_epi8(simde__m256i a)32368 simde_mm256_abs_epi8 (simde__m256i a) {
32369   #if defined(SIMDE_X86_AVX2_NATIVE)
32370     return _mm256_abs_epi8(a);
32371   #else
32372     simde__m256i_private
32373       r_,
32374       a_ = simde__m256i_to_private(a);
32375 
32376     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32377       r_.m128i[0] = simde_mm_abs_epi8(a_.m128i[0]);
32378       r_.m128i[1] = simde_mm_abs_epi8(a_.m128i[1]);
32379     #else
32380       SIMDE_VECTORIZE
32381       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
32382         r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i];
32383       }
32384     #endif
32385 
32386     return simde__m256i_from_private(r_);
32387   #endif
32388 }
32389 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32390   #undef _mm256_abs_epi8
32391   #define _mm256_abs_epi8(a) simde_mm256_abs_epi8(a)
32392 #endif
32393 
32394 SIMDE_FUNCTION_ATTRIBUTES
32395 simde__m256i
simde_mm256_abs_epi16(simde__m256i a)32396 simde_mm256_abs_epi16 (simde__m256i a) {
32397   #if defined(SIMDE_X86_AVX2_NATIVE)
32398     return _mm256_abs_epi16(a);
32399   #else
32400     simde__m256i_private
32401       r_,
32402       a_ = simde__m256i_to_private(a);
32403 
32404     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32405       r_.m128i[0] = simde_mm_abs_epi16(a_.m128i[0]);
32406       r_.m128i[1] = simde_mm_abs_epi16(a_.m128i[1]);
32407     #else
32408       SIMDE_VECTORIZE
32409       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
32410         r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i];
32411       }
32412     #endif
32413 
32414     return simde__m256i_from_private(r_);
32415   #endif
32416 }
32417 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32418   #undef _mm256_abs_epi16
32419   #define _mm256_abs_epi16(a) simde_mm256_abs_epi16(a)
32420 #endif
32421 
32422 SIMDE_FUNCTION_ATTRIBUTES
32423 simde__m256i
simde_mm256_abs_epi32(simde__m256i a)32424 simde_mm256_abs_epi32(simde__m256i a) {
32425   #if defined(SIMDE_X86_AVX2_NATIVE)
32426     return _mm256_abs_epi32(a);
32427   #else
32428     simde__m256i_private
32429       r_,
32430       a_ = simde__m256i_to_private(a);
32431 
32432     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32433       r_.m128i[0] = simde_mm_abs_epi32(a_.m128i[0]);
32434       r_.m128i[1] = simde_mm_abs_epi32(a_.m128i[1]);
32435     #else
32436       SIMDE_VECTORIZE
32437       for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
32438         r_.i32[i] = (a_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
32439       }
32440     #endif
32441 
32442     return simde__m256i_from_private(r_);
32443   #endif
32444 }
32445 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32446   #undef _mm256_abs_epi32
32447   #define _mm256_abs_epi32(a) simde_mm256_abs_epi32(a)
32448 #endif
32449 
32450 SIMDE_FUNCTION_ATTRIBUTES
32451 simde__m256i
simde_mm256_add_epi8(simde__m256i a,simde__m256i b)32452 simde_mm256_add_epi8 (simde__m256i a, simde__m256i b) {
32453   #if defined(SIMDE_X86_AVX2_NATIVE)
32454     return _mm256_add_epi8(a, b);
32455   #else
32456     simde__m256i_private
32457       r_,
32458       a_ = simde__m256i_to_private(a),
32459       b_ = simde__m256i_to_private(b);
32460 
32461     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32462       r_.m128i[0] = simde_mm_add_epi8(a_.m128i[0], b_.m128i[0]);
32463       r_.m128i[1] = simde_mm_add_epi8(a_.m128i[1], b_.m128i[1]);
32464     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
32465       r_.i8 = a_.i8 + b_.i8;
32466     #else
32467       SIMDE_VECTORIZE
32468       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
32469         r_.i8[i] = a_.i8[i] + b_.i8[i];
32470       }
32471     #endif
32472 
32473     return simde__m256i_from_private(r_);
32474   #endif
32475 }
32476 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32477   #undef _mm256_add_epi8
32478   #define _mm256_add_epi8(a, b) simde_mm256_add_epi8(a, b)
32479 #endif
32480 
32481 SIMDE_FUNCTION_ATTRIBUTES
32482 simde__m256i
simde_mm256_add_epi16(simde__m256i a,simde__m256i b)32483 simde_mm256_add_epi16 (simde__m256i a, simde__m256i b) {
32484   #if defined(SIMDE_X86_AVX2_NATIVE)
32485     return _mm256_add_epi16(a, b);
32486   #else
32487     simde__m256i_private
32488       r_,
32489       a_ = simde__m256i_to_private(a),
32490       b_ = simde__m256i_to_private(b);
32491 
32492     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32493       r_.m128i[0] = simde_mm_add_epi16(a_.m128i[0], b_.m128i[0]);
32494       r_.m128i[1] = simde_mm_add_epi16(a_.m128i[1], b_.m128i[1]);
32495     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
32496       r_.i16 = a_.i16 + b_.i16;
32497     #else
32498       SIMDE_VECTORIZE
32499       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
32500         r_.i16[i] = a_.i16[i] + b_.i16[i];
32501       }
32502     #endif
32503 
32504     return simde__m256i_from_private(r_);
32505   #endif
32506 }
32507 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32508   #undef _mm256_add_epi16
32509   #define _mm256_add_epi16(a, b) simde_mm256_add_epi16(a, b)
32510 #endif
32511 
32512 SIMDE_FUNCTION_ATTRIBUTES
32513 simde__m256i
simde_mm256_hadd_epi16(simde__m256i a,simde__m256i b)32514 simde_mm256_hadd_epi16 (simde__m256i a, simde__m256i b) {
32515   #if defined(SIMDE_X86_AVX2_NATIVE)
32516     return _mm256_hadd_epi16(a, b);
32517   #else
32518     return simde_mm256_add_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
32519   #endif
32520 }
32521 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32522   #undef _mm256_hadd_epi16
32523   #define _mm256_hadd_epi16(a, b) simde_mm256_hadd_epi16(a, b)
32524 #endif
32525 
32526 SIMDE_FUNCTION_ATTRIBUTES
32527 simde__m256i
simde_mm256_add_epi32(simde__m256i a,simde__m256i b)32528 simde_mm256_add_epi32 (simde__m256i a, simde__m256i b) {
32529   #if defined(SIMDE_X86_AVX2_NATIVE)
32530     return _mm256_add_epi32(a, b);
32531   #else
32532     simde__m256i_private
32533       r_,
32534       a_ = simde__m256i_to_private(a),
32535       b_ = simde__m256i_to_private(b);
32536 
32537     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32538       r_.m128i[0] = simde_mm_add_epi32(a_.m128i[0], b_.m128i[0]);
32539       r_.m128i[1] = simde_mm_add_epi32(a_.m128i[1], b_.m128i[1]);
32540     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
32541       r_.i32 = a_.i32 + b_.i32;
32542     #else
32543       SIMDE_VECTORIZE
32544       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
32545         r_.i32[i] = a_.i32[i] + b_.i32[i];
32546       }
32547     #endif
32548 
32549     return simde__m256i_from_private(r_);
32550   #endif
32551 }
32552 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32553   #undef _mm256_add_epi32
32554   #define _mm256_add_epi32(a, b) simde_mm256_add_epi32(a, b)
32555 #endif
32556 
32557 SIMDE_FUNCTION_ATTRIBUTES
32558 simde__m256i
simde_mm256_hadd_epi32(simde__m256i a,simde__m256i b)32559 simde_mm256_hadd_epi32 (simde__m256i a, simde__m256i b) {
32560   #if defined(SIMDE_X86_AVX2_NATIVE)
32561     return _mm256_hadd_epi32(a, b);
32562   #else
32563     return simde_mm256_add_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b));
32564   #endif
32565 }
32566 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32567   #undef _mm256_hadd_epi32
32568   #define _mm256_hadd_epi32(a, b) simde_mm256_hadd_epi32(a, b)
32569 #endif
32570 
32571 SIMDE_FUNCTION_ATTRIBUTES
32572 simde__m256i
simde_mm256_add_epi64(simde__m256i a,simde__m256i b)32573 simde_mm256_add_epi64 (simde__m256i a, simde__m256i b) {
32574   #if defined(SIMDE_X86_AVX2_NATIVE)
32575     return _mm256_add_epi64(a, b);
32576   #else
32577     simde__m256i_private
32578       r_,
32579       a_ = simde__m256i_to_private(a),
32580       b_ = simde__m256i_to_private(b);
32581 
32582     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32583       r_.m128i[0] = simde_mm_add_epi64(a_.m128i[0], b_.m128i[0]);
32584       r_.m128i[1] = simde_mm_add_epi64(a_.m128i[1], b_.m128i[1]);
32585     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS)
32586       r_.i64 = a_.i64 + b_.i64;
32587     #else
32588       SIMDE_VECTORIZE
32589       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
32590         r_.i64[i] = a_.i64[i] + b_.i64[i];
32591       }
32592     #endif
32593 
32594     return simde__m256i_from_private(r_);
32595   #endif
32596 }
32597 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32598   #undef _mm256_add_epi64
32599   #define _mm256_add_epi64(a, b) simde_mm256_add_epi64(a, b)
32600 #endif
32601 
32602 SIMDE_FUNCTION_ATTRIBUTES
32603 simde__m256i
simde_mm256_alignr_epi8(simde__m256i a,simde__m256i b,int count)32604 simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count)
32605     SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) {
32606   simde__m256i_private
32607     r_,
32608     a_ = simde__m256i_to_private(a),
32609     b_ = simde__m256i_to_private(b);
32610 
32611   if (HEDLEY_UNLIKELY(count > 31))
32612     return simde_mm256_setzero_si256();
32613 
32614   for (size_t h = 0 ; h < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; h++) {
32615     SIMDE_VECTORIZE
32616     for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
32617       const int srcpos = count + HEDLEY_STATIC_CAST(int, i);
32618       if (srcpos > 31) {
32619         r_.m128i_private[h].i8[i] = 0;
32620       } else if (srcpos > 15) {
32621         r_.m128i_private[h].i8[i] = a_.m128i_private[h].i8[(srcpos) & 15];
32622       } else {
32623         r_.m128i_private[h].i8[i] = b_.m128i_private[h].i8[srcpos];
32624       }
32625     }
32626   }
32627 
32628   return simde__m256i_from_private(r_);
32629 }
32630 #if defined(SIMDE_X86_AVX2_NATIVE)
32631 #  define simde_mm256_alignr_epi8(a, b, count) _mm256_alignr_epi8(a, b, count)
32632 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32633 #  define simde_mm256_alignr_epi8(a, b, count) \
32634       simde_mm256_set_m128i( \
32635           simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \
32636           simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (count)))
32637 #endif
32638 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32639   #undef _mm256_alignr_epi8
32640   #define _mm256_alignr_epi8(a, b, count) simde_mm256_alignr_epi8(a, b, (count))
32641 #endif
32642 
32643 SIMDE_FUNCTION_ATTRIBUTES
32644 simde__m256i
simde_mm256_and_si256(simde__m256i a,simde__m256i b)32645 simde_mm256_and_si256 (simde__m256i a, simde__m256i b) {
32646   #if defined(SIMDE_X86_AVX2_NATIVE)
32647     return _mm256_and_si256(a, b);
32648   #else
32649     simde__m256i_private
32650       r_,
32651       a_ = simde__m256i_to_private(a),
32652       b_ = simde__m256i_to_private(b);
32653 
32654     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32655       r_.m128i[0] = simde_mm_and_si128(a_.m128i[0], b_.m128i[0]);
32656       r_.m128i[1] = simde_mm_and_si128(a_.m128i[1], b_.m128i[1]);
32657     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
32658       r_.i32f = a_.i32f & b_.i32f;
32659     #else
32660       SIMDE_VECTORIZE
32661       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
32662         r_.i64[i] = a_.i64[i] & b_.i64[i];
32663       }
32664     #endif
32665 
32666     return simde__m256i_from_private(r_);
32667   #endif
32668 }
32669 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32670   #undef _mm256_and_si256
32671   #define _mm256_and_si256(a, b) simde_mm256_and_si256(a, b)
32672 #endif
32673 
32674 SIMDE_FUNCTION_ATTRIBUTES
32675 simde__m256i
simde_mm256_andnot_si256(simde__m256i a,simde__m256i b)32676 simde_mm256_andnot_si256 (simde__m256i a, simde__m256i b) {
32677   #if defined(SIMDE_X86_AVX2_NATIVE)
32678     return _mm256_andnot_si256(a, b);
32679   #else
32680     simde__m256i_private
32681       r_,
32682       a_ = simde__m256i_to_private(a),
32683       b_ = simde__m256i_to_private(b);
32684 
32685     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32686       r_.m128i[0] = simde_mm_andnot_si128(a_.m128i[0], b_.m128i[0]);
32687       r_.m128i[1] = simde_mm_andnot_si128(a_.m128i[1], b_.m128i[1]);
32688     #else
32689       SIMDE_VECTORIZE
32690       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
32691         r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
32692       }
32693     #endif
32694 
32695     return simde__m256i_from_private(r_);
32696   #endif
32697 }
32698 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32699   #undef _mm256_andnot_si256
32700   #define _mm256_andnot_si256(a, b) simde_mm256_andnot_si256(a, b)
32701 #endif
32702 
32703 SIMDE_FUNCTION_ATTRIBUTES
32704 simde__m256i
simde_mm256_adds_epi8(simde__m256i a,simde__m256i b)32705 simde_mm256_adds_epi8 (simde__m256i a, simde__m256i b) {
32706   #if defined(SIMDE_X86_AVX2_NATIVE)
32707     return _mm256_adds_epi8(a, b);
32708   #else
32709     simde__m256i_private
32710       r_,
32711       a_ = simde__m256i_to_private(a),
32712       b_ = simde__m256i_to_private(b);
32713 
32714     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32715       r_.m128i[0] = simde_mm_adds_epi8(a_.m128i[0], b_.m128i[0]);
32716       r_.m128i[1] = simde_mm_adds_epi8(a_.m128i[1], b_.m128i[1]);
32717     #else
32718       SIMDE_VECTORIZE
32719       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
32720         r_.i8[i] = simde_math_adds_i8(a_.i8[i], b_.i8[i]);
32721       }
32722     #endif
32723 
32724     return simde__m256i_from_private(r_);
32725   #endif
32726 }
32727 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32728   #undef _mm256_adds_epi8
32729   #define _mm256_adds_epi8(a, b) simde_mm256_adds_epi8(a, b)
32730 #endif
32731 
32732 SIMDE_FUNCTION_ATTRIBUTES
32733 simde__m256i
simde_mm256_adds_epi16(simde__m256i a,simde__m256i b)32734 simde_mm256_adds_epi16(simde__m256i a, simde__m256i b) {
32735   #if defined(SIMDE_X86_AVX2_NATIVE)
32736     return _mm256_adds_epi16(a, b);
32737   #else
32738     simde__m256i_private
32739       r_,
32740       a_ = simde__m256i_to_private(a),
32741       b_ = simde__m256i_to_private(b);
32742 
32743     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32744       r_.m128i[0] = simde_mm_adds_epi16(a_.m128i[0], b_.m128i[0]);
32745       r_.m128i[1] = simde_mm_adds_epi16(a_.m128i[1], b_.m128i[1]);
32746     #else
32747       SIMDE_VECTORIZE
32748       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
32749         r_.i16[i] = simde_math_adds_i16(a_.i16[i], b_.i16[i]);
32750       }
32751     #endif
32752 
32753     return simde__m256i_from_private(r_);
32754   #endif
32755 }
32756 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32757   #undef _mm256_adds_epi16
32758   #define _mm256_adds_epi16(a, b) simde_mm256_adds_epi16(a, b)
32759 #endif
32760 
32761 SIMDE_FUNCTION_ATTRIBUTES
32762 simde__m256i
simde_mm256_hadds_epi16(simde__m256i a,simde__m256i b)32763 simde_mm256_hadds_epi16 (simde__m256i a, simde__m256i b) {
32764   #if defined(SIMDE_X86_AVX2_NATIVE)
32765     return _mm256_hadds_epi16(a, b);
32766   #else
32767     return simde_mm256_adds_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
32768   #endif
32769 }
32770 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32771   #undef _mm256_hadds_epi16
32772   #define _mm256_hadds_epi16(a, b) simde_mm256_hadds_epi16(a, b)
32773 #endif
32774 
32775 SIMDE_FUNCTION_ATTRIBUTES
32776 simde__m256i
simde_mm256_adds_epu8(simde__m256i a,simde__m256i b)32777 simde_mm256_adds_epu8 (simde__m256i a, simde__m256i b) {
32778   #if defined(SIMDE_X86_AVX2_NATIVE)
32779     return _mm256_adds_epu8(a, b);
32780   #else
32781     simde__m256i_private
32782       r_,
32783       a_ = simde__m256i_to_private(a),
32784       b_ = simde__m256i_to_private(b);
32785 
32786     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32787       r_.m128i[0] = simde_mm_adds_epu8(a_.m128i[0], b_.m128i[0]);
32788       r_.m128i[1] = simde_mm_adds_epu8(a_.m128i[1], b_.m128i[1]);
32789     #else
32790       SIMDE_VECTORIZE
32791       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
32792         r_.u8[i] = simde_math_adds_u8(a_.u8[i], b_.u8[i]);
32793       }
32794     #endif
32795 
32796     return simde__m256i_from_private(r_);
32797   #endif
32798 }
32799 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32800   #undef _mm256_adds_epu8
32801   #define _mm256_adds_epu8(a, b) simde_mm256_adds_epu8(a, b)
32802 #endif
32803 
32804 SIMDE_FUNCTION_ATTRIBUTES
32805 simde__m256i
simde_mm256_adds_epu16(simde__m256i a,simde__m256i b)32806 simde_mm256_adds_epu16(simde__m256i a, simde__m256i b) {
32807   #if defined(SIMDE_X86_AVX2_NATIVE)
32808     return _mm256_adds_epu16(a, b);
32809   #else
32810     simde__m256i_private
32811       r_,
32812       a_ = simde__m256i_to_private(a),
32813       b_ = simde__m256i_to_private(b);
32814 
32815     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32816       r_.m128i[0] = simde_mm_adds_epu16(a_.m128i[0], b_.m128i[0]);
32817       r_.m128i[1] = simde_mm_adds_epu16(a_.m128i[1], b_.m128i[1]);
32818     #else
32819       SIMDE_VECTORIZE
32820     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
32821         r_.u16[i] = simde_math_adds_u16(a_.u16[i], b_.u16[i]);
32822     }
32823     #endif
32824 
32825     return simde__m256i_from_private(r_);
32826   #endif
32827 }
32828 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32829   #undef _mm256_adds_epu16
32830   #define _mm256_adds_epu16(a, b) simde_mm256_adds_epu16(a, b)
32831 #endif
32832 
32833 SIMDE_FUNCTION_ATTRIBUTES
32834 simde__m256i
simde_mm256_avg_epu8(simde__m256i a,simde__m256i b)32835 simde_mm256_avg_epu8 (simde__m256i a, simde__m256i b) {
32836   #if defined(SIMDE_X86_AVX2_NATIVE)
32837     return _mm256_avg_epu8(a, b);
32838   #else
32839     simde__m256i_private
32840       r_,
32841       a_ = simde__m256i_to_private(a),
32842       b_ = simde__m256i_to_private(b);
32843 
32844     SIMDE_VECTORIZE
32845     for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
32846       r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
32847     }
32848 
32849     return simde__m256i_from_private(r_);
32850   #endif
32851 }
32852 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32853   #undef _mm256_avg_epu8
32854   #define _mm256_avg_epu8(a, b) simde_mm256_avg_epu8(a, b)
32855 #endif
32856 
32857 SIMDE_FUNCTION_ATTRIBUTES
32858 simde__m256i
simde_mm256_avg_epu16(simde__m256i a,simde__m256i b)32859 simde_mm256_avg_epu16 (simde__m256i a, simde__m256i b) {
32860   #if defined(SIMDE_X86_AVX2_NATIVE)
32861     return _mm256_avg_epu16(a, b);
32862   #else
32863     simde__m256i_private
32864       r_,
32865       a_ = simde__m256i_to_private(a),
32866       b_ = simde__m256i_to_private(b);
32867 
32868     SIMDE_VECTORIZE
32869     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
32870       r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
32871     }
32872 
32873     return simde__m256i_from_private(r_);
32874   #endif
32875 }
32876 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32877   #undef _mm256_avg_epu16
32878   #define _mm256_avg_epu16(a, b) simde_mm256_avg_epu16(a, b)
32879 #endif
32880 
32881 SIMDE_FUNCTION_ATTRIBUTES
32882 simde__m128i
simde_mm_blend_epi32(simde__m128i a,simde__m128i b,const int imm8)32883 simde_mm_blend_epi32(simde__m128i a, simde__m128i b, const int imm8)
32884     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
32885   simde__m128i_private
32886     r_,
32887     a_ = simde__m128i_to_private(a),
32888     b_ = simde__m128i_to_private(b);
32889 
32890   SIMDE_VECTORIZE
32891   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
32892     r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
32893   }
32894 
32895   return simde__m128i_from_private(r_);
32896 }
32897 #if defined(SIMDE_X86_AVX2_NATIVE)
32898 #  define simde_mm_blend_epi32(a, b, imm8) _mm_blend_epi32(a, b, imm8)
32899 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32900 #  define simde_mm_blend_epi32(a, b, imm8) \
32901   simde_mm_castps_si128(simde_mm_blend_ps(simde_mm_castsi128_ps(a), simde_mm_castsi128_ps(b), (imm8)))
32902 #endif
32903 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32904   #undef _mm_blend_epi32
32905   #define _mm_blend_epi32(a, b, imm8) simde_mm_blend_epi32(a, b, imm8)
32906 #endif
32907 
32908 SIMDE_FUNCTION_ATTRIBUTES
32909 simde__m256i
simde_mm256_blend_epi16(simde__m256i a,simde__m256i b,const int imm8)32910 simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8)
32911     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
32912   simde__m256i_private
32913     r_,
32914     a_ = simde__m256i_to_private(a),
32915     b_ = simde__m256i_to_private(b);
32916 
32917   SIMDE_VECTORIZE
32918   for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
32919     r_.i16[i] = ((imm8 >> i%8) & 1) ? b_.i16[i] : a_.i16[i];
32920   }
32921 
32922   return simde__m256i_from_private(r_);
32923 }
32924 #if defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_BUG_CLANG_REV_234560)
32925 #  define simde_mm256_blend_epi16(a, b, imm8) _mm256_castpd_si256(_mm256_blend_epi16(a, b, imm8))
32926 #elif defined(SIMDE_X86_AVX2_NATIVE)
32927 #  define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8)
32928 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32929 #  define simde_mm256_blend_epi16(a, b, imm8) \
32930       simde_mm256_set_m128i( \
32931           simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8)), \
32932           simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8)))
32933 #endif
32934 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32935   #undef _mm256_blend_epi16
32936   #define _mm256_blend_epi16(a, b, imm8) simde_mm256_blend_epi16(a, b, imm8)
32937 #endif
32938 
32939 
32940 SIMDE_FUNCTION_ATTRIBUTES
32941 simde__m256i
simde_mm256_blend_epi32(simde__m256i a,simde__m256i b,const int imm8)32942 simde_mm256_blend_epi32(simde__m256i a, simde__m256i b, const int imm8)
32943     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
32944   simde__m256i_private
32945     r_,
32946     a_ = simde__m256i_to_private(a),
32947     b_ = simde__m256i_to_private(b);
32948 
32949   SIMDE_VECTORIZE
32950   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
32951     r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
32952   }
32953 
32954   return simde__m256i_from_private(r_);
32955 }
32956 #if defined(SIMDE_X86_AVX2_NATIVE)
32957 #  define simde_mm256_blend_epi32(a, b, imm8) _mm256_blend_epi32(a, b, imm8)
32958 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32959 #  define simde_mm256_blend_epi32(a, b, imm8) \
32960       simde_mm256_set_m128i( \
32961           simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8) >> 4), \
32962           simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8) & 0x0F))
32963 #endif
32964 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
32965   #undef _mm256_blend_epi32
32966   #define _mm256_blend_epi32(a, b, imm8) simde_mm256_blend_epi32(a, b, imm8)
32967 #endif
32968 
32969 
32970 SIMDE_FUNCTION_ATTRIBUTES
32971 simde__m256i
simde_mm256_blendv_epi8(simde__m256i a,simde__m256i b,simde__m256i mask)32972 simde_mm256_blendv_epi8(simde__m256i a, simde__m256i b, simde__m256i mask) {
32973   #if defined(SIMDE_X86_AVX2_NATIVE)
32974     return _mm256_blendv_epi8(a, b, mask);
32975   #else
32976     simde__m256i_private
32977       r_,
32978       a_ = simde__m256i_to_private(a),
32979       b_ = simde__m256i_to_private(b),
32980       mask_ = simde__m256i_to_private(mask);
32981 
32982     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
32983       r_.m128i[0] = simde_mm_blendv_epi8(a_.m128i[0], b_.m128i[0], mask_.m128i[0]);
32984       r_.m128i[1] = simde_mm_blendv_epi8(a_.m128i[1], b_.m128i[1], mask_.m128i[1]);
32985     #else
32986       SIMDE_VECTORIZE
32987       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
32988         if (mask_.u8[i] & 0x80) {
32989           r_.u8[i] = b_.u8[i];
32990         } else {
32991           r_.u8[i] = a_.u8[i];
32992         }
32993       }
32994     #endif
32995 
32996     return simde__m256i_from_private(r_);
32997   #endif
32998 }
32999 #if defined(SIMDE_X86_AVX2_NATIVE)
33000 #  define simde_mm256_blendv_epi8(a, b, imm8)  _mm256_blendv_epi8(a, b, imm8)
33001 #endif
33002 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33003   #undef _mm256_blendv_epi8
33004   #define _mm256_blendv_epi8(a, b, mask) simde_mm256_blendv_epi8(a, b, mask)
33005 #endif
33006 
33007 SIMDE_FUNCTION_ATTRIBUTES
33008 simde__m128i
simde_mm_broadcastb_epi8(simde__m128i a)33009 simde_mm_broadcastb_epi8 (simde__m128i a) {
33010   #if defined(SIMDE_X86_AVX2_NATIVE)
33011     return _mm_broadcastb_epi8(a);
33012   #else
33013     simde__m128i_private r_;
33014     simde__m128i_private a_= simde__m128i_to_private(a);
33015 
33016     SIMDE_VECTORIZE
33017     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
33018       r_.i8[i] = a_.i8[0];
33019     }
33020 
33021     return simde__m128i_from_private(r_);
33022   #endif
33023 }
33024 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33025   #undef _mm_broadcastb_epi8
33026   #define _mm_broadcastb_epi8(a) simde_mm_broadcastb_epi8(a)
33027 #endif
33028 
33029 SIMDE_FUNCTION_ATTRIBUTES
33030 simde__m256i
simde_mm256_broadcastb_epi8(simde__m128i a)33031 simde_mm256_broadcastb_epi8 (simde__m128i a) {
33032   #if defined(SIMDE_X86_AVX2_NATIVE)
33033     return _mm256_broadcastb_epi8(a);
33034   #else
33035     simde__m256i_private r_;
33036     simde__m128i_private a_= simde__m128i_to_private(a);
33037 
33038     SIMDE_VECTORIZE
33039     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
33040       r_.i8[i] = a_.i8[0];
33041     }
33042 
33043     return simde__m256i_from_private(r_);
33044   #endif
33045 }
33046 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33047   #undef _mm256_broadcastb_epi8
33048   #define _mm256_broadcastb_epi8(a) simde_mm256_broadcastb_epi8(a)
33049 #endif
33050 
33051 SIMDE_FUNCTION_ATTRIBUTES
33052 simde__m128i
simde_mm_broadcastw_epi16(simde__m128i a)33053 simde_mm_broadcastw_epi16 (simde__m128i a) {
33054   #if defined(SIMDE_X86_AVX2_NATIVE)
33055     return _mm_broadcastw_epi16(a);
33056   #else
33057     simde__m128i_private r_;
33058     simde__m128i_private a_= simde__m128i_to_private(a);
33059 
33060     SIMDE_VECTORIZE
33061     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
33062       r_.i16[i] = a_.i16[0];
33063     }
33064 
33065     return simde__m128i_from_private(r_);
33066   #endif
33067 }
33068 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33069   #undef _mm_broadcastw_epi16
33070   #define _mm_broadcastw_epi16(a) simde_mm_broadcastw_epi16(a)
33071 #endif
33072 
33073 SIMDE_FUNCTION_ATTRIBUTES
33074 simde__m256i
simde_mm256_broadcastw_epi16(simde__m128i a)33075 simde_mm256_broadcastw_epi16 (simde__m128i a) {
33076   #if defined(SIMDE_X86_AVX2_NATIVE)
33077     return _mm256_broadcastw_epi16(a);
33078   #else
33079     simde__m256i_private r_;
33080     simde__m128i_private a_= simde__m128i_to_private(a);
33081 
33082     SIMDE_VECTORIZE
33083     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
33084       r_.i16[i] = a_.i16[0];
33085     }
33086 
33087     return simde__m256i_from_private(r_);
33088   #endif
33089 }
33090 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33091   #undef _mm256_broadcastw_epi16
33092   #define _mm256_broadcastw_epi16(a) simde_mm256_broadcastw_epi16(a)
33093 #endif
33094 
33095 SIMDE_FUNCTION_ATTRIBUTES
33096 simde__m128i
simde_mm_broadcastd_epi32(simde__m128i a)33097 simde_mm_broadcastd_epi32 (simde__m128i a) {
33098   #if defined(SIMDE_X86_AVX2_NATIVE)
33099     return _mm_broadcastd_epi32(a);
33100   #else
33101     simde__m128i_private r_;
33102     simde__m128i_private a_= simde__m128i_to_private(a);
33103 
33104     SIMDE_VECTORIZE
33105     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
33106       r_.i32[i] = a_.i32[0];
33107     }
33108 
33109     return simde__m128i_from_private(r_);
33110   #endif
33111 }
33112 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33113   #undef _mm_broadcastd_epi32
33114   #define _mm_broadcastd_epi32(a) simde_mm_broadcastd_epi32(a)
33115 #endif
33116 
33117 SIMDE_FUNCTION_ATTRIBUTES
33118 simde__m256i
simde_mm256_broadcastd_epi32(simde__m128i a)33119 simde_mm256_broadcastd_epi32 (simde__m128i a) {
33120   #if defined(SIMDE_X86_AVX2_NATIVE)
33121     return _mm256_broadcastd_epi32(a);
33122   #else
33123     simde__m256i_private r_;
33124     simde__m128i_private a_= simde__m128i_to_private(a);
33125 
33126     SIMDE_VECTORIZE
33127     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
33128       r_.i32[i] = a_.i32[0];
33129     }
33130 
33131     return simde__m256i_from_private(r_);
33132   #endif
33133 }
33134 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33135   #undef _mm256_broadcastd_epi32
33136   #define _mm256_broadcastd_epi32(a) simde_mm256_broadcastd_epi32(a)
33137 #endif
33138 
33139 SIMDE_FUNCTION_ATTRIBUTES
33140 simde__m128i
simde_mm_broadcastq_epi64(simde__m128i a)33141 simde_mm_broadcastq_epi64 (simde__m128i a) {
33142   #if defined(SIMDE_X86_AVX2_NATIVE)
33143     return _mm_broadcastq_epi64(a);
33144   #else
33145     simde__m128i_private r_;
33146     simde__m128i_private a_= simde__m128i_to_private(a);
33147 
33148     SIMDE_VECTORIZE
33149     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
33150       r_.i64[i] = a_.i64[0];
33151     }
33152 
33153     return simde__m128i_from_private(r_);
33154   #endif
33155 }
33156 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33157   #undef _mm_broadcastq_epi64
33158   #define _mm_broadcastq_epi64(a) simde_mm_broadcastq_epi64(a)
33159 #endif
33160 
33161 SIMDE_FUNCTION_ATTRIBUTES
33162 simde__m256i
simde_mm256_broadcastq_epi64(simde__m128i a)33163 simde_mm256_broadcastq_epi64 (simde__m128i a) {
33164   #if defined(SIMDE_X86_AVX2_NATIVE)
33165     return _mm256_broadcastq_epi64(a);
33166   #else
33167     simde__m256i_private r_;
33168     simde__m128i_private a_= simde__m128i_to_private(a);
33169 
33170     SIMDE_VECTORIZE
33171     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
33172       r_.i64[i] = a_.i64[0];
33173     }
33174 
33175     return simde__m256i_from_private(r_);
33176   #endif
33177 }
33178 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33179   #undef _mm256_broadcastq_epi64
33180   #define _mm256_broadcastq_epi64(a) simde_mm256_broadcastq_epi64(a)
33181 #endif
33182 
33183 SIMDE_FUNCTION_ATTRIBUTES
33184 simde__m128
simde_mm_broadcastss_ps(simde__m128 a)33185 simde_mm_broadcastss_ps (simde__m128 a) {
33186   #if defined(SIMDE_X86_AVX2_NATIVE)
33187     return _mm_broadcastss_ps(a);
33188   #else
33189     simde__m128_private r_;
33190     simde__m128_private a_= simde__m128_to_private(a);
33191 
33192     SIMDE_VECTORIZE
33193     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
33194       r_.f32[i] = a_.f32[0];
33195     }
33196 
33197     return simde__m128_from_private(r_);
33198   #endif
33199 }
33200 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33201   #undef _mm_broadcastss_ps
33202   #define _mm_broadcastss_ps(a) simde_mm_broadcastss_ps(a)
33203 #endif
33204 
33205 SIMDE_FUNCTION_ATTRIBUTES
33206 simde__m256
simde_mm256_broadcastss_ps(simde__m128 a)33207 simde_mm256_broadcastss_ps (simde__m128 a) {
33208   #if defined(SIMDE_X86_AVX2_NATIVE)
33209     return _mm256_broadcastss_ps(a);
33210   #else
33211     simde__m256_private r_;
33212     simde__m128_private a_= simde__m128_to_private(a);
33213 
33214     SIMDE_VECTORIZE
33215     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
33216       r_.f32[i] = a_.f32[0];
33217     }
33218 
33219     return simde__m256_from_private(r_);
33220   #endif
33221 }
33222 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33223   #undef _mm256_broadcastss_ps
33224   #define _mm256_broadcastss_ps(a) simde_mm256_broadcastss_ps(a)
33225 #endif
33226 
33227 SIMDE_FUNCTION_ATTRIBUTES
33228 simde__m128d
simde_mm_broadcastsd_pd(simde__m128d a)33229 simde_mm_broadcastsd_pd (simde__m128d a) {
33230   return simde_mm_movedup_pd(a);
33231 }
33232 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33233   #undef _mm_broadcastsd_pd
33234   #define _mm_broadcastsd_pd(a) simde_mm_broadcastsd_pd(a)
33235 #endif
33236 
33237 SIMDE_FUNCTION_ATTRIBUTES
33238 simde__m256d
simde_mm256_broadcastsd_pd(simde__m128d a)33239 simde_mm256_broadcastsd_pd (simde__m128d a) {
33240   #if defined(SIMDE_X86_AVX2_NATIVE)
33241     return _mm256_broadcastsd_pd(a);
33242   #else
33243     simde__m256d_private r_;
33244     simde__m128d_private a_= simde__m128d_to_private(a);
33245 
33246     SIMDE_VECTORIZE
33247     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
33248       r_.f64[i] = a_.f64[0];
33249     }
33250 
33251     return simde__m256d_from_private(r_);
33252   #endif
33253 }
33254 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33255   #undef _mm256_broadcastsd_pd
33256   #define _mm256_broadcastsd_pd(a) simde_mm256_broadcastsd_pd(a)
33257 #endif
33258 
33259 SIMDE_FUNCTION_ATTRIBUTES
33260 simde__m256i
simde_mm256_broadcastsi128_si256(simde__m128i a)33261 simde_mm256_broadcastsi128_si256 (simde__m128i a) {
33262   #if defined(SIMDE_X86_AVX2_NATIVE) && \
33263       (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0))
33264     return _mm256_broadcastsi128_si256(a);
33265   #else
33266     simde__m256i_private r_;
33267     simde__m128i_private a_ = simde__m128i_to_private(a);
33268 
33269     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
33270       r_.m128i_private[0] = a_;
33271       r_.m128i_private[1] = a_;
33272     #else
33273       r_.i64[0] = a_.i64[0];
33274       r_.i64[1] = a_.i64[1];
33275       r_.i64[2] = a_.i64[0];
33276       r_.i64[3] = a_.i64[1];
33277     #endif
33278 
33279     return simde__m256i_from_private(r_);
33280   #endif
33281 }
33282 #define simde_mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
33283 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33284   #undef _mm256_broadcastsi128_si256
33285   #define _mm256_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
33286   #undef _mm_broadcastsi128_si256
33287   #define _mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
33288 #endif
33289 
33290 SIMDE_FUNCTION_ATTRIBUTES
33291 simde__m256i
simde_mm256_bslli_epi128(simde__m256i a,const int imm8)33292 simde_mm256_bslli_epi128 (simde__m256i a, const int imm8)
33293     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
33294     simde__m256i_private
33295       r_,
33296       a_ = simde__m256i_to_private(a);
33297     const int ssize = HEDLEY_STATIC_CAST(int, (sizeof(r_.i8) / sizeof(r_.i8[0])));
33298 
33299     SIMDE_VECTORIZE
33300     for (int i = 0 ; i < ssize ; i++) {
33301       const int e = i - imm8;
33302       if(i >= (ssize/2)) {
33303         if(e >= (ssize/2) && e < ssize)
33304           r_.i8[i] = a_.i8[e];
33305         else
33306           r_.i8[i] = 0;
33307       }
33308       else{
33309         if(e >= 0 && e < (ssize/2))
33310           r_.i8[i] = a_.i8[e];
33311         else
33312           r_.i8[i] = 0;
33313       }
33314     }
33315 
33316   return simde__m256i_from_private(r_);
33317 }
33318 #if defined(SIMDE_X86_AVX2_NATIVE) && \
33319     (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \
33320     SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0)
33321   #define simde_mm256_bslli_epi128(a, imm8) _mm256_bslli_epi128(a, imm8)
33322 #endif
33323 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33324   #undef _mm256_bslli_epi128
33325   #define _mm256_bslli_epi128(a, imm8) simde_mm256_bslli_epi128(a, imm8)
33326 #endif
33327 
33328 SIMDE_FUNCTION_ATTRIBUTES
33329 simde__m256i
simde_mm256_bsrli_epi128(simde__m256i a,const int imm8)33330 simde_mm256_bsrli_epi128 (simde__m256i a, const int imm8)
33331     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
33332     simde__m256i_private
33333       r_,
33334       a_ = simde__m256i_to_private(a);
33335     const int ssize = HEDLEY_STATIC_CAST(int, (sizeof(r_.i8) / sizeof(r_.i8[0])));
33336 
33337     SIMDE_VECTORIZE
33338     for (int i = 0 ; i < ssize ; i++) {
33339       const int e = i + imm8;
33340       if(i < (ssize/2)) {
33341         if(e >= 0 && e < (ssize/2))
33342           r_.i8[i] = a_.i8[e];
33343         else
33344           r_.i8[i] = 0;
33345       }
33346       else{
33347         if(e >= (ssize/2) && e < ssize)
33348           r_.i8[i] = a_.i8[e];
33349         else
33350           r_.i8[i] = 0;
33351       }
33352     }
33353 
33354   return simde__m256i_from_private(r_);
33355 }
33356 #if defined(SIMDE_X86_AVX2_NATIVE) && \
33357     (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \
33358     SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0)
33359   #define simde_mm256_bsrli_epi128(a, imm8) _mm256_bsrli_epi128(a, imm8)
33360 #endif
33361 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33362   #undef _mm256_bsrli_epi128
33363   #define _mm256_bsrli_epi128(a, imm8) simde_mm256_bsrli_epi128(a, imm8)
33364 #endif
33365 
33366 SIMDE_FUNCTION_ATTRIBUTES
33367 simde__m256i
simde_mm256_cmpeq_epi8(simde__m256i a,simde__m256i b)33368 simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) {
33369   #if defined(SIMDE_X86_AVX2_NATIVE)
33370     return _mm256_cmpeq_epi8(a, b);
33371   #else
33372     simde__m256i_private
33373       r_,
33374       a_ = simde__m256i_to_private(a),
33375       b_ = simde__m256i_to_private(b);
33376 
33377     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
33378       r_.m128i[0] = simde_mm_cmpeq_epi8(a_.m128i[0], b_.m128i[0]);
33379       r_.m128i[1] = simde_mm_cmpeq_epi8(a_.m128i[1], b_.m128i[1]);
33380     #else
33381       SIMDE_VECTORIZE
33382       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
33383         r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
33384       }
33385     #endif
33386 
33387     return simde__m256i_from_private(r_);
33388   #endif
33389 }
33390 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33391   #undef _mm256_cmpeq_epi8
33392   #define _mm256_cmpeq_epi8(a, b) simde_mm256_cmpeq_epi8(a, b)
33393 #endif
33394 
33395 SIMDE_FUNCTION_ATTRIBUTES
33396 simde__m256i
simde_mm256_cmpeq_epi16(simde__m256i a,simde__m256i b)33397 simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) {
33398   #if defined(SIMDE_X86_AVX2_NATIVE)
33399     return _mm256_cmpeq_epi16(a, b);
33400   #else
33401     simde__m256i_private
33402       r_,
33403       a_ = simde__m256i_to_private(a),
33404       b_ = simde__m256i_to_private(b);
33405 
33406     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
33407       r_.m128i[0] = simde_mm_cmpeq_epi16(a_.m128i[0], b_.m128i[0]);
33408       r_.m128i[1] = simde_mm_cmpeq_epi16(a_.m128i[1], b_.m128i[1]);
33409     #else
33410       SIMDE_VECTORIZE
33411       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
33412         r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
33413       }
33414     #endif
33415 
33416     return simde__m256i_from_private(r_);
33417   #endif
33418 }
33419 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33420   #undef _mm256_cmpeq_epi16
33421   #define _mm256_cmpeq_epi16(a, b) simde_mm256_cmpeq_epi16(a, b)
33422 #endif
33423 
33424 SIMDE_FUNCTION_ATTRIBUTES
33425 simde__m256i
simde_mm256_cmpeq_epi32(simde__m256i a,simde__m256i b)33426 simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) {
33427   #if defined(SIMDE_X86_AVX2_NATIVE)
33428     return _mm256_cmpeq_epi32(a, b);
33429   #else
33430     simde__m256i_private
33431       r_,
33432       a_ = simde__m256i_to_private(a),
33433       b_ = simde__m256i_to_private(b);
33434 
33435     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
33436       r_.m128i[0] = simde_mm_cmpeq_epi32(a_.m128i[0], b_.m128i[0]);
33437       r_.m128i[1] = simde_mm_cmpeq_epi32(a_.m128i[1], b_.m128i[1]);
33438     #else
33439       SIMDE_VECTORIZE
33440       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
33441         r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
33442       }
33443     #endif
33444 
33445     return simde__m256i_from_private(r_);
33446   #endif
33447 }
33448 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33449   #undef _mm256_cmpeq_epi32
33450   #define _mm256_cmpeq_epi32(a, b) simde_mm256_cmpeq_epi32(a, b)
33451 #endif
33452 
33453 SIMDE_FUNCTION_ATTRIBUTES
33454 simde__m256i
simde_mm256_cmpeq_epi64(simde__m256i a,simde__m256i b)33455 simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) {
33456   #if defined(SIMDE_X86_AVX2_NATIVE)
33457     return _mm256_cmpeq_epi64(a, b);
33458   #else
33459     simde__m256i_private
33460       r_,
33461       a_ = simde__m256i_to_private(a),
33462       b_ = simde__m256i_to_private(b);
33463 
33464     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
33465       r_.m128i[0] = simde_mm_cmpeq_epi64(a_.m128i[0], b_.m128i[0]);
33466       r_.m128i[1] = simde_mm_cmpeq_epi64(a_.m128i[1], b_.m128i[1]);
33467     #else
33468       SIMDE_VECTORIZE
33469       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
33470         r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
33471       }
33472     #endif
33473 
33474     return simde__m256i_from_private(r_);
33475   #endif
33476 }
33477 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33478   #undef _mm256_cmpeq_epi64
33479   #define _mm256_cmpeq_epi64(a, b) simde_mm256_cmpeq_epi64(a, b)
33480 #endif
33481 
33482 SIMDE_FUNCTION_ATTRIBUTES
33483 simde__m256i
simde_mm256_cmpgt_epi8(simde__m256i a,simde__m256i b)33484 simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) {
33485   #if defined(SIMDE_X86_AVX2_NATIVE)
33486     return _mm256_cmpgt_epi8(a, b);
33487   #else
33488     simde__m256i_private
33489       r_,
33490       a_ = simde__m256i_to_private(a),
33491       b_ = simde__m256i_to_private(b);
33492 
33493     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
33494       r_.m128i[0] = simde_mm_cmpgt_epi8(a_.m128i[0], b_.m128i[0]);
33495       r_.m128i[1] = simde_mm_cmpgt_epi8(a_.m128i[1], b_.m128i[1]);
33496     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
33497       r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), a_.i8 > b_.i8);
33498     #else
33499       SIMDE_VECTORIZE
33500       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
33501         r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
33502       }
33503     #endif
33504 
33505     return simde__m256i_from_private(r_);
33506   #endif
33507 }
33508 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33509   #undef _mm256_cmpgt_epi8
33510   #define _mm256_cmpgt_epi8(a, b) simde_mm256_cmpgt_epi8(a, b)
33511 #endif
33512 
33513 SIMDE_FUNCTION_ATTRIBUTES
33514 simde__m256i
simde_mm256_cmpgt_epi16(simde__m256i a,simde__m256i b)33515 simde_mm256_cmpgt_epi16 (simde__m256i a, simde__m256i b) {
33516   #if defined(SIMDE_X86_AVX2_NATIVE)
33517     return _mm256_cmpgt_epi16(a, b);
33518   #else
33519     simde__m256i_private
33520       r_,
33521       a_ = simde__m256i_to_private(a),
33522       b_ = simde__m256i_to_private(b);
33523 
33524     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
33525       r_.m128i[0] = simde_mm_cmpgt_epi16(a_.m128i[0], b_.m128i[0]);
33526       r_.m128i[1] = simde_mm_cmpgt_epi16(a_.m128i[1], b_.m128i[1]);
33527     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
33528       r_.i16 = a_.i16 > b_.i16;
33529     #else
33530       SIMDE_VECTORIZE
33531       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
33532         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
33533       }
33534     #endif
33535 
33536     return simde__m256i_from_private(r_);
33537   #endif
33538 }
33539 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33540   #undef _mm256_cmpgt_epi16
33541   #define _mm256_cmpgt_epi16(a, b) simde_mm256_cmpgt_epi16(a, b)
33542 #endif
33543 
33544 SIMDE_FUNCTION_ATTRIBUTES
33545 simde__m256i
simde_mm256_cmpgt_epi32(simde__m256i a,simde__m256i b)33546 simde_mm256_cmpgt_epi32 (simde__m256i a, simde__m256i b) {
33547   #if defined(SIMDE_X86_AVX2_NATIVE)
33548     return _mm256_cmpgt_epi32(a, b);
33549   #else
33550     simde__m256i_private
33551       r_,
33552       a_ = simde__m256i_to_private(a),
33553       b_ = simde__m256i_to_private(b);
33554 
33555     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
33556       r_.m128i[0] = simde_mm_cmpgt_epi32(a_.m128i[0], b_.m128i[0]);
33557       r_.m128i[1] = simde_mm_cmpgt_epi32(a_.m128i[1], b_.m128i[1]);
33558     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
33559       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 > b_.i32);
33560     #else
33561       SIMDE_VECTORIZE
33562       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
33563         r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
33564       }
33565     #endif
33566 
33567     return simde__m256i_from_private(r_);
33568   #endif
33569 }
33570 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33571   #undef _mm256_cmpgt_epi32
33572   #define _mm256_cmpgt_epi32(a, b) simde_mm256_cmpgt_epi32(a, b)
33573 #endif
33574 
33575 SIMDE_FUNCTION_ATTRIBUTES
33576 simde__m256i
simde_mm256_cmpgt_epi64(simde__m256i a,simde__m256i b)33577 simde_mm256_cmpgt_epi64 (simde__m256i a, simde__m256i b) {
33578   #if defined(SIMDE_X86_AVX2_NATIVE)
33579     return _mm256_cmpgt_epi64(a, b);
33580   #else
33581     simde__m256i_private
33582       r_,
33583       a_ = simde__m256i_to_private(a),
33584       b_ = simde__m256i_to_private(b);
33585 
33586     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
33587       r_.m128i[0] = simde_mm_cmpgt_epi64(a_.m128i[0], b_.m128i[0]);
33588       r_.m128i[1] = simde_mm_cmpgt_epi64(a_.m128i[1], b_.m128i[1]);
33589     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
33590       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), a_.i64 > b_.i64);
33591     #else
33592       SIMDE_VECTORIZE
33593       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
33594         r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
33595       }
33596     #endif
33597 
33598     return simde__m256i_from_private(r_);
33599   #endif
33600 }
33601 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33602   #undef _mm256_cmpgt_epi64
33603   #define _mm256_cmpgt_epi64(a, b) simde_mm256_cmpgt_epi64(a, b)
33604 #endif
33605 
33606 SIMDE_FUNCTION_ATTRIBUTES
33607 simde__m256i
simde_mm256_cvtepi8_epi16(simde__m128i a)33608 simde_mm256_cvtepi8_epi16 (simde__m128i a) {
33609   #if defined(SIMDE_X86_AVX2_NATIVE)
33610     return _mm256_cvtepi8_epi16(a);
33611   #else
33612     simde__m256i_private r_;
33613     simde__m128i_private a_ = simde__m128i_to_private(a);
33614 
33615     #if defined(SIMDE_CONVERT_VECTOR_)
33616       SIMDE_CONVERT_VECTOR_(r_.i16, a_.i8);
33617     #else
33618       SIMDE_VECTORIZE
33619       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
33620         r_.i16[i] = a_.i8[i];
33621       }
33622     #endif
33623 
33624     return simde__m256i_from_private(r_);
33625   #endif
33626 }
33627 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33628   #undef _mm256_cvtepi8_epi16
33629   #define _mm256_cvtepi8_epi16(a) simde_mm256_cvtepi8_epi16(a)
33630 #endif
33631 
33632 SIMDE_FUNCTION_ATTRIBUTES
33633 simde__m256i
simde_mm256_cvtepi8_epi32(simde__m128i a)33634 simde_mm256_cvtepi8_epi32 (simde__m128i a) {
33635   #if defined(SIMDE_X86_AVX2_NATIVE)
33636     return _mm256_cvtepi8_epi32(a);
33637   #else
33638     simde__m256i_private r_;
33639     simde__m128i_private a_ = simde__m128i_to_private(a);
33640 
33641     #if defined(SIMDE_CONVERT_VECTOR_)
33642       SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].i8);
33643     #else
33644       SIMDE_VECTORIZE
33645       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
33646         r_.i32[i] = a_.i8[i];
33647       }
33648     #endif
33649 
33650     return simde__m256i_from_private(r_);
33651   #endif
33652 }
33653 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33654   #undef _mm256_cvtepi8_epi32
33655   #define _mm256_cvtepi8_epi32(a) simde_mm256_cvtepi8_epi32(a)
33656 #endif
33657 
33658 SIMDE_FUNCTION_ATTRIBUTES
33659 simde__m256i
simde_mm256_cvtepi8_epi64(simde__m128i a)33660 simde_mm256_cvtepi8_epi64 (simde__m128i a) {
33661   #if defined(SIMDE_X86_AVX2_NATIVE)
33662     return _mm256_cvtepi8_epi64(a);
33663   #else
33664     simde__m256i_private r_;
33665     simde__m128i_private a_ = simde__m128i_to_private(a);
33666 
33667     SIMDE_VECTORIZE
33668     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
33669       r_.i64[i] = a_.i8[i];
33670     }
33671 
33672     return simde__m256i_from_private(r_);
33673   #endif
33674 }
33675 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33676   #undef _mm256_cvtepi8_epi64
33677   #define _mm256_cvtepi8_epi64(a) simde_mm256_cvtepi8_epi64(a)
33678 #endif
33679 
33680 SIMDE_FUNCTION_ATTRIBUTES
33681 simde__m256i
simde_mm256_cvtepi16_epi32(simde__m128i a)33682 simde_mm256_cvtepi16_epi32 (simde__m128i a) {
33683   #if defined(SIMDE_X86_AVX2_NATIVE)
33684     return _mm256_cvtepi16_epi32(a);
33685   #else
33686     simde__m256i_private r_;
33687     simde__m128i_private a_ = simde__m128i_to_private(a);
33688 
33689     #if defined(SIMDE_CONVERT_VECTOR_)
33690       SIMDE_CONVERT_VECTOR_(r_.i32, a_.i16);
33691     #else
33692       SIMDE_VECTORIZE
33693       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
33694         r_.i32[i] = a_.i16[i];
33695       }
33696     #endif
33697 
33698     return simde__m256i_from_private(r_);
33699   #endif
33700 }
33701 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33702   #undef _mm256_cvtepi16_epi32
33703   #define _mm256_cvtepi16_epi32(a) simde_mm256_cvtepi16_epi32(a)
33704 #endif
33705 
33706 SIMDE_FUNCTION_ATTRIBUTES
33707 simde__m256i
simde_mm256_cvtepi16_epi64(simde__m128i a)33708 simde_mm256_cvtepi16_epi64 (simde__m128i a) {
33709   #if defined(SIMDE_X86_AVX2_NATIVE)
33710     return _mm256_cvtepi16_epi64(a);
33711   #else
33712     simde__m256i_private r_;
33713     simde__m128i_private a_ = simde__m128i_to_private(a);
33714 
33715     #if defined(SIMDE_CONVERT_VECTOR_)
33716       SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i16);
33717     #else
33718       SIMDE_VECTORIZE
33719       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
33720         r_.i64[i] = a_.i16[i];
33721       }
33722     #endif
33723 
33724     return simde__m256i_from_private(r_);
33725   #endif
33726 }
33727 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33728   #undef _mm256_cvtepi16_epi64
33729   #define _mm256_cvtepi16_epi64(a) simde_mm256_cvtepi16_epi64(a)
33730 #endif
33731 
33732 SIMDE_FUNCTION_ATTRIBUTES
33733 simde__m256i
simde_mm256_cvtepi32_epi64(simde__m128i a)33734 simde_mm256_cvtepi32_epi64 (simde__m128i a) {
33735   #if defined(SIMDE_X86_AVX2_NATIVE)
33736     return _mm256_cvtepi32_epi64(a);
33737   #else
33738     simde__m256i_private r_;
33739     simde__m128i_private a_ = simde__m128i_to_private(a);
33740 
33741     #if defined(SIMDE_CONVERT_VECTOR_)
33742       SIMDE_CONVERT_VECTOR_(r_.i64, a_.i32);
33743     #else
33744       SIMDE_VECTORIZE
33745       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
33746         r_.i64[i] = a_.i32[i];
33747       }
33748     #endif
33749 
33750     return simde__m256i_from_private(r_);
33751   #endif
33752 }
33753 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33754   #undef _mm256_cvtepi32_epi64
33755   #define _mm256_cvtepi32_epi64(a) simde_mm256_cvtepi32_epi64(a)
33756 #endif
33757 
33758 SIMDE_FUNCTION_ATTRIBUTES
33759 simde__m256i
simde_mm256_cvtepu8_epi16(simde__m128i a)33760 simde_mm256_cvtepu8_epi16 (simde__m128i a) {
33761   #if defined(SIMDE_X86_AVX2_NATIVE)
33762     return _mm256_cvtepu8_epi16(a);
33763   #else
33764     simde__m256i_private r_;
33765     simde__m128i_private a_ = simde__m128i_to_private(a);
33766 
33767     #if defined(SIMDE_CONVERT_VECTOR_)
33768       SIMDE_CONVERT_VECTOR_(r_.i16, a_.u8);
33769     #else
33770       SIMDE_VECTORIZE
33771       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
33772         r_.i16[i] = a_.u8[i];
33773       }
33774     #endif
33775 
33776     return simde__m256i_from_private(r_);
33777   #endif
33778 }
33779 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33780   #undef _mm256_cvtepu8_epi16
33781   #define _mm256_cvtepu8_epi16(a) simde_mm256_cvtepu8_epi16(a)
33782 #endif
33783 
33784 SIMDE_FUNCTION_ATTRIBUTES
33785 simde__m256i
simde_mm256_cvtepu8_epi32(simde__m128i a)33786 simde_mm256_cvtepu8_epi32 (simde__m128i a) {
33787   #if defined(SIMDE_X86_AVX2_NATIVE)
33788     return _mm256_cvtepu8_epi32(a);
33789   #else
33790     simde__m256i_private r_;
33791     simde__m128i_private a_ = simde__m128i_to_private(a);
33792 
33793     #if defined(SIMDE_CONVERT_VECTOR_)
33794       SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u8);
33795     #else
33796       SIMDE_VECTORIZE
33797       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
33798         r_.i32[i] = a_.u8[i];
33799       }
33800     #endif
33801 
33802     return simde__m256i_from_private(r_);
33803   #endif
33804 }
33805 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33806   #undef _mm256_cvtepu8_epi32
33807   #define _mm256_cvtepu8_epi32(a) simde_mm256_cvtepu8_epi32(a)
33808 #endif
33809 
33810 SIMDE_FUNCTION_ATTRIBUTES
33811 simde__m256i
simde_mm256_cvtepu8_epi64(simde__m128i a)33812 simde_mm256_cvtepu8_epi64 (simde__m128i a) {
33813   #if defined(SIMDE_X86_AVX2_NATIVE)
33814     return _mm256_cvtepu8_epi64(a);
33815   #else
33816     simde__m256i_private r_;
33817     simde__m128i_private a_ = simde__m128i_to_private(a);
33818 
33819     SIMDE_VECTORIZE
33820     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
33821       r_.i64[i] = a_.u8[i];
33822     }
33823 
33824     return simde__m256i_from_private(r_);
33825   #endif
33826 }
33827 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33828   #undef _mm256_cvtepu8_epi64
33829   #define _mm256_cvtepu8_epi64(a) simde_mm256_cvtepu8_epi64(a)
33830 #endif
33831 
33832 SIMDE_FUNCTION_ATTRIBUTES
33833 simde__m256i
simde_mm256_cvtepu16_epi32(simde__m128i a)33834 simde_mm256_cvtepu16_epi32 (simde__m128i a) {
33835   #if defined(SIMDE_X86_AVX2_NATIVE)
33836     return _mm256_cvtepu16_epi32(a);
33837   #else
33838     simde__m256i_private r_;
33839     simde__m128i_private a_ = simde__m128i_to_private(a);
33840 
33841     #if defined(SIMDE_CONVERT_VECTOR_)
33842       SIMDE_CONVERT_VECTOR_(r_.i32, a_.u16);
33843     #else
33844       SIMDE_VECTORIZE
33845       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
33846         r_.i32[i] = a_.u16[i];
33847       }
33848     #endif
33849 
33850     return simde__m256i_from_private(r_);
33851   #endif
33852 }
33853 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33854   #undef _mm256_cvtepu16_epi32
33855   #define _mm256_cvtepu16_epi32(a) simde_mm256_cvtepu16_epi32(a)
33856 #endif
33857 
33858 SIMDE_FUNCTION_ATTRIBUTES
33859 simde__m256i
simde_mm256_cvtepu16_epi64(simde__m128i a)33860 simde_mm256_cvtepu16_epi64 (simde__m128i a) {
33861   #if defined(SIMDE_X86_AVX2_NATIVE)
33862     return _mm256_cvtepu16_epi64(a);
33863   #else
33864     simde__m256i_private r_;
33865     simde__m128i_private a_ = simde__m128i_to_private(a);
33866 
33867     #if defined(SIMDE_CONVERT_VECTOR_)
33868       SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u16);
33869     #else
33870       SIMDE_VECTORIZE
33871       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
33872         r_.i64[i] = a_.u16[i];
33873       }
33874     #endif
33875 
33876     return simde__m256i_from_private(r_);
33877   #endif
33878 }
33879 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33880   #undef _mm256_cvtepu16_epi64
33881   #define _mm256_cvtepu16_epi64(a) simde_mm256_cvtepu16_epi64(a)
33882 #endif
33883 
33884 SIMDE_FUNCTION_ATTRIBUTES
33885 simde__m256i
simde_mm256_cvtepu32_epi64(simde__m128i a)33886 simde_mm256_cvtepu32_epi64 (simde__m128i a) {
33887   #if defined(SIMDE_X86_AVX2_NATIVE)
33888     return _mm256_cvtepu32_epi64(a);
33889   #else
33890     simde__m256i_private r_;
33891     simde__m128i_private a_ = simde__m128i_to_private(a);
33892 
33893     #if defined(SIMDE_CONVERT_VECTOR_)
33894       SIMDE_CONVERT_VECTOR_(r_.i64, a_.u32);
33895     #else
33896       SIMDE_VECTORIZE
33897       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
33898         r_.i64[i] = a_.u32[i];
33899       }
33900     #endif
33901 
33902     return simde__m256i_from_private(r_);
33903   #endif
33904 }
33905 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33906   #undef _mm256_cvtepu32_epi64
33907   #define _mm256_cvtepu32_epi64(a) simde_mm256_cvtepu32_epi64(a)
33908 #endif
33909 
33910 SIMDE_FUNCTION_ATTRIBUTES
33911 int
simde_mm256_extract_epi8(simde__m256i a,const int index)33912 simde_mm256_extract_epi8 (simde__m256i a, const int index)
33913     SIMDE_REQUIRE_RANGE(index, 0, 31){
33914   simde__m256i_private a_ = simde__m256i_to_private(a);
33915   return a_.i8[index];
33916 }
33917 #if defined(SIMDE_X86_AVX2_NATIVE)
33918   #define simde_mm256_extract_epi8(a, index) _mm256_extract_epi8(a, index)
33919 #endif
33920 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33921   #undef _mm256_extract_epi8
33922   #define _mm256_extract_epi8(a, index) simde_mm256_extract_epi8(a, index)
33923 #endif
33924 
33925 SIMDE_FUNCTION_ATTRIBUTES
33926 int
simde_mm256_extract_epi16(simde__m256i a,const int index)33927 simde_mm256_extract_epi16 (simde__m256i a, const int index)
33928     SIMDE_REQUIRE_RANGE(index, 0, 15)  {
33929   simde__m256i_private a_ = simde__m256i_to_private(a);
33930   return a_.i16[index];
33931 }
33932 #if defined(SIMDE_X86_AVX2_NATIVE)
33933   #define simde_mm256_extract_epi16(a, index) _mm256_extract_epi16(a, index)
33934 #endif
33935 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33936   #undef _mm256_extract_epi16
33937   #define _mm256_extract_epi16(a, index) simde_mm256_extract_epi16(a, index)
33938 #endif
33939 
33940 SIMDE_FUNCTION_ATTRIBUTES
33941 simde__m128i
simde_mm256_extracti128_si256(simde__m256i a,const int imm8)33942 simde_mm256_extracti128_si256 (simde__m256i a, const int imm8)
33943     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
33944   simde__m256i_private a_ = simde__m256i_to_private(a);
33945   return a_.m128i[imm8];
33946 }
33947 #if defined(SIMDE_X86_AVX2_NATIVE)
33948 #  define simde_mm256_extracti128_si256(a, imm8) _mm256_extracti128_si256(a, imm8)
33949 #endif
33950 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33951   #undef _mm256_extracti128_si256
33952   #define _mm256_extracti128_si256(a, imm8) simde_mm256_extracti128_si256(a, imm8)
33953 #endif
33954 
33955 SIMDE_FUNCTION_ATTRIBUTES
33956 simde__m128i
simde_mm_i32gather_epi32(const int32_t * base_addr,simde__m128i vindex,const int32_t scale)33957 simde_mm_i32gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale)
33958     SIMDE_REQUIRE_CONSTANT(scale)
33959     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
33960   simde__m128i_private
33961     vindex_ = simde__m128i_to_private(vindex),
33962     r_;
33963   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
33964 
33965   SIMDE_VECTORIZE
33966   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
33967     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
33968     int32_t dst;
33969     simde_memcpy(&dst, src, sizeof(dst));
33970     r_.i32[i] = dst;
33971   }
33972 
33973   return simde__m128i_from_private(r_);
33974 }
33975 #if defined(SIMDE_X86_AVX2_NATIVE)
33976   #define simde_mm_i32gather_epi32(base_addr, vindex, scale) _mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
33977 #endif
33978 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
33979   #undef _mm_i32_gather_epi32
33980   #define _mm_i32gather_epi32(base_addr, vindex, scale) simde_mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
33981 #endif
33982 
33983 SIMDE_FUNCTION_ATTRIBUTES
33984 simde__m128i
simde_mm_mask_i32gather_epi32(simde__m128i src,const int32_t * base_addr,simde__m128i vindex,simde__m128i mask,const int32_t scale)33985 simde_mm_mask_i32gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
33986     SIMDE_REQUIRE_CONSTANT(scale)
33987     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
33988   simde__m128i_private
33989     vindex_ = simde__m128i_to_private(vindex),
33990     src_ = simde__m128i_to_private(src),
33991     mask_ = simde__m128i_to_private(mask),
33992     r_;
33993   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
33994 
33995   SIMDE_VECTORIZE
33996   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
33997     if ((mask_.i32[i] >> 31) & 1) {
33998       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
33999       int32_t dst;
34000       simde_memcpy(&dst, src1, sizeof(dst));
34001       r_.i32[i] = dst;
34002     }
34003     else {
34004       r_.i32[i] = src_.i32[i];
34005     }
34006   }
34007 
34008   return simde__m128i_from_private(r_);
34009 }
34010 #if defined(SIMDE_X86_AVX2_NATIVE)
34011   #define simde_mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
34012 #endif
34013 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34014   #undef _mm_mask_i32gather_epi32
34015   #define _mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
34016 #endif
34017 
34018 SIMDE_FUNCTION_ATTRIBUTES
34019 simde__m256i
simde_mm256_i32gather_epi32(const int32_t * base_addr,simde__m256i vindex,const int32_t scale)34020 simde_mm256_i32gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale)
34021     SIMDE_REQUIRE_CONSTANT(scale)
34022     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34023   simde__m256i_private
34024     vindex_ = simde__m256i_to_private(vindex),
34025     r_;
34026   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34027 
34028   SIMDE_VECTORIZE
34029   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
34030     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34031     int32_t dst;
34032     simde_memcpy(&dst, src, sizeof(dst));
34033     r_.i32[i] = dst;
34034   }
34035 
34036   return simde__m256i_from_private(r_);
34037 }
34038 #if defined(SIMDE_X86_AVX2_NATIVE)
34039   #define simde_mm256_i32gather_epi32(base_addr, vindex, scale) _mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
34040 #endif
34041 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34042   #undef _mm256_i32_gather_epi32
34043   #define _mm256_i32gather_epi32(base_addr, vindex, scale) simde_mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
34044 #endif
34045 
34046 SIMDE_FUNCTION_ATTRIBUTES
34047 simde__m256i
simde_mm256_mask_i32gather_epi32(simde__m256i src,const int32_t * base_addr,simde__m256i vindex,simde__m256i mask,const int32_t scale)34048 simde_mm256_mask_i32gather_epi32(simde__m256i src, const int32_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale)
34049     SIMDE_REQUIRE_CONSTANT(scale)
34050     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34051   simde__m256i_private
34052     vindex_ = simde__m256i_to_private(vindex),
34053     src_ = simde__m256i_to_private(src),
34054     mask_ = simde__m256i_to_private(mask),
34055     r_;
34056   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34057 
34058   SIMDE_VECTORIZE
34059   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
34060     if ((mask_.i32[i] >> 31) & 1) {
34061       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34062       int32_t dst;
34063       simde_memcpy(&dst, src1, sizeof(dst));
34064       r_.i32[i] = dst;
34065     }
34066     else {
34067       r_.i32[i] = src_.i32[i];
34068     }
34069   }
34070 
34071   return simde__m256i_from_private(r_);
34072 }
34073 #if defined(SIMDE_X86_AVX2_NATIVE)
34074   #define simde_mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
34075 #endif
34076 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34077   #undef _mm256_mask_i32gather_epi32
34078   #define _mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
34079 #endif
34080 
34081 SIMDE_FUNCTION_ATTRIBUTES
34082 simde__m128i
simde_mm_i64gather_epi32(const int32_t * base_addr,simde__m128i vindex,const int32_t scale)34083 simde_mm_i64gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale)
34084     SIMDE_REQUIRE_CONSTANT(scale)
34085     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34086   simde__m128i_private
34087     vindex_ = simde__m128i_to_private(vindex),
34088     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
34089   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34090 
34091   SIMDE_VECTORIZE
34092   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34093     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34094     int32_t dst;
34095     simde_memcpy(&dst, src, sizeof(dst));
34096     r_.i32[i] = dst;
34097   }
34098 
34099   return simde__m128i_from_private(r_);
34100 }
34101 #if defined(SIMDE_X86_AVX2_NATIVE)
34102   #define simde_mm_i64gather_epi32(base_addr, vindex, scale) _mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
34103 #endif
34104 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34105   #undef _mm_i64_gather_epi32
34106   #define _mm_i64gather_epi32(base_addr, vindex, scale) simde_mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
34107 #endif
34108 
34109 SIMDE_FUNCTION_ATTRIBUTES
34110 simde__m128i
simde_mm_mask_i64gather_epi32(simde__m128i src,const int32_t * base_addr,simde__m128i vindex,simde__m128i mask,const int32_t scale)34111 simde_mm_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
34112     SIMDE_REQUIRE_CONSTANT(scale)
34113     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34114   simde__m128i_private
34115     vindex_ = simde__m128i_to_private(vindex),
34116     src_ = simde__m128i_to_private(src),
34117     mask_ = simde__m128i_to_private(mask),
34118     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
34119   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34120 
34121   SIMDE_VECTORIZE
34122   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34123     if ((mask_.i32[i] >> 31) & 1) {
34124       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34125       int32_t dst;
34126       simde_memcpy(&dst, src1, sizeof(dst));
34127       r_.i32[i] = dst;
34128     }
34129     else {
34130       r_.i32[i] = src_.i32[i];
34131     }
34132   }
34133 
34134   return simde__m128i_from_private(r_);
34135 }
34136 #if defined(SIMDE_X86_AVX2_NATIVE)
34137   #define simde_mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
34138 #endif
34139 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34140   #undef _mm_mask_i64gather_epi32
34141   #define _mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
34142 #endif
34143 
34144 SIMDE_FUNCTION_ATTRIBUTES
34145 simde__m128i
simde_mm256_i64gather_epi32(const int32_t * base_addr,simde__m256i vindex,const int32_t scale)34146 simde_mm256_i64gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale)
34147     SIMDE_REQUIRE_CONSTANT(scale)
34148     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34149   simde__m256i_private
34150     vindex_ = simde__m256i_to_private(vindex);
34151   simde__m128i_private
34152     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
34153   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34154 
34155   SIMDE_VECTORIZE
34156   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34157     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34158     int32_t dst;
34159     simde_memcpy(&dst, src, sizeof(dst));
34160     r_.i32[i] = dst;
34161   }
34162 
34163   return simde__m128i_from_private(r_);
34164 }
34165 #if defined(SIMDE_X86_AVX2_NATIVE)
34166   #define simde_mm256_i64gather_epi32(base_addr, vindex, scale) _mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
34167 #endif
34168 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34169   #undef _mm256_i64_gather_epi32
34170   #define _mm256_i64gather_epi32(base_addr, vindex, scale) simde_mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
34171 #endif
34172 
34173 SIMDE_FUNCTION_ATTRIBUTES
34174 simde__m128i
simde_mm256_mask_i64gather_epi32(simde__m128i src,const int32_t * base_addr,simde__m256i vindex,simde__m128i mask,const int32_t scale)34175 simde_mm256_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m256i vindex, simde__m128i mask, const int32_t scale)
34176     SIMDE_REQUIRE_CONSTANT(scale)
34177     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34178   simde__m256i_private
34179     vindex_ = simde__m256i_to_private(vindex);
34180   simde__m128i_private
34181     src_ = simde__m128i_to_private(src),
34182     mask_ = simde__m128i_to_private(mask),
34183     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
34184   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34185 
34186   SIMDE_VECTORIZE
34187   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34188     if ((mask_.i32[i] >> 31) & 1) {
34189       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34190       int32_t dst;
34191       simde_memcpy(&dst, src1, sizeof(dst));
34192       r_.i32[i] = dst;
34193     }
34194     else {
34195       r_.i32[i] = src_.i32[i];
34196     }
34197   }
34198 
34199   return simde__m128i_from_private(r_);
34200 }
34201 #if defined(SIMDE_X86_AVX2_NATIVE)
34202   #define simde_mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
34203 #endif
34204 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34205   #undef _mm256_mask_i64gather_epi32
34206   #define _mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
34207 #endif
34208 
34209 SIMDE_FUNCTION_ATTRIBUTES
34210 simde__m128i
simde_mm_i32gather_epi64(const int64_t * base_addr,simde__m128i vindex,const int32_t scale)34211 simde_mm_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
34212     SIMDE_REQUIRE_CONSTANT(scale)
34213     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34214   simde__m128i_private
34215     vindex_ = simde__m128i_to_private(vindex),
34216     r_;
34217   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34218 
34219   SIMDE_VECTORIZE
34220   for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
34221     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34222     int64_t dst;
34223     simde_memcpy(&dst, src, sizeof(dst));
34224     r_.i64[i] = dst;
34225   }
34226 
34227   return simde__m128i_from_private(r_);
34228 }
34229 #if defined(SIMDE_X86_AVX2_NATIVE)
34230   #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
34231     #define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
34232   #else
34233     #define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
34234   #endif
34235 #endif
34236 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34237   #undef _mm_i32_gather_epi64
34238   #define _mm_i32gather_epi64(base_addr, vindex, scale) simde_mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
34239 #endif
34240 
34241 SIMDE_FUNCTION_ATTRIBUTES
34242 simde__m128i
simde_mm_mask_i32gather_epi64(simde__m128i src,const int64_t * base_addr,simde__m128i vindex,simde__m128i mask,const int32_t scale)34243 simde_mm_mask_i32gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
34244     SIMDE_REQUIRE_CONSTANT(scale)
34245     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34246   simde__m128i_private
34247     vindex_ = simde__m128i_to_private(vindex),
34248     src_ = simde__m128i_to_private(src),
34249     mask_ = simde__m128i_to_private(mask),
34250     r_;
34251   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34252 
34253   SIMDE_VECTORIZE
34254   for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
34255     if ((mask_.i64[i] >> 63) & 1) {
34256       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34257       int64_t dst;
34258       simde_memcpy(&dst, src1, sizeof(dst));
34259       r_.i64[i] = dst;
34260     }
34261     else {
34262       r_.i64[i] = src_.i64[i];
34263     }
34264   }
34265 
34266   return simde__m128i_from_private(r_);
34267 }
34268 #if defined(SIMDE_X86_AVX2_NATIVE)
34269   #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
34270     #define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
34271   #else
34272     #define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
34273   #endif
34274 #endif
34275 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34276   #undef _mm_mask_i32gather_epi64
34277   #define _mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
34278 #endif
34279 
34280 SIMDE_FUNCTION_ATTRIBUTES
34281 simde__m256i
simde_mm256_i32gather_epi64(const int64_t * base_addr,simde__m128i vindex,const int32_t scale)34282 simde_mm256_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
34283     SIMDE_REQUIRE_CONSTANT(scale)
34284     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34285   simde__m128i_private
34286     vindex_ = simde__m128i_to_private(vindex);
34287   simde__m256i_private
34288     r_;
34289   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34290 
34291   SIMDE_VECTORIZE
34292   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
34293     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34294     int64_t dst;
34295     simde_memcpy(&dst, src, sizeof(dst));
34296     r_.i64[i] = dst;
34297   }
34298 
34299   return simde__m256i_from_private(r_);
34300 }
34301 #if defined(SIMDE_X86_AVX2_NATIVE)
34302   #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
34303     #define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
34304   #else
34305     #define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
34306   #endif
34307 #endif
34308 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34309   #undef _mm256_i32_gather_epi64
34310   #define _mm256_i32gather_epi64(base_addr, vindex, scale) simde_mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
34311 #endif
34312 
34313 SIMDE_FUNCTION_ATTRIBUTES
34314 simde__m256i
simde_mm256_mask_i32gather_epi64(simde__m256i src,const int64_t * base_addr,simde__m128i vindex,simde__m256i mask,const int32_t scale)34315 simde_mm256_mask_i32gather_epi64(simde__m256i src, const int64_t* base_addr, simde__m128i vindex, simde__m256i mask, const int32_t scale)
34316     SIMDE_REQUIRE_CONSTANT(scale)
34317     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34318   simde__m256i_private
34319     src_ = simde__m256i_to_private(src),
34320     mask_ = simde__m256i_to_private(mask),
34321     r_;
34322   simde__m128i_private
34323     vindex_ = simde__m128i_to_private(vindex);
34324   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34325 
34326   SIMDE_VECTORIZE
34327   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
34328     if ((mask_.i64[i] >> 63) & 1) {
34329       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34330       int64_t dst;
34331       simde_memcpy(&dst, src1, sizeof(dst));
34332       r_.i64[i] = dst;
34333     }
34334     else {
34335       r_.i64[i] = src_.i64[i];
34336     }
34337   }
34338 
34339   return simde__m256i_from_private(r_);
34340 }
34341 #if defined(SIMDE_X86_AVX2_NATIVE)
34342   #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
34343     #define simde_mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
34344   #else
34345     #define simde_mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
34346   #endif
34347 #endif
34348 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34349   #undef _mm256_mask_i32gather_epi64
34350   #define _mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
34351 #endif
34352 
34353 SIMDE_FUNCTION_ATTRIBUTES
34354 simde__m128i
simde_mm_i64gather_epi64(const int64_t * base_addr,simde__m128i vindex,const int32_t scale)34355 simde_mm_i64gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
34356     SIMDE_REQUIRE_CONSTANT(scale)
34357     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34358   simde__m128i_private
34359     vindex_ = simde__m128i_to_private(vindex),
34360     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
34361   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34362 
34363   SIMDE_VECTORIZE
34364   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34365     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34366     int64_t dst;
34367     simde_memcpy(&dst, src, sizeof(dst));
34368     r_.i64[i] = dst;
34369   }
34370 
34371   return simde__m128i_from_private(r_);
34372 }
34373 #if defined(SIMDE_X86_AVX2_NATIVE)
34374   #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
34375     #define simde_mm_i64gather_epi64(base_addr, vindex, scale) _mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
34376   #else
34377     #define simde_mm_i64gather_epi64(base_addr, vindex, scale) _mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
34378   #endif
34379 #endif
34380 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34381   #undef _mm_i64_gather_epi64
34382   #define _mm_i64gather_epi64(base_addr, vindex, scale) simde_mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
34383 #endif
34384 
34385 SIMDE_FUNCTION_ATTRIBUTES
34386 simde__m128i
simde_mm_mask_i64gather_epi64(simde__m128i src,const int64_t * base_addr,simde__m128i vindex,simde__m128i mask,const int32_t scale)34387 simde_mm_mask_i64gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
34388     SIMDE_REQUIRE_CONSTANT(scale)
34389     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34390   simde__m128i_private
34391     vindex_ = simde__m128i_to_private(vindex),
34392     src_ = simde__m128i_to_private(src),
34393     mask_ = simde__m128i_to_private(mask),
34394     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
34395   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34396 
34397   SIMDE_VECTORIZE
34398   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34399     if ((mask_.i64[i] >> 63) & 1) {
34400       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34401       int64_t dst;
34402       simde_memcpy(&dst, src1, sizeof(dst));
34403       r_.i64[i] = dst;
34404     }
34405     else {
34406       r_.i64[i] = src_.i64[i];
34407     }
34408   }
34409 
34410   return simde__m128i_from_private(r_);
34411 }
34412 #if defined(SIMDE_X86_AVX2_NATIVE)
34413   #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
34414     #define simde_mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
34415   #else
34416     #define simde_mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
34417   #endif
34418 #endif
34419 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34420   #undef _mm_mask_i64gather_epi64
34421   #define _mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
34422 #endif
34423 
34424 SIMDE_FUNCTION_ATTRIBUTES
34425 simde__m256i
simde_mm256_i64gather_epi64(const int64_t * base_addr,simde__m256i vindex,const int32_t scale)34426 simde_mm256_i64gather_epi64(const int64_t* base_addr, simde__m256i vindex, const int32_t scale)
34427     SIMDE_REQUIRE_CONSTANT(scale)
34428     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34429   simde__m256i_private
34430     vindex_ = simde__m256i_to_private(vindex),
34431     r_ = simde__m256i_to_private(simde_mm256_setzero_si256());
34432   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34433 
34434   SIMDE_VECTORIZE
34435   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34436     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34437     int64_t dst;
34438     simde_memcpy(&dst, src, sizeof(dst));
34439     r_.i64[i] = dst;
34440   }
34441 
34442   return simde__m256i_from_private(r_);
34443 }
34444 #if defined(SIMDE_X86_AVX2_NATIVE)
34445   #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
34446     #define simde_mm256_i64gather_epi64(base_addr, vindex, scale) _mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
34447   #else
34448     #define simde_mm256_i64gather_epi64(base_addr, vindex, scale) _mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
34449   #endif
34450 #endif
34451 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34452   #undef _mm256_i64_gather_epi64
34453   #define _mm256_i64gather_epi64(base_addr, vindex, scale) simde_mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
34454 #endif
34455 
34456 SIMDE_FUNCTION_ATTRIBUTES
34457 simde__m256i
simde_mm256_mask_i64gather_epi64(simde__m256i src,const int64_t * base_addr,simde__m256i vindex,simde__m256i mask,const int32_t scale)34458 simde_mm256_mask_i64gather_epi64(simde__m256i src, const int64_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale)
34459     SIMDE_REQUIRE_CONSTANT(scale)
34460     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34461   simde__m256i_private
34462     vindex_ = simde__m256i_to_private(vindex),
34463     src_ = simde__m256i_to_private(src),
34464     mask_ = simde__m256i_to_private(mask),
34465     r_ = simde__m256i_to_private(simde_mm256_setzero_si256());
34466   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34467 
34468   SIMDE_VECTORIZE
34469   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34470     if ((mask_.i64[i] >> 63) & 1) {
34471       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34472       int64_t dst;
34473       simde_memcpy(&dst, src1, sizeof(dst));
34474       r_.i64[i] = dst;
34475     }
34476     else {
34477       r_.i64[i] = src_.i64[i];
34478     }
34479   }
34480 
34481   return simde__m256i_from_private(r_);
34482 }
34483 #if defined(SIMDE_X86_AVX2_NATIVE)
34484   #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
34485     #define simde_mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
34486   #else
34487     #define simde_mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
34488   #endif
34489 #endif
34490 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34491   #undef _mm256_mask_i64gather_epi64
34492   #define _mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
34493 #endif
34494 
34495 SIMDE_FUNCTION_ATTRIBUTES
34496 simde__m128
simde_mm_i32gather_ps(const simde_float32 * base_addr,simde__m128i vindex,const int32_t scale)34497 simde_mm_i32gather_ps(const simde_float32* base_addr, simde__m128i vindex, const int32_t scale)
34498     SIMDE_REQUIRE_CONSTANT(scale)
34499     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34500   simde__m128i_private
34501     vindex_ = simde__m128i_to_private(vindex);
34502   simde__m128_private
34503     r_;
34504   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34505 
34506   SIMDE_VECTORIZE
34507   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
34508     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34509     simde_float32 dst;
34510     simde_memcpy(&dst, src, sizeof(dst));
34511     r_.f32[i] = dst;
34512   }
34513 
34514   return simde__m128_from_private(r_);
34515 }
34516 #if defined(SIMDE_X86_AVX2_NATIVE)
34517   #define simde_mm_i32gather_ps(base_addr, vindex, scale) _mm_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
34518 #endif
34519 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34520   #undef _mm_i32_gather_ps
34521   #define _mm_i32gather_ps(base_addr, vindex, scale) simde_mm_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
34522 #endif
34523 
34524 SIMDE_FUNCTION_ATTRIBUTES
34525 simde__m128
simde_mm_mask_i32gather_ps(simde__m128 src,const simde_float32 * base_addr,simde__m128i vindex,simde__m128 mask,const int32_t scale)34526 simde_mm_mask_i32gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m128i vindex, simde__m128 mask, const int32_t scale)
34527     SIMDE_REQUIRE_CONSTANT(scale)
34528     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34529   simde__m128i_private
34530     vindex_ = simde__m128i_to_private(vindex);
34531   simde__m128_private
34532     src_ = simde__m128_to_private(src),
34533     mask_ = simde__m128_to_private(mask),
34534     r_;
34535   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34536 
34537   SIMDE_VECTORIZE
34538   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
34539     if ((mask_.i32[i] >> 31) & 1) {
34540       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34541       simde_float32 dst;
34542       simde_memcpy(&dst, src1, sizeof(dst));
34543       r_.f32[i] = dst;
34544     }
34545     else {
34546       r_.f32[i] = src_.f32[i];
34547     }
34548   }
34549 
34550   return simde__m128_from_private(r_);
34551 }
34552 #if defined(SIMDE_X86_AVX2_NATIVE)
34553   #define simde_mm_mask_i32gather_ps(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale)
34554 #endif
34555 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34556   #undef _mm_mask_i32gather_ps
34557   #define _mm_mask_i32gather_ps(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
34558 #endif
34559 
34560 SIMDE_FUNCTION_ATTRIBUTES
34561 simde__m256
simde_mm256_i32gather_ps(const simde_float32 * base_addr,simde__m256i vindex,const int32_t scale)34562 simde_mm256_i32gather_ps(const simde_float32* base_addr, simde__m256i vindex, const int32_t scale)
34563     SIMDE_REQUIRE_CONSTANT(scale)
34564     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34565   simde__m256i_private
34566     vindex_ = simde__m256i_to_private(vindex);
34567   simde__m256_private
34568     r_;
34569   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34570 
34571   SIMDE_VECTORIZE
34572   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
34573     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34574     simde_float32 dst;
34575     simde_memcpy(&dst, src, sizeof(dst));
34576     r_.f32[i] = dst;
34577   }
34578 
34579   return simde__m256_from_private(r_);
34580 }
34581 #if defined(SIMDE_X86_AVX2_NATIVE)
34582   #define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
34583 #endif
34584 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34585   #undef _mm256_i32_gather_ps
34586   #define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
34587 #endif
34588 
34589 SIMDE_FUNCTION_ATTRIBUTES
34590 simde__m256
simde_mm256_mask_i32gather_ps(simde__m256 src,const simde_float32 * base_addr,simde__m256i vindex,simde__m256 mask,const int32_t scale)34591 simde_mm256_mask_i32gather_ps(simde__m256 src, const simde_float32* base_addr, simde__m256i vindex, simde__m256 mask, const int32_t scale)
34592     SIMDE_REQUIRE_CONSTANT(scale)
34593     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34594   simde__m256i_private
34595     vindex_ = simde__m256i_to_private(vindex);
34596   simde__m256_private
34597     src_ = simde__m256_to_private(src),
34598     mask_ = simde__m256_to_private(mask),
34599     r_;
34600   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34601 
34602   SIMDE_VECTORIZE
34603   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
34604     if ((mask_.i32[i] >> 31) & 1) {
34605       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34606       simde_float32 dst;
34607       simde_memcpy(&dst, src1, sizeof(dst));
34608       r_.f32[i] = dst;
34609     }
34610     else {
34611       r_.f32[i] = src_.f32[i];
34612     }
34613   }
34614 
34615   return simde__m256_from_private(r_);
34616 }
34617 #if defined(SIMDE_X86_AVX2_NATIVE)
34618   #define simde_mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale)
34619 #endif
34620 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34621   #undef _mm256_mask_i32gather_ps
34622   #define _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
34623 #endif
34624 
34625 SIMDE_FUNCTION_ATTRIBUTES
34626 simde__m128
simde_mm_i64gather_ps(const simde_float32 * base_addr,simde__m128i vindex,const int32_t scale)34627 simde_mm_i64gather_ps(const simde_float32* base_addr, simde__m128i vindex, const int32_t scale)
34628     SIMDE_REQUIRE_CONSTANT(scale)
34629     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34630   simde__m128i_private
34631     vindex_ = simde__m128i_to_private(vindex);
34632   simde__m128_private
34633     r_ = simde__m128_to_private(simde_mm_setzero_ps());
34634   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34635 
34636   SIMDE_VECTORIZE
34637   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34638     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34639     simde_float32 dst;
34640     simde_memcpy(&dst, src, sizeof(dst));
34641     r_.f32[i] = dst;
34642   }
34643 
34644   return simde__m128_from_private(r_);
34645 }
34646 #if defined(SIMDE_X86_AVX2_NATIVE)
34647   #define simde_mm_i64gather_ps(base_addr, vindex, scale) _mm_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
34648 #endif
34649 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34650   #undef _mm_i64_gather_ps
34651   #define _mm_i64gather_ps(base_addr, vindex, scale) simde_mm_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
34652 #endif
34653 
34654 SIMDE_FUNCTION_ATTRIBUTES
34655 simde__m128
simde_mm_mask_i64gather_ps(simde__m128 src,const simde_float32 * base_addr,simde__m128i vindex,simde__m128 mask,const int32_t scale)34656 simde_mm_mask_i64gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m128i vindex, simde__m128 mask, const int32_t scale)
34657     SIMDE_REQUIRE_CONSTANT(scale)
34658     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34659   simde__m128i_private
34660     vindex_ = simde__m128i_to_private(vindex);
34661   simde__m128_private
34662     src_ = simde__m128_to_private(src),
34663     mask_ = simde__m128_to_private(mask),
34664     r_ = simde__m128_to_private(simde_mm_setzero_ps());
34665   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34666 
34667   SIMDE_VECTORIZE
34668   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34669     if ((mask_.i32[i] >> 31) & 1) {
34670       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34671      simde_float32 dst;
34672       simde_memcpy(&dst, src1, sizeof(dst));
34673       r_.f32[i] = dst;
34674     }
34675     else {
34676       r_.f32[i] = src_.f32[i];
34677     }
34678   }
34679 
34680   return simde__m128_from_private(r_);
34681 }
34682 #if defined(SIMDE_X86_AVX2_NATIVE)
34683   #define simde_mm_mask_i64gather_ps(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, float32_t const*, base_addr), vindex, mask, scale)
34684 #endif
34685 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34686   #undef _mm_mask_i64gather_ps
34687   #define _mm_mask_i64gather_ps(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
34688 #endif
34689 
34690 SIMDE_FUNCTION_ATTRIBUTES
34691 simde__m128
simde_mm256_i64gather_ps(const simde_float32 * base_addr,simde__m256i vindex,const int32_t scale)34692 simde_mm256_i64gather_ps(const simde_float32* base_addr, simde__m256i vindex, const int32_t scale)
34693     SIMDE_REQUIRE_CONSTANT(scale)
34694     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34695   simde__m256i_private
34696     vindex_ = simde__m256i_to_private(vindex);
34697   simde__m128_private
34698     r_ = simde__m128_to_private(simde_mm_setzero_ps());
34699   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34700 
34701   SIMDE_VECTORIZE
34702   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34703     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34704     simde_float32 dst;
34705     simde_memcpy(&dst, src, sizeof(dst));
34706     r_.f32[i] = dst;
34707   }
34708 
34709   return simde__m128_from_private(r_);
34710 }
34711 #if defined(SIMDE_X86_AVX2_NATIVE)
34712   #define simde_mm256_i64gather_ps(base_addr, vindex, scale) _mm256_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
34713 #endif
34714 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34715   #undef _mm256_i64_gather_ps
34716   #define _mm256_i64gather_ps(base_addr, vindex, scale) simde_mm256_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
34717 #endif
34718 
34719 SIMDE_FUNCTION_ATTRIBUTES
34720 simde__m128
simde_mm256_mask_i64gather_ps(simde__m128 src,const simde_float32 * base_addr,simde__m256i vindex,simde__m128 mask,const int32_t scale)34721 simde_mm256_mask_i64gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m256i vindex, simde__m128 mask, const int32_t scale)
34722     SIMDE_REQUIRE_CONSTANT(scale)
34723     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34724   simde__m256i_private
34725     vindex_ = simde__m256i_to_private(vindex);
34726   simde__m128_private
34727     src_ = simde__m128_to_private(src),
34728     mask_ = simde__m128_to_private(mask),
34729     r_ = simde__m128_to_private(simde_mm_setzero_ps());
34730   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34731 
34732   SIMDE_VECTORIZE
34733   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34734     if ((mask_.i32[i] >> 31) & 1) {
34735       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34736       simde_float32 dst;
34737       simde_memcpy(&dst, src1, sizeof(dst));
34738       r_.f32[i] = dst;
34739     }
34740     else {
34741       r_.f32[i] = src_.f32[i];
34742     }
34743   }
34744 
34745   return simde__m128_from_private(r_);
34746 }
34747 #if defined(SIMDE_X86_AVX2_NATIVE)
34748   #define simde_mm256_mask_i64gather_ps(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale)
34749 #endif
34750 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34751   #undef _mm256_mask_i64gather_ps
34752   #define _mm256_mask_i64gather_ps(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
34753 #endif
34754 
34755 SIMDE_FUNCTION_ATTRIBUTES
34756 simde__m128d
simde_mm_i32gather_pd(const simde_float64 * base_addr,simde__m128i vindex,const int32_t scale)34757 simde_mm_i32gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale)
34758     SIMDE_REQUIRE_CONSTANT(scale)
34759     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34760   simde__m128i_private
34761     vindex_ = simde__m128i_to_private(vindex);
34762   simde__m128d_private
34763     r_;
34764   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34765 
34766   SIMDE_VECTORIZE
34767   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
34768     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34769     simde_float64 dst;
34770     simde_memcpy(&dst, src, sizeof(dst));
34771     r_.f64[i] = dst;
34772   }
34773 
34774   return simde__m128d_from_private(r_);
34775 }
34776 #if defined(SIMDE_X86_AVX2_NATIVE)
34777   #define simde_mm_i32gather_pd(base_addr, vindex, scale) _mm_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
34778 #endif
34779 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34780   #undef _mm_i32_gather_pd
34781   #define _mm_i32gather_pd(base_addr, vindex, scale) simde_mm_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
34782 #endif
34783 
34784 SIMDE_FUNCTION_ATTRIBUTES
34785 simde__m128d
simde_mm_mask_i32gather_pd(simde__m128d src,const simde_float64 * base_addr,simde__m128i vindex,simde__m128d mask,const int32_t scale)34786 simde_mm_mask_i32gather_pd(simde__m128d src, const simde_float64* base_addr, simde__m128i vindex, simde__m128d mask, const int32_t scale)
34787     SIMDE_REQUIRE_CONSTANT(scale)
34788     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34789   simde__m128i_private
34790     vindex_ = simde__m128i_to_private(vindex);
34791   simde__m128d_private
34792     src_ = simde__m128d_to_private(src),
34793     mask_ = simde__m128d_to_private(mask),
34794     r_;
34795   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34796 
34797   SIMDE_VECTORIZE
34798   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
34799     if ((mask_.i64[i] >> 63) & 1) {
34800       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34801       simde_float64 dst;
34802       simde_memcpy(&dst, src1, sizeof(dst));
34803       r_.f64[i] = dst;
34804     }
34805     else {
34806       r_.f64[i] = src_.f64[i];
34807     }
34808   }
34809 
34810   return simde__m128d_from_private(r_);
34811 }
34812 #if defined(SIMDE_X86_AVX2_NATIVE)
34813   #define simde_mm_mask_i32gather_pd(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
34814 #endif
34815 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34816   #undef _mm_mask_i32gather_pd
34817   #define _mm_mask_i32gather_pd(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
34818 #endif
34819 
34820 SIMDE_FUNCTION_ATTRIBUTES
34821 simde__m256d
simde_mm256_i32gather_pd(const simde_float64 * base_addr,simde__m128i vindex,const int32_t scale)34822 simde_mm256_i32gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale)
34823     SIMDE_REQUIRE_CONSTANT(scale)
34824     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34825   simde__m128i_private
34826     vindex_ = simde__m128i_to_private(vindex);
34827   simde__m256d_private
34828     r_;
34829   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34830 
34831   SIMDE_VECTORIZE
34832   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
34833     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34834     simde_float64 dst;
34835     simde_memcpy(&dst, src, sizeof(dst));
34836     r_.f64[i] = dst;
34837   }
34838 
34839   return simde__m256d_from_private(r_);
34840 }
34841 #if defined(SIMDE_X86_AVX2_NATIVE)
34842   #define simde_mm256_i32gather_pd(base_addr, vindex, scale) _mm256_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
34843 #endif
34844 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34845   #undef _mm256_i32_gather_pd
34846   #define _mm256_i32gather_pd(base_addr, vindex, scale) simde_mm256_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
34847 #endif
34848 
34849 SIMDE_FUNCTION_ATTRIBUTES
34850 simde__m256d
simde_mm256_mask_i32gather_pd(simde__m256d src,const simde_float64 * base_addr,simde__m128i vindex,simde__m256d mask,const int32_t scale)34851 simde_mm256_mask_i32gather_pd(simde__m256d src, const simde_float64* base_addr, simde__m128i vindex, simde__m256d mask, const int32_t scale)
34852     SIMDE_REQUIRE_CONSTANT(scale)
34853     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34854   simde__m256d_private
34855     src_ = simde__m256d_to_private(src),
34856     mask_ = simde__m256d_to_private(mask),
34857     r_;
34858   simde__m128i_private
34859     vindex_ = simde__m128i_to_private(vindex);
34860   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34861 
34862   SIMDE_VECTORIZE
34863   for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
34864     if ((mask_.i64[i] >> 63) & 1) {
34865       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34866       simde_float64 dst;
34867       simde_memcpy(&dst, src1, sizeof(dst));
34868       r_.f64[i] = dst;
34869     }
34870     else {
34871       r_.f64[i] = src_.f64[i];
34872     }
34873   }
34874 
34875   return simde__m256d_from_private(r_);
34876 }
34877 #if defined(SIMDE_X86_AVX2_NATIVE)
34878   #define simde_mm256_mask_i32gather_pd(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
34879 #endif
34880 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34881   #undef _mm256_mask_i32gather_pd
34882   #define _mm256_mask_i32gather_pd(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
34883 #endif
34884 
34885 SIMDE_FUNCTION_ATTRIBUTES
34886 simde__m128d
simde_mm_i64gather_pd(const simde_float64 * base_addr,simde__m128i vindex,const int32_t scale)34887 simde_mm_i64gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale)
34888     SIMDE_REQUIRE_CONSTANT(scale)
34889     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34890   simde__m128i_private
34891     vindex_ = simde__m128i_to_private(vindex);
34892   simde__m128d_private
34893     r_ = simde__m128d_to_private(simde_mm_setzero_pd());
34894   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34895 
34896   SIMDE_VECTORIZE
34897   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34898     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34899     simde_float64 dst;
34900     simde_memcpy(&dst, src, sizeof(dst));
34901     r_.f64[i] = dst;
34902   }
34903 
34904   return simde__m128d_from_private(r_);
34905 }
34906 #if defined(SIMDE_X86_AVX2_NATIVE)
34907   #define simde_mm_i64gather_pd(base_addr, vindex, scale) _mm_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
34908 #endif
34909 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34910   #undef _mm_i64_gather_pd
34911   #define _mm_i64gather_pd(base_addr, vindex, scale) simde_mm_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
34912 #endif
34913 
34914 SIMDE_FUNCTION_ATTRIBUTES
34915 simde__m128d
simde_mm_mask_i64gather_pd(simde__m128d src,const simde_float64 * base_addr,simde__m128i vindex,simde__m128d mask,const int32_t scale)34916 simde_mm_mask_i64gather_pd(simde__m128d src, const simde_float64* base_addr, simde__m128i vindex, simde__m128d mask, const int32_t scale)
34917     SIMDE_REQUIRE_CONSTANT(scale)
34918     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34919   simde__m128i_private
34920     vindex_ = simde__m128i_to_private(vindex);
34921   simde__m128d_private
34922     src_ = simde__m128d_to_private(src),
34923     mask_ = simde__m128d_to_private(mask),
34924     r_ = simde__m128d_to_private(simde_mm_setzero_pd());
34925   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34926 
34927   SIMDE_VECTORIZE
34928   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34929     if ((mask_.i64[i] >> 63) & 1) {
34930       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34931       simde_float64 dst;
34932       simde_memcpy(&dst, src1, sizeof(dst));
34933       r_.f64[i] = dst;
34934     }
34935     else {
34936       r_.f64[i] = src_.f64[i];
34937     }
34938   }
34939 
34940   return simde__m128d_from_private(r_);
34941 }
34942 #if defined(SIMDE_X86_AVX2_NATIVE)
34943   #define simde_mm_mask_i64gather_pd(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
34944 #endif
34945 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34946   #undef _mm_mask_i64gather_pd
34947   #define _mm_mask_i64gather_pd(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
34948 #endif
34949 
34950 SIMDE_FUNCTION_ATTRIBUTES
34951 simde__m256d
simde_mm256_i64gather_pd(const simde_float64 * base_addr,simde__m256i vindex,const int32_t scale)34952 simde_mm256_i64gather_pd(const simde_float64* base_addr, simde__m256i vindex, const int32_t scale)
34953     SIMDE_REQUIRE_CONSTANT(scale)
34954     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34955   simde__m256i_private
34956     vindex_ = simde__m256i_to_private(vindex);
34957   simde__m256d_private
34958     r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
34959   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34960 
34961   SIMDE_VECTORIZE
34962   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34963     const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34964     simde_float64 dst;
34965     simde_memcpy(&dst, src, sizeof(dst));
34966     r_.f64[i] = dst;
34967   }
34968 
34969   return simde__m256d_from_private(r_);
34970 }
34971 #if defined(SIMDE_X86_AVX2_NATIVE)
34972   #define simde_mm256_i64gather_pd(base_addr, vindex, scale) _mm256_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
34973 #endif
34974 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
34975   #undef _mm256_i64_gather_pd
34976   #define _mm256_i64gather_pd(base_addr, vindex, scale) simde_mm256_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
34977 #endif
34978 
34979 SIMDE_FUNCTION_ATTRIBUTES
34980 simde__m256d
simde_mm256_mask_i64gather_pd(simde__m256d src,const simde_float64 * base_addr,simde__m256i vindex,simde__m256d mask,const int32_t scale)34981 simde_mm256_mask_i64gather_pd(simde__m256d src, const simde_float64* base_addr, simde__m256i vindex, simde__m256d mask, const int32_t scale)
34982     SIMDE_REQUIRE_CONSTANT(scale)
34983     HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
34984   simde__m256i_private
34985     vindex_ = simde__m256i_to_private(vindex);
34986   simde__m256d_private
34987     src_ = simde__m256d_to_private(src),
34988     mask_ = simde__m256d_to_private(mask),
34989     r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
34990   const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
34991 
34992   SIMDE_VECTORIZE
34993   for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
34994     if ((mask_.i64[i] >> 63) & 1) {
34995       const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
34996       simde_float64 dst;
34997       simde_memcpy(&dst, src1, sizeof(dst));
34998       r_.f64[i] = dst;
34999     }
35000     else {
35001       r_.f64[i] = src_.f64[i];
35002     }
35003   }
35004 
35005   return simde__m256d_from_private(r_);
35006 }
35007 #if defined(SIMDE_X86_AVX2_NATIVE)
35008   #define simde_mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
35009 #endif
35010 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35011   #undef _mm256_mask_i64gather_pd
35012   #define _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
35013 #endif
35014 
35015 SIMDE_FUNCTION_ATTRIBUTES
35016 simde__m256i
simde_mm256_inserti128_si256(simde__m256i a,simde__m128i b,const int imm8)35017 simde_mm256_inserti128_si256(simde__m256i a, simde__m128i b, const int imm8)
35018     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
35019   simde__m256i_private a_ = simde__m256i_to_private(a);
35020   simde__m128i_private b_ = simde__m128i_to_private(b);
35021 
35022   a_.m128i_private[ imm8 & 1 ] = b_;
35023 
35024   return simde__m256i_from_private(a_);
35025 }
35026 #if defined(SIMDE_X86_AVX2_NATIVE)
35027   #define simde_mm256_inserti128_si256(a, b, imm8) _mm256_inserti128_si256(a, b, imm8)
35028 #endif
35029 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35030   #undef _mm256_inserti128_si256
35031   #define _mm256_inserti128_si256(a, b, imm8) simde_mm256_inserti128_si256(a, b, imm8)
35032 #endif
35033 
35034 SIMDE_FUNCTION_ATTRIBUTES
35035 simde__m256i
simde_mm256_madd_epi16(simde__m256i a,simde__m256i b)35036 simde_mm256_madd_epi16 (simde__m256i a, simde__m256i b) {
35037   #if defined(SIMDE_X86_AVX2_NATIVE)
35038     return _mm256_madd_epi16(a, b);
35039   #else
35040     simde__m256i_private
35041       r_,
35042       a_ = simde__m256i_to_private(a),
35043       b_ = simde__m256i_to_private(b);
35044 
35045     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35046       r_.m128i[0] = simde_mm_madd_epi16(a_.m128i[0], b_.m128i[0]);
35047       r_.m128i[1] = simde_mm_madd_epi16(a_.m128i[1], b_.m128i[1]);
35048     #else
35049       SIMDE_VECTORIZE
35050       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
35051         r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
35052       }
35053     #endif
35054 
35055     return simde__m256i_from_private(r_);
35056   #endif
35057 }
35058 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35059   #undef _mm256_madd_epi16
35060   #define _mm256_madd_epi16(a, b) simde_mm256_madd_epi16(a, b)
35061 #endif
35062 
35063 SIMDE_FUNCTION_ATTRIBUTES
35064 simde__m256i
simde_mm256_maddubs_epi16(simde__m256i a,simde__m256i b)35065 simde_mm256_maddubs_epi16 (simde__m256i a, simde__m256i b) {
35066   #if defined(SIMDE_X86_AVX2_NATIVE)
35067     return _mm256_maddubs_epi16(a, b);
35068   #else
35069     simde__m256i_private
35070       r_,
35071       a_ = simde__m256i_to_private(a),
35072       b_ = simde__m256i_to_private(b);
35073 
35074     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35075       r_.m128i[0] = simde_mm_maddubs_epi16(a_.m128i[0], b_.m128i[0]);
35076       r_.m128i[1] = simde_mm_maddubs_epi16(a_.m128i[1], b_.m128i[1]);
35077     #else
35078       SIMDE_VECTORIZE
35079       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
35080         const int idx = HEDLEY_STATIC_CAST(int, i) << 1;
35081         int32_t ts =
35082           (HEDLEY_STATIC_CAST(int16_t, a_.u8[  idx  ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[  idx  ])) +
35083           (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1]));
35084         r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN;
35085       }
35086     #endif
35087 
35088     return simde__m256i_from_private(r_);
35089   #endif
35090 }
35091 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35092   #undef _mm256_maddubs_epi16
35093   #define _mm256_maddubs_epi16(a, b) simde_mm256_maddubs_epi16(a, b)
35094 #endif
35095 
35096 SIMDE_FUNCTION_ATTRIBUTES
35097 simde__m128i
simde_mm_maskload_epi32(const int32_t mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m128i mask)35098 simde_mm_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) {
35099   #if defined(SIMDE_X86_AVX2_NATIVE)
35100     return _mm_maskload_epi32(mem_addr, mask);
35101   #else
35102     simde__m128i_private
35103       mem_ = simde__m128i_to_private(simde_x_mm_loadu_epi32(mem_addr)),
35104       r_,
35105       mask_ = simde__m128i_to_private(mask);
35106 
35107     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
35108       r_.neon_i32 = vandq_s32(mem_.neon_i32, vshrq_n_s32(mask_.neon_i32, 31));
35109     #else
35110       SIMDE_VECTORIZE
35111       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
35112         r_.i32[i] = mem_.i32[i] & (mask_.i32[i] >> 31);
35113       }
35114     #endif
35115 
35116     return simde__m128i_from_private(r_);
35117   #endif
35118 }
35119 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35120   #undef _mm_maskload_epi32
35121   #define _mm_maskload_epi32(mem_addr, mask) simde_mm_maskload_epi32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr), mask)
35122 #endif
35123 
35124 SIMDE_FUNCTION_ATTRIBUTES
35125 simde__m256i
simde_mm256_maskload_epi32(const int32_t mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m256i mask)35126 simde_mm256_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) {
35127   #if defined(SIMDE_X86_AVX2_NATIVE)
35128     return _mm256_maskload_epi32(mem_addr, mask);
35129   #else
35130     simde__m256i_private
35131       mask_ = simde__m256i_to_private(mask),
35132       r_ = simde__m256i_to_private(simde_x_mm256_loadu_epi32(mem_addr));
35133 
35134     SIMDE_VECTORIZE
35135     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
35136       r_.i32[i] &= mask_.i32[i] >> 31;
35137     }
35138 
35139     return simde__m256i_from_private(r_);
35140   #endif
35141 }
35142 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35143   #undef _mm256_maskload_epi32
35144   #define _mm256_maskload_epi32(mem_addr, mask) simde_mm256_maskload_epi32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr), mask)
35145 #endif
35146 
35147 SIMDE_FUNCTION_ATTRIBUTES
35148 simde__m128i
simde_mm_maskload_epi64(const int64_t mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m128i mask)35149 simde_mm_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) {
35150   #if defined(SIMDE_X86_AVX2_NATIVE)
35151     return _mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask);
35152   #else
35153     simde__m128i_private
35154       mem_ = simde__m128i_to_private(simde_x_mm_loadu_epi64((mem_addr))),
35155       r_,
35156       mask_ = simde__m128i_to_private(mask);
35157 
35158     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
35159       r_.neon_i64 = vandq_s64(mem_.neon_i64, vshrq_n_s64(mask_.neon_i64, 63));
35160     #else
35161       SIMDE_VECTORIZE
35162       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
35163         r_.i64[i] = mem_.i64[i] & (mask_.i64[i] >> 63);
35164       }
35165     #endif
35166 
35167     return simde__m128i_from_private(r_);
35168   #endif
35169 }
35170 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35171   #undef _mm_maskload_epi64
35172   #define _mm_maskload_epi64(mem_addr, mask) simde_mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr), mask)
35173 #endif
35174 
35175 SIMDE_FUNCTION_ATTRIBUTES
35176 simde__m256i
simde_mm256_maskload_epi64(const int64_t mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m256i mask)35177 simde_mm256_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) {
35178   #if defined(SIMDE_X86_AVX2_NATIVE)
35179     return _mm256_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask);
35180   #else
35181     simde__m256i_private
35182       mask_ = simde__m256i_to_private(mask),
35183       r_ = simde__m256i_to_private(simde_x_mm256_loadu_epi64((mem_addr)));
35184 
35185     SIMDE_VECTORIZE
35186     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
35187       r_.i64[i] &= mask_.i64[i] >> 63;
35188     }
35189 
35190     return simde__m256i_from_private(r_);
35191   #endif
35192 }
35193 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35194   #undef _mm256_maskload_epi64
35195   #define _mm256_maskload_epi64(mem_addr, mask) simde_mm256_maskload_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr), mask)
35196 #endif
35197 
35198 SIMDE_FUNCTION_ATTRIBUTES
35199 void
simde_mm_maskstore_epi32(int32_t mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m128i mask,simde__m128i a)35200 simde_mm_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask, simde__m128i a) {
35201   #if defined(SIMDE_X86_AVX2_NATIVE)
35202     _mm_maskstore_epi32(mem_addr, mask, a);
35203   #else
35204     simde__m128i_private mask_ = simde__m128i_to_private(mask);
35205     simde__m128i_private a_ = simde__m128i_to_private(a);
35206 
35207     SIMDE_VECTORIZE
35208     for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) {
35209       if (mask_.u32[i] & (UINT32_C(1) << 31))
35210         mem_addr[i] = a_.i32[i];
35211     }
35212   #endif
35213 }
35214 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35215   #undef _mm_maskstore_epi32
35216   #define _mm_maskstore_epi32(mem_addr, mask, a) simde_mm_maskstore_epi32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), mask, a)
35217 #endif
35218 
35219 SIMDE_FUNCTION_ATTRIBUTES
35220 void
simde_mm256_maskstore_epi32(int32_t mem_addr[HEDLEY_ARRAY_PARAM (8)],simde__m256i mask,simde__m256i a)35221 simde_mm256_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask, simde__m256i a) {
35222   #if defined(SIMDE_X86_AVX2_NATIVE)
35223     _mm256_maskstore_epi32(mem_addr, mask, a);
35224   #else
35225     simde__m256i_private mask_ = simde__m256i_to_private(mask);
35226     simde__m256i_private a_ = simde__m256i_to_private(a);
35227 
35228     SIMDE_VECTORIZE
35229     for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) {
35230       if (mask_.u32[i] & (UINT32_C(1) << 31))
35231         mem_addr[i] = a_.i32[i];
35232     }
35233   #endif
35234 }
35235 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35236   #undef _mm256_maskstore_epi32
35237   #define _mm256_maskstore_epi32(mem_addr, mask, a) simde_mm256_maskstore_epi32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), mask, a)
35238 #endif
35239 
35240 SIMDE_FUNCTION_ATTRIBUTES
35241 void
simde_mm_maskstore_epi64(int64_t mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128i mask,simde__m128i a)35242 simde_mm_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask, simde__m128i a) {
35243   #if defined(SIMDE_X86_AVX2_NATIVE)
35244     _mm_maskstore_epi64(HEDLEY_REINTERPRET_CAST(long long *, mem_addr), mask, a);
35245   #else
35246     simde__m128i_private mask_ = simde__m128i_to_private(mask);
35247     simde__m128i_private a_ = simde__m128i_to_private(a);
35248 
35249     SIMDE_VECTORIZE
35250     for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) {
35251       if (mask_.u64[i] >> 63)
35252         mem_addr[i] = a_.i64[i];
35253     }
35254   #endif
35255 }
35256 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35257   #undef _mm_maskstore_epi64
35258   #define _mm_maskstore_epi64(mem_addr, mask, a) simde_mm_maskstore_epi64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), mask, a)
35259 #endif
35260 
35261 SIMDE_FUNCTION_ATTRIBUTES
35262 void
simde_mm256_maskstore_epi64(int64_t mem_addr[HEDLEY_ARRAY_PARAM (4)],simde__m256i mask,simde__m256i a)35263 simde_mm256_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask, simde__m256i a) {
35264   #if defined(SIMDE_X86_AVX2_NATIVE)
35265     _mm256_maskstore_epi64(HEDLEY_REINTERPRET_CAST(long long *, mem_addr), mask, a);
35266   #else
35267     simde__m256i_private mask_ = simde__m256i_to_private(mask);
35268     simde__m256i_private a_ = simde__m256i_to_private(a);
35269 
35270     SIMDE_VECTORIZE
35271     for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) {
35272       if (mask_.u64[i] & (UINT64_C(1) << 63))
35273         mem_addr[i] = a_.i64[i];
35274     }
35275   #endif
35276 }
35277 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35278   #undef _mm256_maskstore_epi64
35279   #define _mm256_maskstore_epi64(mem_addr, mask, a) simde_mm256_maskstore_epi64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), mask, a)
35280 #endif
35281 
35282 SIMDE_FUNCTION_ATTRIBUTES
35283 simde__m256i
simde_mm256_max_epi8(simde__m256i a,simde__m256i b)35284 simde_mm256_max_epi8 (simde__m256i a, simde__m256i b) {
35285   #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI)
35286     return _mm256_max_epi8(a, b);
35287   #else
35288     simde__m256i_private
35289       r_,
35290       a_ = simde__m256i_to_private(a),
35291       b_ = simde__m256i_to_private(b);
35292 
35293     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35294       r_.m128i[0] = simde_mm_max_epi8(a_.m128i[0], b_.m128i[0]);
35295       r_.m128i[1] = simde_mm_max_epi8(a_.m128i[1], b_.m128i[1]);
35296     #else
35297       SIMDE_VECTORIZE
35298       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
35299         r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
35300       }
35301     #endif
35302 
35303     return simde__m256i_from_private(r_);
35304   #endif
35305 }
35306 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35307   #undef _mm256_max_epi8
35308   #define _mm256_max_epi8(a, b) simde_mm256_max_epi8(a, b)
35309 #endif
35310 
35311 SIMDE_FUNCTION_ATTRIBUTES
35312 simde__m256i
simde_mm256_max_epu8(simde__m256i a,simde__m256i b)35313 simde_mm256_max_epu8 (simde__m256i a, simde__m256i b) {
35314   #if defined(SIMDE_X86_AVX2_NATIVE)
35315     return _mm256_max_epu8(a, b);
35316   #else
35317     simde__m256i_private
35318       r_,
35319       a_ = simde__m256i_to_private(a),
35320       b_ = simde__m256i_to_private(b);
35321 
35322     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35323       r_.m128i[0] = simde_mm_max_epu8(a_.m128i[0], b_.m128i[0]);
35324       r_.m128i[1] = simde_mm_max_epu8(a_.m128i[1], b_.m128i[1]);
35325     #else
35326       SIMDE_VECTORIZE
35327       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
35328         r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
35329       }
35330     #endif
35331 
35332     return simde__m256i_from_private(r_);
35333   #endif
35334 }
35335 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35336   #undef _mm256_max_epu8
35337   #define _mm256_max_epu8(a, b) simde_mm256_max_epu8(a, b)
35338 #endif
35339 
35340 SIMDE_FUNCTION_ATTRIBUTES
35341 simde__m256i
simde_mm256_max_epu16(simde__m256i a,simde__m256i b)35342 simde_mm256_max_epu16 (simde__m256i a, simde__m256i b) {
35343   #if defined(SIMDE_X86_AVX2_NATIVE)
35344     return _mm256_max_epu16(a, b);
35345   #else
35346     simde__m256i_private
35347       r_,
35348       a_ = simde__m256i_to_private(a),
35349       b_ = simde__m256i_to_private(b);
35350 
35351     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35352       r_.m128i[0] = simde_mm_max_epu16(a_.m128i[0], b_.m128i[0]);
35353       r_.m128i[1] = simde_mm_max_epu16(a_.m128i[1], b_.m128i[1]);
35354     #else
35355       SIMDE_VECTORIZE
35356       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
35357         r_.u16[i] = (a_.u16[i] > b_.u16[i]) ? a_.u16[i] : b_.u16[i];
35358       }
35359     #endif
35360 
35361     return simde__m256i_from_private(r_);
35362   #endif
35363 }
35364 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35365   #undef _mm256_max_epu16
35366   #define _mm256_max_epu16(a, b) simde_mm256_max_epu16(a, b)
35367 #endif
35368 
35369 SIMDE_FUNCTION_ATTRIBUTES
35370 simde__m256i
simde_mm256_max_epu32(simde__m256i a,simde__m256i b)35371 simde_mm256_max_epu32 (simde__m256i a, simde__m256i b) {
35372   #if defined(SIMDE_X86_AVX2_NATIVE)
35373     return _mm256_max_epu32(a, b);
35374   #else
35375     simde__m256i_private
35376       r_,
35377       a_ = simde__m256i_to_private(a),
35378       b_ = simde__m256i_to_private(b);
35379 
35380     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35381       r_.m128i[0] = simde_mm_max_epu32(a_.m128i[0], b_.m128i[0]);
35382       r_.m128i[1] = simde_mm_max_epu32(a_.m128i[1], b_.m128i[1]);
35383     #else
35384       SIMDE_VECTORIZE
35385       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
35386         r_.u32[i] = (a_.u32[i] > b_.u32[i]) ? a_.u32[i] : b_.u32[i];
35387       }
35388     #endif
35389 
35390     return simde__m256i_from_private(r_);
35391   #endif
35392 }
35393 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35394   #undef _mm256_max_epu32
35395   #define _mm256_max_epu32(a, b) simde_mm256_max_epu32(a, b)
35396 #endif
35397 
35398 SIMDE_FUNCTION_ATTRIBUTES
35399 simde__m256i
simde_mm256_max_epi16(simde__m256i a,simde__m256i b)35400 simde_mm256_max_epi16 (simde__m256i a, simde__m256i b) {
35401   #if defined(SIMDE_X86_AVX2_NATIVE)
35402     return _mm256_max_epi16(a, b);
35403   #else
35404     simde__m256i_private
35405       r_,
35406       a_ = simde__m256i_to_private(a),
35407       b_ = simde__m256i_to_private(b);
35408 
35409     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35410       r_.m128i[0] = simde_mm_max_epi16(a_.m128i[0], b_.m128i[0]);
35411       r_.m128i[1] = simde_mm_max_epi16(a_.m128i[1], b_.m128i[1]);
35412     #else
35413       SIMDE_VECTORIZE
35414       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
35415         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
35416       }
35417     #endif
35418 
35419     return simde__m256i_from_private(r_);
35420   #endif
35421 }
35422 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35423   #undef _mm256_max_epi16
35424   #define _mm256_max_epi16(a, b) simde_mm256_max_epi16(a, b)
35425 #endif
35426 
35427 SIMDE_FUNCTION_ATTRIBUTES
35428 simde__m256i
simde_mm256_max_epi32(simde__m256i a,simde__m256i b)35429 simde_mm256_max_epi32 (simde__m256i a, simde__m256i b) {
35430   #if defined(SIMDE_X86_AVX2_NATIVE)
35431     return _mm256_max_epi32(a, b);
35432   #else
35433     simde__m256i_private
35434       r_,
35435       a_ = simde__m256i_to_private(a),
35436       b_ = simde__m256i_to_private(b);
35437 
35438     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35439       r_.m128i[0] = simde_mm_max_epi32(a_.m128i[0], b_.m128i[0]);
35440       r_.m128i[1] = simde_mm_max_epi32(a_.m128i[1], b_.m128i[1]);
35441     #else
35442       SIMDE_VECTORIZE
35443       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
35444         r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
35445       }
35446     #endif
35447 
35448     return simde__m256i_from_private(r_);
35449   #endif
35450 }
35451 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35452   #undef _mm256_max_epi32
35453   #define _mm256_max_epi32(a, b) simde_mm256_max_epi32(a, b)
35454 #endif
35455 
35456 SIMDE_FUNCTION_ATTRIBUTES
35457 simde__m256i
simde_mm256_min_epi8(simde__m256i a,simde__m256i b)35458 simde_mm256_min_epi8 (simde__m256i a, simde__m256i b) {
35459   #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI)
35460     return _mm256_min_epi8(a, b);
35461   #else
35462     simde__m256i_private
35463       r_,
35464       a_ = simde__m256i_to_private(a),
35465       b_ = simde__m256i_to_private(b);
35466 
35467     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35468       r_.m128i[0] = simde_mm_min_epi8(a_.m128i[0], b_.m128i[0]);
35469       r_.m128i[1] = simde_mm_min_epi8(a_.m128i[1], b_.m128i[1]);
35470     #else
35471       SIMDE_VECTORIZE
35472       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
35473         r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
35474       }
35475     #endif
35476 
35477     return simde__m256i_from_private(r_);
35478   #endif
35479 }
35480 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35481   #undef _mm256_min_epi8
35482   #define _mm256_min_epi8(a, b) simde_mm256_min_epi8(a, b)
35483 #endif
35484 
35485 SIMDE_FUNCTION_ATTRIBUTES
35486 simde__m256i
simde_mm256_min_epi16(simde__m256i a,simde__m256i b)35487 simde_mm256_min_epi16 (simde__m256i a, simde__m256i b) {
35488   #if defined(SIMDE_X86_AVX2_NATIVE)
35489     return _mm256_min_epi16(a, b);
35490   #else
35491     simde__m256i_private
35492       r_,
35493       a_ = simde__m256i_to_private(a),
35494       b_ = simde__m256i_to_private(b);
35495 
35496     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35497       r_.m128i[0] = simde_mm_min_epi16(a_.m128i[0], b_.m128i[0]);
35498       r_.m128i[1] = simde_mm_min_epi16(a_.m128i[1], b_.m128i[1]);
35499     #else
35500       SIMDE_VECTORIZE
35501       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
35502         r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
35503       }
35504     #endif
35505 
35506     return simde__m256i_from_private(r_);
35507   #endif
35508 }
35509 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35510   #undef _mm256_min_epi16
35511   #define _mm256_min_epi16(a, b) simde_mm256_min_epi16(a, b)
35512 #endif
35513 
35514 SIMDE_FUNCTION_ATTRIBUTES
35515 simde__m256i
simde_mm256_min_epi32(simde__m256i a,simde__m256i b)35516 simde_mm256_min_epi32 (simde__m256i a, simde__m256i b) {
35517   #if defined(SIMDE_X86_AVX2_NATIVE)
35518     return _mm256_min_epi32(a, b);
35519   #else
35520     simde__m256i_private
35521       r_,
35522       a_ = simde__m256i_to_private(a),
35523       b_ = simde__m256i_to_private(b);
35524 
35525     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35526       r_.m128i[0] = simde_mm_min_epi32(a_.m128i[0], b_.m128i[0]);
35527       r_.m128i[1] = simde_mm_min_epi32(a_.m128i[1], b_.m128i[1]);
35528     #else
35529       SIMDE_VECTORIZE
35530       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
35531         r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
35532       }
35533     #endif
35534 
35535     return simde__m256i_from_private(r_);
35536   #endif
35537 }
35538 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35539   #undef _mm256_min_epi32
35540   #define _mm256_min_epi32(a, b) simde_mm256_min_epi32(a, b)
35541 #endif
35542 
35543 SIMDE_FUNCTION_ATTRIBUTES
35544 simde__m256i
simde_mm256_min_epu8(simde__m256i a,simde__m256i b)35545 simde_mm256_min_epu8 (simde__m256i a, simde__m256i b) {
35546   #if defined(SIMDE_X86_AVX2_NATIVE)
35547     return _mm256_min_epu8(a, b);
35548   #else
35549     simde__m256i_private
35550       r_,
35551       a_ = simde__m256i_to_private(a),
35552       b_ = simde__m256i_to_private(b);
35553 
35554     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35555       r_.m128i[0] = simde_mm_min_epu8(a_.m128i[0], b_.m128i[0]);
35556       r_.m128i[1] = simde_mm_min_epu8(a_.m128i[1], b_.m128i[1]);
35557     #else
35558       SIMDE_VECTORIZE
35559       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
35560         r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
35561       }
35562     #endif
35563 
35564     return simde__m256i_from_private(r_);
35565   #endif
35566 }
35567 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35568   #undef _mm256_min_epu8
35569   #define _mm256_min_epu8(a, b) simde_mm256_min_epu8(a, b)
35570 #endif
35571 
35572 SIMDE_FUNCTION_ATTRIBUTES
35573 simde__m256i
simde_mm256_min_epu16(simde__m256i a,simde__m256i b)35574 simde_mm256_min_epu16 (simde__m256i a, simde__m256i b) {
35575   #if defined(SIMDE_X86_AVX2_NATIVE)
35576     return _mm256_min_epu16(a, b);
35577   #else
35578     simde__m256i_private
35579       r_,
35580       a_ = simde__m256i_to_private(a),
35581       b_ = simde__m256i_to_private(b);
35582 
35583     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35584       r_.m128i[0] = simde_mm_min_epu16(a_.m128i[0], b_.m128i[0]);
35585       r_.m128i[1] = simde_mm_min_epu16(a_.m128i[1], b_.m128i[1]);
35586     #else
35587       SIMDE_VECTORIZE
35588       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
35589         r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? a_.u16[i] : b_.u16[i];
35590       }
35591     #endif
35592 
35593     return simde__m256i_from_private(r_);
35594   #endif
35595 }
35596 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35597   #undef _mm256_min_epu16
35598   #define _mm256_min_epu16(a, b) simde_mm256_min_epu16(a, b)
35599 #endif
35600 
35601 SIMDE_FUNCTION_ATTRIBUTES
35602 simde__m256i
simde_mm256_min_epu32(simde__m256i a,simde__m256i b)35603 simde_mm256_min_epu32 (simde__m256i a, simde__m256i b) {
35604   #if defined(SIMDE_X86_AVX2_NATIVE)
35605     return _mm256_min_epu32(a, b);
35606   #else
35607     simde__m256i_private
35608       r_,
35609       a_ = simde__m256i_to_private(a),
35610       b_ = simde__m256i_to_private(b);
35611 
35612     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35613       r_.m128i[0] = simde_mm_min_epu32(a_.m128i[0], b_.m128i[0]);
35614       r_.m128i[1] = simde_mm_min_epu32(a_.m128i[1], b_.m128i[1]);
35615     #else
35616       SIMDE_VECTORIZE
35617       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
35618         r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? a_.u32[i] : b_.u32[i];
35619       }
35620     #endif
35621 
35622     return simde__m256i_from_private(r_);
35623   #endif
35624 }
35625 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35626   #undef _mm256_min_epu32
35627   #define _mm256_min_epu32(a, b) simde_mm256_min_epu32(a, b)
35628 #endif
35629 
35630 SIMDE_FUNCTION_ATTRIBUTES
35631 int32_t
simde_mm256_movemask_epi8(simde__m256i a)35632 simde_mm256_movemask_epi8 (simde__m256i a) {
35633   #if defined(SIMDE_X86_AVX2_NATIVE)
35634     return _mm256_movemask_epi8(a);
35635   #else
35636     simde__m256i_private a_ = simde__m256i_to_private(a);
35637     uint32_t r = 0;
35638 
35639     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35640       for (size_t i = 0 ; i < (sizeof(a_.m128i) / sizeof(a_.m128i[0])) ; i++) {
35641         r |= HEDLEY_STATIC_CAST(uint32_t,simde_mm_movemask_epi8(a_.m128i[i])) << (16 * i);
35642       }
35643     #else
35644       r = 0;
35645       SIMDE_VECTORIZE_REDUCTION(|:r)
35646       for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
35647         r |= HEDLEY_STATIC_CAST(uint32_t, (a_.u8[31 - i] >> 7)) << (31 - i);
35648       }
35649     #endif
35650 
35651     return HEDLEY_STATIC_CAST(int32_t, r);
35652   #endif
35653 }
35654 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35655   #undef _mm256_movemask_epi8
35656   #define _mm256_movemask_epi8(a) simde_mm256_movemask_epi8(a)
35657 #endif
35658 
35659 SIMDE_FUNCTION_ATTRIBUTES
35660 simde__m256i
simde_mm256_mpsadbw_epu8(simde__m256i a,simde__m256i b,const int imm8)35661 simde_mm256_mpsadbw_epu8 (simde__m256i a, simde__m256i b, const int imm8)
35662     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
35663   simde__m256i_private
35664     r_,
35665     a_ = simde__m256i_to_private(a),
35666     b_ = simde__m256i_to_private(b);
35667 
35668   const int a_offset1 = imm8 & 4;
35669   const int b_offset1 = (imm8 & 3) << 2;
35670   const int a_offset2 = (imm8 >> 3) & 4;
35671   const int b_offset2 = ((imm8 >> 3) & 3) << 2;
35672 
35673   #if defined(simde_math_abs)
35674     const int halfway_point = HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0])) ) / 2;
35675     for (int i = 0 ; i < halfway_point ; i++) {
35676       r_.u16[i] =
35677         HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 0] - b_.u8[b_offset1 + 0]))) +
35678         HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 1] - b_.u8[b_offset1 + 1]))) +
35679         HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 2] - b_.u8[b_offset1 + 2]))) +
35680         HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 3] - b_.u8[b_offset1 + 3])));
35681       r_.u16[halfway_point + i] =
35682         HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 0] - b_.u8[2 * halfway_point + b_offset2 + 0]))) +
35683         HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 1] - b_.u8[2 * halfway_point + b_offset2 + 1]))) +
35684         HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 2] - b_.u8[2 * halfway_point + b_offset2 + 2]))) +
35685         HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 3] - b_.u8[2 * halfway_point + b_offset2 + 3])));
35686     }
35687   #else
35688     HEDLEY_UNREACHABLE();
35689   #endif
35690 
35691   return simde__m256i_from_private(r_);
35692 }
35693 #if defined(SIMDE_X86_AVX2_NATIVE) && SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0)
35694   #define simde_mm256_mpsadbw_epu8(a, b, imm8) _mm256_mpsadbw_epu8(a, b, imm8)
35695 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35696   #define simde_mm256_mpsadbw_epu8(a, b, imm8) \
35697      simde_mm256_set_m128i( \
35698        simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8 >> 3)), \
35699        simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8)))
35700 #endif
35701 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35702   #undef _mm256_mpsadbw_epu8
35703   #define _mm256_mpsadbw_epu8(a, b, imm8) simde_mm256_mpsadbw_epu8(a, b, imm8)
35704 #endif
35705 
35706 SIMDE_FUNCTION_ATTRIBUTES
35707 simde__m256i
simde_mm256_mul_epi32(simde__m256i a,simde__m256i b)35708 simde_mm256_mul_epi32 (simde__m256i a, simde__m256i b) {
35709   #if defined(SIMDE_X86_AVX2_NATIVE)
35710     return _mm256_mul_epi32(a, b);
35711   #else
35712     simde__m256i_private
35713       r_,
35714       a_ = simde__m256i_to_private(a),
35715       b_ = simde__m256i_to_private(b);
35716 
35717     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35718       r_.m128i[0] = simde_mm_mul_epi32(a_.m128i[0], b_.m128i[0]);
35719       r_.m128i[1] = simde_mm_mul_epi32(a_.m128i[1], b_.m128i[1]);
35720     #else
35721       SIMDE_VECTORIZE
35722       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
35723         r_.i64[i] =
35724           HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
35725           HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
35726       }
35727     #endif
35728 
35729     return simde__m256i_from_private(r_);
35730   #endif
35731 }
35732 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35733 #  define _mm256_mul_epi32(a, b) simde_mm256_mul_epi32(a, b)
35734 #endif
35735 
35736 SIMDE_FUNCTION_ATTRIBUTES
35737 simde__m256i
simde_mm256_mul_epu32(simde__m256i a,simde__m256i b)35738 simde_mm256_mul_epu32 (simde__m256i a, simde__m256i b) {
35739   #if defined(SIMDE_X86_AVX2_NATIVE)
35740     return _mm256_mul_epu32(a, b);
35741   #else
35742     simde__m256i_private
35743       r_,
35744       a_ = simde__m256i_to_private(a),
35745       b_ = simde__m256i_to_private(b);
35746 
35747     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35748       r_.m128i[0] = simde_mm_mul_epu32(a_.m128i[0], b_.m128i[0]);
35749       r_.m128i[1] = simde_mm_mul_epu32(a_.m128i[1], b_.m128i[1]);
35750     #else
35751       SIMDE_VECTORIZE
35752       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
35753         r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
35754       }
35755     #endif
35756 
35757     return simde__m256i_from_private(r_);
35758   #endif
35759 }
35760 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35761 #  define _mm256_mul_epu32(a, b) simde_mm256_mul_epu32(a, b)
35762 #endif
35763 
35764 SIMDE_FUNCTION_ATTRIBUTES
35765 simde__m256i
simde_mm256_mulhi_epi16(simde__m256i a,simde__m256i b)35766 simde_mm256_mulhi_epi16 (simde__m256i a, simde__m256i b) {
35767   #if defined(SIMDE_X86_AVX2_NATIVE)
35768     return _mm256_mulhi_epi16(a, b);
35769   #else
35770     simde__m256i_private
35771       r_,
35772       a_ = simde__m256i_to_private(a),
35773       b_ = simde__m256i_to_private(b);
35774 
35775     SIMDE_VECTORIZE
35776     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
35777       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
35778     }
35779 
35780     return simde__m256i_from_private(r_);
35781   #endif
35782 }
35783 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35784 #  define _mm256_mulhi_epi16(a, b) simde_mm256_mulhi_epi16(a, b)
35785 #endif
35786 
35787 SIMDE_FUNCTION_ATTRIBUTES
35788 simde__m256i
simde_mm256_mulhi_epu16(simde__m256i a,simde__m256i b)35789 simde_mm256_mulhi_epu16 (simde__m256i a, simde__m256i b) {
35790   #if defined(SIMDE_X86_AVX2_NATIVE)
35791     return _mm256_mulhi_epu16(a, b);
35792   #else
35793     simde__m256i_private
35794       r_,
35795       a_ = simde__m256i_to_private(a),
35796       b_ = simde__m256i_to_private(b);
35797 
35798     SIMDE_VECTORIZE
35799     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
35800       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
35801     }
35802 
35803     return simde__m256i_from_private(r_);
35804   #endif
35805 }
35806 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35807 #  define _mm256_mulhi_epu16(a, b) simde_mm256_mulhi_epu16(a, b)
35808 #endif
35809 
35810 SIMDE_FUNCTION_ATTRIBUTES
35811 simde__m256i
simde_mm256_mulhrs_epi16(simde__m256i a,simde__m256i b)35812 simde_mm256_mulhrs_epi16 (simde__m256i a, simde__m256i b) {
35813   #if defined(SIMDE_X86_AVX2_NATIVE)
35814     return _mm256_mulhrs_epi16(a, b);
35815   #else
35816     simde__m256i_private
35817       r_,
35818       a_ = simde__m256i_to_private(a),
35819       b_ = simde__m256i_to_private(b);
35820 
35821     SIMDE_VECTORIZE
35822     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
35823       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15));
35824     }
35825 
35826     return simde__m256i_from_private(r_);
35827   #endif
35828 }
35829 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35830 #  define _mm256_mulhrs_epi16(a, b) simde_mm256_mulhrs_epi16(a, b)
35831 #endif
35832 
35833 SIMDE_FUNCTION_ATTRIBUTES
35834 simde__m256i
simde_mm256_mullo_epi16(simde__m256i a,simde__m256i b)35835 simde_mm256_mullo_epi16 (simde__m256i a, simde__m256i b) {
35836   #if defined(SIMDE_X86_AVX2_NATIVE)
35837     return _mm256_mullo_epi16(a, b);
35838   #else
35839     simde__m256i_private
35840     a_ = simde__m256i_to_private(a),
35841     b_ = simde__m256i_to_private(b),
35842     r_;
35843 
35844     SIMDE_VECTORIZE
35845     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
35846       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] * b_.i16[i]);
35847     }
35848 
35849     return simde__m256i_from_private(r_);
35850   #endif
35851 }
35852 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35853   #undef _mm256_mullo_epi16
35854   #define _mm256_mullo_epi16(a, b) simde_mm256_mullo_epi16(a, b)
35855 #endif
35856 
35857 SIMDE_FUNCTION_ATTRIBUTES
35858 simde__m256i
simde_mm256_mullo_epi32(simde__m256i a,simde__m256i b)35859 simde_mm256_mullo_epi32 (simde__m256i a, simde__m256i b) {
35860   #if defined(SIMDE_X86_AVX2_NATIVE)
35861     return _mm256_mullo_epi32(a, b);
35862   #else
35863     simde__m256i_private
35864     a_ = simde__m256i_to_private(a),
35865     b_ = simde__m256i_to_private(b),
35866     r_;
35867 
35868     SIMDE_VECTORIZE
35869     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
35870       r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] * b_.i32[i]);
35871     }
35872 
35873     return simde__m256i_from_private(r_);
35874   #endif
35875 }
35876 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35877   #undef _mm256_mullo_epi32
35878   #define _mm256_mullo_epi32(a, b) simde_mm256_mullo_epi32(a, b)
35879 #endif
35880 
35881 SIMDE_FUNCTION_ATTRIBUTES
35882 simde__m256i
simde_x_mm256_mullo_epu32(simde__m256i a,simde__m256i b)35883 simde_x_mm256_mullo_epu32 (simde__m256i a, simde__m256i b) {
35884   simde__m256i_private
35885     r_,
35886     a_ = simde__m256i_to_private(a),
35887     b_ = simde__m256i_to_private(b);
35888 
35889     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
35890       r_.u32 = a_.u32 * b_.u32;
35891     #else
35892       SIMDE_VECTORIZE
35893       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
35894         r_.u32[i] = a_.u32[i] * b_.u32[i];
35895       }
35896     #endif
35897 
35898   return simde__m256i_from_private(r_);
35899 }
35900 
35901 SIMDE_FUNCTION_ATTRIBUTES
35902 simde__m256i
simde_mm256_or_si256(simde__m256i a,simde__m256i b)35903 simde_mm256_or_si256 (simde__m256i a, simde__m256i b) {
35904   #if defined(SIMDE_X86_AVX2_NATIVE)
35905     return _mm256_or_si256(a, b);
35906   #else
35907     simde__m256i_private
35908       r_,
35909       a_ = simde__m256i_to_private(a),
35910       b_ = simde__m256i_to_private(b);
35911 
35912     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35913       r_.m128i[0] = simde_mm_or_si128(a_.m128i[0], b_.m128i[0]);
35914       r_.m128i[1] = simde_mm_or_si128(a_.m128i[1], b_.m128i[1]);
35915     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
35916       r_.i32f = a_.i32f | b_.i32f;
35917     #else
35918       SIMDE_VECTORIZE
35919       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
35920         r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
35921       }
35922     #endif
35923 
35924     return simde__m256i_from_private(r_);
35925   #endif
35926 }
35927 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35928   #undef _mm256_or_si256
35929   #define _mm256_or_si256(a, b) simde_mm256_or_si256(a, b)
35930 #endif
35931 
35932 SIMDE_FUNCTION_ATTRIBUTES
35933 simde__m256i
simde_mm256_packs_epi16(simde__m256i a,simde__m256i b)35934 simde_mm256_packs_epi16 (simde__m256i a, simde__m256i b) {
35935   #if defined(SIMDE_X86_AVX2_NATIVE)
35936     return _mm256_packs_epi16(a, b);
35937   #else
35938     simde__m256i_private
35939       r_,
35940       a_ = simde__m256i_to_private(a),
35941       b_ = simde__m256i_to_private(b);
35942 
35943     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35944       r_.m128i[0] = simde_mm_packs_epi16(a_.m128i[0], b_.m128i[0]);
35945       r_.m128i[1] = simde_mm_packs_epi16(a_.m128i[1], b_.m128i[1]);
35946     #else
35947       const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0]))/2;
35948       const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0]))/4;
35949       SIMDE_VECTORIZE
35950       for (size_t i = 0 ; i < quarter_point ; i++) {
35951         r_.i8[i]     = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
35952         r_.i8[i + quarter_point] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
35953         r_.i8[halfway_point + i]     = (a_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((a_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[quarter_point + i]));
35954         r_.i8[halfway_point + i + quarter_point] = (b_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((b_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[quarter_point + i]));
35955       }
35956     #endif
35957 
35958     return simde__m256i_from_private(r_);
35959   #endif
35960 }
35961 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35962   #undef _mm256_packs_epi16
35963   #define _mm256_packs_epi16(a, b) simde_mm256_packs_epi16(a, b)
35964 #endif
35965 
35966 SIMDE_FUNCTION_ATTRIBUTES
35967 simde__m256i
simde_mm256_packs_epi32(simde__m256i a,simde__m256i b)35968 simde_mm256_packs_epi32 (simde__m256i a, simde__m256i b) {
35969   #if defined(SIMDE_X86_AVX2_NATIVE)
35970     return _mm256_packs_epi32(a, b);
35971   #else
35972     simde__m256i_private
35973       r_,
35974       v_[] = {
35975         simde__m256i_to_private(a),
35976         simde__m256i_to_private(b)
35977       };
35978 
35979     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
35980       r_.m128i[0] = simde_mm_packs_epi32(v_[0].m128i[0], v_[1].m128i[0]);
35981       r_.m128i[1] = simde_mm_packs_epi32(v_[0].m128i[1], v_[1].m128i[1]);
35982     #else
35983       SIMDE_VECTORIZE
35984       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
35985         const int32_t v = v_[(i >> 2) & 1].i32[(i & 11) - ((i & 8) >> 1)];
35986         r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (v > INT16_MAX) ? INT16_MAX : ((v < INT16_MIN) ? INT16_MIN : v));
35987       }
35988     #endif
35989 
35990     return simde__m256i_from_private(r_);
35991   #endif
35992 }
35993 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
35994   #undef _mm256_packs_epi32
35995   #define _mm256_packs_epi32(a, b) simde_mm256_packs_epi32(a, b)
35996 #endif
35997 
35998 SIMDE_FUNCTION_ATTRIBUTES
35999 simde__m256i
simde_mm256_packus_epi16(simde__m256i a,simde__m256i b)36000 simde_mm256_packus_epi16 (simde__m256i a, simde__m256i b) {
36001   #if defined(SIMDE_X86_AVX2_NATIVE)
36002     return _mm256_packus_epi16(a, b);
36003   #else
36004     simde__m256i_private
36005       r_,
36006       a_ = simde__m256i_to_private(a),
36007       b_ = simde__m256i_to_private(b);
36008 
36009     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36010       r_.m128i[0] = simde_mm_packus_epi16(a_.m128i[0], b_.m128i[0]);
36011       r_.m128i[1] = simde_mm_packus_epi16(a_.m128i[1], b_.m128i[1]);
36012     #else
36013       const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 2;
36014       const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 4;
36015       SIMDE_VECTORIZE
36016       for (size_t i = 0 ; i < quarter_point ; i++) {
36017         r_.u8[i] = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
36018         r_.u8[i + quarter_point] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
36019         r_.u8[halfway_point + i] = (a_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[quarter_point + i]));
36020         r_.u8[halfway_point + i + quarter_point] = (b_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[quarter_point + i]));
36021       }
36022     #endif
36023 
36024     return simde__m256i_from_private(r_);
36025   #endif
36026 }
36027 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36028   #undef _mm256_packus_epi16
36029   #define _mm256_packus_epi16(a, b) simde_mm256_packus_epi16(a, b)
36030 #endif
36031 
36032 SIMDE_FUNCTION_ATTRIBUTES
36033 simde__m256i
simde_mm256_packus_epi32(simde__m256i a,simde__m256i b)36034 simde_mm256_packus_epi32 (simde__m256i a, simde__m256i b) {
36035   #if defined(SIMDE_X86_AVX2_NATIVE)
36036     return _mm256_packus_epi32(a, b);
36037   #else
36038     simde__m256i_private
36039       r_,
36040       a_ = simde__m256i_to_private(a),
36041       b_ = simde__m256i_to_private(b);
36042 
36043     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36044       r_.m128i[0] = simde_mm_packus_epi32(a_.m128i[0], b_.m128i[0]);
36045       r_.m128i[1] = simde_mm_packus_epi32(a_.m128i[1], b_.m128i[1]);
36046     #else
36047       const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
36048       const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4;
36049       SIMDE_VECTORIZE
36050       for (size_t i = 0 ; i < quarter_point ; i++) {
36051         r_.u16[i] = (a_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
36052         r_.u16[i + quarter_point] = (b_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
36053         r_.u16[halfway_point + i]     = (a_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[quarter_point + i]));
36054         r_.u16[halfway_point + i + quarter_point] = (b_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[quarter_point + i]));
36055       }
36056     #endif
36057 
36058     return simde__m256i_from_private(r_);
36059   #endif
36060 }
36061 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36062   #undef _mm256_packus_epi32
36063   #define _mm256_packus_epi32(a, b) simde_mm256_packus_epi32(a, b)
36064 #endif
36065 
36066 SIMDE_FUNCTION_ATTRIBUTES
36067 simde__m256i
simde_mm256_permute2x128_si256(simde__m256i a,simde__m256i b,const int imm8)36068 simde_mm256_permute2x128_si256 (simde__m256i a, simde__m256i b, const int imm8)
36069     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
36070   simde__m256i_private
36071     r_,
36072     a_ = simde__m256i_to_private(a),
36073     b_ = simde__m256i_to_private(b);
36074 
36075   r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8     ) & 1] : a_.m128i_private[(imm8     ) & 1]);
36076   r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]);
36077 
36078   return simde__m256i_from_private(r_);
36079 }
36080 #if defined(SIMDE_X86_AVX2_NATIVE)
36081 #  define simde_mm256_permute2x128_si256(a, b, imm8) _mm256_permute2x128_si256(a, b, imm8)
36082 #endif
36083 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36084   #undef _mm256_permute2x128_si256
36085   #define _mm256_permute2x128_si256(a, b, imm8) simde_mm256_permute2x128_si256(a, b, imm8)
36086 #endif
36087 
36088 SIMDE_FUNCTION_ATTRIBUTES
36089 simde__m256i
simde_mm256_permute4x64_epi64(simde__m256i a,const int imm8)36090 simde_mm256_permute4x64_epi64 (simde__m256i a, const int imm8)
36091     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
36092   simde__m256i_private
36093     r_,
36094     a_ = simde__m256i_to_private(a);
36095 
36096   r_.i64[0] = (imm8 & 0x02) ? a_.i64[((imm8       ) & 1)+2] : a_.i64[(imm8       ) & 1];
36097   r_.i64[1] = (imm8 & 0x08) ? a_.i64[((imm8 >> 2  ) & 1)+2] : a_.i64[(imm8 >> 2  ) & 1];
36098   r_.i64[2] = (imm8 & 0x20) ? a_.i64[((imm8 >> 4  ) & 1)+2] : a_.i64[(imm8 >> 4  ) & 1];
36099   r_.i64[3] = (imm8 & 0x80) ? a_.i64[((imm8 >> 6  ) & 1)+2] : a_.i64[(imm8 >> 6  ) & 1];
36100 
36101   return simde__m256i_from_private(r_);
36102 }
36103 #if defined(SIMDE_X86_AVX2_NATIVE)
36104 #  define simde_mm256_permute4x64_epi64(a, imm8) _mm256_permute4x64_epi64(a, imm8)
36105 #endif
36106 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36107   #undef _mm256_permute4x64_epi64
36108   #define _mm256_permute4x64_epi64(a, imm8) simde_mm256_permute4x64_epi64(a, imm8)
36109 #endif
36110 
36111 SIMDE_FUNCTION_ATTRIBUTES
36112 simde__m256d
simde_mm256_permute4x64_pd(simde__m256d a,const int imm8)36113 simde_mm256_permute4x64_pd (simde__m256d a, const int imm8)
36114     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
36115   simde__m256d_private
36116     r_,
36117     a_ = simde__m256d_to_private(a);
36118 
36119   r_.f64[0] = (imm8 & 0x02) ? a_.f64[((imm8       ) & 1)+2] : a_.f64[(imm8       ) & 1];
36120   r_.f64[1] = (imm8 & 0x08) ? a_.f64[((imm8 >> 2  ) & 1)+2] : a_.f64[(imm8 >> 2  ) & 1];
36121   r_.f64[2] = (imm8 & 0x20) ? a_.f64[((imm8 >> 4  ) & 1)+2] : a_.f64[(imm8 >> 4  ) & 1];
36122   r_.f64[3] = (imm8 & 0x80) ? a_.f64[((imm8 >> 6  ) & 1)+2] : a_.f64[(imm8 >> 6  ) & 1];
36123 
36124   return simde__m256d_from_private(r_);
36125 }
36126 #if defined(SIMDE_X86_AVX2_NATIVE)
36127 #  define simde_mm256_permute4x64_pd(a, imm8) _mm256_permute4x64_pd(a, imm8)
36128 #endif
36129 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36130   #undef _mm256_permute4x64_pd
36131   #define _mm256_permute4x64_pd(a, imm8) simde_mm256_permute4x64_pd(a, imm8)
36132 #endif
36133 
36134 SIMDE_FUNCTION_ATTRIBUTES
36135 simde__m256i
simde_mm256_permutevar8x32_epi32(simde__m256i a,simde__m256i idx)36136 simde_mm256_permutevar8x32_epi32 (simde__m256i a, simde__m256i idx) {
36137   #if defined(SIMDE_X86_AVX2_NATIVE)
36138     return _mm256_permutevar8x32_epi32(a, idx);
36139   #else
36140     simde__m256i_private
36141       r_,
36142       a_ = simde__m256i_to_private(a),
36143       idx_ = simde__m256i_to_private(idx);
36144 
36145     SIMDE_VECTORIZE
36146     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
36147       r_.i32[i] = a_.i32[idx_.i32[i] & 7];
36148     }
36149 
36150     return simde__m256i_from_private(r_);
36151   #endif
36152 }
36153 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36154   #undef _mm256_permutevar8x32_epi32
36155   #define _mm256_permutevar8x32_epi32(a, idx) simde_mm256_permutevar8x32_epi32(a, idx)
36156 #endif
36157 
36158 SIMDE_FUNCTION_ATTRIBUTES
36159 simde__m256
simde_mm256_permutevar8x32_ps(simde__m256 a,simde__m256i idx)36160 simde_mm256_permutevar8x32_ps (simde__m256 a, simde__m256i idx) {
36161   #if defined(SIMDE_X86_AVX2_NATIVE)
36162     return _mm256_permutevar8x32_ps(a, idx);
36163   #else
36164     simde__m256_private
36165       r_,
36166       a_ = simde__m256_to_private(a);
36167     simde__m256i_private
36168       idx_ = simde__m256i_to_private(idx);
36169 
36170     SIMDE_VECTORIZE
36171     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
36172       r_.f32[i] = a_.f32[idx_.i32[i] & 7];
36173     }
36174 
36175     return simde__m256_from_private(r_);
36176   #endif
36177 }
36178 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36179   #undef _mm256_permutevar8x32_ps
36180   #define _mm256_permutevar8x32_ps(a, idx) simde_mm256_permutevar8x32_ps(a, idx)
36181 #endif
36182 
36183 SIMDE_FUNCTION_ATTRIBUTES
36184 simde__m256i
simde_mm256_sad_epu8(simde__m256i a,simde__m256i b)36185 simde_mm256_sad_epu8 (simde__m256i a, simde__m256i b) {
36186   #if defined(SIMDE_X86_AVX2_NATIVE)
36187     return _mm256_sad_epu8(a, b);
36188   #else
36189     simde__m256i_private
36190       r_,
36191       a_ = simde__m256i_to_private(a),
36192       b_ = simde__m256i_to_private(b);
36193 
36194     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36195       r_.m128i[0] = simde_mm_sad_epu8(a_.m128i[0], b_.m128i[0]);
36196       r_.m128i[1] = simde_mm_sad_epu8(a_.m128i[1], b_.m128i[1]);
36197     #else
36198       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
36199         uint16_t tmp = 0;
36200         SIMDE_VECTORIZE_REDUCTION(+:tmp)
36201         for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 4) ; j++) {
36202           const size_t e = j + (i * 8);
36203           tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
36204         }
36205         r_.i64[i] = tmp;
36206       }
36207     #endif
36208 
36209     return simde__m256i_from_private(r_);
36210   #endif
36211 }
36212 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36213   #undef _mm256_sad_epu8
36214   #define _mm256_sad_epu8(a, b) simde_mm256_sad_epu8(a, b)
36215 #endif
36216 
36217 SIMDE_FUNCTION_ATTRIBUTES
36218 simde__m256i
simde_mm256_shuffle_epi8(simde__m256i a,simde__m256i b)36219 simde_mm256_shuffle_epi8 (simde__m256i a, simde__m256i b) {
36220   #if defined(SIMDE_X86_AVX2_NATIVE)
36221     return _mm256_shuffle_epi8(a, b);
36222   #else
36223     simde__m256i_private
36224       r_,
36225       a_ = simde__m256i_to_private(a),
36226       b_ = simde__m256i_to_private(b);
36227 
36228     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36229       r_.m128i[0] = simde_mm_shuffle_epi8(a_.m128i[0], b_.m128i[0]);
36230       r_.m128i[1] = simde_mm_shuffle_epi8(a_.m128i[1], b_.m128i[1]);
36231     #else
36232       SIMDE_VECTORIZE
36233       for (size_t i = 0 ; i < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; i++) {
36234         r_.u8[  i   ] = (b_.u8[  i   ] & 0x80) ? 0 : a_.u8[(b_.u8[  i   ] & 0x0f)     ];
36235         r_.u8[i + 16] = (b_.u8[i + 16] & 0x80) ? 0 : a_.u8[(b_.u8[i + 16] & 0x0f) + 16];
36236       }
36237     #endif
36238 
36239     return simde__m256i_from_private(r_);
36240   #endif
36241 }
36242 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36243   #undef _mm256_shuffle_epi8
36244   #define _mm256_shuffle_epi8(a, b) simde_mm256_shuffle_epi8(a, b)
36245 #endif
36246 
36247 SIMDE_FUNCTION_ATTRIBUTES
36248 simde__m256i
simde_mm256_shuffle_epi32(simde__m256i a,const int imm8)36249 simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8)
36250     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
36251   simde__m256i_private
36252     r_,
36253     a_ = simde__m256i_to_private(a);
36254 
36255   for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) {
36256     r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
36257   }
36258   for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) {
36259     r_.i32[i + 4] = a_.i32[((imm8 >> (i * 2)) & 3) + 4];
36260   }
36261 
36262   return simde__m256i_from_private(r_);
36263 }
36264 #if defined(SIMDE_X86_AVX2_NATIVE)
36265 #  define simde_mm256_shuffle_epi32(a, imm8) _mm256_shuffle_epi32(a, imm8)
36266 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(__PGI)
36267 #  define simde_mm256_shuffle_epi32(a, imm8) \
36268      simde_mm256_set_m128i( \
36269        simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
36270        simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
36271 #elif defined(SIMDE_SHUFFLE_VECTOR_)
36272 #  define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \
36273       const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
36274       simde__m256i_from_private((simde__m256i_private) { .i32 = \
36275           SIMDE_SHUFFLE_VECTOR_(32, 32, \
36276                                 (simde__tmp_a_).i32, \
36277                                 (simde__tmp_a_).i32, \
36278                                 ((imm8)     ) & 3, \
36279                                 ((imm8) >> 2) & 3, \
36280                                 ((imm8) >> 4) & 3, \
36281                                 ((imm8) >> 6) & 3, \
36282                                 (((imm8)     ) & 3) + 4, \
36283                                 (((imm8) >> 2) & 3) + 4, \
36284                                 (((imm8) >> 4) & 3) + 4, \
36285                                 (((imm8) >> 6) & 3) + 4) }); }))
36286 #endif
36287 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36288   #undef _mm256_shuffle_epi32
36289   #define _mm256_shuffle_epi32(a, imm8) simde_mm256_shuffle_epi32(a, imm8)
36290 #endif
36291 
36292 #if defined(SIMDE_X86_AVX2_NATIVE)
36293 #  define simde_mm256_shufflehi_epi16(a, imm8) _mm256_shufflehi_epi16(a, imm8)
36294 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36295 #  define simde_mm256_shufflehi_epi16(a, imm8) \
36296      simde_mm256_set_m128i( \
36297        simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
36298        simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
36299 #elif defined(SIMDE_SHUFFLE_VECTOR_)
36300 #  define simde_mm256_shufflehi_epi16(a, imm8) (__extension__ ({ \
36301       const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
36302       simde__m256i_from_private((simde__m256i_private) { .i16 = \
36303         SIMDE_SHUFFLE_VECTOR_(16, 32, \
36304           (simde__tmp_a_).i16, \
36305           (simde__tmp_a_).i16, \
36306           0, 1, 2, 3, \
36307           (((imm8)     ) & 3) + 4, \
36308           (((imm8) >> 2) & 3) + 4, \
36309           (((imm8) >> 4) & 3) + 4, \
36310           (((imm8) >> 6) & 3) + 4, \
36311           8, 9, 10, 11, \
36312           ((((imm8)     ) & 3) + 8 + 4), \
36313           ((((imm8) >> 2) & 3) + 8 + 4), \
36314           ((((imm8) >> 4) & 3) + 8 + 4), \
36315           ((((imm8) >> 6) & 3) + 8 + 4) \
36316           ) }); }))
36317 #else
36318 #  define simde_mm256_shufflehi_epi16(a, imm8) \
36319      simde_mm256_set_m128i( \
36320        simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \
36321        simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), imm8))
36322 #endif
36323 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36324   #undef _mm256_shufflehi_epi16
36325   #define _mm256_shufflehi_epi16(a, imm8) simde_mm256_shufflehi_epi16(a, imm8)
36326 #endif
36327 
36328 #if defined(SIMDE_X86_AVX2_NATIVE)
36329 #  define simde_mm256_shufflelo_epi16(a, imm8) _mm256_shufflelo_epi16(a, imm8)
36330 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36331 #  define simde_mm256_shufflelo_epi16(a, imm8) \
36332      simde_mm256_set_m128i( \
36333        simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
36334        simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
36335 #elif defined(SIMDE_SHUFFLE_VECTOR_)
36336 #  define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \
36337       const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
36338       simde__m256i_from_private((simde__m256i_private) { .i16 = \
36339         SIMDE_SHUFFLE_VECTOR_(16, 32, \
36340           (simde__tmp_a_).i16, \
36341           (simde__tmp_a_).i16, \
36342           (((imm8)     ) & 3), \
36343           (((imm8) >> 2) & 3), \
36344           (((imm8) >> 4) & 3), \
36345           (((imm8) >> 6) & 3), \
36346           4, 5, 6, 7, \
36347           ((((imm8)     ) & 3) + 8), \
36348           ((((imm8) >> 2) & 3) + 8), \
36349           ((((imm8) >> 4) & 3) + 8), \
36350           ((((imm8) >> 6) & 3) + 8), \
36351           12, 13, 14, 15) }); }))
36352 #else
36353 #  define simde_mm256_shufflelo_epi16(a, imm8) \
36354      simde_mm256_set_m128i( \
36355        simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \
36356        simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), imm8))
36357 #endif
36358 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36359   #undef _mm256_shufflelo_epi16
36360   #define _mm256_shufflelo_epi16(a, imm8) simde_mm256_shufflelo_epi16(a, imm8)
36361 #endif
36362 
36363 SIMDE_FUNCTION_ATTRIBUTES
36364 simde__m256i
simde_mm256_sign_epi8(simde__m256i a,simde__m256i b)36365 simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) {
36366   #if defined(SIMDE_X86_AVX2_NATIVE)
36367     return _mm256_sign_epi8(a, b);
36368   #else
36369     simde__m256i_private
36370       r_,
36371       a_ = simde__m256i_to_private(a),
36372       b_ = simde__m256i_to_private(b);
36373 
36374     SIMDE_VECTORIZE
36375     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
36376       r_.i8[i] = (b_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i];
36377     }
36378 
36379     return simde__m256i_from_private(r_);
36380   #endif
36381 }
36382 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36383   #undef _mm256_sign_epi8
36384   #define _mm256_sign_epi8(a, b) simde_mm256_sign_epi8(a, b)
36385 #endif
36386 
36387 SIMDE_FUNCTION_ATTRIBUTES
36388 simde__m256i
simde_mm256_sign_epi16(simde__m256i a,simde__m256i b)36389 simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) {
36390   #if defined(SIMDE_X86_AVX2_NATIVE)
36391     return _mm256_sign_epi16(a, b);
36392   #else
36393     simde__m256i_private
36394       r_,
36395       a_ = simde__m256i_to_private(a),
36396       b_ = simde__m256i_to_private(b);
36397 
36398     SIMDE_VECTORIZE
36399     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
36400       r_.i16[i] = (b_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i];
36401     }
36402 
36403     return simde__m256i_from_private(r_);
36404   #endif
36405 }
36406 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36407   #undef _mm256_sign_epi16
36408   #define _mm256_sign_epi16(a, b) simde_mm256_sign_epi16(a, b)
36409 #endif
36410 
36411 SIMDE_FUNCTION_ATTRIBUTES
36412 simde__m256i
simde_mm256_sign_epi32(simde__m256i a,simde__m256i b)36413 simde_mm256_sign_epi32(simde__m256i a, simde__m256i b) {
36414   #if defined(SIMDE_X86_AVX2_NATIVE)
36415     return _mm256_sign_epi32(a, b);
36416   #else
36417     simde__m256i_private
36418       r_,
36419       a_ = simde__m256i_to_private(a),
36420       b_ = simde__m256i_to_private(b);
36421 
36422     SIMDE_VECTORIZE
36423     for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
36424       r_.i32[i] = (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
36425     }
36426 
36427     return simde__m256i_from_private(r_);
36428   #endif
36429 }
36430 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36431   #undef _mm256_sign_epi32
36432   #define _mm256_sign_epi32(a, b) simde_mm256_sign_epi32(a, b)
36433 #endif
36434 
36435 SIMDE_FUNCTION_ATTRIBUTES
36436 simde__m256i
simde_mm256_sll_epi16(simde__m256i a,simde__m128i count)36437 simde_mm256_sll_epi16 (simde__m256i a, simde__m128i count) {
36438   #if defined(SIMDE_X86_AVX2_NATIVE)
36439     return _mm256_sll_epi16(a, count);
36440   #else
36441     simde__m256i_private
36442       r_,
36443       a_ = simde__m256i_to_private(a);
36444 
36445     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36446       r_.m128i[0] = simde_mm_sll_epi16(a_.m128i[0], count);
36447       r_.m128i[1] = simde_mm_sll_epi16(a_.m128i[1], count);
36448     #else
36449       simde__m128i_private
36450         count_ = simde__m128i_to_private(count);
36451 
36452       uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
36453       if (shift > 15)
36454         return simde_mm256_setzero_si256();
36455 
36456       #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36457         r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, shift);
36458       #else
36459         SIMDE_VECTORIZE
36460         for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
36461           r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (shift));
36462         }
36463       #endif
36464     #endif
36465 
36466     return simde__m256i_from_private(r_);
36467   #endif
36468 }
36469 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36470   #undef _mm256_sll_epi16
36471   #define _mm256_sll_epi16(a, count) simde_mm256_sll_epi16(a, count)
36472 #endif
36473 
36474 SIMDE_FUNCTION_ATTRIBUTES
36475 simde__m256i
simde_mm256_sll_epi32(simde__m256i a,simde__m128i count)36476 simde_mm256_sll_epi32 (simde__m256i a, simde__m128i count) {
36477   #if defined(SIMDE_X86_AVX2_NATIVE)
36478     return _mm256_sll_epi32(a, count);
36479   #else
36480     simde__m256i_private
36481       r_,
36482       a_ = simde__m256i_to_private(a);
36483 
36484     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36485       r_.m128i[0] = simde_mm_sll_epi32(a_.m128i[0], count);
36486       r_.m128i[1] = simde_mm_sll_epi32(a_.m128i[1], count);
36487     #else
36488       simde__m128i_private
36489         count_ = simde__m128i_to_private(count);
36490 
36491       uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
36492       if (shift > 31)
36493         return simde_mm256_setzero_si256();
36494 
36495       #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36496         r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, shift);
36497       #else
36498         SIMDE_VECTORIZE
36499         for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
36500           r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] << (shift));
36501         }
36502       #endif
36503     #endif
36504 
36505     return simde__m256i_from_private(r_);
36506   #endif
36507 }
36508 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36509   #undef _mm256_sll_epi32
36510   #define _mm256_sll_epi32(a, count) simde_mm256_sll_epi32(a, count)
36511 #endif
36512 
36513 SIMDE_FUNCTION_ATTRIBUTES
36514 simde__m256i
simde_mm256_sll_epi64(simde__m256i a,simde__m128i count)36515 simde_mm256_sll_epi64 (simde__m256i a, simde__m128i count) {
36516   #if defined(SIMDE_X86_AVX2_NATIVE)
36517     return _mm256_sll_epi64(a, count);
36518   #else
36519     simde__m256i_private
36520       r_,
36521       a_ = simde__m256i_to_private(a);
36522 
36523     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36524       r_.m128i[0] = simde_mm_sll_epi64(a_.m128i[0], count);
36525       r_.m128i[1] = simde_mm_sll_epi64(a_.m128i[1], count);
36526     #else
36527       simde__m128i_private
36528         count_ = simde__m128i_to_private(count);
36529 
36530       uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
36531       if (shift > 63)
36532         return simde_mm256_setzero_si256();
36533 
36534       #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36535         r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, shift);
36536       #else
36537         SIMDE_VECTORIZE
36538         for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
36539           r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] << (shift));
36540         }
36541       #endif
36542     #endif
36543 
36544     return simde__m256i_from_private(r_);
36545   #endif
36546 }
36547 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36548   #undef _mm256_sll_epi64
36549   #define _mm256_sll_epi64(a, count) simde_mm256_sll_epi64(a, count)
36550 #endif
36551 
36552 SIMDE_FUNCTION_ATTRIBUTES
36553 simde__m256i
simde_mm256_slli_epi16(simde__m256i a,const int imm8)36554 simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
36555     SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
36556   /* Note: There is no consistency in how compilers handle values outside of
36557      the expected range, hence the discrepancy between what we allow and what
36558      Intel specifies.  Some compilers will return 0, others seem to just mask
36559      off everything outside of the range. */
36560   simde__m256i_private
36561     r_,
36562     a_ = simde__m256i_to_private(a);
36563 
36564   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
36565     SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8));
36566     for (size_t i = 0 ; i < (sizeof(a_.altivec_i16) / sizeof(a_.altivec_i16[0])) ; i++) {
36567       r_.altivec_i16[i] = vec_sl(a_.altivec_i16[i], sv);
36568     }
36569   #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36570     r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, imm8);
36571   #else
36572     SIMDE_VECTORIZE
36573     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
36574       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (imm8 & 0xff));
36575     }
36576   #endif
36577 
36578   return simde__m256i_from_private(r_);
36579 }
36580 #if defined(SIMDE_X86_AVX2_NATIVE)
36581 #  define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8)
36582 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36583 #  define simde_mm256_slli_epi16(a, imm8) \
36584      simde_mm256_set_m128i( \
36585          simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
36586          simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
36587 #endif
36588 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36589   #undef _mm256_slli_epi16
36590   #define _mm256_slli_epi16(a, imm8) simde_mm256_slli_epi16(a, imm8)
36591 #endif
36592 
36593 SIMDE_FUNCTION_ATTRIBUTES
36594 simde__m256i
simde_mm256_slli_epi32(simde__m256i a,const int imm8)36595 simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
36596     SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
36597   simde__m256i_private
36598     r_,
36599     a_ = simde__m256i_to_private(a);
36600 
36601   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
36602     SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8));
36603     for (size_t i = 0 ; i < (sizeof(a_.altivec_i32) / sizeof(a_.altivec_i32[0])) ; i++) {
36604       r_.altivec_i32[i] = vec_sl(a_.altivec_i32[i], sv);
36605     }
36606   #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36607     r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, imm8);
36608   #else
36609     SIMDE_VECTORIZE
36610     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
36611       r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
36612     }
36613   #endif
36614 
36615   return simde__m256i_from_private(r_);
36616 }
36617 #if defined(SIMDE_X86_AVX2_NATIVE)
36618 #  define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8)
36619 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36620 #  define simde_mm256_slli_epi32(a, imm8) \
36621      simde_mm256_set_m128i( \
36622          simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
36623          simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
36624 #endif
36625 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36626   #undef _mm256_slli_epi32
36627   #define _mm256_slli_epi32(a, imm8) simde_mm256_slli_epi32(a, imm8)
36628 #endif
36629 
36630 SIMDE_FUNCTION_ATTRIBUTES
36631 simde__m256i
simde_mm256_slli_epi64(simde__m256i a,const int imm8)36632 simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
36633     SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
36634   simde__m256i_private
36635     r_,
36636     a_ = simde__m256i_to_private(a);
36637 
36638 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36639   r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, imm8);
36640 #else
36641   SIMDE_VECTORIZE
36642   for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
36643     r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
36644   }
36645 #endif
36646 
36647   return simde__m256i_from_private(r_);
36648 }
36649 #if defined(SIMDE_X86_AVX2_NATIVE)
36650 #  define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8)
36651 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36652 #  define simde_mm256_slli_epi64(a, imm8) \
36653      simde_mm256_set_m128i( \
36654          simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
36655          simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
36656 #endif
36657 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36658   #undef _mm256_slli_epi64
36659   #define _mm256_slli_epi64(a, imm8) simde_mm256_slli_epi64(a, imm8)
36660 #endif
36661 
36662 SIMDE_FUNCTION_ATTRIBUTES
36663 simde__m256i
simde_mm256_slli_si256(simde__m256i a,const int imm8)36664 simde_mm256_slli_si256 (simde__m256i a, const int imm8)
36665     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
36666   simde__m256i_private
36667     r_,
36668     a_ = simde__m256i_to_private(a);
36669 
36670   for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) {
36671     SIMDE_VECTORIZE
36672     for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
36673       const int e = HEDLEY_STATIC_CAST(int, i) - imm8;
36674       r_.m128i_private[h].i8[i] = (e >= 0) ? a_.m128i_private[h].i8[e] : 0;
36675     }
36676   }
36677 
36678   return simde__m256i_from_private(r_);
36679 }
36680 #if defined(SIMDE_X86_AVX2_NATIVE)
36681 #  define simde_mm256_slli_si256(a, imm8) _mm256_slli_si256(a, imm8)
36682 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(__PGI)
36683 #  define simde_mm256_slli_si256(a, imm8) \
36684      simde_mm256_set_m128i( \
36685          simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
36686          simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
36687 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
36688 #  define simde_mm256_slli_si256(a, imm8) \
36689      simde_mm256_set_m128i( \
36690        simde_mm_bslli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
36691        simde_mm_bslli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
36692 #endif
36693 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36694   #undef _mm256_slli_si256
36695   #define _mm256_slli_si256(a, imm8) simde_mm256_slli_si256(a, imm8)
36696 #endif
36697 
36698 SIMDE_FUNCTION_ATTRIBUTES
36699 simde__m128i
simde_mm_sllv_epi32(simde__m128i a,simde__m128i b)36700 simde_mm_sllv_epi32 (simde__m128i a, simde__m128i b) {
36701   simde__m128i_private
36702     a_ = simde__m128i_to_private(a),
36703     b_ = simde__m128i_to_private(b),
36704     r_;
36705 
36706   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
36707     r_.neon_u32 = vshlq_u32(a_.neon_u32, vreinterpretq_s32_u32(b_.neon_u32));
36708     r_.neon_u32 = vandq_u32(r_.neon_u32, vcltq_u32(b_.neon_u32, vdupq_n_u32(32)));
36709   #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36710     r_.u32 = HEDLEY_STATIC_CAST(__typeof__(r_.u32), (b_.u32 < 32) & (a_.u32 << b_.u32));
36711   #else
36712     SIMDE_VECTORIZE
36713     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
36714       r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] << b_.u32[i]) : 0;
36715     }
36716   #endif
36717 
36718   return simde__m128i_from_private(r_);
36719 }
36720 #if defined(SIMDE_X86_AVX2_NATIVE)
36721   #define simde_mm_sllv_epi32(a, b) _mm_sllv_epi32(a, b)
36722 #endif
36723 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36724   #undef _mm_sllv_epi32
36725   #define _mm_sllv_epi32(a, b) simde_mm_sllv_epi32(a, b)
36726 #endif
36727 
36728 SIMDE_FUNCTION_ATTRIBUTES
36729 simde__m256i
simde_mm256_sllv_epi32(simde__m256i a,simde__m256i b)36730 simde_mm256_sllv_epi32 (simde__m256i a, simde__m256i b) {
36731   simde__m256i_private
36732     a_ = simde__m256i_to_private(a),
36733     b_ = simde__m256i_to_private(b),
36734     r_;
36735 
36736   #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36737     r_.m128i[0] = simde_mm_sllv_epi32(a_.m128i[0], b_.m128i[0]);
36738     r_.m128i[1] = simde_mm_sllv_epi32(a_.m128i[1], b_.m128i[1]);
36739   #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36740     r_.u32 = HEDLEY_STATIC_CAST(__typeof__(r_.u32), (b_.u32 < 32) & (a_.u32 << b_.u32));
36741   #else
36742     SIMDE_VECTORIZE
36743     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
36744       r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] << b_.u32[i]) : 0;
36745     }
36746   #endif
36747 
36748   return simde__m256i_from_private(r_);
36749 }
36750 #if defined(SIMDE_X86_AVX2_NATIVE)
36751   #define simde_mm256_sllv_epi32(a, b) _mm256_sllv_epi32(a, b)
36752 #endif
36753 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36754   #undef _mm256_sllv_epi32
36755   #define _mm256_sllv_epi32(a, b) simde_mm256_sllv_epi32(a, b)
36756 #endif
36757 
36758 SIMDE_FUNCTION_ATTRIBUTES
36759 simde__m128i
simde_mm_sllv_epi64(simde__m128i a,simde__m128i b)36760 simde_mm_sllv_epi64 (simde__m128i a, simde__m128i b) {
36761   simde__m128i_private
36762     a_ = simde__m128i_to_private(a),
36763     b_ = simde__m128i_to_private(b),
36764     r_;
36765 
36766   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
36767     r_.neon_u64 = vshlq_u64(a_.neon_u64, vreinterpretq_s64_u64(b_.neon_u64));
36768     r_.neon_u64 = vandq_u64(r_.neon_u64, vcltq_u64(b_.neon_u64, vdupq_n_u64(64)));
36769   #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36770     r_.u64 = HEDLEY_STATIC_CAST(__typeof__(r_.u64), (b_.u64 < 64) & (a_.u64 << b_.u64));
36771   #else
36772     SIMDE_VECTORIZE
36773     for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
36774       r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] << b_.u64[i]) : 0;
36775     }
36776   #endif
36777 
36778   return simde__m128i_from_private(r_);
36779 }
36780 #if defined(SIMDE_X86_AVX2_NATIVE)
36781   #define simde_mm_sllv_epi64(a, b) _mm_sllv_epi64(a, b)
36782 #endif
36783 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36784   #undef _mm_sllv_epi64
36785   #define _mm_sllv_epi64(a, b) simde_mm_sllv_epi64(a, b)
36786 #endif
36787 
36788 SIMDE_FUNCTION_ATTRIBUTES
36789 simde__m256i
simde_mm256_sllv_epi64(simde__m256i a,simde__m256i b)36790 simde_mm256_sllv_epi64 (simde__m256i a, simde__m256i b) {
36791   simde__m256i_private
36792     a_ = simde__m256i_to_private(a),
36793     b_ = simde__m256i_to_private(b),
36794     r_;
36795 
36796   #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36797     r_.m128i[0] = simde_mm_sllv_epi64(a_.m128i[0], b_.m128i[0]);
36798     r_.m128i[1] = simde_mm_sllv_epi64(a_.m128i[1], b_.m128i[1]);
36799   #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36800     r_.u64 = HEDLEY_STATIC_CAST(__typeof__(r_.u64), (b_.u64 < 64) & (a_.u64 << b_.u64));
36801   #else
36802     SIMDE_VECTORIZE
36803     for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
36804       r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] << b_.u64[i]) : 0;
36805     }
36806   #endif
36807 
36808   return simde__m256i_from_private(r_);
36809 }
36810 #if defined(SIMDE_X86_AVX2_NATIVE)
36811   #define simde_mm256_sllv_epi64(a, b) _mm256_sllv_epi64(a, b)
36812 #endif
36813 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36814   #undef _mm256_sllv_epi64
36815   #define _mm256_sllv_epi64(a, b) simde_mm256_sllv_epi64(a, b)
36816 #endif
36817 
36818 SIMDE_FUNCTION_ATTRIBUTES
36819 simde__m256i
simde_mm256_sra_epi16(simde__m256i a,simde__m128i count)36820 simde_mm256_sra_epi16 (simde__m256i a, simde__m128i count) {
36821   #if defined(SIMDE_X86_AVX2_NATIVE)
36822     return _mm256_sra_epi16(a, count);
36823   #else
36824     simde__m256i_private
36825       r_,
36826       a_ = simde__m256i_to_private(a);
36827 
36828     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36829       r_.m128i[0] = simde_mm_sra_epi16(a_.m128i[0], count);
36830       r_.m128i[1] = simde_mm_sra_epi16(a_.m128i[1], count);
36831     #else
36832       simde__m128i_private
36833         count_ = simde__m128i_to_private(count);
36834 
36835       uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
36836 
36837       if (shift > 15) shift = 15;
36838 
36839       #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36840         r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift);
36841       #else
36842         SIMDE_VECTORIZE
36843         for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
36844           r_.i16[i] = a_.i16[i] >> shift;
36845         }
36846       #endif
36847     #endif
36848 
36849     return simde__m256i_from_private(r_);
36850   #endif
36851 }
36852 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36853   #undef _mm256_sra_epi16
36854   #define _mm256_sra_epi16(a, count) simde_mm256_sra_epi16(a, count)
36855 #endif
36856 
36857 SIMDE_FUNCTION_ATTRIBUTES
36858 simde__m256i
simde_mm256_sra_epi32(simde__m256i a,simde__m128i count)36859 simde_mm256_sra_epi32 (simde__m256i a, simde__m128i count) {
36860   #if defined(SIMDE_X86_AVX2_NATIVE)
36861     return _mm256_sra_epi32(a, count);
36862   #else
36863     simde__m256i_private
36864       r_,
36865       a_ = simde__m256i_to_private(a);
36866 
36867     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36868       r_.m128i[0] = simde_mm_sra_epi32(a_.m128i[0], count);
36869       r_.m128i[1] = simde_mm_sra_epi32(a_.m128i[1], count);
36870     #else
36871       simde__m128i_private
36872         count_ = simde__m128i_to_private(count);
36873       uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
36874 
36875       if (shift > 31) shift = 31;
36876 
36877       #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36878         r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift);
36879       #else
36880         SIMDE_VECTORIZE
36881         for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
36882           r_.i32[i] = a_.i32[i] >> shift;
36883         }
36884       #endif
36885     #endif
36886 
36887     return simde__m256i_from_private(r_);
36888   #endif
36889 }
36890 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36891   #undef _mm256_sra_epi32
36892   #define _mm256_sra_epi32(a, count) simde_mm256_sra_epi32(a, count)
36893 #endif
36894 
36895 SIMDE_FUNCTION_ATTRIBUTES
36896 simde__m256i
simde_mm256_srai_epi16(simde__m256i a,const int imm8)36897 simde_mm256_srai_epi16 (simde__m256i a, const int imm8)
36898     SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
36899   simde__m256i_private
36900     r_,
36901     a_ = simde__m256i_to_private(a);
36902   unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8);
36903 
36904   if (shift > 15) shift = 15;
36905 
36906   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36907     r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift);
36908   #else
36909     SIMDE_VECTORIZE
36910     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
36911       r_.i16[i] = a_.i16[i] >> shift;
36912     }
36913   #endif
36914 
36915   return simde__m256i_from_private(r_);
36916 }
36917 #if defined(SIMDE_X86_AVX2_NATIVE)
36918 #  define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8)
36919 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36920 #  define simde_mm256_srai_epi16(a, imm8) \
36921      simde_mm256_set_m128i( \
36922          simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
36923          simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
36924 #endif
36925 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36926   #undef _mm256_srai_epi16
36927   #define _mm256_srai_epi16(a, imm8) simde_mm256_srai_epi16(a, imm8)
36928 #endif
36929 
36930 SIMDE_FUNCTION_ATTRIBUTES
36931 simde__m256i
simde_mm256_srai_epi32(simde__m256i a,const int imm8)36932 simde_mm256_srai_epi32 (simde__m256i a, const int imm8)
36933     SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
36934   simde__m256i_private
36935     r_,
36936     a_ = simde__m256i_to_private(a);
36937   unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8);
36938 
36939   if (shift > 31) shift = 31;
36940 
36941   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
36942     r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift);
36943   #else
36944     SIMDE_VECTORIZE
36945     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
36946       r_.i32[i] = a_.i32[i] >> shift;
36947     }
36948   #endif
36949 
36950   return simde__m256i_from_private(r_);
36951 }
36952 #if defined(SIMDE_X86_AVX2_NATIVE)
36953 #  define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8)
36954 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
36955 #  define simde_mm256_srai_epi32(a, imm8) \
36956      simde_mm256_set_m128i( \
36957          simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
36958          simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
36959 #endif
36960 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36961   #undef _mm256_srai_epi32
36962   #define _mm256_srai_epi32(a, imm8) simde_mm256_srai_epi32(a, imm8)
36963 #endif
36964 
36965 SIMDE_FUNCTION_ATTRIBUTES
36966 simde__m128i
simde_mm_srav_epi32(simde__m128i a,simde__m128i count)36967 simde_mm_srav_epi32 (simde__m128i a, simde__m128i count) {
36968   #if defined(SIMDE_X86_AVX2_NATIVE)
36969     return _mm_srav_epi32(a, count);
36970   #else
36971     simde__m128i_private
36972       r_,
36973       a_ = simde__m128i_to_private(a),
36974       count_ = simde__m128i_to_private(count);
36975 
36976     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
36977       int32x4_t cnt = vreinterpretq_s32_u32(vminq_u32(count_.neon_u32, vdupq_n_u32(31)));
36978       r_.neon_i32 = vshlq_s32(a_.neon_i32, vnegq_s32(cnt));
36979     #else
36980       SIMDE_VECTORIZE
36981       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
36982         uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i32[i]);
36983         r_.i32[i] = a_.i32[i] >> HEDLEY_STATIC_CAST(int, shift > 31 ? 31 : shift);
36984       }
36985     #endif
36986 
36987     return simde__m128i_from_private(r_);
36988   #endif
36989 }
36990 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
36991   #undef _mm_srav_epi32
36992   #define _mm_srav_epi32(a, count) simde_mm_srav_epi32(a, count)
36993 #endif
36994 
36995 SIMDE_FUNCTION_ATTRIBUTES
36996 simde__m256i
simde_mm256_srav_epi32(simde__m256i a,simde__m256i count)36997 simde_mm256_srav_epi32 (simde__m256i a, simde__m256i count) {
36998   #if defined(SIMDE_X86_AVX2_NATIVE)
36999     return _mm256_srav_epi32(a, count);
37000   #else
37001     simde__m256i_private
37002       r_,
37003       a_ = simde__m256i_to_private(a),
37004       count_ = simde__m256i_to_private(count);
37005 
37006     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37007       r_.m128i[0] = simde_mm_srav_epi32(a_.m128i[0], count_.m128i[0]);
37008       r_.m128i[1] = simde_mm_srav_epi32(a_.m128i[1], count_.m128i[1]);
37009     #else
37010       SIMDE_VECTORIZE
37011       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
37012         uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i32[i]);
37013         if (shift > 31) shift = 31;
37014         r_.i32[i] = a_.i32[i] >> shift;
37015       }
37016     #endif
37017 
37018     return simde__m256i_from_private(r_);
37019   #endif
37020 }
37021 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37022   #undef _mm256_srav_epi32
37023   #define _mm256_srav_epi32(a, count) simde_mm256_srav_epi32(a, count)
37024 #endif
37025 
37026 SIMDE_FUNCTION_ATTRIBUTES
37027 simde__m256i
simde_mm256_srl_epi16(simde__m256i a,simde__m128i count)37028 simde_mm256_srl_epi16 (simde__m256i a, simde__m128i count) {
37029   #if defined(SIMDE_X86_AVX2_NATIVE)
37030     return _mm256_srl_epi16(a, count);
37031   #else
37032     simde__m256i_private
37033       r_,
37034       a_ = simde__m256i_to_private(a);
37035 
37036     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37037       r_.m128i[0] = simde_mm_srl_epi16(a_.m128i[0], count);
37038       r_.m128i[1] = simde_mm_srl_epi16(a_.m128i[1], count);
37039     #else
37040       simde__m128i_private
37041         count_ = simde__m128i_to_private(count);
37042 
37043       uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 16 ? 16 : count_.i64[0]));
37044 
37045       #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
37046         r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, shift);
37047       #else
37048         SIMDE_VECTORIZE
37049         for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
37050           r_.u16[i] = a_.u16[i] >> (shift);
37051         }
37052       #endif
37053     #endif
37054 
37055     return simde__m256i_from_private(r_);
37056   #endif
37057 }
37058 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37059   #undef _mm256_srl_epi16
37060   #define _mm256_srl_epi16(a, count) simde_mm256_srl_epi16(a, count)
37061 #endif
37062 
37063 SIMDE_FUNCTION_ATTRIBUTES
37064 simde__m256i
simde_mm256_srl_epi32(simde__m256i a,simde__m128i count)37065 simde_mm256_srl_epi32 (simde__m256i a, simde__m128i count) {
37066   #if defined(SIMDE_X86_AVX2_NATIVE)
37067     return _mm256_srl_epi32(a, count);
37068   #else
37069     simde__m256i_private
37070       r_,
37071       a_ = simde__m256i_to_private(a);
37072 
37073     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37074       r_.m128i[0] = simde_mm_srl_epi32(a_.m128i[0], count);
37075       r_.m128i[1] = simde_mm_srl_epi32(a_.m128i[1], count);
37076     #else
37077       simde__m128i_private
37078         count_ = simde__m128i_to_private(count);
37079 
37080       uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 32 ? 32 : count_.i64[0]));
37081 
37082       #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
37083         r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, shift);
37084       #else
37085         SIMDE_VECTORIZE
37086         for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
37087           r_.u32[i] = a_.u32[i] >> (shift);
37088         }
37089       #endif
37090     #endif
37091 
37092     return simde__m256i_from_private(r_);
37093   #endif
37094 }
37095 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37096   #undef _mm256_srl_epi32
37097   #define _mm256_srl_epi32(a, count) simde_mm256_srl_epi32(a, count)
37098 #endif
37099 
37100 SIMDE_FUNCTION_ATTRIBUTES
37101 simde__m256i
simde_mm256_srl_epi64(simde__m256i a,simde__m128i count)37102 simde_mm256_srl_epi64 (simde__m256i a, simde__m128i count) {
37103   #if defined(SIMDE_X86_AVX2_NATIVE)
37104     return _mm256_srl_epi64(a, count);
37105   #else
37106     simde__m256i_private
37107       r_,
37108       a_ = simde__m256i_to_private(a);
37109 
37110     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37111       r_.m128i[0] = simde_mm_srl_epi64(a_.m128i[0], count);
37112       r_.m128i[1] = simde_mm_srl_epi64(a_.m128i[1], count);
37113     #else
37114       simde__m128i_private
37115         count_ = simde__m128i_to_private(count);
37116 
37117       uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 64 ? 64 : count_.i64[0]));
37118 
37119       #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
37120         r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(64, shift);
37121       #else
37122         SIMDE_VECTORIZE
37123         for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
37124           r_.u64[i] = a_.u64[i] >> (shift);
37125         }
37126       #endif
37127     #endif
37128 
37129     return simde__m256i_from_private(r_);
37130   #endif
37131 }
37132 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37133   #undef _mm256_srl_epi64
37134   #define _mm256_srl_epi64(a, count) simde_mm256_srl_epi64(a, count)
37135 #endif
37136 
37137 SIMDE_FUNCTION_ATTRIBUTES
37138 simde__m256i
simde_mm256_srli_epi16(simde__m256i a,const int imm8)37139 simde_mm256_srli_epi16 (simde__m256i a, const int imm8)
37140     SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
37141   simde__m256i_private
37142     r_,
37143     a_ = simde__m256i_to_private(a);
37144 
37145   if (imm8 > 15)
37146     return simde_mm256_setzero_si256();
37147 
37148   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
37149     SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8));
37150     for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) {
37151       r_.altivec_u16[i] = vec_sr(a_.altivec_u16[i], sv);
37152     }
37153   #else
37154     if (HEDLEY_STATIC_CAST(unsigned int, imm8) > 15) {
37155       simde_memset(&r_, 0, sizeof(r_));
37156     } else {
37157       #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
37158         r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8);
37159       #else
37160         SIMDE_VECTORIZE
37161         for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
37162           r_.u16[i] = a_.u16[i] >> imm8;
37163         }
37164       #endif
37165     }
37166   #endif
37167 
37168   return simde__m256i_from_private(r_);
37169 }
37170 #if defined(SIMDE_X86_AVX2_NATIVE)
37171 #  define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8)
37172 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37173 #  define simde_mm256_srli_epi16(a, imm8) \
37174      simde_mm256_set_m128i( \
37175          simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
37176          simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
37177 #endif
37178 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37179   #undef _mm256_srli_epi16
37180   #define _mm256_srli_epi16(a, imm8) simde_mm256_srli_epi16(a, imm8)
37181 #endif
37182 
37183 SIMDE_FUNCTION_ATTRIBUTES
37184 simde__m256i
simde_mm256_srli_epi32(simde__m256i a,const int imm8)37185 simde_mm256_srli_epi32 (simde__m256i a, const int imm8)
37186     SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
37187   simde__m256i_private
37188     r_,
37189     a_ = simde__m256i_to_private(a);
37190 
37191   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
37192     SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8));
37193     for (size_t i = 0 ; i < (sizeof(a_.altivec_u32) / sizeof(a_.altivec_u32[0])) ; i++) {
37194       r_.altivec_u32[i] = vec_sr(a_.altivec_u32[i], sv);
37195     }
37196   #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
37197     r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8);
37198   #else
37199     SIMDE_VECTORIZE
37200     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
37201       r_.u32[i] = a_.u32[i] >> imm8;
37202     }
37203   #endif
37204 
37205   return simde__m256i_from_private(r_);
37206 }
37207 #if defined(SIMDE_X86_AVX2_NATIVE)
37208 #  define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8)
37209 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37210 #  define simde_mm256_srli_epi32(a, imm8) \
37211      simde_mm256_set_m128i( \
37212          simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
37213          simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
37214 #endif
37215 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37216   #undef _mm256_srli_epi32
37217   #define _mm256_srli_epi32(a, imm8) simde_mm256_srli_epi32(a, imm8)
37218 #endif
37219 
37220 SIMDE_FUNCTION_ATTRIBUTES
37221 simde__m256i
simde_mm256_srli_epi64(simde__m256i a,const int imm8)37222 simde_mm256_srli_epi64 (simde__m256i a, const int imm8)
37223     SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
37224   simde__m256i_private
37225     r_,
37226     a_ = simde__m256i_to_private(a);
37227 
37228 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
37229   r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, imm8);
37230 #else
37231   SIMDE_VECTORIZE
37232   for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
37233     r_.u64[i] = a_.u64[i] >> imm8;
37234   }
37235 #endif
37236 
37237   return simde__m256i_from_private(r_);
37238 }
37239 #if defined(SIMDE_X86_AVX2_NATIVE)
37240 #  define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8)
37241 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37242 #  define simde_mm256_srli_epi64(a, imm8) \
37243      simde_mm256_set_m128i( \
37244          simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
37245          simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
37246 #endif
37247 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37248   #undef _mm256_srli_epi64
37249   #define _mm256_srli_epi64(a, imm8) simde_mm256_srli_epi64(a, imm8)
37250 #endif
37251 
37252 SIMDE_FUNCTION_ATTRIBUTES
37253 simde__m256i
simde_mm256_srli_si256(simde__m256i a,const int imm8)37254 simde_mm256_srli_si256 (simde__m256i a, const int imm8)
37255     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
37256   simde__m256i_private
37257     r_,
37258     a_ = simde__m256i_to_private(a);
37259 
37260   for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) {
37261     SIMDE_VECTORIZE
37262     for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
37263       const int e = imm8 + HEDLEY_STATIC_CAST(int, i);
37264       r_.m128i_private[h].i8[i] = (e < 16) ? a_.m128i_private[h].i8[e] : 0;
37265     }
37266   }
37267 
37268   return simde__m256i_from_private(r_);
37269 }
37270 #if defined(SIMDE_X86_AVX2_NATIVE)
37271 #  define simde_mm256_srli_si256(a, imm8) _mm256_srli_si256(a, imm8)
37272 #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(__PGI)
37273 #  define simde_mm256_srli_si256(a, imm8) \
37274      simde_mm256_set_m128i( \
37275          simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
37276          simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
37277 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
37278 #  define simde_mm256_srli_si256(a, imm8) \
37279      simde_mm256_set_m128i( \
37280        simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
37281        simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
37282 #endif
37283 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37284   #undef _mm256_srli_si256
37285   #define _mm256_srli_si256(a, imm8) simde_mm256_srli_si256(a, imm8)
37286 #endif
37287 
37288 SIMDE_FUNCTION_ATTRIBUTES
37289 simde__m128i
simde_mm_srlv_epi32(simde__m128i a,simde__m128i b)37290 simde_mm_srlv_epi32 (simde__m128i a, simde__m128i b) {
37291   simde__m128i_private
37292     a_ = simde__m128i_to_private(a),
37293     b_ = simde__m128i_to_private(b),
37294     r_;
37295 
37296   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
37297     r_.u32 = HEDLEY_STATIC_CAST(__typeof__(r_.u32), (b_.u32 < 32) & (a_.u32 >> b_.u32));
37298   #else
37299     SIMDE_VECTORIZE
37300     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
37301       r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0;
37302     }
37303   #endif
37304 
37305   return simde__m128i_from_private(r_);
37306 }
37307 #if defined(SIMDE_X86_AVX2_NATIVE)
37308   #define simde_mm_srlv_epi32(a, b) _mm_srlv_epi32(a, b)
37309 #endif
37310 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37311   #undef _mm_srlv_epi32
37312   #define _mm_srlv_epi32(a, b) simde_mm_srlv_epi32(a, b)
37313 #endif
37314 
37315 SIMDE_FUNCTION_ATTRIBUTES
37316 simde__m256i
simde_mm256_srlv_epi32(simde__m256i a,simde__m256i b)37317 simde_mm256_srlv_epi32 (simde__m256i a, simde__m256i b) {
37318   simde__m256i_private
37319     a_ = simde__m256i_to_private(a),
37320     b_ = simde__m256i_to_private(b),
37321     r_;
37322 
37323   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
37324     r_.u32 = HEDLEY_STATIC_CAST(__typeof__(r_.u32), (b_.u32 < 32) & (a_.u32 >> b_.u32));
37325   #else
37326     SIMDE_VECTORIZE
37327     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
37328       r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0;
37329     }
37330   #endif
37331 
37332   return simde__m256i_from_private(r_);
37333 }
37334 #if defined(SIMDE_X86_AVX2_NATIVE)
37335   #define simde_mm256_srlv_epi32(a, b) _mm256_srlv_epi32(a, b)
37336 #endif
37337 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37338   #undef _mm256_srlv_epi32
37339   #define _mm256_srlv_epi32(a, b) simde_mm256_srlv_epi32(a, b)
37340 #endif
37341 
37342 SIMDE_FUNCTION_ATTRIBUTES
37343 simde__m128i
simde_mm_srlv_epi64(simde__m128i a,simde__m128i b)37344 simde_mm_srlv_epi64 (simde__m128i a, simde__m128i b) {
37345   simde__m128i_private
37346     a_ = simde__m128i_to_private(a),
37347     b_ = simde__m128i_to_private(b),
37348     r_;
37349 
37350   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
37351     r_.u64 = HEDLEY_STATIC_CAST(__typeof__(r_.u64), (b_.u64 < 64) & (a_.u64 >> b_.u64));
37352   #else
37353     SIMDE_VECTORIZE
37354     for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
37355       r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0;
37356     }
37357   #endif
37358 
37359   return simde__m128i_from_private(r_);
37360 }
37361 #if defined(SIMDE_X86_AVX2_NATIVE)
37362   #define simde_mm_srlv_epi64(a, b) _mm_srlv_epi64(a, b)
37363 #endif
37364 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37365   #undef _mm_srlv_epi64
37366   #define _mm_srlv_epi64(a, b) simde_mm_srlv_epi64(a, b)
37367 #endif
37368 
37369 SIMDE_FUNCTION_ATTRIBUTES
37370 simde__m256i
simde_mm256_srlv_epi64(simde__m256i a,simde__m256i b)37371 simde_mm256_srlv_epi64 (simde__m256i a, simde__m256i b) {
37372   simde__m256i_private
37373     a_ = simde__m256i_to_private(a),
37374     b_ = simde__m256i_to_private(b),
37375     r_;
37376 
37377   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
37378     r_.u64 = HEDLEY_STATIC_CAST(__typeof__(r_.u64), (b_.u64 < 64) & (a_.u64 >> b_.u64));
37379   #else
37380     SIMDE_VECTORIZE
37381     for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
37382       r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0;
37383     }
37384   #endif
37385 
37386   return simde__m256i_from_private(r_);
37387 }
37388 #if defined(SIMDE_X86_AVX2_NATIVE)
37389   #define simde_mm256_srlv_epi64(a, b) _mm256_srlv_epi64(a, b)
37390 #endif
37391 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37392   #undef _mm256_srlv_epi64
37393   #define _mm256_srlv_epi64(a, b) simde_mm256_srlv_epi64(a, b)
37394 #endif
37395 
37396 SIMDE_FUNCTION_ATTRIBUTES
37397 simde__m256i
simde_mm256_stream_load_si256(const simde__m256i * mem_addr)37398 simde_mm256_stream_load_si256 (const simde__m256i* mem_addr) {
37399   #if defined(SIMDE_X86_AVX2_NATIVE)
37400     return _mm256_stream_load_si256(HEDLEY_CONST_CAST(simde__m256i*, mem_addr));
37401   #else
37402     simde__m256i r;
37403     simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r));
37404     return r;
37405   #endif
37406 }
37407 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37408 #  define _mm256_stream_load_si256(mem_addr) simde_mm256_stream_load_si256(mem_addr)
37409 #endif
37410 
37411 SIMDE_FUNCTION_ATTRIBUTES
37412 simde__m256i
simde_mm256_sub_epi8(simde__m256i a,simde__m256i b)37413 simde_mm256_sub_epi8 (simde__m256i a, simde__m256i b) {
37414   #if defined(SIMDE_X86_AVX2_NATIVE)
37415     return _mm256_sub_epi8(a, b);
37416   #else
37417     simde__m256i_private
37418       r_,
37419       a_ = simde__m256i_to_private(a),
37420       b_ = simde__m256i_to_private(b);
37421 
37422     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37423       r_.m128i[0] = simde_mm_sub_epi8(a_.m128i[0], b_.m128i[0]);
37424       r_.m128i[1] = simde_mm_sub_epi8(a_.m128i[1], b_.m128i[1]);
37425     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
37426       r_.i8 = a_.i8 - b_.i8;
37427     #else
37428       SIMDE_VECTORIZE
37429       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
37430         r_.i8[i] = a_.i8[i] - b_.i8[i];
37431       }
37432     #endif
37433 
37434     return simde__m256i_from_private(r_);
37435   #endif
37436 }
37437 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37438   #undef _mm256_sub_epi8
37439   #define _mm256_sub_epi8(a, b) simde_mm256_sub_epi8(a, b)
37440 #endif
37441 
37442 SIMDE_FUNCTION_ATTRIBUTES
37443 simde__m256i
simde_mm256_sub_epi16(simde__m256i a,simde__m256i b)37444 simde_mm256_sub_epi16 (simde__m256i a, simde__m256i b) {
37445   #if defined(SIMDE_X86_AVX2_NATIVE)
37446     return _mm256_sub_epi16(a, b);
37447   #else
37448     simde__m256i_private
37449       r_,
37450       a_ = simde__m256i_to_private(a),
37451       b_ = simde__m256i_to_private(b);
37452 
37453     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37454       r_.m128i[0] = simde_mm_sub_epi16(a_.m128i[0], b_.m128i[0]);
37455       r_.m128i[1] = simde_mm_sub_epi16(a_.m128i[1], b_.m128i[1]);
37456     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
37457       r_.i16 = a_.i16 - b_.i16;
37458     #else
37459       SIMDE_VECTORIZE
37460       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
37461         r_.i16[i] = a_.i16[i] - b_.i16[i];
37462       }
37463     #endif
37464 
37465     return simde__m256i_from_private(r_);
37466   #endif
37467 }
37468 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37469   #undef _mm256_sub_epi16
37470   #define _mm256_sub_epi16(a, b) simde_mm256_sub_epi16(a, b)
37471 #endif
37472 
37473 SIMDE_FUNCTION_ATTRIBUTES
37474 simde__m256i
simde_mm256_hsub_epi16(simde__m256i a,simde__m256i b)37475 simde_mm256_hsub_epi16 (simde__m256i a, simde__m256i b) {
37476   #if defined(SIMDE_X86_AVX2_NATIVE)
37477     return _mm256_hsub_epi16(a, b);
37478   #else
37479     return simde_mm256_sub_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
37480   #endif
37481 }
37482 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37483   #undef _mm256_hsub_epi16
37484   #define _mm256_hsub_epi16(a, b) simde_mm256_hsub_epi16(a, b)
37485 #endif
37486 
37487 SIMDE_FUNCTION_ATTRIBUTES
37488 simde__m256i
simde_mm256_sub_epi32(simde__m256i a,simde__m256i b)37489 simde_mm256_sub_epi32 (simde__m256i a, simde__m256i b) {
37490   #if defined(SIMDE_X86_AVX2_NATIVE)
37491     return _mm256_sub_epi32(a, b);
37492   #else
37493     simde__m256i_private
37494       r_,
37495       a_ = simde__m256i_to_private(a),
37496       b_ = simde__m256i_to_private(b);
37497 
37498     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37499       r_.m128i[0] = simde_mm_sub_epi32(a_.m128i[0], b_.m128i[0]);
37500       r_.m128i[1] = simde_mm_sub_epi32(a_.m128i[1], b_.m128i[1]);
37501     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
37502       r_.i32 = a_.i32 - b_.i32;
37503     #else
37504       SIMDE_VECTORIZE
37505       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
37506         r_.i32[i] = a_.i32[i] - b_.i32[i];
37507       }
37508     #endif
37509 
37510     return simde__m256i_from_private(r_);
37511   #endif
37512 }
37513 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37514   #undef _mm256_sub_epi32
37515   #define _mm256_sub_epi32(a, b) simde_mm256_sub_epi32(a, b)
37516 #endif
37517 
37518 SIMDE_FUNCTION_ATTRIBUTES
37519 simde__m256i
simde_mm256_hsub_epi32(simde__m256i a,simde__m256i b)37520 simde_mm256_hsub_epi32 (simde__m256i a, simde__m256i b) {
37521   #if defined(SIMDE_X86_AVX2_NATIVE)
37522     return _mm256_hsub_epi32(a, b);
37523   #else
37524     return simde_mm256_sub_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b));
37525   #endif
37526 }
37527 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37528   #undef _mm256_hsub_epi32
37529   #define _mm256_hsub_epi32(a, b) simde_mm256_hsub_epi32(a, b)
37530 #endif
37531 
37532 SIMDE_FUNCTION_ATTRIBUTES
37533 simde__m256i
simde_mm256_sub_epi64(simde__m256i a,simde__m256i b)37534 simde_mm256_sub_epi64 (simde__m256i a, simde__m256i b) {
37535   #if defined(SIMDE_X86_AVX2_NATIVE)
37536     return _mm256_sub_epi64(a, b);
37537   #else
37538     simde__m256i_private
37539       r_,
37540       a_ = simde__m256i_to_private(a),
37541       b_ = simde__m256i_to_private(b);
37542 
37543     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37544       r_.m128i[0] = simde_mm_sub_epi64(a_.m128i[0], b_.m128i[0]);
37545       r_.m128i[1] = simde_mm_sub_epi64(a_.m128i[1], b_.m128i[1]);
37546     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
37547       r_.i64 = a_.i64 - b_.i64;
37548     #else
37549       SIMDE_VECTORIZE
37550       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
37551         r_.i64[i] = a_.i64[i] - b_.i64[i];
37552       }
37553     #endif
37554 
37555   return simde__m256i_from_private(r_);
37556 #endif
37557 }
37558 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37559   #undef _mm256_sub_epi64
37560   #define _mm256_sub_epi64(a, b) simde_mm256_sub_epi64(a, b)
37561 #endif
37562 
37563 SIMDE_FUNCTION_ATTRIBUTES
37564 simde__m256i
simde_x_mm256_sub_epu32(simde__m256i a,simde__m256i b)37565 simde_x_mm256_sub_epu32 (simde__m256i a, simde__m256i b) {
37566   simde__m256i_private
37567     r_,
37568     a_ = simde__m256i_to_private(a),
37569     b_ = simde__m256i_to_private(b);
37570 
37571   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
37572     r_.u32 = a_.u32 - b_.u32;
37573   #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37574     r_.m128i[0] = simde_x_mm_sub_epu32(a_.m128i[0], b_.m128i[0]);
37575     r_.m128i[1] = simde_x_mm_sub_epu32(a_.m128i[1], b_.m128i[1]);
37576   #else
37577     SIMDE_VECTORIZE
37578     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
37579       r_.u32[i] = a_.u32[i] - b_.u32[i];
37580     }
37581   #endif
37582 
37583   return simde__m256i_from_private(r_);
37584 }
37585 
37586 SIMDE_FUNCTION_ATTRIBUTES
37587 simde__m256i
simde_mm256_subs_epi8(simde__m256i a,simde__m256i b)37588 simde_mm256_subs_epi8 (simde__m256i a, simde__m256i b) {
37589   #if defined(SIMDE_X86_AVX2_NATIVE)
37590     return _mm256_subs_epi8(a, b);
37591   #else
37592     simde__m256i_private
37593       r_,
37594       a_ = simde__m256i_to_private(a),
37595       b_ = simde__m256i_to_private(b);
37596 
37597     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37598       r_.m128i[0] = simde_mm_subs_epi8(a_.m128i[0], b_.m128i[0]);
37599       r_.m128i[1] = simde_mm_subs_epi8(a_.m128i[1], b_.m128i[1]);
37600     #else
37601       SIMDE_VECTORIZE
37602       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
37603         r_.i8[i] = simde_math_subs_i8(a_.i8[i], b_.i8[i]);
37604       }
37605     #endif
37606 
37607     return simde__m256i_from_private(r_);
37608   #endif
37609 }
37610 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37611   #undef _mm256_subs_epi8
37612   #define _mm256_subs_epi8(a, b) simde_mm256_subs_epi8(a, b)
37613 #endif
37614 
37615 SIMDE_FUNCTION_ATTRIBUTES
37616 simde__m256i
simde_mm256_subs_epi16(simde__m256i a,simde__m256i b)37617 simde_mm256_subs_epi16(simde__m256i a, simde__m256i b) {
37618   #if defined(SIMDE_X86_AVX2_NATIVE)
37619     return _mm256_subs_epi16(a, b);
37620   #else
37621     simde__m256i_private
37622       r_,
37623       a_ = simde__m256i_to_private(a),
37624       b_ = simde__m256i_to_private(b);
37625 
37626     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37627       r_.m128i[0] = simde_mm_subs_epi16(a_.m128i[0], b_.m128i[0]);
37628       r_.m128i[1] = simde_mm_subs_epi16(a_.m128i[1], b_.m128i[1]);
37629     #else
37630       SIMDE_VECTORIZE
37631       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
37632         r_.i16[i] = simde_math_subs_i16(a_.i16[i], b_.i16[i]);
37633       }
37634     #endif
37635 
37636     return simde__m256i_from_private(r_);
37637   #endif
37638 }
37639 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37640   #undef _mm256_subs_epi16
37641   #define _mm256_subs_epi16(a, b) simde_mm256_subs_epi16(a, b)
37642 #endif
37643 
37644 SIMDE_FUNCTION_ATTRIBUTES
37645 simde__m256i
simde_mm256_hsubs_epi16(simde__m256i a,simde__m256i b)37646 simde_mm256_hsubs_epi16 (simde__m256i a, simde__m256i b) {
37647   #if defined(SIMDE_X86_AVX2_NATIVE)
37648     return _mm256_hsubs_epi16(a, b);
37649   #else
37650     return simde_mm256_subs_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
37651   #endif
37652 }
37653 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37654   #undef _mm256_hsubs_epi16
37655   #define _mm256_hsubs_epi16(a, b) simde_mm256_hsubs_epi16(a, b)
37656 #endif
37657 
37658 SIMDE_FUNCTION_ATTRIBUTES
37659 simde__m256i
simde_mm256_subs_epu8(simde__m256i a,simde__m256i b)37660 simde_mm256_subs_epu8 (simde__m256i a, simde__m256i b) {
37661   #if defined(SIMDE_X86_AVX2_NATIVE)
37662     return _mm256_subs_epu8(a, b);
37663   #else
37664     simde__m256i_private
37665       r_,
37666       a_ = simde__m256i_to_private(a),
37667       b_ = simde__m256i_to_private(b);
37668 
37669     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37670       r_.m128i[0] = simde_mm_subs_epu8(a_.m128i[0], b_.m128i[0]);
37671       r_.m128i[1] = simde_mm_subs_epu8(a_.m128i[1], b_.m128i[1]);
37672     #else
37673       SIMDE_VECTORIZE
37674       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
37675         r_.u8[i] = simde_math_subs_u8(a_.u8[i], b_.u8[i]);
37676       }
37677     #endif
37678 
37679     return simde__m256i_from_private(r_);
37680   #endif
37681 }
37682 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37683   #undef _mm256_subs_epu8
37684   #define _mm256_subs_epu8(a, b) simde_mm256_subs_epu8(a, b)
37685 #endif
37686 
37687 SIMDE_FUNCTION_ATTRIBUTES
37688 simde__m256i
simde_mm256_subs_epu16(simde__m256i a,simde__m256i b)37689 simde_mm256_subs_epu16(simde__m256i a, simde__m256i b) {
37690   #if defined(SIMDE_X86_AVX2_NATIVE)
37691     return _mm256_subs_epu16(a, b);
37692   #else
37693     simde__m256i_private
37694       r_,
37695       a_ = simde__m256i_to_private(a),
37696       b_ = simde__m256i_to_private(b);
37697 
37698     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37699       r_.m128i[0] = simde_mm_subs_epu16(a_.m128i[0], b_.m128i[0]);
37700       r_.m128i[1] = simde_mm_subs_epu16(a_.m128i[1], b_.m128i[1]);
37701     #else
37702       SIMDE_VECTORIZE
37703       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
37704         r_.u16[i] = simde_math_subs_u16(a_.u16[i], b_.u16[i]);
37705       }
37706     #endif
37707 
37708     return simde__m256i_from_private(r_);
37709   #endif
37710 }
37711 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37712   #undef _mm256_subs_epu16
37713   #define _mm256_subs_epu16(a, b) simde_mm256_subs_epu16(a, b)
37714 #endif
37715 
37716 SIMDE_FUNCTION_ATTRIBUTES
37717 int
simde_x_mm256_test_all_ones(simde__m256i a)37718 simde_x_mm256_test_all_ones (simde__m256i a) {
37719   simde__m256i_private a_ = simde__m256i_to_private(a);
37720   int r;
37721   int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
37722 
37723   SIMDE_VECTORIZE_REDUCTION(&:r_)
37724   for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
37725     r_ &= a_.i32f[i];
37726   }
37727 
37728   r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0));
37729 
37730   return r;
37731 }
37732 
37733 SIMDE_FUNCTION_ATTRIBUTES
37734 simde__m256i
simde_mm256_unpacklo_epi8(simde__m256i a,simde__m256i b)37735 simde_mm256_unpacklo_epi8 (simde__m256i a, simde__m256i b) {
37736   #if defined(SIMDE_X86_AVX2_NATIVE)
37737     return _mm256_unpacklo_epi8(a, b);
37738   #else
37739     simde__m256i_private
37740       r_,
37741       a_ = simde__m256i_to_private(a),
37742       b_ = simde__m256i_to_private(b);
37743 
37744     #if defined(SIMDE_SHUFFLE_VECTOR_)
37745       r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 32, a_.i8, b_.i8,
37746            0, 32,  1, 33,  2, 34,  3, 35,
37747            4, 36,  5, 37,  6, 38,  7, 39,
37748           16, 48, 17, 49, 18, 50, 19, 51,
37749           20, 52, 21, 53, 22, 54, 23, 55);
37750     #else
37751       r_.m128i[0] = simde_mm_unpacklo_epi8(a_.m128i[0], b_.m128i[0]);
37752       r_.m128i[1] = simde_mm_unpacklo_epi8(a_.m128i[1], b_.m128i[1]);
37753     #endif
37754 
37755     return simde__m256i_from_private(r_);
37756   #endif
37757 }
37758 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37759   #undef _mm256_unpacklo_epi8
37760   #define _mm256_unpacklo_epi8(a, b) simde_mm256_unpacklo_epi8(a, b)
37761 #endif
37762 
37763 SIMDE_FUNCTION_ATTRIBUTES
37764 simde__m256i
simde_mm256_unpacklo_epi16(simde__m256i a,simde__m256i b)37765 simde_mm256_unpacklo_epi16 (simde__m256i a, simde__m256i b) {
37766   #if defined(SIMDE_X86_AVX2_NATIVE)
37767     return _mm256_unpacklo_epi16(a, b);
37768   #else
37769     simde__m256i_private
37770       r_,
37771       a_ = simde__m256i_to_private(a),
37772       b_ = simde__m256i_to_private(b);
37773 
37774     #if defined(SIMDE_SHUFFLE_VECTOR_)
37775       r_.i16 =SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16,
37776         0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27);
37777     #else
37778       r_.m128i[0] = simde_mm_unpacklo_epi16(a_.m128i[0], b_.m128i[0]);
37779       r_.m128i[1] = simde_mm_unpacklo_epi16(a_.m128i[1], b_.m128i[1]);
37780     #endif
37781 
37782     return simde__m256i_from_private(r_);
37783   #endif
37784 }
37785 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37786   #undef _mm256_unpacklo_epi16
37787   #define _mm256_unpacklo_epi16(a, b) simde_mm256_unpacklo_epi16(a, b)
37788 #endif
37789 
37790 SIMDE_FUNCTION_ATTRIBUTES
37791 simde__m256i
simde_mm256_unpacklo_epi32(simde__m256i a,simde__m256i b)37792 simde_mm256_unpacklo_epi32 (simde__m256i a, simde__m256i b) {
37793   #if defined(SIMDE_X86_AVX2_NATIVE)
37794     return _mm256_unpacklo_epi32(a, b);
37795   #else
37796     simde__m256i_private
37797       r_,
37798       a_ = simde__m256i_to_private(a),
37799       b_ = simde__m256i_to_private(b);
37800 
37801     #if defined(SIMDE_SHUFFLE_VECTOR_)
37802       r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32,
37803                                     0, 8, 1, 9, 4, 12, 5, 13);
37804     #else
37805       r_.m128i[0] = simde_mm_unpacklo_epi32(a_.m128i[0], b_.m128i[0]);
37806       r_.m128i[1] = simde_mm_unpacklo_epi32(a_.m128i[1], b_.m128i[1]);
37807     #endif
37808 
37809     return simde__m256i_from_private(r_);
37810   #endif
37811 }
37812 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37813   #undef _mm256_unpacklo_epi32
37814   #define _mm256_unpacklo_epi32(a, b) simde_mm256_unpacklo_epi32(a, b)
37815 #endif
37816 
37817 SIMDE_FUNCTION_ATTRIBUTES
37818 simde__m256i
simde_mm256_unpacklo_epi64(simde__m256i a,simde__m256i b)37819 simde_mm256_unpacklo_epi64 (simde__m256i a, simde__m256i b) {
37820   #if defined(SIMDE_X86_AVX2_NATIVE)
37821     return _mm256_unpacklo_epi64(a, b);
37822   #else
37823     simde__m256i_private
37824       r_,
37825       a_ = simde__m256i_to_private(a),
37826       b_ = simde__m256i_to_private(b);
37827 
37828     #if defined(SIMDE_SHUFFLE_VECTOR_)
37829       r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.i64, b_.i64, 0, 4, 2, 6);
37830     #else
37831       r_.m128i[0] = simde_mm_unpacklo_epi64(a_.m128i[0], b_.m128i[0]);
37832       r_.m128i[1] = simde_mm_unpacklo_epi64(a_.m128i[1], b_.m128i[1]);
37833     #endif
37834 
37835     return simde__m256i_from_private(r_);
37836   #endif
37837 }
37838 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37839   #undef _mm256_unpacklo_epi64
37840   #define _mm256_unpacklo_epi64(a, b) simde_mm256_unpacklo_epi64(a, b)
37841 #endif
37842 
37843 SIMDE_FUNCTION_ATTRIBUTES
37844 simde__m256i
simde_mm256_unpackhi_epi8(simde__m256i a,simde__m256i b)37845 simde_mm256_unpackhi_epi8 (simde__m256i a, simde__m256i b) {
37846   #if defined(SIMDE_X86_AVX2_NATIVE)
37847     return _mm256_unpackhi_epi8(a, b);
37848   #else
37849     simde__m256i_private
37850       r_,
37851       a_ = simde__m256i_to_private(a),
37852       b_ = simde__m256i_to_private(b);
37853 
37854     #if defined(SIMDE_SHUFFLE_VECTOR_)
37855       r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 32, a_.i8, b_.i8,
37856            8, 40,  9, 41, 10, 42, 11, 43,
37857           12, 44, 13, 45, 14, 46, 15, 47,
37858           24, 56, 25, 57, 26, 58, 27, 59,
37859           28, 60, 29, 61, 30, 62, 31, 63);
37860     #else
37861       r_.m128i[0] = simde_mm_unpackhi_epi8(a_.m128i[0], b_.m128i[0]);
37862       r_.m128i[1] = simde_mm_unpackhi_epi8(a_.m128i[1], b_.m128i[1]);
37863     #endif
37864 
37865     return simde__m256i_from_private(r_);
37866   #endif
37867 }
37868 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37869   #undef _mm256_unpackhi_epi8
37870   #define _mm256_unpackhi_epi8(a, b) simde_mm256_unpackhi_epi8(a, b)
37871 #endif
37872 
37873 SIMDE_FUNCTION_ATTRIBUTES
37874 simde__m256i
simde_mm256_unpackhi_epi16(simde__m256i a,simde__m256i b)37875 simde_mm256_unpackhi_epi16 (simde__m256i a, simde__m256i b) {
37876   #if defined(SIMDE_X86_AVX2_NATIVE)
37877     return _mm256_unpackhi_epi16(a, b);
37878   #else
37879     simde__m256i_private
37880       r_,
37881       a_ = simde__m256i_to_private(a),
37882       b_ = simde__m256i_to_private(b);
37883 
37884     #if defined(SIMDE_SHUFFLE_VECTOR_)
37885       r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16,
37886          4, 20,  5, 21,  6, 22,  7, 23,
37887         12, 28, 13, 29, 14, 30, 15, 31);
37888     #else
37889       r_.m128i[0] = simde_mm_unpackhi_epi16(a_.m128i[0], b_.m128i[0]);
37890       r_.m128i[1] = simde_mm_unpackhi_epi16(a_.m128i[1], b_.m128i[1]);
37891     #endif
37892 
37893     return simde__m256i_from_private(r_);
37894   #endif
37895 }
37896 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37897   #undef _mm256_unpackhi_epi16
37898   #define _mm256_unpackhi_epi16(a, b) simde_mm256_unpackhi_epi16(a, b)
37899 #endif
37900 
37901 SIMDE_FUNCTION_ATTRIBUTES
37902 simde__m256i
simde_mm256_unpackhi_epi32(simde__m256i a,simde__m256i b)37903 simde_mm256_unpackhi_epi32 (simde__m256i a, simde__m256i b) {
37904   #if defined(SIMDE_X86_AVX2_NATIVE)
37905     return _mm256_unpackhi_epi32(a, b);
37906   #else
37907     simde__m256i_private
37908       r_,
37909       a_ = simde__m256i_to_private(a),
37910       b_ = simde__m256i_to_private(b);
37911 
37912     #if defined(SIMDE_SHUFFLE_VECTOR_)
37913       r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32,
37914                                     2, 10, 3, 11, 6, 14, 7, 15);
37915     #else
37916       r_.m128i[0] = simde_mm_unpackhi_epi32(a_.m128i[0], b_.m128i[0]);
37917       r_.m128i[1] = simde_mm_unpackhi_epi32(a_.m128i[1], b_.m128i[1]);
37918     #endif
37919 
37920     return simde__m256i_from_private(r_);
37921   #endif
37922 }
37923 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37924   #undef _mm256_unpackhi_epi32
37925   #define _mm256_unpackhi_epi32(a, b) simde_mm256_unpackhi_epi32(a, b)
37926 #endif
37927 
37928 SIMDE_FUNCTION_ATTRIBUTES
37929 simde__m256i
simde_mm256_unpackhi_epi64(simde__m256i a,simde__m256i b)37930 simde_mm256_unpackhi_epi64 (simde__m256i a, simde__m256i b) {
37931   #if defined(SIMDE_X86_AVX2_NATIVE)
37932     return _mm256_unpackhi_epi64(a, b);
37933   #else
37934     simde__m256i_private
37935       r_,
37936       a_ = simde__m256i_to_private(a),
37937       b_ = simde__m256i_to_private(b);
37938 
37939     #if defined(SIMDE_SHUFFLE_VECTOR_)
37940       r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.i64, b_.i64, 1, 5, 3, 7);
37941     #else
37942       r_.m128i[0] = simde_mm_unpackhi_epi64(a_.m128i[0], b_.m128i[0]);
37943       r_.m128i[1] = simde_mm_unpackhi_epi64(a_.m128i[1], b_.m128i[1]);
37944     #endif
37945 
37946     return simde__m256i_from_private(r_);
37947   #endif
37948 }
37949 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37950   #undef _mm256_unpackhi_epi64
37951   #define _mm256_unpackhi_epi64(a, b) simde_mm256_unpackhi_epi64(a, b)
37952 #endif
37953 
37954 SIMDE_FUNCTION_ATTRIBUTES
37955 simde__m256i
simde_mm256_xor_si256(simde__m256i a,simde__m256i b)37956 simde_mm256_xor_si256 (simde__m256i a, simde__m256i b) {
37957   #if defined(SIMDE_X86_AVX2_NATIVE)
37958     return _mm256_xor_si256(a, b);
37959   #else
37960     simde__m256i_private
37961       r_,
37962       a_ = simde__m256i_to_private(a),
37963       b_ = simde__m256i_to_private(b);
37964 
37965     #if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
37966       r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]);
37967       r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]);
37968     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
37969       r_.i32f = a_.i32f ^ b_.i32f;
37970     #else
37971       SIMDE_VECTORIZE
37972       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
37973         r_.i64[i] = a_.i64[i] ^ b_.i64[i];
37974       }
37975     #endif
37976 
37977     return simde__m256i_from_private(r_);
37978   #endif
37979 }
37980 #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
37981   #undef _mm256_xor_si256
37982   #define _mm256_xor_si256(a, b) simde_mm256_xor_si256(a, b)
37983 #endif
37984 
37985 SIMDE_END_DECLS_
37986 
37987 HEDLEY_DIAGNOSTIC_POP
37988 
37989 #endif /* !defined(SIMDE_X86_AVX2_H) */
37990 /* :: End ../simde/simde/x86/avx2.h :: */
37991