1 #pragma once
2 #ifndef PSIMD_H
3 #define PSIMD_H
4 
5 #if defined(__CUDA_ARCH__)
6 	/* CUDA compiler */
7 	#define PSIMD_INTRINSIC __forceinline__ __device__
8 #elif defined(__OPENCL_VERSION__)
9 	/* OpenCL compiler */
10 	#define PSIMD_INTRINSIC inline static
11 #elif defined(__INTEL_COMPILER)
12 	/* Intel compiler, even on Windows */
13 	#define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
14 #elif defined(__GNUC__)
15 	/* GCC-compatible compiler (gcc/clang/icc) */
16 	#define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
17 #elif defined(_MSC_VER)
18 	/* MSVC-compatible compiler (cl/icl/clang-cl) */
19 	#define PSIMD_INTRINSIC __forceinline static
20 #elif defined(__cplusplus)
21 	/* Generic C++ compiler */
22 	#define PSIMD_INTRINSIC inline static
23 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
24 	/* Generic C99 compiler */
25 	#define PSIMD_INTRINSIC inline static
26 #else
27 	/* Generic C compiler */
28 	#define PSIMD_INTRINSIC static
29 #endif
30 
31 #if defined(__GNUC__)
32 	#if defined(__ARM_NEON__) || defined(__ARM_NEON)
33 		#include <arm_neon.h>
34 	#endif
35 
36 	#if defined(__SSE2__)
37 		#include <emmintrin.h>
38 	#endif
39 
40 	#if defined(__SSE3__)
41 		#include <pmmintrin.h>
42 	#endif
43 
44 	#if defined(__SSSE3__)
45 		#include <tmmintrin.h>
46 	#endif
47 
48 	#if defined(__SSE4_1__)
49 		#include <smmintrin.h>
50 	#endif
51 
52 	#if defined(__SSE4_2__)
53 		#include <nmmintrin.h>
54 	#endif
55 
56 	#if defined(__AVX__)
57 		#include <immintrin.h>
58 	#endif
59 #elif defined(_MSC_VER)
60 	#include <intrin.h>
61 #endif
62 
63 #if defined(__cplusplus)
64 	#define PSIMD_CXX_SYNTAX
65 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
66 	#define PSIMD_C11_SYNTAX
67 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
68 	#define PSIMD_C99_SYNTAX
69 #else
70 	#define PSIMD_C89_SYNTAX
71 #endif
72 
73 #if defined(__cplusplus) && (__cplusplus >= 201103L)
74 	#include <cstddef>
75 	#include <cstdint>
76 #elif !defined(__OPENCL_VERSION__)
77 	#include <stddef.h>
78 	#include <stdint.h>
79 #endif
80 
81 #if defined(__GNUC__)
82 	#define PSIMD_HAVE_F64 0
83 	#define PSIMD_HAVE_F32 1
84 	#define PSIMD_HAVE_U8 1
85 	#define PSIMD_HAVE_S8 1
86 	#define PSIMD_HAVE_U16 1
87 	#define PSIMD_HAVE_S16 1
88 	#define PSIMD_HAVE_U32 1
89 	#define PSIMD_HAVE_S32 1
90 	#define PSIMD_HAVE_U64 0
91 	#define PSIMD_HAVE_S64 0
92 
93 	typedef int8_t   psimd_s8  __attribute__((vector_size(16), aligned(1)));
94 	typedef uint8_t  psimd_u8  __attribute__((vector_size(16), aligned(1)));
95 	typedef int16_t  psimd_s16 __attribute__((vector_size(16), aligned(2)));
96 	typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
97 	typedef int32_t  psimd_s32 __attribute__((vector_size(16), aligned(4)));
98 	typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
99 	typedef float    psimd_f32 __attribute__((vector_size(16), aligned(4)));
100 
101 	typedef struct {
102 		psimd_s8 lo;
103 		psimd_s8 hi;
104 	} psimd_s8x2;
105 
106 	typedef struct {
107 		psimd_u8 lo;
108 		psimd_u8 hi;
109 	} psimd_u8x2;
110 
111 	typedef struct {
112 		psimd_s16 lo;
113 		psimd_s16 hi;
114 	} psimd_s16x2;
115 
116 	typedef struct {
117 		psimd_u16 lo;
118 		psimd_u16 hi;
119 	} psimd_u16x2;
120 
121 	typedef struct {
122 		psimd_s32 lo;
123 		psimd_s32 hi;
124 	} psimd_s32x2;
125 
126 	typedef struct {
127 		psimd_u32 lo;
128 		psimd_u32 hi;
129 	} psimd_u32x2;
130 
131 	typedef struct {
132 		psimd_f32 lo;
133 		psimd_f32 hi;
134 	} psimd_f32x2;
135 
136 	/* Bit casts */
psimd_cast_s32x2_u32x2(psimd_s32x2 v)137 	PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
138 		return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
139 	}
140 
psimd_cast_s32x2_f32x2(psimd_s32x2 v)141 	PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
142 		return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
143 	}
144 
psimd_cast_u32x2_s32x2(psimd_u32x2 v)145 	PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
146 		return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
147 	}
148 
psimd_cast_u32x2_f32x2(psimd_u32x2 v)149 	PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
150 		return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
151 	}
152 
psimd_cast_f32x2_s32x2(psimd_f32x2 v)153 	PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
154 		return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
155 	}
156 
psimd_cast_f32x2_u32x2(psimd_f32x2 v)157 	PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
158 		return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
159 	}
160 
161 	/* Swap */
psimd_swap_s8(psimd_s8 a[1],psimd_s8 b[1])162 	PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
163 		const psimd_s8 new_a = *b;
164 		const psimd_s8 new_b = *a;
165 		*a = new_a;
166 		*b = new_b;
167 	}
168 
psimd_swap_u8(psimd_u8 a[1],psimd_u8 b[1])169 	PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
170 		const psimd_u8 new_a = *b;
171 		const psimd_u8 new_b = *a;
172 		*a = new_a;
173 		*b = new_b;
174 	}
175 
psimd_swap_s16(psimd_s16 a[1],psimd_s16 b[1])176 	PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
177 		const psimd_s16 new_a = *b;
178 		const psimd_s16 new_b = *a;
179 		*a = new_a;
180 		*b = new_b;
181 	}
182 
psimd_swap_u16(psimd_u16 a[1],psimd_u16 b[1])183 	PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
184 		const psimd_u16 new_a = *b;
185 		const psimd_u16 new_b = *a;
186 		*a = new_a;
187 		*b = new_b;
188 	}
189 
psimd_swap_s32(psimd_s32 a[1],psimd_s32 b[1])190 	PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
191 		const psimd_s32 new_a = *b;
192 		const psimd_s32 new_b = *a;
193 		*a = new_a;
194 		*b = new_b;
195 	}
196 
psimd_swap_u32(psimd_u32 a[1],psimd_u32 b[1])197 	PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
198 		const psimd_u32 new_a = *b;
199 		const psimd_u32 new_b = *a;
200 		*a = new_a;
201 		*b = new_b;
202 	}
203 
psimd_swap_f32(psimd_f32 a[1],psimd_f32 b[1])204 	PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
205 		const psimd_f32 new_a = *b;
206 		const psimd_f32 new_b = *a;
207 		*a = new_a;
208 		*b = new_b;
209 	}
210 
211 	/* Zero-initialization */
psimd_zero_s8(void)212 	PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
213 		return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
214 	}
215 
psimd_zero_u8(void)216 	PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
217 		return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
218 	}
219 
psimd_zero_s16(void)220 	PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
221 		return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
222 	}
223 
psimd_zero_u16(void)224 	PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
225 		return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
226 	}
227 
psimd_zero_s32(void)228 	PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
229 		return (psimd_s32) { 0, 0, 0, 0 };
230 	}
231 
psimd_zero_u32(void)232 	PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
233 		return (psimd_u32) { 0, 0, 0, 0 };
234 	}
235 
psimd_zero_f32(void)236 	PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
237 		return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
238 	}
239 
240 	/* Initialization to the same constant */
psimd_splat_s8(int8_t c)241 	PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
242 		return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
243 	}
244 
psimd_splat_u8(uint8_t c)245 	PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
246 		return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
247 	}
248 
psimd_splat_s16(int16_t c)249 	PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
250 		return (psimd_s16) { c, c, c, c, c, c, c, c };
251 	}
252 
psimd_splat_u16(uint16_t c)253 	PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
254 		return (psimd_u16) { c, c, c, c, c, c, c, c };
255 	}
256 
psimd_splat_s32(int32_t c)257 	PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
258 		return (psimd_s32) { c, c, c, c };
259 	}
260 
psimd_splat_u32(uint32_t c)261 	PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
262 		return (psimd_u32) { c, c, c, c };
263 	}
264 
psimd_splat_f32(float c)265 	PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
266 		return (psimd_f32) { c, c, c, c };
267 	}
268 
269 	/* Load vector */
psimd_load_s8(const void * address)270 	PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
271 		return *((const psimd_s8*) address);
272 	}
273 
psimd_load_u8(const void * address)274 	PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
275 		return *((const psimd_u8*) address);
276 	}
277 
psimd_load_s16(const void * address)278 	PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
279 		return *((const psimd_s16*) address);
280 	}
281 
psimd_load_u16(const void * address)282 	PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
283 		return *((const psimd_u16*) address);
284 	}
285 
psimd_load_s32(const void * address)286 	PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
287 		return *((const psimd_s32*) address);
288 	}
289 
psimd_load_u32(const void * address)290 	PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
291 		return *((const psimd_u32*) address);
292 	}
293 
psimd_load_f32(const void * address)294 	PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
295 		return *((const psimd_f32*) address);
296 	}
297 
psimd_load1_f32(const void * address)298 	PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
299 		return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
300 	}
301 
psimd_load2_f32(const void * address)302 	PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
303 		const float* address_f32 = (const float*) address;
304 		return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
305 	}
306 
psimd_load3_f32(const void * address)307 	PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
308 		const float* address_f32 = (const float*) address;
309 		return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
310 	}
311 
psimd_load4_f32(const void * address)312 	PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
313 		return psimd_load_f32(address);
314 	}
315 
psimd_load_stride2_f32(const void * address)316 	PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
317 		const psimd_f32 v0x1x = psimd_load_f32(address);
318 		const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3);
319 		#if defined(__clang__)
320 			return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
321 		#else
322 			return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
323 		#endif
324 	}
325 
psimd_load1_stride2_f32(const void * address)326 	PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
327 		return psimd_load_f32(address);
328 	}
329 
psimd_load2_stride2_f32(const void * address)330 	PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
331 		const float* address_f32 = (const float*) address;
332 		return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
333 	}
334 
psimd_load3_stride2_f32(const void * address)335 	PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
336 		const psimd_f32 v0x1x = psimd_load_f32(address);
337 		const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2);
338 		#if defined(__clang__)
339 			return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
340 		#else
341 			return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
342 		#endif
343 	}
344 
psimd_load4_stride2_f32(const void * address)345 	PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
346 		return psimd_load_stride2_f32(address);
347 	}
348 
psimd_load_stride_f32(const void * address,size_t stride)349 	PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
350 		const float* address0_f32 = (const float*) address;
351 		const float* address1_f32 = address0_f32 + stride;
352 		const float* address2_f32 = address1_f32 + stride;
353 		const float* address3_f32 = address2_f32 + stride;
354 		return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
355 	}
356 
psimd_load1_stride_f32(const void * address,size_t stride)357 	PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
358 		return psimd_load1_f32(address);
359 	}
360 
psimd_load2_stride_f32(const void * address,size_t stride)361 	PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
362 		const float* address_f32 = (const float*) address;
363 		return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
364 	}
365 
psimd_load3_stride_f32(const void * address,size_t stride)366 	PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
367 		const float* address0_f32 = (const float*) address;
368 		const float* address1_f32 = address0_f32 + stride;
369 		const float* address2_f32 = address1_f32 + stride;
370 		return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
371 	}
372 
psimd_load4_stride_f32(const void * address,size_t stride)373 	PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
374 		return psimd_load_stride_f32(address, stride);
375 	}
376 
377 	/* Store vector */
psimd_store_s8(void * address,psimd_s8 value)378 	PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
379 		*((psimd_s8*) address) = value;
380 	}
381 
psimd_store_u8(void * address,psimd_u8 value)382 	PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
383 		*((psimd_u8*) address) = value;
384 	}
385 
psimd_store_s16(void * address,psimd_s16 value)386 	PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
387 		*((psimd_s16*) address) = value;
388 	}
389 
psimd_store_u16(void * address,psimd_u16 value)390 	PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
391 		*((psimd_u16*) address) = value;
392 	}
393 
psimd_store_s32(void * address,psimd_s32 value)394 	PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
395 		*((psimd_s32*) address) = value;
396 	}
397 
psimd_store_u32(void * address,psimd_u32 value)398 	PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
399 		*((psimd_u32*) address) = value;
400 	}
401 
psimd_store_f32(void * address,psimd_f32 value)402 	PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
403 		*((psimd_f32*) address) = value;
404 	}
405 
psimd_store1_f32(void * address,psimd_f32 value)406 	PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
407 		*((float*) address) = value[0];
408 	}
409 
psimd_store2_f32(void * address,psimd_f32 value)410 	PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
411 		float* address_f32 = (float*) address;
412 		address_f32[0] = value[0];
413 		address_f32[1] = value[1];
414 	}
415 
psimd_store3_f32(void * address,psimd_f32 value)416 	PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
417 		float* address_f32 = (float*) address;
418 		address_f32[0] = value[0];
419 		address_f32[1] = value[1];
420 		address_f32[2] = value[2];
421 	}
422 
psimd_store4_f32(void * address,psimd_f32 value)423 	PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
424 		psimd_store_f32(address, value);
425 	}
426 
psimd_store_stride_f32(void * address,size_t stride,psimd_f32 value)427 	PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
428 		float* address0_f32 = (float*) address;
429 		float* address1_f32 = address0_f32 + stride;
430 		float* address2_f32 = address1_f32 + stride;
431 		float* address3_f32 = address2_f32 + stride;
432 		*address0_f32 = value[0];
433 		*address1_f32 = value[1];
434 		*address2_f32 = value[2];
435 		*address3_f32 = value[3];
436 	}
437 
psimd_store1_stride_f32(void * address,size_t stride,psimd_f32 value)438 	PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
439 		psimd_store1_f32(address, value);
440 	}
441 
psimd_store2_stride_f32(void * address,size_t stride,psimd_f32 value)442 	PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
443 		float* address_f32 = (float*) address;
444 		address_f32[0]      = value[0];
445 		address_f32[stride] = value[1];
446 	}
447 
psimd_store3_stride_f32(void * address,size_t stride,psimd_f32 value)448 	PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
449 		float* address0_f32 = (float*) address;
450 		float* address1_f32 = address0_f32 + stride;
451 		float* address2_f32 = address1_f32 + stride;
452 		*address0_f32 = value[0];
453 		*address1_f32 = value[1];
454 		*address2_f32 = value[2];
455 	}
456 
457 	/* Vector addition */
psimd_add_s8(psimd_s8 a,psimd_s8 b)458 	PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
459 		return a + b;
460 	}
461 
psimd_add_u8(psimd_u8 a,psimd_u8 b)462 	PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
463 		return a + b;
464 	}
465 
psimd_add_s16(psimd_s16 a,psimd_s16 b)466 	PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
467 		return a + b;
468 	}
469 
psimd_add_u16(psimd_u16 a,psimd_u16 b)470 	PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
471 		return a + b;
472 	}
473 
psimd_add_s32(psimd_s32 a,psimd_s32 b)474 	PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
475 		return a + b;
476 	}
477 
psimd_add_u32(psimd_u32 a,psimd_u32 b)478 	PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
479 		return a + b;
480 	}
481 
psimd_add_f32(psimd_f32 a,psimd_f32 b)482 	PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
483 		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
484 			return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b);
485 		#else
486 			return a + b;
487 		#endif
488 	}
489 
490 	/* Vector subtraction */
psimd_sub_s8(psimd_s8 a,psimd_s8 b)491 	PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
492 		return a - b;
493 	}
494 
psimd_sub_u8(psimd_u8 a,psimd_u8 b)495 	PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
496 		return a - b;
497 	}
498 
psimd_sub_s16(psimd_s16 a,psimd_s16 b)499 	PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
500 		return a - b;
501 	}
502 
psimd_sub_u16(psimd_u16 a,psimd_u16 b)503 	PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
504 		return a - b;
505 	}
506 
psimd_sub_s32(psimd_s32 a,psimd_s32 b)507 	PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
508 		return a - b;
509 	}
510 
psimd_sub_u32(psimd_u32 a,psimd_u32 b)511 	PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
512 		return a - b;
513 	}
514 
psimd_sub_f32(psimd_f32 a,psimd_f32 b)515 	PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
516 		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
517 			return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b);
518 		#else
519 			return a - b;
520 		#endif
521 	}
522 
523 	/* Vector multiplication */
psimd_mul_s8(psimd_s8 a,psimd_s8 b)524 	PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
525 		return a * b;
526 	}
527 
psimd_mul_u8(psimd_u8 a,psimd_u8 b)528 	PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
529 		return a * b;
530 	}
531 
psimd_mul_s16(psimd_s16 a,psimd_s16 b)532 	PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
533 		return a * b;
534 	}
535 
psimd_mul_u16(psimd_u16 a,psimd_u16 b)536 	PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
537 		return a * b;
538 	}
539 
psimd_mul_s32(psimd_s32 a,psimd_s32 b)540 	PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
541 		return a * b;
542 	}
543 
psimd_mul_u32(psimd_u32 a,psimd_u32 b)544 	PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
545 		return a * b;
546 	}
547 
psimd_mul_f32(psimd_f32 a,psimd_f32 b)548 	PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
549 		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
550 			return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b);
551 		#else
552 			return a * b;
553 		#endif
554 	}
555 
556 	/* Vector and */
psimd_andmask_f32(psimd_s32 mask,psimd_f32 v)557 	PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
558 		return (psimd_f32) (mask & (psimd_s32) v);
559 	}
560 
561 	/* Vector blend */
psimd_blend_s8(psimd_s8 mask,psimd_s8 a,psimd_s8 b)562 	PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
563 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
564 			return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b);
565 		#else
566 			return (mask & a) | (~mask & b);
567 		#endif
568 	}
569 
psimd_blend_u8(psimd_u8 mask,psimd_u8 a,psimd_u8 b)570 	PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_u8 mask, psimd_u8 a, psimd_u8 b) {
571 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
572 			return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b);
573 		#else
574 			return (mask & a) | (~mask & b);
575 		#endif
576 	}
577 
psimd_blend_s16(psimd_s16 mask,psimd_s16 a,psimd_s16 b)578 	PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
579 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
580 			return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b);
581 		#else
582 			return (mask & a) | (~mask & b);
583 		#endif
584 	}
585 
psimd_blend_u16(psimd_u16 mask,psimd_u16 a,psimd_u16 b)586 	PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_u16 mask, psimd_u16 a, psimd_u16 b) {
587 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
588 			return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b);
589 		#else
590 			return (mask & a) | (~mask & b);
591 		#endif
592 	}
593 
psimd_blend_s32(psimd_s32 mask,psimd_s32 a,psimd_s32 b)594 	PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
595 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
596 			return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b);
597 		#else
598 			return (mask & a) | (~mask & b);
599 		#endif
600 	}
601 
psimd_blend_u32(psimd_u32 mask,psimd_u32 a,psimd_u32 b)602 	PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_u32 mask, psimd_u32 a, psimd_u32 b) {
603 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
604 			return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b);
605 		#else
606 			return (mask & a) | (~mask & b);
607 		#endif
608 	}
609 
psimd_blend_f32(psimd_s32 mask,psimd_f32 a,psimd_f32 b)610 	PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
611 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
612 			return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b);
613 		#else
614 			return (psimd_f32) psimd_blend_s32(mask, (psimd_s32) a, (psimd_s32) b);
615 		#endif
616 	}
617 
618 	/* Vector blend on sign */
psimd_signblend_s8(psimd_s8 x,psimd_s8 a,psimd_s8 b)619 	PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
620 		return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
621 	}
622 
psimd_signblend_u8(psimd_s8 x,psimd_u8 a,psimd_u8 b)623 	PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
624 		return psimd_blend_u8((psimd_u8) (x >> psimd_splat_s8(7)), a, b);
625 	}
626 
psimd_signblend_s16(psimd_s16 x,psimd_s16 a,psimd_s16 b)627 	PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
628 		return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
629 	}
630 
psimd_signblend_u16(psimd_s16 x,psimd_u16 a,psimd_u16 b)631 	PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
632 		return psimd_blend_u16((psimd_u16) (x >> psimd_splat_s16(15)), a, b);
633 	}
634 
psimd_signblend_s32(psimd_s32 x,psimd_s32 a,psimd_s32 b)635 	PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
636 		return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
637 	}
638 
psimd_signblend_u32(psimd_s32 x,psimd_u32 a,psimd_u32 b)639 	PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
640 		return psimd_blend_u32((psimd_u32) (x >> psimd_splat_s32(31)), a, b);
641 	}
642 
psimd_signblend_f32(psimd_f32 x,psimd_f32 a,psimd_f32 b)643 	PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
644 		const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
645 		return psimd_blend_f32(mask, a, b);
646 	}
647 
648 	/* Vector absolute value */
psimd_abs_f32(psimd_f32 v)649 	PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
650 		const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
651 		return (psimd_f32) ((psimd_s32) v & mask);
652 	}
653 
654 	/* Vector negation */
psimd_neg_f32(psimd_f32 v)655 	PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
656 		const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
657 		return (psimd_f32) ((psimd_s32) v ^ mask);
658 	}
659 
660 	/* Vector maximum */
psimd_max_s8(psimd_s8 a,psimd_s8 b)661 	PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
662 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
663 			return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b);
664 		#else
665 			return psimd_blend_s8(a > b, a, b);
666 		#endif
667 	}
668 
psimd_max_u8(psimd_u8 a,psimd_u8 b)669 	PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
670 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
671 			return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
672 		#else
673 			return psimd_blend_u8(a > b, a, b);
674 		#endif
675 	}
676 
psimd_max_s16(psimd_s16 a,psimd_s16 b)677 	PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
678 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
679 			return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b);
680 		#else
681 			return psimd_blend_s16(a > b, a, b);
682 		#endif
683 	}
684 
psimd_max_u16(psimd_u16 a,psimd_u16 b)685 	PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
686 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
687 			return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b);
688 		#else
689 			return psimd_blend_u16(a > b, a, b);
690 		#endif
691 	}
692 
psimd_max_s32(psimd_s32 a,psimd_s32 b)693 	PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
694 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
695 			return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b);
696 		#else
697 			return psimd_blend_s32(a > b, a, b);
698 		#endif
699 	}
700 
psimd_max_u32(psimd_u32 a,psimd_u32 b)701 	PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
702 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
703 			return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b);
704 		#else
705 			return psimd_blend_u32(a > b, a, b);
706 		#endif
707 	}
708 
psimd_max_f32(psimd_f32 a,psimd_f32 b)709 	PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
710 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
711 			return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b);
712 		#else
713 			return psimd_blend_f32(a > b, a, b);
714 		#endif
715 	}
716 
717 	/* Vector minimum */
psimd_min_s8(psimd_s8 a,psimd_s8 b)718 	PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
719 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
720 			return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b);
721 		#else
722 			return psimd_blend_s8(a < b, a, b);
723 		#endif
724 	}
725 
psimd_min_u8(psimd_u8 a,psimd_u8 b)726 	PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
727 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
728 			return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b);
729 		#else
730 			return psimd_blend_u8(a < b, a, b);
731 		#endif
732 	}
733 
psimd_min_s16(psimd_s16 a,psimd_s16 b)734 	PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
735 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
736 			return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b);
737 		#else
738 			return psimd_blend_s16(a < b, a, b);
739 		#endif
740 	}
741 
psimd_min_u16(psimd_u16 a,psimd_u16 b)742 	PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
743 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
744 			return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b);
745 		#else
746 			return psimd_blend_u16(a < b, a, b);
747 		#endif
748 	}
749 
psimd_min_s32(psimd_s32 a,psimd_s32 b)750 	PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
751 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
752 			return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b);
753 		#else
754 			return psimd_blend_s32(a < b, a, b);
755 		#endif
756 	}
757 
psimd_min_u32(psimd_u32 a,psimd_u32 b)758 	PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
759 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
760 			return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b);
761 		#else
762 			return psimd_blend_u32(a < b, a, b);
763 		#endif
764 	}
765 
psimd_min_f32(psimd_f32 a,psimd_f32 b)766 	PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
767 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
768 			return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b);
769 		#else
770 			return psimd_blend_f32(a < b, a, b);
771 		#endif
772 	}
773 
psimd_cvt_s32_f32(psimd_s32 v)774 	PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) {
775 		#if defined(__clang__)
776 			return __builtin_convertvector(v, psimd_f32);
777 		#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
778 			return (psimd_f32) vcvtq_f32_s32((int32x4_t) v);
779 		#elif defined(__SSE2__)
780 			return (psimd_f32) _mm_cvtepi32_ps((__m128i) v);
781 		#else
782 			return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] };
783 		#endif
784 	}
785 
786 	/* Broadcast vector element */
787 	#if defined(__clang__)
psimd_splat0_f32(psimd_f32 v)788 		PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
789 			return __builtin_shufflevector(v, v, 0, 0, 0, 0);
790 		}
791 
psimd_splat1_f32(psimd_f32 v)792 		PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
793 			return __builtin_shufflevector(v, v, 1, 1, 1, 1);
794 		}
795 
psimd_splat2_f32(psimd_f32 v)796 		PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
797 			return __builtin_shufflevector(v, v, 2, 2, 2, 2);
798 		}
799 
psimd_splat3_f32(psimd_f32 v)800 		PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
801 			return __builtin_shufflevector(v, v, 3, 3, 3, 3);
802 		}
803 	#else
psimd_splat0_f32(psimd_f32 v)804 		PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
805 			return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 });
806 		}
807 
psimd_splat1_f32(psimd_f32 v)808 		PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
809 			return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 });
810 		}
811 
psimd_splat2_f32(psimd_f32 v)812 		PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
813 			return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 });
814 		}
815 
psimd_splat3_f32(psimd_f32 v)816 		PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
817 			return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 });
818 		}
819 	#endif
820 
821 	/* Reversal of vector elements */
822 	#if defined(__clang__)
psimd_reverse_s8(psimd_s8 v)823 		PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
824 			return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
825 		}
826 
psimd_reverse_u8(psimd_u8 v)827 		PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
828 			return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
829 		}
830 
psimd_reverse_s16(psimd_s16 v)831 		PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
832 			return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
833 		}
834 
psimd_reverse_u16(psimd_u16 v)835 		PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
836 			return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
837 		}
838 
psimd_reverse_s32(psimd_s32 v)839 		PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
840 			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
841 		}
842 
psimd_reverse_u32(psimd_u32 v)843 		PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
844 			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
845 		}
846 
psimd_reverse_f32(psimd_f32 v)847 		PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
848 			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
849 		}
850 	#else
psimd_reverse_s8(psimd_s8 v)851 		PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
852 			return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
853 		}
854 
psimd_reverse_u8(psimd_u8 v)855 		PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
856 			return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
857 		}
858 
psimd_reverse_s16(psimd_s16 v)859 		PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
860 			return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
861 		}
862 
psimd_reverse_u16(psimd_u16 v)863 		PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
864 			return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
865 		}
866 
psimd_reverse_s32(psimd_s32 v)867 		PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
868 			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
869 		}
870 
psimd_reverse_u32(psimd_u32 v)871 		PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
872 			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
873 		}
874 
psimd_reverse_f32(psimd_f32 v)875 		PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
876 			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
877 		}
878 	#endif
879 
880 	/* Interleaving of vector elements */
881 	#if defined(__clang__)
psimd_interleave_lo_s16(psimd_s16 a,psimd_s16 b)882 		PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
883 			return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
884 		}
885 
psimd_interleave_hi_s16(psimd_s16 a,psimd_s16 b)886 		PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
887 			return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
888 		}
889 
psimd_interleave_lo_u16(psimd_u16 a,psimd_u16 b)890 		PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
891 			return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
892 		}
893 
psimd_interleave_hi_u16(psimd_u16 a,psimd_u16 b)894 		PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
895 			return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
896 		}
897 
psimd_interleave_lo_s32(psimd_s32 a,psimd_s32 b)898 		PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
899 			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
900 		}
901 
psimd_interleave_hi_s32(psimd_s32 a,psimd_s32 b)902 		PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
903 			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
904 		}
905 
psimd_interleave_lo_u32(psimd_u32 a,psimd_u32 b)906 		PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
907 			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
908 		}
909 
psimd_interleave_hi_u32(psimd_u32 a,psimd_u32 b)910 		PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
911 			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
912 		}
913 
psimd_interleave_lo_f32(psimd_f32 a,psimd_f32 b)914 		PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
915 			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
916 		}
917 
psimd_interleave_hi_f32(psimd_f32 a,psimd_f32 b)918 		PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
919 			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
920 		}
921 	#else
psimd_interleave_lo_s16(psimd_s16 a,psimd_s16 b)922 		PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
923 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
924 		}
925 
psimd_interleave_hi_s16(psimd_s16 a,psimd_s16 b)926 		PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
927 			return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
928 		}
929 
psimd_interleave_lo_u16(psimd_u16 a,psimd_u16 b)930 		PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
931 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
932 		}
933 
psimd_interleave_hi_u16(psimd_u16 a,psimd_u16 b)934 		PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
935 			return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
936 		}
937 
psimd_interleave_lo_s32(psimd_s32 a,psimd_s32 b)938 		PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
939 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
940 		}
941 
psimd_interleave_hi_s32(psimd_s32 a,psimd_s32 b)942 		PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
943 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
944 		}
945 
psimd_interleave_lo_u32(psimd_u32 a,psimd_u32 b)946 		PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
947 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
948 		}
949 
psimd_interleave_hi_u32(psimd_u32 a,psimd_u32 b)950 		PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
951 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
952 		}
953 
psimd_interleave_lo_f32(psimd_f32 a,psimd_f32 b)954 		PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
955 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
956 		}
957 
psimd_interleave_hi_f32(psimd_f32 a,psimd_f32 b)958 		PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
959 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
960 		}
961 	#endif
962 
963 	/* Concatenation of low/high vector elements */
964 	#if defined(__clang__)
psimd_concat_lo_s16(psimd_s16 a,psimd_s16 b)965 		PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
966 			return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
967 		}
968 
psimd_concat_hi_s16(psimd_s16 a,psimd_s16 b)969 		PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
970 			return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
971 		}
972 
psimd_concat_lo_u16(psimd_u16 a,psimd_u16 b)973 		PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
974 			return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
975 		}
976 
psimd_concat_hi_u16(psimd_u16 a,psimd_u16 b)977 		PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
978 			return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
979 		}
980 
psimd_concat_lo_s32(psimd_s32 a,psimd_s32 b)981 		PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
982 			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
983 		}
984 
psimd_concat_hi_s32(psimd_s32 a,psimd_s32 b)985 		PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
986 			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
987 		}
988 
psimd_concat_lo_u32(psimd_u32 a,psimd_u32 b)989 		PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
990 			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
991 		}
992 
psimd_concat_hi_u32(psimd_u32 a,psimd_u32 b)993 		PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
994 			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
995 		}
996 
psimd_concat_lo_f32(psimd_f32 a,psimd_f32 b)997 		PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
998 			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
999 		}
1000 
psimd_concat_hi_f32(psimd_f32 a,psimd_f32 b)1001 		PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1002 			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1003 		}
1004 	#else
psimd_concat_lo_s16(psimd_s16 a,psimd_s16 b)1005 		PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
1006 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1007 		}
1008 
psimd_concat_hi_s16(psimd_s16 a,psimd_s16 b)1009 		PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
1010 			return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1011 		}
1012 
psimd_concat_lo_u16(psimd_u16 a,psimd_u16 b)1013 		PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
1014 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1015 		}
1016 
psimd_concat_hi_u16(psimd_u16 a,psimd_u16 b)1017 		PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
1018 			return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1019 		}
1020 
psimd_concat_lo_s32(psimd_s32 a,psimd_s32 b)1021 		PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
1022 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1023 		}
1024 
psimd_concat_hi_s32(psimd_s32 a,psimd_s32 b)1025 		PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
1026 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1027 		}
1028 
psimd_concat_lo_u32(psimd_u32 a,psimd_u32 b)1029 		PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
1030 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1031 		}
1032 
psimd_concat_hi_u32(psimd_u32 a,psimd_u32 b)1033 		PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
1034 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1035 		}
1036 
psimd_concat_lo_f32(psimd_f32 a,psimd_f32 b)1037 		PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
1038 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1039 		}
1040 
psimd_concat_hi_f32(psimd_f32 a,psimd_f32 b)1041 		PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1042 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1043 		}
1044 	#endif
1045 
1046 	/* Concatenation of even/odd vector elements */
1047 	#if defined(__clang__)
psimd_concat_even_s8(psimd_s8 a,psimd_s8 b)1048 		PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1049 			return __builtin_shufflevector(a, b,
1050 				0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1051 		}
1052 
psimd_concat_odd_s8(psimd_s8 a,psimd_s8 b)1053 		PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1054 			return __builtin_shufflevector(a, b,
1055 				1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1056 		}
1057 
psimd_concat_even_u8(psimd_u8 a,psimd_u8 b)1058 		PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1059 			return __builtin_shufflevector(a, b,
1060 				0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1061 		}
1062 
psimd_concat_odd_u8(psimd_u8 a,psimd_u8 b)1063 		PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1064 			return __builtin_shufflevector(a, b,
1065 				1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1066 		}
1067 
psimd_concat_even_s16(psimd_s16 a,psimd_s16 b)1068 		PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1069 			return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1070 		}
1071 
psimd_concat_odd_s16(psimd_s16 a,psimd_s16 b)1072 		PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1073 			return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1074 		}
1075 
psimd_concat_even_u16(psimd_u16 a,psimd_u16 b)1076 		PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1077 			return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1078 		}
1079 
psimd_concat_odd_u16(psimd_u16 a,psimd_u16 b)1080 		PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1081 			return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1082 		}
1083 
psimd_concat_even_s32(psimd_s32 a,psimd_s32 b)1084 		PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1085 			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1086 		}
1087 
psimd_concat_odd_s32(psimd_s32 a,psimd_s32 b)1088 		PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1089 			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1090 		}
1091 
psimd_concat_even_u32(psimd_u32 a,psimd_u32 b)1092 		PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1093 			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1094 		}
1095 
psimd_concat_odd_u32(psimd_u32 a,psimd_u32 b)1096 		PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1097 			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1098 		}
1099 
psimd_concat_even_f32(psimd_f32 a,psimd_f32 b)1100 		PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1101 			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1102 		}
1103 
psimd_concat_odd_f32(psimd_f32 a,psimd_f32 b)1104 		PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1105 			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1106 		}
1107 	#else
psimd_concat_even_s8(psimd_s8 a,psimd_s8 b)1108 		PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1109 			return __builtin_shuffle(a, b,
1110 				(psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1111 		}
1112 
psimd_concat_odd_s8(psimd_s8 a,psimd_s8 b)1113 		PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1114 			return __builtin_shuffle(a, b,
1115 				(psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1116 		}
1117 
psimd_concat_even_u8(psimd_u8 a,psimd_u8 b)1118 		PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1119 			return __builtin_shuffle(a, b,
1120 				(psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1121 		}
1122 
psimd_concat_odd_u8(psimd_u8 a,psimd_u8 b)1123 		PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1124 			return __builtin_shuffle(a, b,
1125 				(psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1126 		}
1127 
psimd_concat_even_s16(psimd_s16 a,psimd_s16 b)1128 		PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1129 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1130 		}
1131 
psimd_concat_odd_s16(psimd_s16 a,psimd_s16 b)1132 		PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1133 			return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1134 		}
1135 
psimd_concat_even_u16(psimd_u16 a,psimd_u16 b)1136 		PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1137 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1138 		}
1139 
psimd_concat_odd_u16(psimd_u16 a,psimd_u16 b)1140 		PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1141 			return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1142 		}
1143 
psimd_concat_even_s32(psimd_s32 a,psimd_s32 b)1144 		PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1145 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1146 		}
1147 
psimd_concat_odd_s32(psimd_s32 a,psimd_s32 b)1148 		PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1149 			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1150 		}
1151 
psimd_concat_even_u32(psimd_u32 a,psimd_u32 b)1152 		PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1153 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1154 		}
1155 
psimd_concat_odd_u32(psimd_u32 a,psimd_u32 b)1156 		PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1157 			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1158 		}
1159 
psimd_concat_even_f32(psimd_f32 a,psimd_f32 b)1160 		PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1161 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1162 		}
1163 
psimd_concat_odd_f32(psimd_f32 a,psimd_f32 b)1164 		PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1165 			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1166 		}
1167 	#endif
1168 
1169 	/* Vector reduce */
1170 	#if defined(__clang__)
psimd_allreduce_sum_f32(psimd_f32 v)1171 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1172 			const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
1173 			return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
1174 		}
1175 
psimd_allreduce_max_f32(psimd_f32 v)1176 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1177 			const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1178 			return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1179 		}
1180 
psimd_allreduce_min_f32(psimd_f32 v)1181 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1182 			const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1183 			return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1184 		}
1185 
psimd_reduce_sum_f32(psimd_f32 v)1186 		PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1187 			const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
1188 			const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
1189 			return result[0];
1190 		}
1191 
psimd_reduce_max_f32(psimd_f32 v)1192 		PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1193 			const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1194 			const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1195 			return result[0];
1196 		}
1197 
psimd_reduce_min_f32(psimd_f32 v)1198 		PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1199 			const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1200 			const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1201 			return result[0];
1202 		}
1203 	#else
psimd_allreduce_sum_f32(psimd_f32 v)1204 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1205 			const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
1206 			return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
1207 		}
1208 
psimd_allreduce_max_f32(psimd_f32 v)1209 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1210 			const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1211 			return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1212 		}
1213 
psimd_allreduce_min_f32(psimd_f32 v)1214 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1215 			const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1216 			return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1217 		}
1218 
psimd_reduce_sum_f32(psimd_f32 v)1219 		PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1220 			const psimd_f32 result = psimd_allreduce_sum_f32(v);
1221 			return result[0];
1222 		}
1223 
psimd_reduce_max_f32(psimd_f32 v)1224 		PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1225 			const psimd_f32 result = psimd_allreduce_max_f32(v);
1226 			return result[0];
1227 		}
1228 
psimd_reduce_min_f32(psimd_f32 v)1229 		PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1230 			const psimd_f32 result = psimd_allreduce_min_f32(v);
1231 			return result[0];
1232 		}
1233 	#endif
1234 #endif
1235 
1236 #endif /* PSIMD_H */
1237