1 // [Blend2D]
2 // 2D Vector Graphics Powered by a JIT Compiler.
3 //
4 // [License]
5 // Zlib - See LICENSE.md file in the package.
6
7 #ifndef BLEND2D_SIMD_X86_P_H
8 #define BLEND2D_SIMD_X86_P_H
9
10 #include "./support_p.h"
11 #include "./tables_p.h"
12
13 #if defined(_MSC_VER)
14 #include <intrin.h>
15 #endif
16
17 #if defined(BL_TARGET_OPT_SSE)
18 #include <xmmintrin.h>
19 #endif
20
21 #if defined(BL_TARGET_OPT_SSE2)
22 #include <emmintrin.h>
23 #endif
24
25 #if defined(BL_TARGET_OPT_SSE3) && !defined(_MSC_VER)
26 #include <pmmintrin.h>
27 #endif
28
29 #if defined(BL_TARGET_OPT_SSSE3)
30 #include <tmmintrin.h>
31 #endif
32
33 #if defined(BL_TARGET_OPT_SSE4_1)
34 #include <smmintrin.h>
35 #endif
36
37 #if defined(BL_TARGET_OPT_SSE4_2)
38 #include <nmmintrin.h>
39 #endif
40
41 #if defined(BL_TARGET_OPT_AVX) || defined(BL_TARGET_OPT_AVX2)
42 #include <immintrin.h>
43 #endif
44
45 #if defined(BL_TARGET_OPT_NEON)
46 #include <arm_neon.h>
47 #endif
48
49 //! \cond INTERNAL
50 //! \addtogroup blend2d_internal
51 //! \{
52
53 //! SIMD namespace contains helper functions to access SIMD intrinsics. The
54 //! names of these functions correspond to names of functions used by pipeline
55 //! generator (BLPipe).
56 namespace SIMD {
57
58 // ============================================================================
59 // [BLSIMD - Features]
60 // ============================================================================
61
62 #if defined(BL_TARGET_OPT_AVX2)
63 #define BL_TARGET_SIMD_I 256
64 #define BL_TARGET_SIMD_F 256
65 #define BL_TARGET_SIMD_D 256
66 #elif defined(BL_TARGET_OPT_AVX)
67 #define BL_TARGET_SIMD_I 128
68 #define BL_TARGET_SIMD_F 256
69 #define BL_TARGET_SIMD_D 256
70 #elif defined(BL_TARGET_OPT_SSE2)
71 #define BL_TARGET_SIMD_I 128
72 #define BL_TARGET_SIMD_F 128
73 #define BL_TARGET_SIMD_D 128
74 #else
75 #define BL_TARGET_SIMD_I 0
76 #define BL_TARGET_SIMD_F 0
77 #define BL_TARGET_SIMD_D 0
78 #endif
79
80 // ============================================================================
81 // [BLSIMD - Types]
82 // ============================================================================
83
84 #if defined(BL_TARGET_OPT_SSE2)
85 typedef __m128i I128;
86 typedef __m128 F128;
87 typedef __m128d D128;
88 #endif
89
90 // 256-bit types (including integers) are accessible through AVX as AVX also
91 // include conversion instructions between integer types and FP types.
92 #if defined(BL_TARGET_OPT_AVX)
93 typedef __m256i I256;
94 typedef __m256 F256;
95 typedef __m256d D256;
96 #endif
97
98 // Must be in anonymous namespace.
99 namespace {
100
101 // ============================================================================
102 // [BLSIMD - Cast]
103 // ============================================================================
104
105 template<typename Out, typename In>
v_const_as(const In * c)106 BL_INLINE const Out& v_const_as(const In* c) noexcept {
107 return *reinterpret_cast<const Out*>(c);
108 }
109
110 template<typename DstT, typename SrcT>
vcast(const SrcT & x)111 BL_INLINE DstT vcast(const SrcT& x) noexcept { return x; }
112
113 #if defined(BL_TARGET_OPT_SSE2)
vcast(const I128 & x)114 template<> BL_INLINE F128 vcast(const I128& x) noexcept { return _mm_castsi128_ps(x); }
vcast(const I128 & x)115 template<> BL_INLINE D128 vcast(const I128& x) noexcept { return _mm_castsi128_pd(x); }
vcast(const F128 & x)116 template<> BL_INLINE I128 vcast(const F128& x) noexcept { return _mm_castps_si128(x); }
vcast(const F128 & x)117 template<> BL_INLINE D128 vcast(const F128& x) noexcept { return _mm_castps_pd(x); }
vcast(const D128 & x)118 template<> BL_INLINE I128 vcast(const D128& x) noexcept { return _mm_castpd_si128(x); }
vcast(const D128 & x)119 template<> BL_INLINE F128 vcast(const D128& x) noexcept { return _mm_castpd_ps(x); }
120 #endif
121
122 #if defined(BL_TARGET_OPT_AVX)
vcast(const I256 & x)123 template<> BL_INLINE I128 vcast(const I256& x) noexcept { return _mm256_castsi256_si128(x); }
vcast(const I128 & x)124 template<> BL_INLINE I256 vcast(const I128& x) noexcept { return _mm256_castsi128_si256(x); }
125
vcast(const F256 & x)126 template<> BL_INLINE F128 vcast(const F256& x) noexcept { return _mm256_castps256_ps128(x); }
vcast(const F128 & x)127 template<> BL_INLINE F256 vcast(const F128& x) noexcept { return _mm256_castps128_ps256(x); }
128
vcast(const D256 & x)129 template<> BL_INLINE D128 vcast(const D256& x) noexcept { return _mm256_castpd256_pd128(x); }
vcast(const D128 & x)130 template<> BL_INLINE D256 vcast(const D128& x) noexcept { return _mm256_castpd128_pd256(x); }
131
vcast(const F256 & x)132 template<> BL_INLINE D256 vcast(const F256& x) noexcept { return _mm256_castps_pd(x); }
vcast(const D256 & x)133 template<> BL_INLINE F256 vcast(const D256& x) noexcept { return _mm256_castpd_ps(x); }
134
vcast(const I256 & x)135 template<> BL_INLINE F256 vcast(const I256& x) noexcept { return _mm256_castsi256_ps(x); }
vcast(const F256 & x)136 template<> BL_INLINE I256 vcast(const F256& x) noexcept { return _mm256_castps_si256(x); }
137
vcast(const I256 & x)138 template<> BL_INLINE D256 vcast(const I256& x) noexcept { return _mm256_castsi256_pd(x); }
vcast(const D256 & x)139 template<> BL_INLINE I256 vcast(const D256& x) noexcept { return _mm256_castpd_si256(x); }
140 #endif
141
142 // ============================================================================
143 // [BLSIMD - I128]
144 // ============================================================================
145
146 #if defined(BL_TARGET_OPT_SSE2)
vzeroi128()147 BL_INLINE I128 vzeroi128() noexcept { return _mm_setzero_si128(); }
148
vseti128i8(int8_t x)149 BL_INLINE I128 vseti128i8(int8_t x) noexcept { return _mm_set1_epi8(x); }
vseti128i16(int16_t x)150 BL_INLINE I128 vseti128i16(int16_t x) noexcept { return _mm_set1_epi16(x); }
vseti128i32(int32_t x)151 BL_INLINE I128 vseti128i32(int32_t x) noexcept { return _mm_set1_epi32(x); }
152
vseti128i32(int32_t x1,int32_t x0)153 BL_INLINE I128 vseti128i32(int32_t x1, int32_t x0) noexcept { return _mm_set_epi32(x1, x0, x1, x0); }
vseti128i32(int32_t x3,int32_t x2,int32_t x1,int32_t x0)154 BL_INLINE I128 vseti128i32(int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm_set_epi32(x3, x2, x1, x0); }
155
vseti128i64(int64_t x)156 BL_INLINE I128 vseti128i64(int64_t x) noexcept {
157 #if BL_TARGET_ARCH_BITS >= 64
158 return _mm_set1_epi64x(x);
159 #else
160 return vseti128i32(int32_t(uint64_t(x) >> 32), int32_t(x & 0xFFFFFFFFu));
161 #endif
162 }
163
vseti128i64(int64_t x1,int64_t x0)164 BL_INLINE I128 vseti128i64(int64_t x1, int64_t x0) noexcept {
165 return vseti128i32(int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
166 int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu));
167 }
168
vseti128u8(uint8_t x)169 BL_INLINE I128 vseti128u8(uint8_t x) noexcept { return vseti128i8(int8_t(x)); }
vseti128u16(uint16_t x)170 BL_INLINE I128 vseti128u16(uint16_t x) noexcept { return vseti128i16(int16_t(x)); }
vseti128u32(uint32_t x)171 BL_INLINE I128 vseti128u32(uint32_t x) noexcept { return vseti128i32(int32_t(x)); }
vseti128u32(uint32_t x1,uint32_t x0)172 BL_INLINE I128 vseti128u32(uint32_t x1, uint32_t x0) noexcept { return vseti128i32(int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0)); }
vseti128u32(uint32_t x3,uint32_t x2,uint32_t x1,uint32_t x0)173 BL_INLINE I128 vseti128u32(uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept { return vseti128i32(int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0)); }
vseti128u64(uint64_t x)174 BL_INLINE I128 vseti128u64(uint64_t x) noexcept { return vseti128i64(int64_t(x)); }
vseti128u64(uint64_t x1,uint64_t x0)175 BL_INLINE I128 vseti128u64(uint64_t x1, uint64_t x0) noexcept { return vseti128i64(int64_t(x1), int64_t(x0)); }
176
vcvti32i128(int32_t x)177 BL_INLINE I128 vcvti32i128(int32_t x) noexcept { return _mm_cvtsi32_si128(int(x)); }
vcvtu32i128(uint32_t x)178 BL_INLINE I128 vcvtu32i128(uint32_t x) noexcept { return _mm_cvtsi32_si128(int(x)); }
179
vcvti128i32(const I128 & x)180 BL_INLINE int32_t vcvti128i32(const I128& x) noexcept { return int32_t(_mm_cvtsi128_si32(x)); }
vcvti128u32(const I128 & x)181 BL_INLINE uint32_t vcvti128u32(const I128& x) noexcept { return uint32_t(_mm_cvtsi128_si32(x)); }
182
vcvti64i128(int64_t x)183 BL_INLINE I128 vcvti64i128(int64_t x) noexcept {
184 #if BL_TARGET_ARCH_BITS >= 64
185 return _mm_cvtsi64_si128(x);
186 #else
187 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&x));
188 #endif
189 }
190
vcvti128i64(const I128 & x)191 BL_INLINE int64_t vcvti128i64(const I128& x) noexcept {
192 #if BL_TARGET_ARCH_BITS >= 64
193 return int64_t(_mm_cvtsi128_si64(x));
194 #else
195 int64_t result;
196 _mm_storel_epi64(reinterpret_cast<__m128i*>(&result), x);
197 return result;
198 #endif
199 }
200
vcvtu64i128(uint64_t x)201 BL_INLINE I128 vcvtu64i128(uint64_t x) noexcept { return vcvti64i128(int64_t(x)); }
vcvti128u64(const I128 & x)202 BL_INLINE uint64_t vcvti128u64(const I128& x) noexcept { return uint64_t(vcvti128i64(x)); }
203
204 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizli16(const I128 & x)205 BL_INLINE I128 vswizli16(const I128& x) noexcept { return _mm_shufflelo_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
206 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizhi16(const I128 & x)207 BL_INLINE I128 vswizhi16(const I128& x) noexcept { return _mm_shufflehi_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
208
209 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizi16(const I128 & x)210 BL_INLINE I128 vswizi16(const I128& x) noexcept { return vswizhi16<A, B, C, D>(vswizli16<A, B, C, D>(x)); }
211 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizi32(const I128 & x)212 BL_INLINE I128 vswizi32(const I128& x) noexcept { return _mm_shuffle_epi32(x, _MM_SHUFFLE(A, B, C, D)); }
213 template<int A, int B>
vswizi64(const I128 & x)214 BL_INLINE I128 vswizi64(const I128& x) noexcept { return vswizi32<A*2 + 1, A*2, B*2 + 1, B*2>(x); }
215
216 #if defined(BL_TARGET_OPT_SSSE3)
vpshufb(const I128 & x,const I128 & y)217 BL_INLINE I128 vpshufb(const I128& x, const I128& y) noexcept { return _mm_shuffle_epi8(x, y); }
218
219 template<int N_BYTES>
vpalignr(const I128 & x,const I128 & y)220 BL_INLINE I128 vpalignr(const I128& x, const I128& y) noexcept { return _mm_alignr_epi8(x, y, N_BYTES); }
221 #endif
222
vswapi64(const I128 & x)223 BL_INLINE I128 vswapi64(const I128& x) noexcept { return vswizi64<0, 1>(x); }
vdupli64(const I128 & x)224 BL_INLINE I128 vdupli64(const I128& x) noexcept { return vswizi64<0, 0>(x); }
vduphi64(const I128 & x)225 BL_INLINE I128 vduphi64(const I128& x) noexcept { return vswizi64<1, 1>(x); }
226
vmovli64u8u16(const I128 & x)227 BL_INLINE I128 vmovli64u8u16(const I128& x) noexcept {
228 #if defined(BL_TARGET_OPT_SSE4_1)
229 return _mm_cvtepu8_epi16(x);
230 #else
231 return _mm_unpacklo_epi8(x, _mm_setzero_si128());
232 #endif
233 }
234
vmovli64u16u32(const I128 & x)235 BL_INLINE I128 vmovli64u16u32(const I128& x) noexcept {
236 #if defined(BL_TARGET_OPT_SSE4_1)
237 return _mm_cvtepu16_epi32(x);
238 #else
239 return _mm_unpacklo_epi16(x, _mm_setzero_si128());
240 #endif
241 }
242
vmovli64u32u64(const I128 & x)243 BL_INLINE I128 vmovli64u32u64(const I128& x) noexcept {
244 #if defined(BL_TARGET_OPT_SSE4_1)
245 return _mm_cvtepu32_epi64(x);
246 #else
247 return _mm_unpacklo_epi32(x, _mm_setzero_si128());
248 #endif
249 }
250
vmovhi64u8u16(const I128 & x)251 BL_INLINE I128 vmovhi64u8u16(const I128& x) noexcept { return _mm_unpackhi_epi8(x, _mm_setzero_si128()); }
vmovhi64u16u32(const I128 & x)252 BL_INLINE I128 vmovhi64u16u32(const I128& x) noexcept { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }
vmovhi64u32u64(const I128 & x)253 BL_INLINE I128 vmovhi64u32u64(const I128& x) noexcept { return _mm_unpackhi_epi32(x, _mm_setzero_si128()); }
254
vpacki16i8(const I128 & x,const I128 & y)255 BL_INLINE I128 vpacki16i8(const I128& x, const I128& y) noexcept { return _mm_packs_epi16(x, y); }
vpacki16u8(const I128 & x,const I128 & y)256 BL_INLINE I128 vpacki16u8(const I128& x, const I128& y) noexcept { return _mm_packus_epi16(x, y); }
vpacki32i16(const I128 & x,const I128 & y)257 BL_INLINE I128 vpacki32i16(const I128& x, const I128& y) noexcept { return _mm_packs_epi32(x, y); }
258
vpacki16i8(const I128 & x)259 BL_INLINE I128 vpacki16i8(const I128& x) noexcept { return vpacki16i8(x, x); }
vpacki16u8(const I128 & x)260 BL_INLINE I128 vpacki16u8(const I128& x) noexcept { return vpacki16u8(x, x); }
vpacki32i16(const I128 & x)261 BL_INLINE I128 vpacki32i16(const I128& x) noexcept { return vpacki32i16(x, x); }
262
vpacki32u16(const I128 & x,const I128 & y)263 BL_INLINE I128 vpacki32u16(const I128& x, const I128& y) noexcept {
264 #if defined(BL_TARGET_OPT_SSE4_1)
265 return _mm_packus_epi32(x, y);
266 #else
267 I128 xShifted = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
268 I128 yShifted = _mm_srai_epi32(_mm_slli_epi32(y, 16), 16);
269 return _mm_packs_epi32(xShifted, yShifted);
270 #endif
271 }
272
vpacki32u16(const I128 & x)273 BL_INLINE I128 vpacki32u16(const I128& x) noexcept {
274 #if defined(BL_TARGET_OPT_SSE4_1)
275 return vpacki32u16(x, x);
276 #else
277 I128 xShifted = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
278 return _mm_packs_epi32(xShifted, xShifted);
279 #endif
280 }
281
vpacki32i8(const I128 & x)282 BL_INLINE I128 vpacki32i8(const I128& x) noexcept { return vpacki16i8(vpacki32i16(x)); }
vpacki32i8(const I128 & x,const I128 & y)283 BL_INLINE I128 vpacki32i8(const I128& x, const I128& y) noexcept { return vpacki16i8(vpacki32i16(x, y)); }
vpacki32i8(const I128 & x,const I128 & y,const I128 & z,const I128 & w)284 BL_INLINE I128 vpacki32i8(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16i8(vpacki32i16(x, y), vpacki32i16(z, w)); }
285
vpacki32u8(const I128 & x)286 BL_INLINE I128 vpacki32u8(const I128& x) noexcept { return vpacki16u8(vpacki32i16(x)); }
vpacki32u8(const I128 & x,const I128 & y)287 BL_INLINE I128 vpacki32u8(const I128& x, const I128& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
vpacki32u8(const I128 & x,const I128 & y,const I128 & z,const I128 & w)288 BL_INLINE I128 vpacki32u8(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
289
290 // These assume that HI bytes of all inputs are always zero, so the implementation
291 // can decide between packing with signed/unsigned saturation or vector swizzling.
vpackzzwb(const I128 & x)292 BL_INLINE I128 vpackzzwb(const I128& x) noexcept { return vpacki16u8(x); }
vpackzzwb(const I128 & x,const I128 & y)293 BL_INLINE I128 vpackzzwb(const I128& x, const I128& y) noexcept { return vpacki16u8(x, y); }
294
vpackzzdw(const I128 & x)295 BL_INLINE I128 vpackzzdw(const I128& x) noexcept {
296 #if defined(BL_TARGET_OPT_SSE4_1) || !defined(BL_TARGET_OPT_SSSE3)
297 return vpacki32u16(x);
298 #else
299 return vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo));
300 #endif
301 }
302
vpackzzdw(const I128 & x,const I128 & y)303 BL_INLINE I128 vpackzzdw(const I128& x, const I128& y) noexcept {
304 #if defined(BL_TARGET_OPT_SSE4_1) || !defined(BL_TARGET_OPT_SSSE3)
305 return vpacki32u16(x, y);
306 #else
307 I128 xLo = vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo));
308 I128 yLo = vpshufb(y, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo));
309 return _mm_unpacklo_epi64(xLo, yLo);
310 #endif
311 }
312
vpackzzdb(const I128 & x)313 BL_INLINE I128 vpackzzdb(const I128& x) noexcept {
314 #if defined(BL_TARGET_OPT_SSSE3)
315 return vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u8_lo));
316 #else
317 return vpacki16u8(vpacki32i16(x));
318 #endif
319 }
320
vpackzzdb(const I128 & x,const I128 & y)321 BL_INLINE I128 vpackzzdb(const I128& x, const I128& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
vpackzzdb(const I128 & x,const I128 & y,const I128 & z,const I128 & w)322 BL_INLINE I128 vpackzzdb(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
323
vunpackli8(const I128 & x,const I128 & y)324 BL_INLINE I128 vunpackli8(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi8(x, y); }
vunpackhi8(const I128 & x,const I128 & y)325 BL_INLINE I128 vunpackhi8(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi8(x, y); }
326
vunpackli16(const I128 & x,const I128 & y)327 BL_INLINE I128 vunpackli16(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi16(x, y); }
vunpackhi16(const I128 & x,const I128 & y)328 BL_INLINE I128 vunpackhi16(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi16(x, y); }
329
vunpackli32(const I128 & x,const I128 & y)330 BL_INLINE I128 vunpackli32(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi32(x, y); }
vunpackhi32(const I128 & x,const I128 & y)331 BL_INLINE I128 vunpackhi32(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi32(x, y); }
332
vunpackli64(const I128 & x,const I128 & y)333 BL_INLINE I128 vunpackli64(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi64(x, y); }
vunpackhi64(const I128 & x,const I128 & y)334 BL_INLINE I128 vunpackhi64(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi64(x, y); }
335
vor(const I128 & x,const I128 & y)336 BL_INLINE I128 vor(const I128& x, const I128& y) noexcept { return _mm_or_si128(x, y); }
vxor(const I128 & x,const I128 & y)337 BL_INLINE I128 vxor(const I128& x, const I128& y) noexcept { return _mm_xor_si128(x, y); }
vand(const I128 & x,const I128 & y)338 BL_INLINE I128 vand(const I128& x, const I128& y) noexcept { return _mm_and_si128(x, y); }
vandnot_a(const I128 & x,const I128 & y)339 BL_INLINE I128 vandnot_a(const I128& x, const I128& y) noexcept { return _mm_andnot_si128(x, y); }
vandnot_b(const I128 & x,const I128 & y)340 BL_INLINE I128 vandnot_b(const I128& x, const I128& y) noexcept { return _mm_andnot_si128(y, x); }
vblendmask(const I128 & x,const I128 & y,const I128 & mask)341 BL_INLINE I128 vblendmask(const I128& x, const I128& y, const I128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
342
343 //! Blend BITs or BYTEs, taking advantage of `pblendvb` (SSE4.1), if possible.
vblendx(const I128 & x,const I128 & y,const I128 & mask)344 BL_INLINE I128 vblendx(const I128& x, const I128& y, const I128& mask) noexcept {
345 #if defined(BL_TARGET_OPT_SSE4_1)
346 return _mm_blendv_epi8(x, y, mask);
347 #else
348 return vblendmask(x, y, mask);
349 #endif
350 }
351
vaddi8(const I128 & x,const I128 & y)352 BL_INLINE I128 vaddi8(const I128& x, const I128& y) noexcept { return _mm_add_epi8(x, y); }
vaddi16(const I128 & x,const I128 & y)353 BL_INLINE I128 vaddi16(const I128& x, const I128& y) noexcept { return _mm_add_epi16(x, y); }
vaddi32(const I128 & x,const I128 & y)354 BL_INLINE I128 vaddi32(const I128& x, const I128& y) noexcept { return _mm_add_epi32(x, y); }
vaddi64(const I128 & x,const I128 & y)355 BL_INLINE I128 vaddi64(const I128& x, const I128& y) noexcept { return _mm_add_epi64(x, y); }
356
vaddsi8(const I128 & x,const I128 & y)357 BL_INLINE I128 vaddsi8(const I128& x, const I128& y) noexcept { return _mm_adds_epi8(x, y); }
vaddsu8(const I128 & x,const I128 & y)358 BL_INLINE I128 vaddsu8(const I128& x, const I128& y) noexcept { return _mm_adds_epu8(x, y); }
vaddsi16(const I128 & x,const I128 & y)359 BL_INLINE I128 vaddsi16(const I128& x, const I128& y) noexcept { return _mm_adds_epi16(x, y); }
vaddsu16(const I128 & x,const I128 & y)360 BL_INLINE I128 vaddsu16(const I128& x, const I128& y) noexcept { return _mm_adds_epu16(x, y); }
361
vsubi8(const I128 & x,const I128 & y)362 BL_INLINE I128 vsubi8(const I128& x, const I128& y) noexcept { return _mm_sub_epi8(x, y); }
vsubi16(const I128 & x,const I128 & y)363 BL_INLINE I128 vsubi16(const I128& x, const I128& y) noexcept { return _mm_sub_epi16(x, y); }
vsubi32(const I128 & x,const I128 & y)364 BL_INLINE I128 vsubi32(const I128& x, const I128& y) noexcept { return _mm_sub_epi32(x, y); }
vsubi64(const I128 & x,const I128 & y)365 BL_INLINE I128 vsubi64(const I128& x, const I128& y) noexcept { return _mm_sub_epi64(x, y); }
366
vsubsi8(const I128 & x,const I128 & y)367 BL_INLINE I128 vsubsi8(const I128& x, const I128& y) noexcept { return _mm_subs_epi8(x, y); }
vsubsu8(const I128 & x,const I128 & y)368 BL_INLINE I128 vsubsu8(const I128& x, const I128& y) noexcept { return _mm_subs_epu8(x, y); }
vsubsi16(const I128 & x,const I128 & y)369 BL_INLINE I128 vsubsi16(const I128& x, const I128& y) noexcept { return _mm_subs_epi16(x, y); }
vsubsu16(const I128 & x,const I128 & y)370 BL_INLINE I128 vsubsu16(const I128& x, const I128& y) noexcept { return _mm_subs_epu16(x, y); }
371
vmuli16(const I128 & x,const I128 & y)372 BL_INLINE I128 vmuli16(const I128& x, const I128& y) noexcept { return _mm_mullo_epi16(x, y); }
vmulu16(const I128 & x,const I128 & y)373 BL_INLINE I128 vmulu16(const I128& x, const I128& y) noexcept { return _mm_mullo_epi16(x, y); }
vmulhi16(const I128 & x,const I128 & y)374 BL_INLINE I128 vmulhi16(const I128& x, const I128& y) noexcept { return _mm_mulhi_epi16(x, y); }
vmulhu16(const I128 & x,const I128 & y)375 BL_INLINE I128 vmulhu16(const I128& x, const I128& y) noexcept { return _mm_mulhi_epu16(x, y); }
376
377 #if defined(BL_TARGET_OPT_SSE4_1)
vmuli32(const I128 & x,const I128 & y)378 BL_INLINE I128 vmuli32(const I128& x, const I128& y) noexcept { return _mm_mullo_epi32(x, y); }
vmulu32(const I128 & x,const I128 & y)379 BL_INLINE I128 vmulu32(const I128& x, const I128& y) noexcept { return _mm_mullo_epi32(x, y); }
380 #endif
381
vmaddi16i32(const I128 & x,const I128 & y)382 BL_INLINE I128 vmaddi16i32(const I128& x, const I128& y) noexcept { return _mm_madd_epi16(x, y); }
383
vslli16(const I128 & x)384 template<uint8_t N_BITS> BL_INLINE I128 vslli16(const I128& x) noexcept { return N_BITS ? _mm_slli_epi16(x, N_BITS) : x; }
vslli32(const I128 & x)385 template<uint8_t N_BITS> BL_INLINE I128 vslli32(const I128& x) noexcept { return N_BITS ? _mm_slli_epi32(x, N_BITS) : x; }
vslli64(const I128 & x)386 template<uint8_t N_BITS> BL_INLINE I128 vslli64(const I128& x) noexcept { return N_BITS ? _mm_slli_epi64(x, N_BITS) : x; }
387
vsrli16(const I128 & x)388 template<uint8_t N_BITS> BL_INLINE I128 vsrli16(const I128& x) noexcept { return N_BITS ? _mm_srli_epi16(x, N_BITS) : x; }
vsrli32(const I128 & x)389 template<uint8_t N_BITS> BL_INLINE I128 vsrli32(const I128& x) noexcept { return N_BITS ? _mm_srli_epi32(x, N_BITS) : x; }
vsrli64(const I128 & x)390 template<uint8_t N_BITS> BL_INLINE I128 vsrli64(const I128& x) noexcept { return N_BITS ? _mm_srli_epi64(x, N_BITS) : x; }
391
vsrai16(const I128 & x)392 template<uint8_t N_BITS> BL_INLINE I128 vsrai16(const I128& x) noexcept { return N_BITS ? _mm_srai_epi16(x, N_BITS) : x; }
vsrai32(const I128 & x)393 template<uint8_t N_BITS> BL_INLINE I128 vsrai32(const I128& x) noexcept { return N_BITS ? _mm_srai_epi32(x, N_BITS) : x; }
394
vslli128b(const I128 & x)395 template<uint8_t N_BYTES> BL_INLINE I128 vslli128b(const I128& x) noexcept { return N_BYTES ? _mm_slli_si128(x, N_BYTES) : x; }
vsrli128b(const I128 & x)396 template<uint8_t N_BYTES> BL_INLINE I128 vsrli128b(const I128& x) noexcept { return N_BYTES ? _mm_srli_si128(x, N_BYTES) : x; }
397
398 #if defined(BL_TARGET_OPT_SSE4_1)
vmini8(const I128 & x,const I128 & y)399 BL_INLINE I128 vmini8(const I128& x, const I128& y) noexcept { return _mm_min_epi8(x, y); }
vmaxi8(const I128 & x,const I128 & y)400 BL_INLINE I128 vmaxi8(const I128& x, const I128& y) noexcept { return _mm_max_epi8(x, y); }
401 #else
vmini8(const I128 & x,const I128 & y)402 BL_INLINE I128 vmini8(const I128& x, const I128& y) noexcept { return vblendmask(y, x, _mm_cmpgt_epi8(x, y)); }
vmaxi8(const I128 & x,const I128 & y)403 BL_INLINE I128 vmaxi8(const I128& x, const I128& y) noexcept { return vblendmask(x, y, _mm_cmpgt_epi8(x, y)); }
404 #endif
405
vminu8(const I128 & x,const I128 & y)406 BL_INLINE I128 vminu8(const I128& x, const I128& y) noexcept { return _mm_min_epu8(x, y); }
vmaxu8(const I128 & x,const I128 & y)407 BL_INLINE I128 vmaxu8(const I128& x, const I128& y) noexcept { return _mm_max_epu8(x, y); }
408
vmini16(const I128 & x,const I128 & y)409 BL_INLINE I128 vmini16(const I128& x, const I128& y) noexcept { return _mm_min_epi16(x, y); }
vmaxi16(const I128 & x,const I128 & y)410 BL_INLINE I128 vmaxi16(const I128& x, const I128& y) noexcept { return _mm_max_epi16(x, y); }
411
412 #if defined(BL_TARGET_OPT_SSE4_1)
vminu16(const I128 & x,const I128 & y)413 BL_INLINE I128 vminu16(const I128& x, const I128& y) noexcept { return _mm_min_epu16(x, y); }
vmaxu16(const I128 & x,const I128 & y)414 BL_INLINE I128 vmaxu16(const I128& x, const I128& y) noexcept { return _mm_max_epu16(x, y); }
415 #else
vminu16(const I128 & x,const I128 & y)416 BL_INLINE I128 vminu16(const I128& x, const I128& y) noexcept { return _mm_sub_epi16(x, _mm_subs_epu16(x, y)); }
vmaxu16(const I128 & x,const I128 & y)417 BL_INLINE I128 vmaxu16(const I128& x, const I128& y) noexcept { return _mm_add_epi16(x, _mm_subs_epu16(x, y)); }
418 #endif
419
420 #if defined(BL_TARGET_OPT_SSE4_1)
vmini32(const I128 & x,const I128 & y)421 BL_INLINE I128 vmini32(const I128& x, const I128& y) noexcept { return _mm_min_epi32(x, y); }
vmaxi32(const I128 & x,const I128 & y)422 BL_INLINE I128 vmaxi32(const I128& x, const I128& y) noexcept { return _mm_max_epi32(x, y); }
423 #else
vmini32(const I128 & x,const I128 & y)424 BL_INLINE I128 vmini32(const I128& x, const I128& y) noexcept { return vblendmask(y, x, _mm_cmpgt_epi32(x, y)); }
vmaxi32(const I128 & x,const I128 & y)425 BL_INLINE I128 vmaxi32(const I128& x, const I128& y) noexcept { return vblendmask(x, y, _mm_cmpgt_epi32(x, y)); }
426 #endif
427
vcmpeqi8(const I128 & x,const I128 & y)428 BL_INLINE I128 vcmpeqi8(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi8(x, y); }
vcmpgti8(const I128 & x,const I128 & y)429 BL_INLINE I128 vcmpgti8(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi8(x, y); }
430
vcmpeqi16(const I128 & x,const I128 & y)431 BL_INLINE I128 vcmpeqi16(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi16(x, y); }
vcmpgti16(const I128 & x,const I128 & y)432 BL_INLINE I128 vcmpgti16(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi16(x, y); }
433
vcmpeqi32(const I128 & x,const I128 & y)434 BL_INLINE I128 vcmpeqi32(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi32(x, y); }
vcmpgti32(const I128 & x,const I128 & y)435 BL_INLINE I128 vcmpgti32(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi32(x, y); }
436
437 #if defined(BL_TARGET_OPT_SSSE3)
vabsi8(const I128 & x)438 BL_INLINE I128 vabsi8(const I128& x) noexcept { return _mm_abs_epi8(x); }
vabsi16(const I128 & x)439 BL_INLINE I128 vabsi16(const I128& x) noexcept { return _mm_abs_epi16(x); }
vabsi32(const I128 & x)440 BL_INLINE I128 vabsi32(const I128& x) noexcept { return _mm_abs_epi32(x); }
441 #else
vabsi8(const I128 & x)442 BL_INLINE I128 vabsi8(const I128& x) noexcept { return vminu8(vsubi8(vzeroi128(), x), x); }
vabsi16(const I128 & x)443 BL_INLINE I128 vabsi16(const I128& x) noexcept { return vmaxi16(vsubi16(vzeroi128(), x), x); }
vabsi32(const I128 & x)444 BL_INLINE I128 vabsi32(const I128& x) noexcept { I128 y = vsrai32<31>(x); return vsubi32(vxor(x, y), y); }
445 #endif
446
vloadi128_32(const void * p)447 BL_INLINE I128 vloadi128_32(const void* p) noexcept { return _mm_cvtsi32_si128(int(*(BLMisalignedUInt<uint32_t, 1>::T*)(p))); }
vloadi128_64(const void * p)448 BL_INLINE I128 vloadi128_64(const void* p) noexcept { return _mm_loadl_epi64(static_cast<const I128*>(p)); }
vloadi128a(const void * p)449 BL_INLINE I128 vloadi128a(const void* p) noexcept { return _mm_load_si128(static_cast<const I128*>(p)); }
vloadi128u(const void * p)450 BL_INLINE I128 vloadi128u(const void* p) noexcept { return _mm_loadu_si128(static_cast<const I128*>(p)); }
451
452 #if defined(BL_TARGET_OPT_AVX2)
vloadi128_mask32(const void * p,const I128 & mask)453 BL_INLINE I128 vloadi128_mask32(const void* p, const I128& mask) noexcept { return _mm_maskload_epi32(static_cast<const int*>(p), mask); }
vloadi128_mask64(const void * p,const I128 & mask)454 BL_INLINE I128 vloadi128_mask64(const void* p, const I128& mask) noexcept { return _mm_maskload_epi64(static_cast<const long long*>(p), mask); }
455 #endif
456
vloadi128_l64(const I128 & x,const void * p)457 BL_INLINE I128 vloadi128_l64(const I128& x, const void* p) noexcept { return vcast<I128>(_mm_loadl_pd(vcast<D128>(x), static_cast<const double*>(p))); }
vloadi128_h64(const I128 & x,const void * p)458 BL_INLINE I128 vloadi128_h64(const I128& x, const void* p) noexcept { return vcast<I128>(_mm_loadh_pd(vcast<D128>(x), static_cast<const double*>(p))); }
459
vstorei32(void * p,const I128 & x)460 BL_INLINE void vstorei32(void* p, const I128& x) noexcept { static_cast<int*>(p)[0] = _mm_cvtsi128_si32(x); }
vstorei64(void * p,const I128 & x)461 BL_INLINE void vstorei64(void* p, const I128& x) noexcept { _mm_storel_epi64(static_cast<I128*>(p), x); }
vstorei128a(void * p,const I128 & x)462 BL_INLINE void vstorei128a(void* p, const I128& x) noexcept { _mm_store_si128(static_cast<I128*>(p), x); }
vstorei128u(void * p,const I128 & x)463 BL_INLINE void vstorei128u(void* p, const I128& x) noexcept { _mm_storeu_si128(static_cast<I128*>(p), x); }
464
vstoreli64(void * p,const I128 & x)465 BL_INLINE void vstoreli64(void* p, const I128& x) noexcept { _mm_storel_epi64(static_cast<I128*>(p), x); }
vstorehi64(void * p,const I128 & x)466 BL_INLINE void vstorehi64(void* p, const I128& x) noexcept { _mm_storeh_pd(static_cast<double*>(p), vcast<D128>(x)); }
467
468 #if defined(BL_TARGET_OPT_AVX2)
vstorei128_mask32(void * p,const I128 & x,const I128 & mask)469 BL_INLINE void vstorei128_mask32(void* p, const I128& x, const I128& mask) noexcept { _mm_maskstore_epi32(static_cast<int*>(p), mask, x); }
vstorei128_mask64(void * p,const I128 & x,const I128 & mask)470 BL_INLINE void vstorei128_mask64(void* p, const I128& x, const I128& mask) noexcept { _mm_maskstore_epi64(static_cast<long long*>(p), mask, x); }
471 #endif
472
473 #if defined(BL_TARGET_OPT_SSE4_1)
474 template<uint32_t I>
vinsertu8(const I128 & x,uint32_t y)475 BL_INLINE I128 vinsertu8(const I128& x, uint32_t y) noexcept { return _mm_insert_epi8(x, int8_t(y), I); }
476 template<uint32_t I>
vinsertu16(const I128 & x,uint32_t y)477 BL_INLINE I128 vinsertu16(const I128& x, uint32_t y) noexcept { return _mm_insert_epi16(x, int16_t(y), I); }
478 template<uint32_t I>
vinsertu32(const I128 & x,uint32_t y)479 BL_INLINE I128 vinsertu32(const I128& x, uint32_t y) noexcept { return _mm_insert_epi32(x, int(y), I); }
480
481 template<uint32_t I>
vinsertm8(const I128 & x,const void * p)482 BL_INLINE I128 vinsertm8(const I128& x, const void* p) noexcept { return _mm_insert_epi8(x, blMemReadU8(p), I); }
483 template<uint32_t I>
vinsertm16(const I128 & x,const void * p)484 BL_INLINE I128 vinsertm16(const I128& x, const void* p) noexcept { return _mm_insert_epi16(x, blMemReadU16u(p), I); }
485 template<uint32_t I>
vinsertm32(const I128 & x,const void * p)486 BL_INLINE I128 vinsertm32(const I128& x, const void* p) noexcept { return _mm_insert_epi32(x, blMemReadU32u(p), I); }
487
488 // Convenience function used by RGB24 fetchers.
489 template<uint32_t I>
vinsertm24(const I128 & x,const void * p)490 BL_INLINE I128 vinsertm24(const I128& x, const void* p) noexcept {
491 const uint8_t* p8 = static_cast<const uint8_t*>(p);
492 if ((I & 0x1) == 0)
493 return _mm_insert_epi8(_mm_insert_epi16(x, blMemReadU16u(p8), I / 2), blMemReadU8(p8 + 2), I + 2);
494 else
495 return _mm_insert_epi16(_mm_insert_epi8(x, blMemReadU8(p8), I), blMemReadU16u(p8 + 1), (I + 1) / 2);
496 }
497
498 template<uint32_t I>
vextractu8(const I128 & x)499 BL_INLINE uint32_t vextractu8(const I128& x) noexcept { return uint32_t(_mm_extract_epi8(x, I)); }
500 template<uint32_t I>
vextractu16(const I128 & x)501 BL_INLINE uint32_t vextractu16(const I128& x) noexcept { return uint32_t(_mm_extract_epi16(x, I)); }
502 template<uint32_t I>
vextractu32(const I128 & x)503 BL_INLINE uint32_t vextractu32(const I128& x) noexcept { return uint32_t(_mm_extract_epi32(x, I)); }
504 #endif
505
vhasmaski8(const I128 & x,int bits0_15)506 BL_INLINE bool vhasmaski8(const I128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; }
vhasmaski8(const F128 & x,int bits0_15)507 BL_INLINE bool vhasmaski8(const F128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; }
vhasmaski8(const D128 & x,int bits0_15)508 BL_INLINE bool vhasmaski8(const D128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; }
509
vhasmaski32(const I128 & x,int bits0_3)510 BL_INLINE bool vhasmaski32(const I128& x, int bits0_3) noexcept { return _mm_movemask_ps(vcast<F128>(x)) == bits0_3; }
vhasmaski64(const I128 & x,int bits0_1)511 BL_INLINE bool vhasmaski64(const I128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; }
512
vdiv255u16(const I128 & x)513 BL_INLINE I128 vdiv255u16(const I128& x) noexcept {
514 I128 y = vaddi16(x, v_const_as<I128>(blCommonTable.i128_0080008000800080));
515 return vmulhu16(y, v_const_as<I128>(blCommonTable.i128_0101010101010101));
516 }
517 #endif
518
519 // ============================================================================
520 // [BLSIMD - F128]
521 // ============================================================================
522
523 #if defined(BL_TARGET_OPT_SSE)
vzerof128()524 BL_INLINE F128 vzerof128() noexcept { return _mm_setzero_ps(); }
525
vsetf128(float x)526 BL_INLINE F128 vsetf128(float x) noexcept { return _mm_set1_ps(x); }
vsetf128(float x3,float x2,float x1,float x0)527 BL_INLINE F128 vsetf128(float x3, float x2, float x1, float x0) noexcept { return _mm_set_ps(x3, x2, x1, x0); }
528
529 //! Cast a scalar `float` to `F128` vector type.
vcvtf32f128(float x)530 BL_INLINE F128 vcvtf32f128(float x) noexcept {
531 #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
532 // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70708
533 F128 reg;
534 __asm__("" : "=x" (reg) : "0" (x));
535 return reg;
536 #else
537 return _mm_set_ss(x);
538 #endif
539 }
vcvtf128f32(const F128 & x)540 BL_INLINE float vcvtf128f32(const F128& x) noexcept { return _mm_cvtss_f32(x); }
541
vcvti32f128(int32_t x)542 BL_INLINE F128 vcvti32f128(int32_t x) noexcept { return _mm_cvtsi32_ss(vzerof128(), x); }
vcvtf128i32(const F128 & x)543 BL_INLINE int32_t vcvtf128i32(const F128& x) noexcept { return _mm_cvtss_si32(x); }
vcvttf128i32(const F128 & x)544 BL_INLINE int32_t vcvttf128i32(const F128& x) noexcept { return _mm_cvttss_si32(x); }
545
546 #if BL_TARGET_ARCH_BITS >= 64
vcvti64f128(int64_t x)547 BL_INLINE F128 vcvti64f128(int64_t x) noexcept { return _mm_cvtsi64_ss(vzerof128(), x); }
vcvtf128i64(const F128 & x)548 BL_INLINE int64_t vcvtf128i64(const F128& x) noexcept { return _mm_cvtss_si64(x); }
vcvttf128i64(const F128 & x)549 BL_INLINE int64_t vcvttf128i64(const F128& x) noexcept { return _mm_cvttss_si64(x); }
550 #endif
551
552 template<int A, int B, int C, int D>
vshuff32(const F128 & x,const F128 & y)553 BL_INLINE F128 vshuff32(const F128& x, const F128& y) noexcept { return _mm_shuffle_ps(x, y, _MM_SHUFFLE(A, B, C, D)); }
554
555 template<int A, int B, int C, int D>
vswizf32(const F128 & x)556 BL_INLINE F128 vswizf32(const F128& x) noexcept {
557 #if defined(BL_TARGET_OPT_SSE2) && !defined(BL_TARGET_OPT_AVX)
558 return vcast<F128>(vswizi32<A, B, C, D>(vcast<I128>(x)));
559 #else
560 return vshuff32<A, B, C, D>(x, x);
561 #endif
562 }
563
564 template<int A, int B>
vswizf64(const F128 & x)565 BL_INLINE F128 vswizf64(const F128& x) noexcept {
566 #if defined(BL_TARGET_OPT_SSE2) && !defined(BL_TARGET_OPT_AVX)
567 return vcast<F128>(vswizi64<A, B>(vcast<I128>(x)));
568 #else
569 return vswizf32<A*2 + 1, A*2, B*2 + 1, B*2>(x);
570 #endif
571 }
572
vduplf32(const F128 & x)573 BL_INLINE F128 vduplf32(const F128& x) noexcept { return vswizf32<2, 2, 0, 0>(x); }
vduphf32(const F128 & x)574 BL_INLINE F128 vduphf32(const F128& x) noexcept { return vswizf32<3, 3, 1, 1>(x); }
575
vswapf64(const F128 & x)576 BL_INLINE F128 vswapf64(const F128& x) noexcept { return vswizf64<0, 1>(x); }
vduplf64(const F128 & x)577 BL_INLINE F128 vduplf64(const F128& x) noexcept { return vswizf64<0, 0>(x); }
vduphf64(const F128 & x)578 BL_INLINE F128 vduphf64(const F128& x) noexcept { return vswizf64<1, 1>(x); }
579
vunpacklf32(const F128 & x,const F128 & y)580 BL_INLINE F128 vunpacklf32(const F128& x, const F128& y) noexcept { return _mm_unpacklo_ps(x, y); }
vunpackhf32(const F128 & x,const F128 & y)581 BL_INLINE F128 vunpackhf32(const F128& x, const F128& y) noexcept { return _mm_unpackhi_ps(x, y); }
582
vor(const F128 & x,const F128 & y)583 BL_INLINE F128 vor(const F128& x, const F128& y) noexcept { return _mm_or_ps(x, y); }
vxor(const F128 & x,const F128 & y)584 BL_INLINE F128 vxor(const F128& x, const F128& y) noexcept { return _mm_xor_ps(x, y); }
vand(const F128 & x,const F128 & y)585 BL_INLINE F128 vand(const F128& x, const F128& y) noexcept { return _mm_and_ps(x, y); }
vandnot_a(const F128 & x,const F128 & y)586 BL_INLINE F128 vandnot_a(const F128& x, const F128& y) noexcept { return _mm_andnot_ps(x, y); }
vandnot_b(const F128 & x,const F128 & y)587 BL_INLINE F128 vandnot_b(const F128& x, const F128& y) noexcept { return _mm_andnot_ps(y, x); }
vblendmask(const F128 & x,const F128 & y,const F128 & mask)588 BL_INLINE F128 vblendmask(const F128& x, const F128& y, const F128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
589
vaddss(const F128 & x,const F128 & y)590 BL_INLINE F128 vaddss(const F128& x, const F128& y) noexcept { return _mm_add_ss(x, y); }
vaddps(const F128 & x,const F128 & y)591 BL_INLINE F128 vaddps(const F128& x, const F128& y) noexcept { return _mm_add_ps(x, y); }
592
vsubss(const F128 & x,const F128 & y)593 BL_INLINE F128 vsubss(const F128& x, const F128& y) noexcept { return _mm_sub_ss(x, y); }
vsubps(const F128 & x,const F128 & y)594 BL_INLINE F128 vsubps(const F128& x, const F128& y) noexcept { return _mm_sub_ps(x, y); }
595
vmulss(const F128 & x,const F128 & y)596 BL_INLINE F128 vmulss(const F128& x, const F128& y) noexcept { return _mm_mul_ss(x, y); }
vmulps(const F128 & x,const F128 & y)597 BL_INLINE F128 vmulps(const F128& x, const F128& y) noexcept { return _mm_mul_ps(x, y); }
598
vdivss(const F128 & x,const F128 & y)599 BL_INLINE F128 vdivss(const F128& x, const F128& y) noexcept { return _mm_div_ss(x, y); }
vdivps(const F128 & x,const F128 & y)600 BL_INLINE F128 vdivps(const F128& x, const F128& y) noexcept { return _mm_div_ps(x, y); }
601
vminss(const F128 & x,const F128 & y)602 BL_INLINE F128 vminss(const F128& x, const F128& y) noexcept { return _mm_min_ss(x, y); }
vminps(const F128 & x,const F128 & y)603 BL_INLINE F128 vminps(const F128& x, const F128& y) noexcept { return _mm_min_ps(x, y); }
604
vmaxss(const F128 & x,const F128 & y)605 BL_INLINE F128 vmaxss(const F128& x, const F128& y) noexcept { return _mm_max_ss(x, y); }
vmaxps(const F128 & x,const F128 & y)606 BL_INLINE F128 vmaxps(const F128& x, const F128& y) noexcept { return _mm_max_ps(x, y); }
607
vcmpeqss(const F128 & x,const F128 & y)608 BL_INLINE F128 vcmpeqss(const F128& x, const F128& y) noexcept { return _mm_cmpeq_ss(x, y); }
vcmpeqps(const F128 & x,const F128 & y)609 BL_INLINE F128 vcmpeqps(const F128& x, const F128& y) noexcept { return _mm_cmpeq_ps(x, y); }
610
vcmpness(const F128 & x,const F128 & y)611 BL_INLINE F128 vcmpness(const F128& x, const F128& y) noexcept { return _mm_cmpneq_ss(x, y); }
vcmpneps(const F128 & x,const F128 & y)612 BL_INLINE F128 vcmpneps(const F128& x, const F128& y) noexcept { return _mm_cmpneq_ps(x, y); }
613
vcmpgess(const F128 & x,const F128 & y)614 BL_INLINE F128 vcmpgess(const F128& x, const F128& y) noexcept { return _mm_cmpge_ss(x, y); }
vcmpgeps(const F128 & x,const F128 & y)615 BL_INLINE F128 vcmpgeps(const F128& x, const F128& y) noexcept { return _mm_cmpge_ps(x, y); }
616
vcmpgtss(const F128 & x,const F128 & y)617 BL_INLINE F128 vcmpgtss(const F128& x, const F128& y) noexcept { return _mm_cmpgt_ss(x, y); }
vcmpgtps(const F128 & x,const F128 & y)618 BL_INLINE F128 vcmpgtps(const F128& x, const F128& y) noexcept { return _mm_cmpgt_ps(x, y); }
619
vcmpless(const F128 & x,const F128 & y)620 BL_INLINE F128 vcmpless(const F128& x, const F128& y) noexcept { return _mm_cmple_ss(x, y); }
vcmpleps(const F128 & x,const F128 & y)621 BL_INLINE F128 vcmpleps(const F128& x, const F128& y) noexcept { return _mm_cmple_ps(x, y); }
622
vcmpltss(const F128 & x,const F128 & y)623 BL_INLINE F128 vcmpltss(const F128& x, const F128& y) noexcept { return _mm_cmplt_ss(x, y); }
vcmpltps(const F128 & x,const F128 & y)624 BL_INLINE F128 vcmpltps(const F128& x, const F128& y) noexcept { return _mm_cmplt_ps(x, y); }
625
vsqrtss(const F128 & x)626 BL_INLINE F128 vsqrtss(const F128& x) noexcept { return _mm_sqrt_ss(x); }
vsqrtps(const F128 & x)627 BL_INLINE F128 vsqrtps(const F128& x) noexcept { return _mm_sqrt_ps(x); }
628
vloadf128_32(const void * p)629 BL_INLINE F128 vloadf128_32(const void* p) noexcept { return _mm_load_ss(static_cast<const float*>(p)); }
vloadf128_64(const void * p)630 BL_INLINE F128 vloadf128_64(const void* p) noexcept { return vcast<F128>(vloadi128_64(p)); }
631
vloadf128a(const void * p)632 BL_INLINE F128 vloadf128a(const void* p) noexcept { return _mm_load_ps(static_cast<const float*>(p)); }
vloadf128u(const void * p)633 BL_INLINE F128 vloadf128u(const void* p) noexcept { return _mm_loadu_ps(static_cast<const float*>(p)); }
634
vloadf128_l64(const F128 & x,const void * p)635 BL_INLINE F128 vloadf128_l64(const F128& x, const void* p) noexcept { return _mm_loadl_pi(x, static_cast<const __m64*>(p)); }
vloadf128_h64(const F128 & x,const void * p)636 BL_INLINE F128 vloadf128_h64(const F128& x, const void* p) noexcept { return _mm_loadh_pi(x, static_cast<const __m64*>(p)); }
637
638 #if defined(BL_TARGET_OPT_AVX)
vloadf128_mask32(const void * p,const F128 & mask)639 BL_INLINE F128 vloadf128_mask32(const void* p, const F128& mask) noexcept { return _mm_maskload_ps(static_cast<const float*>(p), vcast<I128>(mask)); }
640 #endif
641
vbroadcastf128_64(const void * p)642 BL_INLINE F128 vbroadcastf128_64(const void* p) noexcept {
643 #if defined(BL_TARGET_OPT_SSE3)
644 return vcast<F128>(_mm_loaddup_pd(static_cast<const double*>(p)));
645 #else
646 return vduplf64(vloadf128_64(p));
647 #endif
648 }
649
vstoref32(void * p,const F128 & x)650 BL_INLINE void vstoref32(void* p, const F128& x) noexcept { _mm_store_ss(static_cast<float*>(p), x); }
vstoref64(void * p,const F128 & x)651 BL_INLINE void vstoref64(void* p, const F128& x) noexcept { _mm_storel_pi(static_cast<__m64*>(p), x); }
vstorelf64(void * p,const F128 & x)652 BL_INLINE void vstorelf64(void* p, const F128& x) noexcept { _mm_storel_pi(static_cast<__m64*>(p), x); }
vstorehf64(void * p,const F128 & x)653 BL_INLINE void vstorehf64(void* p, const F128& x) noexcept { _mm_storeh_pi(static_cast<__m64*>(p), x); }
vstoref128a(void * p,const F128 & x)654 BL_INLINE void vstoref128a(void* p, const F128& x) noexcept { _mm_store_ps(static_cast<float*>(p), x); }
vstoref128u(void * p,const F128 & x)655 BL_INLINE void vstoref128u(void* p, const F128& x) noexcept { _mm_storeu_ps(static_cast<float*>(p), x); }
656
657 #if defined(BL_TARGET_OPT_AVX)
vstoref128_mask32(void * p,const F128 & x,const F128 & mask)658 BL_INLINE void vstoref128_mask32(void* p, const F128& x, const F128& mask) noexcept { _mm_maskstore_ps(static_cast<float*>(p), vcast<I128>(mask), x); }
659 #endif
660
vhasmaskf32(const F128 & x,int bits0_3)661 BL_INLINE bool vhasmaskf32(const F128& x, int bits0_3) noexcept { return _mm_movemask_ps(vcast<F128>(x)) == bits0_3; }
vhasmaskf64(const F128 & x,int bits0_1)662 BL_INLINE bool vhasmaskf64(const F128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; }
663
664 // ============================================================================
665 // [BLSIMD - D128]
666 // ============================================================================
667
668 #if defined(BL_TARGET_OPT_SSE2)
vzerod128()669 BL_INLINE D128 vzerod128() noexcept { return _mm_setzero_pd(); }
670
vsetd128(double x)671 BL_INLINE D128 vsetd128(double x) noexcept { return _mm_set1_pd(x); }
vsetd128(double x1,double x0)672 BL_INLINE D128 vsetd128(double x1, double x0) noexcept { return _mm_set_pd(x1, x0); }
673
674 //! Cast a scalar `double` to `D128` vector type.
vcvtd64d128(double x)675 BL_INLINE D128 vcvtd64d128(double x) noexcept {
676 #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
677 // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70708
678 D128 reg;
679 __asm__("" : "=x" (reg) : "0" (x));
680 return reg;
681 #else
682 return _mm_set_sd(x);
683 #endif
684 }
vcvtd128d64(const D128 & x)685 BL_INLINE double vcvtd128d64(const D128& x) noexcept { return _mm_cvtsd_f64(x); }
686
vcvti32d128(int32_t x)687 BL_INLINE D128 vcvti32d128(int32_t x) noexcept { return _mm_cvtsi32_sd(vzerod128(), x); }
vcvtd128i32(const D128 & x)688 BL_INLINE int32_t vcvtd128i32(const D128& x) noexcept { return _mm_cvtsd_si32(x); }
vcvttd128i32(const D128 & x)689 BL_INLINE int32_t vcvttd128i32(const D128& x) noexcept { return _mm_cvttsd_si32(x); }
690
691 #if BL_TARGET_ARCH_BITS >= 64
vcvti64d128(int64_t x)692 BL_INLINE D128 vcvti64d128(int64_t x) noexcept { return _mm_cvtsi64_sd(vzerod128(), x); }
vcvtd128i64(const D128 & x)693 BL_INLINE int64_t vcvtd128i64(const D128& x) noexcept { return _mm_cvtsd_si64(x); }
vcvttd128i64(const D128 & x)694 BL_INLINE int64_t vcvttd128i64(const D128& x) noexcept { return _mm_cvttsd_si64(x); }
695 #endif
696
vcvtf128d128(const F128 & x)697 BL_INLINE D128 vcvtf128d128(const F128& x) noexcept { return _mm_cvtps_pd(x); }
vcvtd128f128(const D128 & x)698 BL_INLINE F128 vcvtd128f128(const D128& x) noexcept { return _mm_cvtpd_ps(x); }
699
vcvti128f128(const I128 & x)700 BL_INLINE F128 vcvti128f128(const I128& x) noexcept { return _mm_cvtepi32_ps(x); }
vcvti128d128(const I128 & x)701 BL_INLINE D128 vcvti128d128(const I128& x) noexcept { return _mm_cvtepi32_pd(x); }
702
vcvtf128i128(const F128 & x)703 BL_INLINE I128 vcvtf128i128(const F128& x) noexcept { return _mm_cvtps_epi32(x); }
vcvttf128i128(const F128 & x)704 BL_INLINE I128 vcvttf128i128(const F128& x) noexcept { return _mm_cvttps_epi32(x); }
705
vcvtd128i128(const D128 & x)706 BL_INLINE I128 vcvtd128i128(const D128& x) noexcept { return _mm_cvtpd_epi32(x); }
vcvttd128i128(const D128 & x)707 BL_INLINE I128 vcvttd128i128(const D128& x) noexcept { return _mm_cvttpd_epi32(x); }
708
709 template<int A, int B>
vshufd64(const D128 & x,const D128 & y)710 BL_INLINE D128 vshufd64(const D128& x, const D128& y) noexcept { return _mm_shuffle_pd(x, y, (A << 1) | B); }
711
712 template<int A, int B>
vswizd64(const D128 & x)713 BL_INLINE D128 vswizd64(const D128& x) noexcept {
714 #if !defined(BL_TARGET_OPT_AVX)
715 return vcast<D128>(vswizi64<A, B>(vcast<I128>(x)));
716 #else
717 return vshufd64<A, B>(x, x);
718 #endif
719 }
720
vswapd64(const D128 & x)721 BL_INLINE D128 vswapd64(const D128& x) noexcept { return vswizd64<0, 1>(x); }
vdupld64(const D128 & x)722 BL_INLINE D128 vdupld64(const D128& x) noexcept { return vswizd64<0, 0>(x); }
vduphd64(const D128 & x)723 BL_INLINE D128 vduphd64(const D128& x) noexcept { return vswizd64<1, 1>(x); }
724
vunpackld64(const D128 & x,const D128 & y)725 BL_INLINE D128 vunpackld64(const D128& x, const D128& y) noexcept { return _mm_unpacklo_pd(x, y); }
vunpackhd64(const D128 & x,const D128 & y)726 BL_INLINE D128 vunpackhd64(const D128& x, const D128& y) noexcept { return _mm_unpackhi_pd(x, y); }
727
vor(const D128 & x,const D128 & y)728 BL_INLINE D128 vor(const D128& x, const D128& y) noexcept { return _mm_or_pd(x, y); }
vxor(const D128 & x,const D128 & y)729 BL_INLINE D128 vxor(const D128& x, const D128& y) noexcept { return _mm_xor_pd(x, y); }
vand(const D128 & x,const D128 & y)730 BL_INLINE D128 vand(const D128& x, const D128& y) noexcept { return _mm_and_pd(x, y); }
vandnot_a(const D128 & x,const D128 & y)731 BL_INLINE D128 vandnot_a(const D128& x, const D128& y) noexcept { return _mm_andnot_pd(x, y); }
vandnot_b(const D128 & x,const D128 & y)732 BL_INLINE D128 vandnot_b(const D128& x, const D128& y) noexcept { return _mm_andnot_pd(y, x); }
vblendmask(const D128 & x,const D128 & y,const D128 & mask)733 BL_INLINE D128 vblendmask(const D128& x, const D128& y, const D128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
734
vaddsd(const D128 & x,const D128 & y)735 BL_INLINE D128 vaddsd(const D128& x, const D128& y) noexcept { return _mm_add_sd(x, y); }
vaddpd(const D128 & x,const D128 & y)736 BL_INLINE D128 vaddpd(const D128& x, const D128& y) noexcept { return _mm_add_pd(x, y); }
737
vsubsd(const D128 & x,const D128 & y)738 BL_INLINE D128 vsubsd(const D128& x, const D128& y) noexcept { return _mm_sub_sd(x, y); }
vsubpd(const D128 & x,const D128 & y)739 BL_INLINE D128 vsubpd(const D128& x, const D128& y) noexcept { return _mm_sub_pd(x, y); }
740
vmulsd(const D128 & x,const D128 & y)741 BL_INLINE D128 vmulsd(const D128& x, const D128& y) noexcept { return _mm_mul_sd(x, y); }
vmulpd(const D128 & x,const D128 & y)742 BL_INLINE D128 vmulpd(const D128& x, const D128& y) noexcept { return _mm_mul_pd(x, y); }
743
vdivsd(const D128 & x,const D128 & y)744 BL_INLINE D128 vdivsd(const D128& x, const D128& y) noexcept { return _mm_div_sd(x, y); }
vdivpd(const D128 & x,const D128 & y)745 BL_INLINE D128 vdivpd(const D128& x, const D128& y) noexcept { return _mm_div_pd(x, y); }
746
vminsd(const D128 & x,const D128 & y)747 BL_INLINE D128 vminsd(const D128& x, const D128& y) noexcept { return _mm_min_sd(x, y); }
vminpd(const D128 & x,const D128 & y)748 BL_INLINE D128 vminpd(const D128& x, const D128& y) noexcept { return _mm_min_pd(x, y); }
749
vmaxsd(const D128 & x,const D128 & y)750 BL_INLINE D128 vmaxsd(const D128& x, const D128& y) noexcept { return _mm_max_sd(x, y); }
vmaxpd(const D128 & x,const D128 & y)751 BL_INLINE D128 vmaxpd(const D128& x, const D128& y) noexcept { return _mm_max_pd(x, y); }
752
vcmpeqsd(const D128 & x,const D128 & y)753 BL_INLINE D128 vcmpeqsd(const D128& x, const D128& y) noexcept { return _mm_cmpeq_sd(x, y); }
vcmpeqpd(const D128 & x,const D128 & y)754 BL_INLINE D128 vcmpeqpd(const D128& x, const D128& y) noexcept { return _mm_cmpeq_pd(x, y); }
755
vcmpnesd(const D128 & x,const D128 & y)756 BL_INLINE D128 vcmpnesd(const D128& x, const D128& y) noexcept { return _mm_cmpneq_sd(x, y); }
vcmpnepd(const D128 & x,const D128 & y)757 BL_INLINE D128 vcmpnepd(const D128& x, const D128& y) noexcept { return _mm_cmpneq_pd(x, y); }
758
vcmpgesd(const D128 & x,const D128 & y)759 BL_INLINE D128 vcmpgesd(const D128& x, const D128& y) noexcept { return _mm_cmpge_sd(x, y); }
vcmpgepd(const D128 & x,const D128 & y)760 BL_INLINE D128 vcmpgepd(const D128& x, const D128& y) noexcept { return _mm_cmpge_pd(x, y); }
761
vcmpgtsd(const D128 & x,const D128 & y)762 BL_INLINE D128 vcmpgtsd(const D128& x, const D128& y) noexcept { return _mm_cmpgt_sd(x, y); }
vcmpgtpd(const D128 & x,const D128 & y)763 BL_INLINE D128 vcmpgtpd(const D128& x, const D128& y) noexcept { return _mm_cmpgt_pd(x, y); }
764
vcmplesd(const D128 & x,const D128 & y)765 BL_INLINE D128 vcmplesd(const D128& x, const D128& y) noexcept { return _mm_cmple_sd(x, y); }
vcmplepd(const D128 & x,const D128 & y)766 BL_INLINE D128 vcmplepd(const D128& x, const D128& y) noexcept { return _mm_cmple_pd(x, y); }
767
vcmpltsd(const D128 & x,const D128 & y)768 BL_INLINE D128 vcmpltsd(const D128& x, const D128& y) noexcept { return _mm_cmplt_sd(x, y); }
vcmpltpd(const D128 & x,const D128 & y)769 BL_INLINE D128 vcmpltpd(const D128& x, const D128& y) noexcept { return _mm_cmplt_pd(x, y); }
770
vsqrtsd(const D128 & x)771 BL_INLINE D128 vsqrtsd(const D128& x) noexcept { return _mm_sqrt_sd(x, x); }
vsqrtpd(const D128 & x)772 BL_INLINE D128 vsqrtpd(const D128& x) noexcept { return _mm_sqrt_pd(x); }
773
vloadd128_64(const void * p)774 BL_INLINE D128 vloadd128_64(const void* p) noexcept { return _mm_load_sd(static_cast<const double*>(p)); }
vloadd128a(const void * p)775 BL_INLINE D128 vloadd128a(const void* p) noexcept { return _mm_load_pd(static_cast<const double*>(p)); }
vloadd128u(const void * p)776 BL_INLINE D128 vloadd128u(const void* p) noexcept { return _mm_loadu_pd(static_cast<const double*>(p)); }
777
778 #if defined(BL_TARGET_OPT_AVX)
vloadd128_mask64(const void * p,const D128 & mask)779 BL_INLINE D128 vloadd128_mask64(const void* p, const D128& mask) noexcept { return _mm_maskload_pd(static_cast<const double*>(p), vcast<I128>(mask)); }
780 #endif
781
vloadd128_l64(const D128 & x,const void * p)782 BL_INLINE D128 vloadd128_l64(const D128& x, const void* p) noexcept { return _mm_loadl_pd(x, static_cast<const double*>(p)); }
vloadd128_h64(const D128 & x,const void * p)783 BL_INLINE D128 vloadd128_h64(const D128& x, const void* p) noexcept { return _mm_loadh_pd(x, static_cast<const double*>(p)); }
784
vbroadcastd128_64(const void * p)785 BL_INLINE D128 vbroadcastd128_64(const void* p) noexcept {
786 #if defined(BL_TARGET_OPT_SSE3)
787 return _mm_loaddup_pd(static_cast<const double*>(p));
788 #else
789 return vdupld64(vloadd128_64(p));
790 #endif
791 }
792
vstored64(void * p,const D128 & x)793 BL_INLINE void vstored64(void* p, const D128& x) noexcept { _mm_store_sd(static_cast<double*>(p), x); }
vstoreld64(void * p,const D128 & x)794 BL_INLINE void vstoreld64(void* p, const D128& x) noexcept { _mm_storel_pd(static_cast<double*>(p), x); }
vstorehd64(void * p,const D128 & x)795 BL_INLINE void vstorehd64(void* p, const D128& x) noexcept { _mm_storeh_pd(static_cast<double*>(p), x); }
vstored128a(void * p,const D128 & x)796 BL_INLINE void vstored128a(void* p, const D128& x) noexcept { _mm_store_pd(static_cast<double*>(p), x); }
vstored128u(void * p,const D128 & x)797 BL_INLINE void vstored128u(void* p, const D128& x) noexcept { _mm_storeu_pd(static_cast<double*>(p), x); }
798
799 #if defined(BL_TARGET_OPT_AVX)
vstored128_mask64(void * p,const D128 & x,const D128 & mask)800 BL_INLINE void vstored128_mask64(void* p, const D128& x, const D128& mask) noexcept { _mm_maskstore_pd(static_cast<double*>(p), vcast<I128>(mask), x); }
801 #endif
802
vhasmaskd64(const D128 & x,int bits0_1)803 BL_INLINE bool vhasmaskd64(const D128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; }
804 #endif
805
806 // ============================================================================
807 // [BLSIMD::I256]
808 // ============================================================================
809
810 #if defined(BL_TARGET_OPT_AVX)
vzeroi256()811 BL_INLINE I256 vzeroi256() noexcept { return _mm256_setzero_si256(); }
812
vcvti256f256(const I256 & x)813 BL_INLINE F256 vcvti256f256(const I256& x) noexcept { return _mm256_cvtepi32_ps(x); }
vcvti128d256(const I128 & x)814 BL_INLINE D256 vcvti128d256(const I128& x) noexcept { return _mm256_cvtepi32_pd(vcast<I128>(x)); }
vcvti256d256(const I256 & x)815 BL_INLINE D256 vcvti256d256(const I256& x) noexcept { return _mm256_cvtepi32_pd(vcast<I128>(x)); }
816 #endif
817
818 #if defined(BL_TARGET_OPT_AVX2)
vseti256i8(int8_t x)819 BL_INLINE I256 vseti256i8(int8_t x) noexcept { return _mm256_set1_epi8(x); }
vseti256i16(int16_t x)820 BL_INLINE I256 vseti256i16(int16_t x) noexcept { return _mm256_set1_epi16(x); }
821
vseti256i32(int32_t x)822 BL_INLINE I256 vseti256i32(int32_t x) noexcept { return _mm256_set1_epi32(x); }
vseti256i32(int32_t x1,int32_t x0)823 BL_INLINE I256 vseti256i32(int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x1, x0, x1, x0, x1, x0, x1, x0); }
vseti256i32(int32_t x3,int32_t x2,int32_t x1,int32_t x0)824 BL_INLINE I256 vseti256i32(int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x3, x2, x1, x0, x3, x2, x1, x0); }
vseti256i32(int32_t x7,int32_t x6,int32_t x5,int32_t x4,int32_t x3,int32_t x2,int32_t x1,int32_t x0)825 BL_INLINE I256 vseti256i32(int32_t x7, int32_t x6, int32_t x5, int32_t x4, int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0); }
826
vseti256i64(int64_t x)827 BL_INLINE I256 vseti256i64(int64_t x) noexcept {
828 #if BL_TARGET_ARCH_BITS >= 64
829 return _mm256_set1_epi64x(x);
830 #else
831 return vseti256i32(int32_t(uint64_t(x) >> 32), int32_t(x & 0xFFFFFFFFu));
832 #endif
833 }
834
vseti256i64(int64_t x1,int64_t x0)835 BL_INLINE I256 vseti256i64(int64_t x1, int64_t x0) noexcept {
836 return vseti256i32(int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
837 int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu),
838 int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
839 int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu));
840 }
841
vseti256i64(int64_t x3,int64_t x2,int64_t x1,int64_t x0)842 BL_INLINE I256 vseti256i64(int64_t x3, int64_t x2, int64_t x1, int64_t x0) noexcept {
843 return vseti256i32(int32_t(uint64_t(x3) >> 32), int32_t(x3 & 0xFFFFFFFFu),
844 int32_t(uint64_t(x2) >> 32), int32_t(x2 & 0xFFFFFFFFu),
845 int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
846 int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu));
847 }
848
vseti256u8(uint8_t x)849 BL_INLINE I256 vseti256u8(uint8_t x) noexcept { return vseti256i8(int8_t(x)); }
vseti256u16(uint16_t x)850 BL_INLINE I256 vseti256u16(uint16_t x) noexcept { return vseti256i16(int16_t(x)); }
vseti256u32(uint32_t x)851 BL_INLINE I256 vseti256u32(uint32_t x) noexcept { return vseti256i32(int32_t(x)); }
vseti256u64(uint64_t x)852 BL_INLINE I256 vseti256u64(uint64_t x) noexcept { return vseti256i64(int64_t(x)); }
853
vseti256u32(uint32_t x1,uint32_t x0)854 BL_INLINE I256 vseti256u32(uint32_t x1, uint32_t x0) noexcept {
855 return vseti256i32(int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0),
856 int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0));
857 }
858
vseti256u32(uint32_t x3,uint32_t x2,uint32_t x1,uint32_t x0)859 BL_INLINE I256 vseti256u32(uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept {
860 return vseti256i32(int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0),
861 int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0));
862 }
863
vseti256u32(uint32_t x7,uint32_t x6,uint32_t x5,uint32_t x4,uint32_t x3,uint32_t x2,uint32_t x1,uint32_t x0)864 BL_INLINE I256 vseti256u32(uint32_t x7, uint32_t x6, uint32_t x5, uint32_t x4, uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept {
865 return vseti256i32(int32_t(x7), int32_t(x6), int32_t(x5), int32_t(x4),
866 int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0));
867 }
868
vseti256u64(uint64_t x1,uint64_t x0)869 BL_INLINE I256 vseti256u64(uint64_t x1, uint64_t x0) noexcept {
870 return vseti256i64(int64_t(x1), int64_t(x0));
871 }
872
vseti256u64(uint64_t x3,uint64_t x2,uint64_t x1,uint64_t x0)873 BL_INLINE I256 vseti256u64(uint64_t x3, uint64_t x2, uint64_t x1, uint64_t x0) noexcept {
874 return vseti256i64(int64_t(x3), int64_t(x2), int64_t(x1), int64_t(x0));
875 }
876
vcvti32i256(int32_t x)877 BL_INLINE I256 vcvti32i256(int32_t x) noexcept { return vcast<I256>(vcvti32i128(x)); }
vcvtu32i256(uint32_t x)878 BL_INLINE I256 vcvtu32i256(uint32_t x) noexcept { return vcast<I256>(vcvtu32i128(x)); }
879
vcvti256i32(const I256 & x)880 BL_INLINE int32_t vcvti256i32(const I256& x) noexcept { return vcvti128i32(vcast<I128>(x)); }
vcvti256u32(const I256 & x)881 BL_INLINE uint32_t vcvti256u32(const I256& x) noexcept { return vcvti128u32(vcast<I128>(x)); }
882
vcvti64i256(int64_t x)883 BL_INLINE I256 vcvti64i256(int64_t x) noexcept { return vcast<I256>(vcvti64i128(x)); }
vcvtu64i256(uint64_t x)884 BL_INLINE I256 vcvtu64i256(uint64_t x) noexcept { return vcast<I256>(vcvtu64i128(x)); }
885
vcvti256i64(const I256 & x)886 BL_INLINE int64_t vcvti256i64(const I256& x) noexcept { return vcvti128i64(vcast<I128>(x)); }
vcvti256u64(const I256 & x)887 BL_INLINE uint64_t vcvti256u64(const I256& x) noexcept { return vcvti128u64(vcast<I128>(x)); }
888
889 template<int A, int B>
vpermi128(const I256 & x,const I256 & y)890 BL_INLINE I256 vpermi128(const I256& x, const I256& y) noexcept { return _mm256_permute2x128_si256(x, y, ((A & 0xF) << 4) + (B & 0xF)); }
891 template<int A, int B>
vpermi128(const I256 & x)892 BL_INLINE I256 vpermi128(const I256& x) noexcept { return vpermi128<A, B>(x, x); }
893
894 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizli16(const I256 & x)895 BL_INLINE I256 vswizli16(const I256& x) noexcept { return _mm256_shufflelo_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
896 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizhi16(const I256 & x)897 BL_INLINE I256 vswizhi16(const I256& x) noexcept { return _mm256_shufflehi_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
898
899 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizi16(const I256 & x)900 BL_INLINE I256 vswizi16(const I256& x) noexcept { return vswizhi16<A, B, C, D>(vswizli16<A, B, C, D>(x)); }
901 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizi32(const I256 & x)902 BL_INLINE I256 vswizi32(const I256& x) noexcept { return _mm256_shuffle_epi32(x, _MM_SHUFFLE(A, B, C, D)); }
903 template<int A, int B>
vswizi64(const I256 & x)904 BL_INLINE I256 vswizi64(const I256& x) noexcept { return vswizi32<A*2 + 1, A*2, B*2 + 1, B*2>(x); }
905
vpshufb(const I256 & x,const I256 & y)906 BL_INLINE I256 vpshufb(const I256& x, const I256& y) noexcept { return _mm256_shuffle_epi8(x, y); }
907
908 template<int N_BYTES>
vpalignr(const I256 & x,const I256 & y)909 BL_INLINE I256 vpalignr(const I256& x, const I256& y) noexcept { return _mm256_alignr_epi8(x, y, N_BYTES); }
910
vsplati8i256(const I128 & x)911 BL_INLINE I256 vsplati8i256(const I128& x) noexcept { return _mm256_broadcastb_epi8(vcast<I128>(x)); }
vsplati8i256(const I256 & x)912 BL_INLINE I256 vsplati8i256(const I256& x) noexcept { return _mm256_broadcastb_epi8(vcast<I128>(x)); }
913
vsplati16i256(const I128 & x)914 BL_INLINE I256 vsplati16i256(const I128& x) noexcept { return _mm256_broadcastw_epi16(vcast<I128>(x)); }
vsplati16i256(const I256 & x)915 BL_INLINE I256 vsplati16i256(const I256& x) noexcept { return _mm256_broadcastw_epi16(vcast<I128>(x)); }
916
vsplati32i256(const I128 & x)917 BL_INLINE I256 vsplati32i256(const I128& x) noexcept { return _mm256_broadcastd_epi32(vcast<I128>(x)); }
vsplati32i256(const I256 & x)918 BL_INLINE I256 vsplati32i256(const I256& x) noexcept { return _mm256_broadcastd_epi32(vcast<I128>(x)); }
919
vsplati64i256(const I128 & x)920 BL_INLINE I256 vsplati64i256(const I128& x) noexcept { return _mm256_broadcastq_epi64(vcast<I128>(x)); }
vsplati64i256(const I256 & x)921 BL_INLINE I256 vsplati64i256(const I256& x) noexcept { return _mm256_broadcastq_epi64(vcast<I128>(x)); }
922
vswapi64(const I256 & x)923 BL_INLINE I256 vswapi64(const I256& x) noexcept { return vswizi64<0, 1>(x); }
vdupli64(const I256 & x)924 BL_INLINE I256 vdupli64(const I256& x) noexcept { return vswizi64<0, 0>(x); }
vduphi64(const I256 & x)925 BL_INLINE I256 vduphi64(const I256& x) noexcept { return vswizi64<1, 1>(x); }
926
vswapi128(const I256 & x)927 BL_INLINE I256 vswapi128(const I256& x) noexcept { return vpermi128<0, 1>(x); }
vdupli128(const I128 & x)928 BL_INLINE I256 vdupli128(const I128& x) noexcept { return vpermi128<0, 0>(vcast<I256>(x)); }
vdupli128(const I256 & x)929 BL_INLINE I256 vdupli128(const I256& x) noexcept { return vpermi128<0, 0>(x); }
vduphi128(const I256 & x)930 BL_INLINE I256 vduphi128(const I256& x) noexcept { return vpermi128<1, 1>(x); }
931
vmovli128u8u16(const I128 & x)932 BL_INLINE I256 vmovli128u8u16(const I128& x) noexcept { return _mm256_cvtepu8_epi16(x); }
vmovli128u8u32(const I128 & x)933 BL_INLINE I256 vmovli128u8u32(const I128& x) noexcept { return _mm256_cvtepu8_epi32(x); }
vmovli128u8u64(const I128 & x)934 BL_INLINE I256 vmovli128u8u64(const I128& x) noexcept { return _mm256_cvtepu8_epi64(x); }
vmovli128u16u32(const I128 & x)935 BL_INLINE I256 vmovli128u16u32(const I128& x) noexcept { return _mm256_cvtepu16_epi32(x); }
vmovli128u16u64(const I128 & x)936 BL_INLINE I256 vmovli128u16u64(const I128& x) noexcept { return _mm256_cvtepu16_epi64(x); }
vmovli128u32u64(const I128 & x)937 BL_INLINE I256 vmovli128u32u64(const I128& x) noexcept { return _mm256_cvtepu32_epi64(x); }
938
vpacki16i8(const I256 & x,const I256 & y)939 BL_INLINE I256 vpacki16i8(const I256& x, const I256& y) noexcept { return _mm256_packs_epi16(x, y); }
vpacki16u8(const I256 & x,const I256 & y)940 BL_INLINE I256 vpacki16u8(const I256& x, const I256& y) noexcept { return _mm256_packus_epi16(x, y); }
vpacki32i16(const I256 & x,const I256 & y)941 BL_INLINE I256 vpacki32i16(const I256& x, const I256& y) noexcept { return _mm256_packs_epi32(x, y); }
vpacki32u16(const I256 & x,const I256 & y)942 BL_INLINE I256 vpacki32u16(const I256& x, const I256& y) noexcept { return _mm256_packus_epi32(x, y); }
943
vpacki16i8(const I256 & x)944 BL_INLINE I256 vpacki16i8(const I256& x) noexcept { return vpacki16i8(x, x); }
vpacki16u8(const I256 & x)945 BL_INLINE I256 vpacki16u8(const I256& x) noexcept { return vpacki16u8(x, x); }
vpacki32i16(const I256 & x)946 BL_INLINE I256 vpacki32i16(const I256& x) noexcept { return vpacki32i16(x, x); }
vpacki32u16(const I256 & x)947 BL_INLINE I256 vpacki32u16(const I256& x) noexcept { return vpacki32u16(x, x); }
948
vpacki32i8(const I256 & x)949 BL_INLINE I256 vpacki32i8(const I256& x) noexcept { return vpacki16i8(vpacki32i16(x)); }
vpacki32i8(const I256 & x,const I256 & y)950 BL_INLINE I256 vpacki32i8(const I256& x, const I256& y) noexcept { return vpacki16i8(vpacki32i16(x, y)); }
vpacki32i8(const I256 & x,const I256 & y,const I256 & z,const I256 & w)951 BL_INLINE I256 vpacki32i8(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16i8(vpacki32i16(x, y), vpacki32i16(z, w)); }
952
vpacki32u8(const I256 & x)953 BL_INLINE I256 vpacki32u8(const I256& x) noexcept { return vpacki16u8(vpacki32i16(x)); }
vpacki32u8(const I256 & x,const I256 & y)954 BL_INLINE I256 vpacki32u8(const I256& x, const I256& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
vpacki32u8(const I256 & x,const I256 & y,const I256 & z,const I256 & w)955 BL_INLINE I256 vpacki32u8(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
956
vpackzzdb(const I256 & x,const I256 & y)957 BL_INLINE I256 vpackzzdb(const I256& x, const I256& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
vpackzzdb(const I256 & x,const I256 & y,const I256 & z,const I256 & w)958 BL_INLINE I256 vpackzzdb(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
959
vunpackli8(const I256 & x,const I256 & y)960 BL_INLINE I256 vunpackli8(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi8(x, y); }
vunpackhi8(const I256 & x,const I256 & y)961 BL_INLINE I256 vunpackhi8(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi8(x, y); }
962
vunpackli16(const I256 & x,const I256 & y)963 BL_INLINE I256 vunpackli16(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi16(x, y); }
vunpackhi16(const I256 & x,const I256 & y)964 BL_INLINE I256 vunpackhi16(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi16(x, y); }
965
vunpackli32(const I256 & x,const I256 & y)966 BL_INLINE I256 vunpackli32(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi32(x, y); }
vunpackhi32(const I256 & x,const I256 & y)967 BL_INLINE I256 vunpackhi32(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi32(x, y); }
968
vunpackli64(const I256 & x,const I256 & y)969 BL_INLINE I256 vunpackli64(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi64(x, y); }
vunpackhi64(const I256 & x,const I256 & y)970 BL_INLINE I256 vunpackhi64(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi64(x, y); }
971
vunpackli128(const I256 & x,const I256 & y)972 BL_INLINE I256 vunpackli128(const I256& x, const I256& y) noexcept { return vpermi128<2, 0>(x, y); }
vunpackhi128(const I256 & x,const I256 & y)973 BL_INLINE I256 vunpackhi128(const I256& x, const I256& y) noexcept { return vpermi128<3, 1>(x, y); }
974
vor(const I256 & x,const I256 & y)975 BL_INLINE I256 vor(const I256& x, const I256& y) noexcept { return _mm256_or_si256(x, y); }
vxor(const I256 & x,const I256 & y)976 BL_INLINE I256 vxor(const I256& x, const I256& y) noexcept { return _mm256_xor_si256(x, y); }
vand(const I256 & x,const I256 & y)977 BL_INLINE I256 vand(const I256& x, const I256& y) noexcept { return _mm256_and_si256(x, y); }
vandnot_a(const I256 & x,const I256 & y)978 BL_INLINE I256 vandnot_a(const I256& x, const I256& y) noexcept { return _mm256_andnot_si256(x, y); }
vandnot_b(const I256 & x,const I256 & y)979 BL_INLINE I256 vandnot_b(const I256& x, const I256& y) noexcept { return _mm256_andnot_si256(y, x); }
980
vblendmask(const I256 & x,const I256 & y,const I256 & mask)981 BL_INLINE I256 vblendmask(const I256& x, const I256& y, const I256& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
vblendx(const I256 & x,const I256 & y,const I256 & mask)982 BL_INLINE I256 vblendx(const I256& x, const I256& y, const I256& mask) noexcept { return _mm256_blendv_epi8(x, y, mask); }
983
vaddi8(const I256 & x,const I256 & y)984 BL_INLINE I256 vaddi8(const I256& x, const I256& y) noexcept { return _mm256_add_epi8(x, y); }
vaddi16(const I256 & x,const I256 & y)985 BL_INLINE I256 vaddi16(const I256& x, const I256& y) noexcept { return _mm256_add_epi16(x, y); }
vaddi32(const I256 & x,const I256 & y)986 BL_INLINE I256 vaddi32(const I256& x, const I256& y) noexcept { return _mm256_add_epi32(x, y); }
vaddi64(const I256 & x,const I256 & y)987 BL_INLINE I256 vaddi64(const I256& x, const I256& y) noexcept { return _mm256_add_epi64(x, y); }
988
vaddsi8(const I256 & x,const I256 & y)989 BL_INLINE I256 vaddsi8(const I256& x, const I256& y) noexcept { return _mm256_adds_epi8(x, y); }
vaddsu8(const I256 & x,const I256 & y)990 BL_INLINE I256 vaddsu8(const I256& x, const I256& y) noexcept { return _mm256_adds_epu8(x, y); }
vaddsi16(const I256 & x,const I256 & y)991 BL_INLINE I256 vaddsi16(const I256& x, const I256& y) noexcept { return _mm256_adds_epi16(x, y); }
vaddsu16(const I256 & x,const I256 & y)992 BL_INLINE I256 vaddsu16(const I256& x, const I256& y) noexcept { return _mm256_adds_epu16(x, y); }
993
vsubi8(const I256 & x,const I256 & y)994 BL_INLINE I256 vsubi8(const I256& x, const I256& y) noexcept { return _mm256_sub_epi8(x, y); }
vsubi16(const I256 & x,const I256 & y)995 BL_INLINE I256 vsubi16(const I256& x, const I256& y) noexcept { return _mm256_sub_epi16(x, y); }
vsubi32(const I256 & x,const I256 & y)996 BL_INLINE I256 vsubi32(const I256& x, const I256& y) noexcept { return _mm256_sub_epi32(x, y); }
vsubi64(const I256 & x,const I256 & y)997 BL_INLINE I256 vsubi64(const I256& x, const I256& y) noexcept { return _mm256_sub_epi64(x, y); }
998
vsubsi8(const I256 & x,const I256 & y)999 BL_INLINE I256 vsubsi8(const I256& x, const I256& y) noexcept { return _mm256_subs_epi8(x, y); }
vsubsu8(const I256 & x,const I256 & y)1000 BL_INLINE I256 vsubsu8(const I256& x, const I256& y) noexcept { return _mm256_subs_epu8(x, y); }
vsubsi16(const I256 & x,const I256 & y)1001 BL_INLINE I256 vsubsi16(const I256& x, const I256& y) noexcept { return _mm256_subs_epi16(x, y); }
vsubsu16(const I256 & x,const I256 & y)1002 BL_INLINE I256 vsubsu16(const I256& x, const I256& y) noexcept { return _mm256_subs_epu16(x, y); }
1003
vmuli16(const I256 & x,const I256 & y)1004 BL_INLINE I256 vmuli16(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi16(x, y); }
vmulu16(const I256 & x,const I256 & y)1005 BL_INLINE I256 vmulu16(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi16(x, y); }
vmulhi16(const I256 & x,const I256 & y)1006 BL_INLINE I256 vmulhi16(const I256& x, const I256& y) noexcept { return _mm256_mulhi_epi16(x, y); }
vmulhu16(const I256 & x,const I256 & y)1007 BL_INLINE I256 vmulhu16(const I256& x, const I256& y) noexcept { return _mm256_mulhi_epu16(x, y); }
1008
vmuli32(const I256 & x,const I256 & y)1009 BL_INLINE I256 vmuli32(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi32(x, y); }
vmulu32(const I256 & x,const I256 & y)1010 BL_INLINE I256 vmulu32(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi32(x, y); }
1011
vmaddi16i32(const I256 & x,const I256 & y)1012 BL_INLINE I256 vmaddi16i32(const I256& x, const I256& y) noexcept { return _mm256_madd_epi16(x, y); }
1013
vslli16(const I256 & x)1014 template<uint8_t N_BITS> BL_INLINE I256 vslli16(const I256& x) noexcept { return N_BITS ? _mm256_slli_epi16(x, N_BITS) : x; }
vslli32(const I256 & x)1015 template<uint8_t N_BITS> BL_INLINE I256 vslli32(const I256& x) noexcept { return N_BITS ? _mm256_slli_epi32(x, N_BITS) : x; }
vslli64(const I256 & x)1016 template<uint8_t N_BITS> BL_INLINE I256 vslli64(const I256& x) noexcept { return N_BITS ? _mm256_slli_epi64(x, N_BITS) : x; }
1017
vsrli16(const I256 & x)1018 template<uint8_t N_BITS> BL_INLINE I256 vsrli16(const I256& x) noexcept { return N_BITS ? _mm256_srli_epi16(x, N_BITS) : x; }
vsrli32(const I256 & x)1019 template<uint8_t N_BITS> BL_INLINE I256 vsrli32(const I256& x) noexcept { return N_BITS ? _mm256_srli_epi32(x, N_BITS) : x; }
vsrli64(const I256 & x)1020 template<uint8_t N_BITS> BL_INLINE I256 vsrli64(const I256& x) noexcept { return N_BITS ? _mm256_srli_epi64(x, N_BITS) : x; }
1021
vsrai16(const I256 & x)1022 template<uint8_t N_BITS> BL_INLINE I256 vsrai16(const I256& x) noexcept { return N_BITS ? _mm256_srai_epi16(x, N_BITS) : x; }
vsrai32(const I256 & x)1023 template<uint8_t N_BITS> BL_INLINE I256 vsrai32(const I256& x) noexcept { return N_BITS ? _mm256_srai_epi32(x, N_BITS) : x; }
1024
vslli128b(const I256 & x)1025 template<uint8_t N_BYTES> BL_INLINE I256 vslli128b(const I256& x) noexcept { return N_BYTES ? _mm256_slli_si256(x, N_BYTES) : x; }
vsrli128b(const I256 & x)1026 template<uint8_t N_BYTES> BL_INLINE I256 vsrli128b(const I256& x) noexcept { return N_BYTES ? _mm256_srli_si256(x, N_BYTES) : x; }
1027
vmini8(const I256 & x,const I256 & y)1028 BL_INLINE I256 vmini8(const I256& x, const I256& y) noexcept { return _mm256_min_epi8(x, y); }
vmaxi8(const I256 & x,const I256 & y)1029 BL_INLINE I256 vmaxi8(const I256& x, const I256& y) noexcept { return _mm256_max_epi8(x, y); }
vminu8(const I256 & x,const I256 & y)1030 BL_INLINE I256 vminu8(const I256& x, const I256& y) noexcept { return _mm256_min_epu8(x, y); }
vmaxu8(const I256 & x,const I256 & y)1031 BL_INLINE I256 vmaxu8(const I256& x, const I256& y) noexcept { return _mm256_max_epu8(x, y); }
1032
vmini16(const I256 & x,const I256 & y)1033 BL_INLINE I256 vmini16(const I256& x, const I256& y) noexcept { return _mm256_min_epi16(x, y); }
vmaxi16(const I256 & x,const I256 & y)1034 BL_INLINE I256 vmaxi16(const I256& x, const I256& y) noexcept { return _mm256_max_epi16(x, y); }
vminu16(const I256 & x,const I256 & y)1035 BL_INLINE I256 vminu16(const I256& x, const I256& y) noexcept { return _mm256_min_epu16(x, y); }
vmaxu16(const I256 & x,const I256 & y)1036 BL_INLINE I256 vmaxu16(const I256& x, const I256& y) noexcept { return _mm256_max_epu16(x, y); }
1037
vmini32(const I256 & x,const I256 & y)1038 BL_INLINE I256 vmini32(const I256& x, const I256& y) noexcept { return _mm256_min_epi32(x, y); }
vmaxi32(const I256 & x,const I256 & y)1039 BL_INLINE I256 vmaxi32(const I256& x, const I256& y) noexcept { return _mm256_max_epi32(x, y); }
vminu32(const I256 & x,const I256 & y)1040