1 // [Blend2D]
2 // 2D Vector Graphics Powered by a JIT Compiler.
3 //
4 // [License]
5 // Zlib - See LICENSE.md file in the package.
6 
7 #ifndef BLEND2D_SIMD_X86_P_H
8 #define BLEND2D_SIMD_X86_P_H
9 
10 #include "./support_p.h"
11 #include "./tables_p.h"
12 
13 #if defined(_MSC_VER)
14   #include <intrin.h>
15 #endif
16 
17 #if defined(BL_TARGET_OPT_SSE)
18   #include <xmmintrin.h>
19 #endif
20 
21 #if defined(BL_TARGET_OPT_SSE2)
22   #include <emmintrin.h>
23 #endif
24 
25 #if defined(BL_TARGET_OPT_SSE3) && !defined(_MSC_VER)
26   #include <pmmintrin.h>
27 #endif
28 
29 #if defined(BL_TARGET_OPT_SSSE3)
30   #include <tmmintrin.h>
31 #endif
32 
33 #if defined(BL_TARGET_OPT_SSE4_1)
34   #include <smmintrin.h>
35 #endif
36 
37 #if defined(BL_TARGET_OPT_SSE4_2)
38   #include <nmmintrin.h>
39 #endif
40 
41 #if defined(BL_TARGET_OPT_AVX) || defined(BL_TARGET_OPT_AVX2)
42   #include <immintrin.h>
43 #endif
44 
45 #if defined(BL_TARGET_OPT_NEON)
46   #include <arm_neon.h>
47 #endif
48 
49 //! \cond INTERNAL
50 //! \addtogroup blend2d_internal
51 //! \{
52 
53 //! SIMD namespace contains helper functions to access SIMD intrinsics. The
54 //! names of these functions correspond to names of functions used by pipeline
55 //! generator (BLPipe).
56 namespace SIMD {
57 
58 // ============================================================================
59 // [BLSIMD - Features]
60 // ============================================================================
61 
62 #if defined(BL_TARGET_OPT_AVX2)
63   #define BL_TARGET_SIMD_I 256
64   #define BL_TARGET_SIMD_F 256
65   #define BL_TARGET_SIMD_D 256
66 #elif defined(BL_TARGET_OPT_AVX)
67   #define BL_TARGET_SIMD_I 128
68   #define BL_TARGET_SIMD_F 256
69   #define BL_TARGET_SIMD_D 256
70 #elif defined(BL_TARGET_OPT_SSE2)
71   #define BL_TARGET_SIMD_I 128
72   #define BL_TARGET_SIMD_F 128
73   #define BL_TARGET_SIMD_D 128
74 #else
75   #define BL_TARGET_SIMD_I 0
76   #define BL_TARGET_SIMD_F 0
77   #define BL_TARGET_SIMD_D 0
78 #endif
79 
80 // ============================================================================
81 // [BLSIMD - Types]
82 // ============================================================================
83 
84 #if defined(BL_TARGET_OPT_SSE2)
85 typedef __m128i I128;
86 typedef __m128  F128;
87 typedef __m128d D128;
88 #endif
89 
90 // 256-bit types (including integers) are accessible through AVX as AVX also
91 // include conversion instructions between integer types and FP types.
92 #if defined(BL_TARGET_OPT_AVX)
93 typedef __m256i I256;
94 typedef __m256  F256;
95 typedef __m256d D256;
96 #endif
97 
98 // Must be in anonymous namespace.
99 namespace {
100 
101 // ============================================================================
102 // [BLSIMD - Cast]
103 // ============================================================================
104 
105 template<typename Out, typename In>
v_const_as(const In * c)106 BL_INLINE const Out& v_const_as(const In* c) noexcept {
107   return *reinterpret_cast<const Out*>(c);
108 }
109 
110 template<typename DstT, typename SrcT>
vcast(const SrcT & x)111 BL_INLINE DstT vcast(const SrcT& x) noexcept { return x; }
112 
113 #if defined(BL_TARGET_OPT_SSE2)
vcast(const I128 & x)114 template<> BL_INLINE F128 vcast(const I128& x) noexcept { return _mm_castsi128_ps(x); }
vcast(const I128 & x)115 template<> BL_INLINE D128 vcast(const I128& x) noexcept { return _mm_castsi128_pd(x); }
vcast(const F128 & x)116 template<> BL_INLINE I128 vcast(const F128& x) noexcept { return _mm_castps_si128(x); }
vcast(const F128 & x)117 template<> BL_INLINE D128 vcast(const F128& x) noexcept { return _mm_castps_pd(x); }
vcast(const D128 & x)118 template<> BL_INLINE I128 vcast(const D128& x) noexcept { return _mm_castpd_si128(x); }
vcast(const D128 & x)119 template<> BL_INLINE F128 vcast(const D128& x) noexcept { return _mm_castpd_ps(x); }
120 #endif
121 
122 #if defined(BL_TARGET_OPT_AVX)
vcast(const I256 & x)123 template<> BL_INLINE I128 vcast(const I256& x) noexcept { return _mm256_castsi256_si128(x); }
vcast(const I128 & x)124 template<> BL_INLINE I256 vcast(const I128& x) noexcept { return _mm256_castsi128_si256(x); }
125 
vcast(const F256 & x)126 template<> BL_INLINE F128 vcast(const F256& x) noexcept { return _mm256_castps256_ps128(x); }
vcast(const F128 & x)127 template<> BL_INLINE F256 vcast(const F128& x) noexcept { return _mm256_castps128_ps256(x); }
128 
vcast(const D256 & x)129 template<> BL_INLINE D128 vcast(const D256& x) noexcept { return _mm256_castpd256_pd128(x); }
vcast(const D128 & x)130 template<> BL_INLINE D256 vcast(const D128& x) noexcept { return _mm256_castpd128_pd256(x); }
131 
vcast(const F256 & x)132 template<> BL_INLINE D256 vcast(const F256& x) noexcept { return _mm256_castps_pd(x); }
vcast(const D256 & x)133 template<> BL_INLINE F256 vcast(const D256& x) noexcept { return _mm256_castpd_ps(x); }
134 
vcast(const I256 & x)135 template<> BL_INLINE F256 vcast(const I256& x) noexcept { return _mm256_castsi256_ps(x); }
vcast(const F256 & x)136 template<> BL_INLINE I256 vcast(const F256& x) noexcept { return _mm256_castps_si256(x); }
137 
vcast(const I256 & x)138 template<> BL_INLINE D256 vcast(const I256& x) noexcept { return _mm256_castsi256_pd(x); }
vcast(const D256 & x)139 template<> BL_INLINE I256 vcast(const D256& x) noexcept { return _mm256_castpd_si256(x); }
140 #endif
141 
142 // ============================================================================
143 // [BLSIMD - I128]
144 // ============================================================================
145 
146 #if defined(BL_TARGET_OPT_SSE2)
vzeroi128()147 BL_INLINE I128 vzeroi128() noexcept { return _mm_setzero_si128(); }
148 
vseti128i8(int8_t x)149 BL_INLINE I128 vseti128i8(int8_t x) noexcept { return _mm_set1_epi8(x); }
vseti128i16(int16_t x)150 BL_INLINE I128 vseti128i16(int16_t x) noexcept { return _mm_set1_epi16(x); }
vseti128i32(int32_t x)151 BL_INLINE I128 vseti128i32(int32_t x) noexcept { return _mm_set1_epi32(x); }
152 
vseti128i32(int32_t x1,int32_t x0)153 BL_INLINE I128 vseti128i32(int32_t x1, int32_t x0) noexcept { return _mm_set_epi32(x1, x0, x1, x0); }
vseti128i32(int32_t x3,int32_t x2,int32_t x1,int32_t x0)154 BL_INLINE I128 vseti128i32(int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm_set_epi32(x3, x2, x1, x0); }
155 
vseti128i64(int64_t x)156 BL_INLINE I128 vseti128i64(int64_t x) noexcept {
157 #if BL_TARGET_ARCH_BITS >= 64
158   return _mm_set1_epi64x(x);
159 #else
160   return vseti128i32(int32_t(uint64_t(x) >> 32), int32_t(x & 0xFFFFFFFFu));
161 #endif
162 }
163 
vseti128i64(int64_t x1,int64_t x0)164 BL_INLINE I128 vseti128i64(int64_t x1, int64_t x0) noexcept {
165   return vseti128i32(int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
166                      int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu));
167 }
168 
vseti128u8(uint8_t x)169 BL_INLINE I128 vseti128u8(uint8_t x) noexcept { return vseti128i8(int8_t(x)); }
vseti128u16(uint16_t x)170 BL_INLINE I128 vseti128u16(uint16_t x) noexcept { return vseti128i16(int16_t(x)); }
vseti128u32(uint32_t x)171 BL_INLINE I128 vseti128u32(uint32_t x) noexcept { return vseti128i32(int32_t(x)); }
vseti128u32(uint32_t x1,uint32_t x0)172 BL_INLINE I128 vseti128u32(uint32_t x1, uint32_t x0) noexcept { return vseti128i32(int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0)); }
vseti128u32(uint32_t x3,uint32_t x2,uint32_t x1,uint32_t x0)173 BL_INLINE I128 vseti128u32(uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept { return vseti128i32(int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0)); }
vseti128u64(uint64_t x)174 BL_INLINE I128 vseti128u64(uint64_t x) noexcept { return vseti128i64(int64_t(x)); }
vseti128u64(uint64_t x1,uint64_t x0)175 BL_INLINE I128 vseti128u64(uint64_t x1, uint64_t x0) noexcept { return vseti128i64(int64_t(x1), int64_t(x0)); }
176 
vcvti32i128(int32_t x)177 BL_INLINE I128 vcvti32i128(int32_t x) noexcept { return _mm_cvtsi32_si128(int(x)); }
vcvtu32i128(uint32_t x)178 BL_INLINE I128 vcvtu32i128(uint32_t x) noexcept { return _mm_cvtsi32_si128(int(x)); }
179 
vcvti128i32(const I128 & x)180 BL_INLINE int32_t vcvti128i32(const I128& x) noexcept { return int32_t(_mm_cvtsi128_si32(x)); }
vcvti128u32(const I128 & x)181 BL_INLINE uint32_t vcvti128u32(const I128& x) noexcept { return uint32_t(_mm_cvtsi128_si32(x)); }
182 
vcvti64i128(int64_t x)183 BL_INLINE I128 vcvti64i128(int64_t x) noexcept {
184 #if BL_TARGET_ARCH_BITS >= 64
185   return _mm_cvtsi64_si128(x);
186 #else
187   return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&x));
188 #endif
189 }
190 
vcvti128i64(const I128 & x)191 BL_INLINE int64_t vcvti128i64(const I128& x) noexcept {
192 #if BL_TARGET_ARCH_BITS >= 64
193   return int64_t(_mm_cvtsi128_si64(x));
194 #else
195   int64_t result;
196   _mm_storel_epi64(reinterpret_cast<__m128i*>(&result), x);
197   return result;
198 #endif
199 }
200 
vcvtu64i128(uint64_t x)201 BL_INLINE I128 vcvtu64i128(uint64_t x) noexcept { return vcvti64i128(int64_t(x)); }
vcvti128u64(const I128 & x)202 BL_INLINE uint64_t vcvti128u64(const I128& x) noexcept { return uint64_t(vcvti128i64(x)); }
203 
204 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizli16(const I128 & x)205 BL_INLINE I128 vswizli16(const I128& x) noexcept { return _mm_shufflelo_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
206 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizhi16(const I128 & x)207 BL_INLINE I128 vswizhi16(const I128& x) noexcept { return _mm_shufflehi_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
208 
209 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizi16(const I128 & x)210 BL_INLINE I128 vswizi16(const I128& x) noexcept { return vswizhi16<A, B, C, D>(vswizli16<A, B, C, D>(x)); }
211 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizi32(const I128 & x)212 BL_INLINE I128 vswizi32(const I128& x) noexcept { return _mm_shuffle_epi32(x, _MM_SHUFFLE(A, B, C, D)); }
213 template<int A, int B>
vswizi64(const I128 & x)214 BL_INLINE I128 vswizi64(const I128& x) noexcept { return vswizi32<A*2 + 1, A*2, B*2 + 1, B*2>(x); }
215 
216 #if defined(BL_TARGET_OPT_SSSE3)
vpshufb(const I128 & x,const I128 & y)217 BL_INLINE I128 vpshufb(const I128& x, const I128& y) noexcept { return _mm_shuffle_epi8(x, y); }
218 
219 template<int N_BYTES>
vpalignr(const I128 & x,const I128 & y)220 BL_INLINE I128 vpalignr(const I128& x, const I128& y) noexcept { return _mm_alignr_epi8(x, y, N_BYTES); }
221 #endif
222 
vswapi64(const I128 & x)223 BL_INLINE I128 vswapi64(const I128& x) noexcept { return vswizi64<0, 1>(x); }
vdupli64(const I128 & x)224 BL_INLINE I128 vdupli64(const I128& x) noexcept { return vswizi64<0, 0>(x); }
vduphi64(const I128 & x)225 BL_INLINE I128 vduphi64(const I128& x) noexcept { return vswizi64<1, 1>(x); }
226 
vmovli64u8u16(const I128 & x)227 BL_INLINE I128 vmovli64u8u16(const I128& x) noexcept {
228 #if defined(BL_TARGET_OPT_SSE4_1)
229   return _mm_cvtepu8_epi16(x);
230 #else
231   return _mm_unpacklo_epi8(x, _mm_setzero_si128());
232 #endif
233 }
234 
vmovli64u16u32(const I128 & x)235 BL_INLINE I128 vmovli64u16u32(const I128& x) noexcept {
236 #if defined(BL_TARGET_OPT_SSE4_1)
237   return _mm_cvtepu16_epi32(x);
238 #else
239   return _mm_unpacklo_epi16(x, _mm_setzero_si128());
240 #endif
241 }
242 
vmovli64u32u64(const I128 & x)243 BL_INLINE I128 vmovli64u32u64(const I128& x) noexcept {
244 #if defined(BL_TARGET_OPT_SSE4_1)
245   return _mm_cvtepu32_epi64(x);
246 #else
247   return _mm_unpacklo_epi32(x, _mm_setzero_si128());
248 #endif
249 }
250 
vmovhi64u8u16(const I128 & x)251 BL_INLINE I128 vmovhi64u8u16(const I128& x) noexcept { return _mm_unpackhi_epi8(x, _mm_setzero_si128()); }
vmovhi64u16u32(const I128 & x)252 BL_INLINE I128 vmovhi64u16u32(const I128& x) noexcept { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }
vmovhi64u32u64(const I128 & x)253 BL_INLINE I128 vmovhi64u32u64(const I128& x) noexcept { return _mm_unpackhi_epi32(x, _mm_setzero_si128()); }
254 
vpacki16i8(const I128 & x,const I128 & y)255 BL_INLINE I128 vpacki16i8(const I128& x, const I128& y) noexcept { return _mm_packs_epi16(x, y); }
vpacki16u8(const I128 & x,const I128 & y)256 BL_INLINE I128 vpacki16u8(const I128& x, const I128& y) noexcept { return _mm_packus_epi16(x, y); }
vpacki32i16(const I128 & x,const I128 & y)257 BL_INLINE I128 vpacki32i16(const I128& x, const I128& y) noexcept { return _mm_packs_epi32(x, y); }
258 
vpacki16i8(const I128 & x)259 BL_INLINE I128 vpacki16i8(const I128& x) noexcept { return vpacki16i8(x, x); }
vpacki16u8(const I128 & x)260 BL_INLINE I128 vpacki16u8(const I128& x) noexcept { return vpacki16u8(x, x); }
vpacki32i16(const I128 & x)261 BL_INLINE I128 vpacki32i16(const I128& x) noexcept { return vpacki32i16(x, x); }
262 
vpacki32u16(const I128 & x,const I128 & y)263 BL_INLINE I128 vpacki32u16(const I128& x, const I128& y) noexcept {
264 #if defined(BL_TARGET_OPT_SSE4_1)
265   return _mm_packus_epi32(x, y);
266 #else
267   I128 xShifted = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
268   I128 yShifted = _mm_srai_epi32(_mm_slli_epi32(y, 16), 16);
269   return _mm_packs_epi32(xShifted, yShifted);
270 #endif
271 }
272 
vpacki32u16(const I128 & x)273 BL_INLINE I128 vpacki32u16(const I128& x) noexcept {
274 #if defined(BL_TARGET_OPT_SSE4_1)
275   return vpacki32u16(x, x);
276 #else
277   I128 xShifted = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
278   return _mm_packs_epi32(xShifted, xShifted);
279 #endif
280 }
281 
vpacki32i8(const I128 & x)282 BL_INLINE I128 vpacki32i8(const I128& x) noexcept { return vpacki16i8(vpacki32i16(x)); }
vpacki32i8(const I128 & x,const I128 & y)283 BL_INLINE I128 vpacki32i8(const I128& x, const I128& y) noexcept { return vpacki16i8(vpacki32i16(x, y)); }
vpacki32i8(const I128 & x,const I128 & y,const I128 & z,const I128 & w)284 BL_INLINE I128 vpacki32i8(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16i8(vpacki32i16(x, y), vpacki32i16(z, w)); }
285 
vpacki32u8(const I128 & x)286 BL_INLINE I128 vpacki32u8(const I128& x) noexcept { return vpacki16u8(vpacki32i16(x)); }
vpacki32u8(const I128 & x,const I128 & y)287 BL_INLINE I128 vpacki32u8(const I128& x, const I128& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
vpacki32u8(const I128 & x,const I128 & y,const I128 & z,const I128 & w)288 BL_INLINE I128 vpacki32u8(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
289 
290 // These assume that HI bytes of all inputs are always zero, so the implementation
291 // can decide between packing with signed/unsigned saturation or vector swizzling.
vpackzzwb(const I128 & x)292 BL_INLINE I128 vpackzzwb(const I128& x) noexcept { return vpacki16u8(x); }
vpackzzwb(const I128 & x,const I128 & y)293 BL_INLINE I128 vpackzzwb(const I128& x, const I128& y) noexcept { return vpacki16u8(x, y); }
294 
vpackzzdw(const I128 & x)295 BL_INLINE I128 vpackzzdw(const I128& x) noexcept {
296 #if defined(BL_TARGET_OPT_SSE4_1) || !defined(BL_TARGET_OPT_SSSE3)
297   return vpacki32u16(x);
298 #else
299   return vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo));
300 #endif
301 }
302 
vpackzzdw(const I128 & x,const I128 & y)303 BL_INLINE I128 vpackzzdw(const I128& x, const I128& y) noexcept {
304 #if defined(BL_TARGET_OPT_SSE4_1) || !defined(BL_TARGET_OPT_SSSE3)
305   return vpacki32u16(x, y);
306 #else
307   I128 xLo = vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo));
308   I128 yLo = vpshufb(y, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo));
309   return _mm_unpacklo_epi64(xLo, yLo);
310 #endif
311 }
312 
vpackzzdb(const I128 & x)313 BL_INLINE I128 vpackzzdb(const I128& x) noexcept {
314 #if defined(BL_TARGET_OPT_SSSE3)
315   return vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u8_lo));
316 #else
317   return vpacki16u8(vpacki32i16(x));
318 #endif
319 }
320 
vpackzzdb(const I128 & x,const I128 & y)321 BL_INLINE I128 vpackzzdb(const I128& x, const I128& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
vpackzzdb(const I128 & x,const I128 & y,const I128 & z,const I128 & w)322 BL_INLINE I128 vpackzzdb(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
323 
vunpackli8(const I128 & x,const I128 & y)324 BL_INLINE I128 vunpackli8(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi8(x, y); }
vunpackhi8(const I128 & x,const I128 & y)325 BL_INLINE I128 vunpackhi8(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi8(x, y); }
326 
vunpackli16(const I128 & x,const I128 & y)327 BL_INLINE I128 vunpackli16(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi16(x, y); }
vunpackhi16(const I128 & x,const I128 & y)328 BL_INLINE I128 vunpackhi16(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi16(x, y); }
329 
vunpackli32(const I128 & x,const I128 & y)330 BL_INLINE I128 vunpackli32(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi32(x, y); }
vunpackhi32(const I128 & x,const I128 & y)331 BL_INLINE I128 vunpackhi32(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi32(x, y); }
332 
vunpackli64(const I128 & x,const I128 & y)333 BL_INLINE I128 vunpackli64(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi64(x, y); }
vunpackhi64(const I128 & x,const I128 & y)334 BL_INLINE I128 vunpackhi64(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi64(x, y); }
335 
vor(const I128 & x,const I128 & y)336 BL_INLINE I128 vor(const I128& x, const I128& y) noexcept { return _mm_or_si128(x, y); }
vxor(const I128 & x,const I128 & y)337 BL_INLINE I128 vxor(const I128& x, const I128& y) noexcept { return _mm_xor_si128(x, y); }
vand(const I128 & x,const I128 & y)338 BL_INLINE I128 vand(const I128& x, const I128& y) noexcept { return _mm_and_si128(x, y); }
vandnot_a(const I128 & x,const I128 & y)339 BL_INLINE I128 vandnot_a(const I128& x, const I128& y) noexcept { return _mm_andnot_si128(x, y); }
vandnot_b(const I128 & x,const I128 & y)340 BL_INLINE I128 vandnot_b(const I128& x, const I128& y) noexcept { return _mm_andnot_si128(y, x); }
vblendmask(const I128 & x,const I128 & y,const I128 & mask)341 BL_INLINE I128 vblendmask(const I128& x, const I128& y, const I128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
342 
343 //! Blend BITs or BYTEs, taking advantage of `pblendvb` (SSE4.1), if possible.
vblendx(const I128 & x,const I128 & y,const I128 & mask)344 BL_INLINE I128 vblendx(const I128& x, const I128& y, const I128& mask) noexcept {
345 #if defined(BL_TARGET_OPT_SSE4_1)
346   return _mm_blendv_epi8(x, y, mask);
347 #else
348   return vblendmask(x, y, mask);
349 #endif
350 }
351 
vaddi8(const I128 & x,const I128 & y)352 BL_INLINE I128 vaddi8(const I128& x, const I128& y) noexcept { return _mm_add_epi8(x, y); }
vaddi16(const I128 & x,const I128 & y)353 BL_INLINE I128 vaddi16(const I128& x, const I128& y) noexcept { return _mm_add_epi16(x, y); }
vaddi32(const I128 & x,const I128 & y)354 BL_INLINE I128 vaddi32(const I128& x, const I128& y) noexcept { return _mm_add_epi32(x, y); }
vaddi64(const I128 & x,const I128 & y)355 BL_INLINE I128 vaddi64(const I128& x, const I128& y) noexcept { return _mm_add_epi64(x, y); }
356 
vaddsi8(const I128 & x,const I128 & y)357 BL_INLINE I128 vaddsi8(const I128& x, const I128& y) noexcept { return _mm_adds_epi8(x, y); }
vaddsu8(const I128 & x,const I128 & y)358 BL_INLINE I128 vaddsu8(const I128& x, const I128& y) noexcept { return _mm_adds_epu8(x, y); }
vaddsi16(const I128 & x,const I128 & y)359 BL_INLINE I128 vaddsi16(const I128& x, const I128& y) noexcept { return _mm_adds_epi16(x, y); }
vaddsu16(const I128 & x,const I128 & y)360 BL_INLINE I128 vaddsu16(const I128& x, const I128& y) noexcept { return _mm_adds_epu16(x, y); }
361 
vsubi8(const I128 & x,const I128 & y)362 BL_INLINE I128 vsubi8(const I128& x, const I128& y) noexcept { return _mm_sub_epi8(x, y); }
vsubi16(const I128 & x,const I128 & y)363 BL_INLINE I128 vsubi16(const I128& x, const I128& y) noexcept { return _mm_sub_epi16(x, y); }
vsubi32(const I128 & x,const I128 & y)364 BL_INLINE I128 vsubi32(const I128& x, const I128& y) noexcept { return _mm_sub_epi32(x, y); }
vsubi64(const I128 & x,const I128 & y)365 BL_INLINE I128 vsubi64(const I128& x, const I128& y) noexcept { return _mm_sub_epi64(x, y); }
366 
vsubsi8(const I128 & x,const I128 & y)367 BL_INLINE I128 vsubsi8(const I128& x, const I128& y) noexcept { return _mm_subs_epi8(x, y); }
vsubsu8(const I128 & x,const I128 & y)368 BL_INLINE I128 vsubsu8(const I128& x, const I128& y) noexcept { return _mm_subs_epu8(x, y); }
vsubsi16(const I128 & x,const I128 & y)369 BL_INLINE I128 vsubsi16(const I128& x, const I128& y) noexcept { return _mm_subs_epi16(x, y); }
vsubsu16(const I128 & x,const I128 & y)370 BL_INLINE I128 vsubsu16(const I128& x, const I128& y) noexcept { return _mm_subs_epu16(x, y); }
371 
vmuli16(const I128 & x,const I128 & y)372 BL_INLINE I128 vmuli16(const I128& x, const I128& y) noexcept { return _mm_mullo_epi16(x, y); }
vmulu16(const I128 & x,const I128 & y)373 BL_INLINE I128 vmulu16(const I128& x, const I128& y) noexcept { return _mm_mullo_epi16(x, y); }
vmulhi16(const I128 & x,const I128 & y)374 BL_INLINE I128 vmulhi16(const I128& x, const I128& y) noexcept { return _mm_mulhi_epi16(x, y); }
vmulhu16(const I128 & x,const I128 & y)375 BL_INLINE I128 vmulhu16(const I128& x, const I128& y) noexcept { return _mm_mulhi_epu16(x, y); }
376 
377 #if defined(BL_TARGET_OPT_SSE4_1)
vmuli32(const I128 & x,const I128 & y)378 BL_INLINE I128 vmuli32(const I128& x, const I128& y) noexcept { return _mm_mullo_epi32(x, y); }
vmulu32(const I128 & x,const I128 & y)379 BL_INLINE I128 vmulu32(const I128& x, const I128& y) noexcept { return _mm_mullo_epi32(x, y); }
380 #endif
381 
vmaddi16i32(const I128 & x,const I128 & y)382 BL_INLINE I128 vmaddi16i32(const I128& x, const I128& y) noexcept { return _mm_madd_epi16(x, y); }
383 
vslli16(const I128 & x)384 template<uint8_t N_BITS> BL_INLINE I128 vslli16(const I128& x) noexcept { return N_BITS ? _mm_slli_epi16(x, N_BITS) : x; }
vslli32(const I128 & x)385 template<uint8_t N_BITS> BL_INLINE I128 vslli32(const I128& x) noexcept { return N_BITS ? _mm_slli_epi32(x, N_BITS) : x; }
vslli64(const I128 & x)386 template<uint8_t N_BITS> BL_INLINE I128 vslli64(const I128& x) noexcept { return N_BITS ? _mm_slli_epi64(x, N_BITS) : x; }
387 
vsrli16(const I128 & x)388 template<uint8_t N_BITS> BL_INLINE I128 vsrli16(const I128& x) noexcept { return N_BITS ? _mm_srli_epi16(x, N_BITS) : x; }
vsrli32(const I128 & x)389 template<uint8_t N_BITS> BL_INLINE I128 vsrli32(const I128& x) noexcept { return N_BITS ? _mm_srli_epi32(x, N_BITS) : x; }
vsrli64(const I128 & x)390 template<uint8_t N_BITS> BL_INLINE I128 vsrli64(const I128& x) noexcept { return N_BITS ? _mm_srli_epi64(x, N_BITS) : x; }
391 
vsrai16(const I128 & x)392 template<uint8_t N_BITS> BL_INLINE I128 vsrai16(const I128& x) noexcept { return N_BITS ? _mm_srai_epi16(x, N_BITS) : x; }
vsrai32(const I128 & x)393 template<uint8_t N_BITS> BL_INLINE I128 vsrai32(const I128& x) noexcept { return N_BITS ? _mm_srai_epi32(x, N_BITS) : x; }
394 
vslli128b(const I128 & x)395 template<uint8_t N_BYTES> BL_INLINE I128 vslli128b(const I128& x) noexcept { return N_BYTES ? _mm_slli_si128(x, N_BYTES) : x; }
vsrli128b(const I128 & x)396 template<uint8_t N_BYTES> BL_INLINE I128 vsrli128b(const I128& x) noexcept { return N_BYTES ? _mm_srli_si128(x, N_BYTES) : x; }
397 
398 #if defined(BL_TARGET_OPT_SSE4_1)
vmini8(const I128 & x,const I128 & y)399 BL_INLINE I128 vmini8(const I128& x, const I128& y) noexcept { return _mm_min_epi8(x, y); }
vmaxi8(const I128 & x,const I128 & y)400 BL_INLINE I128 vmaxi8(const I128& x, const I128& y) noexcept { return _mm_max_epi8(x, y); }
401 #else
vmini8(const I128 & x,const I128 & y)402 BL_INLINE I128 vmini8(const I128& x, const I128& y) noexcept { return vblendmask(y, x, _mm_cmpgt_epi8(x, y)); }
vmaxi8(const I128 & x,const I128 & y)403 BL_INLINE I128 vmaxi8(const I128& x, const I128& y) noexcept { return vblendmask(x, y, _mm_cmpgt_epi8(x, y)); }
404 #endif
405 
vminu8(const I128 & x,const I128 & y)406 BL_INLINE I128 vminu8(const I128& x, const I128& y) noexcept { return _mm_min_epu8(x, y); }
vmaxu8(const I128 & x,const I128 & y)407 BL_INLINE I128 vmaxu8(const I128& x, const I128& y) noexcept { return _mm_max_epu8(x, y); }
408 
vmini16(const I128 & x,const I128 & y)409 BL_INLINE I128 vmini16(const I128& x, const I128& y) noexcept { return _mm_min_epi16(x, y); }
vmaxi16(const I128 & x,const I128 & y)410 BL_INLINE I128 vmaxi16(const I128& x, const I128& y) noexcept { return _mm_max_epi16(x, y); }
411 
412 #if defined(BL_TARGET_OPT_SSE4_1)
vminu16(const I128 & x,const I128 & y)413 BL_INLINE I128 vminu16(const I128& x, const I128& y) noexcept { return _mm_min_epu16(x, y); }
vmaxu16(const I128 & x,const I128 & y)414 BL_INLINE I128 vmaxu16(const I128& x, const I128& y) noexcept { return _mm_max_epu16(x, y); }
415 #else
vminu16(const I128 & x,const I128 & y)416 BL_INLINE I128 vminu16(const I128& x, const I128& y) noexcept { return _mm_sub_epi16(x, _mm_subs_epu16(x, y)); }
vmaxu16(const I128 & x,const I128 & y)417 BL_INLINE I128 vmaxu16(const I128& x, const I128& y) noexcept { return _mm_add_epi16(x, _mm_subs_epu16(x, y)); }
418 #endif
419 
420 #if defined(BL_TARGET_OPT_SSE4_1)
vmini32(const I128 & x,const I128 & y)421 BL_INLINE I128 vmini32(const I128& x, const I128& y) noexcept { return _mm_min_epi32(x, y); }
vmaxi32(const I128 & x,const I128 & y)422 BL_INLINE I128 vmaxi32(const I128& x, const I128& y) noexcept { return _mm_max_epi32(x, y); }
423 #else
vmini32(const I128 & x,const I128 & y)424 BL_INLINE I128 vmini32(const I128& x, const I128& y) noexcept { return vblendmask(y, x, _mm_cmpgt_epi32(x, y)); }
vmaxi32(const I128 & x,const I128 & y)425 BL_INLINE I128 vmaxi32(const I128& x, const I128& y) noexcept { return vblendmask(x, y, _mm_cmpgt_epi32(x, y)); }
426 #endif
427 
vcmpeqi8(const I128 & x,const I128 & y)428 BL_INLINE I128 vcmpeqi8(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi8(x, y); }
vcmpgti8(const I128 & x,const I128 & y)429 BL_INLINE I128 vcmpgti8(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi8(x, y); }
430 
vcmpeqi16(const I128 & x,const I128 & y)431 BL_INLINE I128 vcmpeqi16(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi16(x, y); }
vcmpgti16(const I128 & x,const I128 & y)432 BL_INLINE I128 vcmpgti16(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi16(x, y); }
433 
vcmpeqi32(const I128 & x,const I128 & y)434 BL_INLINE I128 vcmpeqi32(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi32(x, y); }
vcmpgti32(const I128 & x,const I128 & y)435 BL_INLINE I128 vcmpgti32(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi32(x, y); }
436 
437 #if defined(BL_TARGET_OPT_SSSE3)
vabsi8(const I128 & x)438 BL_INLINE I128 vabsi8(const I128& x) noexcept { return _mm_abs_epi8(x); }
vabsi16(const I128 & x)439 BL_INLINE I128 vabsi16(const I128& x) noexcept { return _mm_abs_epi16(x); }
vabsi32(const I128 & x)440 BL_INLINE I128 vabsi32(const I128& x) noexcept { return _mm_abs_epi32(x); }
441 #else
vabsi8(const I128 & x)442 BL_INLINE I128 vabsi8(const I128& x) noexcept { return vminu8(vsubi8(vzeroi128(), x), x); }
vabsi16(const I128 & x)443 BL_INLINE I128 vabsi16(const I128& x) noexcept { return vmaxi16(vsubi16(vzeroi128(), x), x); }
vabsi32(const I128 & x)444 BL_INLINE I128 vabsi32(const I128& x) noexcept { I128 y = vsrai32<31>(x); return vsubi32(vxor(x, y), y); }
445 #endif
446 
vloadi128_32(const void * p)447 BL_INLINE I128 vloadi128_32(const void* p) noexcept { return _mm_cvtsi32_si128(int(*(BLMisalignedUInt<uint32_t, 1>::T*)(p))); }
vloadi128_64(const void * p)448 BL_INLINE I128 vloadi128_64(const void* p) noexcept { return _mm_loadl_epi64(static_cast<const I128*>(p)); }
vloadi128a(const void * p)449 BL_INLINE I128 vloadi128a(const void* p) noexcept { return _mm_load_si128(static_cast<const I128*>(p)); }
vloadi128u(const void * p)450 BL_INLINE I128 vloadi128u(const void* p) noexcept { return _mm_loadu_si128(static_cast<const I128*>(p)); }
451 
452 #if defined(BL_TARGET_OPT_AVX2)
vloadi128_mask32(const void * p,const I128 & mask)453 BL_INLINE I128 vloadi128_mask32(const void* p, const I128& mask) noexcept { return _mm_maskload_epi32(static_cast<const int*>(p), mask); }
vloadi128_mask64(const void * p,const I128 & mask)454 BL_INLINE I128 vloadi128_mask64(const void* p, const I128& mask) noexcept { return _mm_maskload_epi64(static_cast<const long long*>(p), mask); }
455 #endif
456 
vloadi128_l64(const I128 & x,const void * p)457 BL_INLINE I128 vloadi128_l64(const I128& x, const void* p) noexcept { return vcast<I128>(_mm_loadl_pd(vcast<D128>(x), static_cast<const double*>(p))); }
vloadi128_h64(const I128 & x,const void * p)458 BL_INLINE I128 vloadi128_h64(const I128& x, const void* p) noexcept { return vcast<I128>(_mm_loadh_pd(vcast<D128>(x), static_cast<const double*>(p))); }
459 
vstorei32(void * p,const I128 & x)460 BL_INLINE void vstorei32(void* p, const I128& x) noexcept { static_cast<int*>(p)[0] = _mm_cvtsi128_si32(x); }
vstorei64(void * p,const I128 & x)461 BL_INLINE void vstorei64(void* p, const I128& x) noexcept { _mm_storel_epi64(static_cast<I128*>(p), x); }
vstorei128a(void * p,const I128 & x)462 BL_INLINE void vstorei128a(void* p, const I128& x) noexcept { _mm_store_si128(static_cast<I128*>(p), x); }
vstorei128u(void * p,const I128 & x)463 BL_INLINE void vstorei128u(void* p, const I128& x) noexcept { _mm_storeu_si128(static_cast<I128*>(p), x); }
464 
vstoreli64(void * p,const I128 & x)465 BL_INLINE void vstoreli64(void* p, const I128& x) noexcept { _mm_storel_epi64(static_cast<I128*>(p), x); }
vstorehi64(void * p,const I128 & x)466 BL_INLINE void vstorehi64(void* p, const I128& x) noexcept { _mm_storeh_pd(static_cast<double*>(p), vcast<D128>(x)); }
467 
468 #if defined(BL_TARGET_OPT_AVX2)
vstorei128_mask32(void * p,const I128 & x,const I128 & mask)469 BL_INLINE void vstorei128_mask32(void* p, const I128& x, const I128& mask) noexcept { _mm_maskstore_epi32(static_cast<int*>(p), mask, x); }
vstorei128_mask64(void * p,const I128 & x,const I128 & mask)470 BL_INLINE void vstorei128_mask64(void* p, const I128& x, const I128& mask) noexcept { _mm_maskstore_epi64(static_cast<long long*>(p), mask, x); }
471 #endif
472 
473 #if defined(BL_TARGET_OPT_SSE4_1)
474 template<uint32_t I>
vinsertu8(const I128 & x,uint32_t y)475 BL_INLINE I128 vinsertu8(const I128& x, uint32_t y) noexcept { return _mm_insert_epi8(x, int8_t(y), I); }
476 template<uint32_t I>
vinsertu16(const I128 & x,uint32_t y)477 BL_INLINE I128 vinsertu16(const I128& x, uint32_t y) noexcept { return _mm_insert_epi16(x, int16_t(y), I); }
478 template<uint32_t I>
vinsertu32(const I128 & x,uint32_t y)479 BL_INLINE I128 vinsertu32(const I128& x, uint32_t y) noexcept { return _mm_insert_epi32(x, int(y), I); }
480 
481 template<uint32_t I>
vinsertm8(const I128 & x,const void * p)482 BL_INLINE I128 vinsertm8(const I128& x, const void* p) noexcept { return _mm_insert_epi8(x, blMemReadU8(p), I); }
483 template<uint32_t I>
vinsertm16(const I128 & x,const void * p)484 BL_INLINE I128 vinsertm16(const I128& x, const void* p) noexcept { return _mm_insert_epi16(x, blMemReadU16u(p), I); }
485 template<uint32_t I>
vinsertm32(const I128 & x,const void * p)486 BL_INLINE I128 vinsertm32(const I128& x, const void* p) noexcept { return _mm_insert_epi32(x, blMemReadU32u(p), I); }
487 
488 // Convenience function used by RGB24 fetchers.
489 template<uint32_t I>
vinsertm24(const I128 & x,const void * p)490 BL_INLINE I128 vinsertm24(const I128& x, const void* p) noexcept {
491   const uint8_t* p8 = static_cast<const uint8_t*>(p);
492   if ((I & 0x1) == 0)
493     return _mm_insert_epi8(_mm_insert_epi16(x, blMemReadU16u(p8), I / 2), blMemReadU8(p8 + 2), I + 2);
494   else
495     return _mm_insert_epi16(_mm_insert_epi8(x, blMemReadU8(p8), I), blMemReadU16u(p8 + 1), (I + 1) / 2);
496 }
497 
498 template<uint32_t I>
vextractu8(const I128 & x)499 BL_INLINE uint32_t vextractu8(const I128& x) noexcept { return uint32_t(_mm_extract_epi8(x, I)); }
500 template<uint32_t I>
vextractu16(const I128 & x)501 BL_INLINE uint32_t vextractu16(const I128& x) noexcept { return uint32_t(_mm_extract_epi16(x, I)); }
502 template<uint32_t I>
vextractu32(const I128 & x)503 BL_INLINE uint32_t vextractu32(const I128& x) noexcept { return uint32_t(_mm_extract_epi32(x, I)); }
504 #endif
505 
vhasmaski8(const I128 & x,int bits0_15)506 BL_INLINE bool vhasmaski8(const I128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; }
vhasmaski8(const F128 & x,int bits0_15)507 BL_INLINE bool vhasmaski8(const F128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; }
vhasmaski8(const D128 & x,int bits0_15)508 BL_INLINE bool vhasmaski8(const D128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; }
509 
vhasmaski32(const I128 & x,int bits0_3)510 BL_INLINE bool vhasmaski32(const I128& x, int bits0_3) noexcept { return _mm_movemask_ps(vcast<F128>(x)) == bits0_3; }
vhasmaski64(const I128 & x,int bits0_1)511 BL_INLINE bool vhasmaski64(const I128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; }
512 
vdiv255u16(const I128 & x)513 BL_INLINE I128 vdiv255u16(const I128& x) noexcept {
514   I128 y = vaddi16(x, v_const_as<I128>(blCommonTable.i128_0080008000800080));
515   return vmulhu16(y, v_const_as<I128>(blCommonTable.i128_0101010101010101));
516 }
517 #endif
518 
519 // ============================================================================
520 // [BLSIMD - F128]
521 // ============================================================================
522 
523 #if defined(BL_TARGET_OPT_SSE)
vzerof128()524 BL_INLINE F128 vzerof128() noexcept { return _mm_setzero_ps(); }
525 
vsetf128(float x)526 BL_INLINE F128 vsetf128(float x) noexcept { return _mm_set1_ps(x); }
vsetf128(float x3,float x2,float x1,float x0)527 BL_INLINE F128 vsetf128(float x3, float x2, float x1, float x0) noexcept { return _mm_set_ps(x3, x2, x1, x0); }
528 
529 //! Cast a scalar `float` to `F128` vector type.
vcvtf32f128(float x)530 BL_INLINE F128 vcvtf32f128(float x) noexcept {
531 #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
532   // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70708
533   F128 reg;
534   __asm__("" : "=x" (reg) : "0" (x));
535   return reg;
536 #else
537   return _mm_set_ss(x);
538 #endif
539 }
vcvtf128f32(const F128 & x)540 BL_INLINE float vcvtf128f32(const F128& x) noexcept { return _mm_cvtss_f32(x); }
541 
vcvti32f128(int32_t x)542 BL_INLINE F128 vcvti32f128(int32_t x) noexcept { return _mm_cvtsi32_ss(vzerof128(), x); }
vcvtf128i32(const F128 & x)543 BL_INLINE int32_t vcvtf128i32(const F128& x) noexcept { return _mm_cvtss_si32(x); }
vcvttf128i32(const F128 & x)544 BL_INLINE int32_t vcvttf128i32(const F128& x) noexcept { return _mm_cvttss_si32(x); }
545 
546 #if BL_TARGET_ARCH_BITS >= 64
vcvti64f128(int64_t x)547 BL_INLINE F128 vcvti64f128(int64_t x) noexcept { return _mm_cvtsi64_ss(vzerof128(), x); }
vcvtf128i64(const F128 & x)548 BL_INLINE int64_t vcvtf128i64(const F128& x) noexcept { return _mm_cvtss_si64(x); }
vcvttf128i64(const F128 & x)549 BL_INLINE int64_t vcvttf128i64(const F128& x) noexcept { return _mm_cvttss_si64(x); }
550 #endif
551 
552 template<int A, int B, int C, int D>
vshuff32(const F128 & x,const F128 & y)553 BL_INLINE F128 vshuff32(const F128& x, const F128& y) noexcept { return _mm_shuffle_ps(x, y, _MM_SHUFFLE(A, B, C, D)); }
554 
555 template<int A, int B, int C, int D>
vswizf32(const F128 & x)556 BL_INLINE F128 vswizf32(const F128& x) noexcept {
557 #if defined(BL_TARGET_OPT_SSE2) && !defined(BL_TARGET_OPT_AVX)
558   return vcast<F128>(vswizi32<A, B, C, D>(vcast<I128>(x)));
559 #else
560   return vshuff32<A, B, C, D>(x, x);
561 #endif
562 }
563 
564 template<int A, int B>
vswizf64(const F128 & x)565 BL_INLINE F128 vswizf64(const F128& x) noexcept {
566 #if defined(BL_TARGET_OPT_SSE2) && !defined(BL_TARGET_OPT_AVX)
567   return vcast<F128>(vswizi64<A, B>(vcast<I128>(x)));
568 #else
569   return vswizf32<A*2 + 1, A*2, B*2 + 1, B*2>(x);
570 #endif
571 }
572 
vduplf32(const F128 & x)573 BL_INLINE F128 vduplf32(const F128& x) noexcept { return vswizf32<2, 2, 0, 0>(x); }
vduphf32(const F128 & x)574 BL_INLINE F128 vduphf32(const F128& x) noexcept { return vswizf32<3, 3, 1, 1>(x); }
575 
vswapf64(const F128 & x)576 BL_INLINE F128 vswapf64(const F128& x) noexcept { return vswizf64<0, 1>(x); }
vduplf64(const F128 & x)577 BL_INLINE F128 vduplf64(const F128& x) noexcept { return vswizf64<0, 0>(x); }
vduphf64(const F128 & x)578 BL_INLINE F128 vduphf64(const F128& x) noexcept { return vswizf64<1, 1>(x); }
579 
vunpacklf32(const F128 & x,const F128 & y)580 BL_INLINE F128 vunpacklf32(const F128& x, const F128& y) noexcept { return _mm_unpacklo_ps(x, y); }
vunpackhf32(const F128 & x,const F128 & y)581 BL_INLINE F128 vunpackhf32(const F128& x, const F128& y) noexcept { return _mm_unpackhi_ps(x, y); }
582 
vor(const F128 & x,const F128 & y)583 BL_INLINE F128 vor(const F128& x, const F128& y) noexcept { return _mm_or_ps(x, y); }
vxor(const F128 & x,const F128 & y)584 BL_INLINE F128 vxor(const F128& x, const F128& y) noexcept { return _mm_xor_ps(x, y); }
vand(const F128 & x,const F128 & y)585 BL_INLINE F128 vand(const F128& x, const F128& y) noexcept { return _mm_and_ps(x, y); }
vandnot_a(const F128 & x,const F128 & y)586 BL_INLINE F128 vandnot_a(const F128& x, const F128& y) noexcept { return _mm_andnot_ps(x, y); }
vandnot_b(const F128 & x,const F128 & y)587 BL_INLINE F128 vandnot_b(const F128& x, const F128& y) noexcept { return _mm_andnot_ps(y, x); }
vblendmask(const F128 & x,const F128 & y,const F128 & mask)588 BL_INLINE F128 vblendmask(const F128& x, const F128& y, const F128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
589 
vaddss(const F128 & x,const F128 & y)590 BL_INLINE F128 vaddss(const F128& x, const F128& y) noexcept { return _mm_add_ss(x, y); }
vaddps(const F128 & x,const F128 & y)591 BL_INLINE F128 vaddps(const F128& x, const F128& y) noexcept { return _mm_add_ps(x, y); }
592 
vsubss(const F128 & x,const F128 & y)593 BL_INLINE F128 vsubss(const F128& x, const F128& y) noexcept { return _mm_sub_ss(x, y); }
vsubps(const F128 & x,const F128 & y)594 BL_INLINE F128 vsubps(const F128& x, const F128& y) noexcept { return _mm_sub_ps(x, y); }
595 
vmulss(const F128 & x,const F128 & y)596 BL_INLINE F128 vmulss(const F128& x, const F128& y) noexcept { return _mm_mul_ss(x, y); }
vmulps(const F128 & x,const F128 & y)597 BL_INLINE F128 vmulps(const F128& x, const F128& y) noexcept { return _mm_mul_ps(x, y); }
598 
vdivss(const F128 & x,const F128 & y)599 BL_INLINE F128 vdivss(const F128& x, const F128& y) noexcept { return _mm_div_ss(x, y); }
vdivps(const F128 & x,const F128 & y)600 BL_INLINE F128 vdivps(const F128& x, const F128& y) noexcept { return _mm_div_ps(x, y); }
601 
vminss(const F128 & x,const F128 & y)602 BL_INLINE F128 vminss(const F128& x, const F128& y) noexcept { return _mm_min_ss(x, y); }
vminps(const F128 & x,const F128 & y)603 BL_INLINE F128 vminps(const F128& x, const F128& y) noexcept { return _mm_min_ps(x, y); }
604 
vmaxss(const F128 & x,const F128 & y)605 BL_INLINE F128 vmaxss(const F128& x, const F128& y) noexcept { return _mm_max_ss(x, y); }
vmaxps(const F128 & x,const F128 & y)606 BL_INLINE F128 vmaxps(const F128& x, const F128& y) noexcept { return _mm_max_ps(x, y); }
607 
vcmpeqss(const F128 & x,const F128 & y)608 BL_INLINE F128 vcmpeqss(const F128& x, const F128& y) noexcept { return _mm_cmpeq_ss(x, y); }
vcmpeqps(const F128 & x,const F128 & y)609 BL_INLINE F128 vcmpeqps(const F128& x, const F128& y) noexcept { return _mm_cmpeq_ps(x, y); }
610 
vcmpness(const F128 & x,const F128 & y)611 BL_INLINE F128 vcmpness(const F128& x, const F128& y) noexcept { return _mm_cmpneq_ss(x, y); }
vcmpneps(const F128 & x,const F128 & y)612 BL_INLINE F128 vcmpneps(const F128& x, const F128& y) noexcept { return _mm_cmpneq_ps(x, y); }
613 
vcmpgess(const F128 & x,const F128 & y)614 BL_INLINE F128 vcmpgess(const F128& x, const F128& y) noexcept { return _mm_cmpge_ss(x, y); }
vcmpgeps(const F128 & x,const F128 & y)615 BL_INLINE F128 vcmpgeps(const F128& x, const F128& y) noexcept { return _mm_cmpge_ps(x, y); }
616 
vcmpgtss(const F128 & x,const F128 & y)617 BL_INLINE F128 vcmpgtss(const F128& x, const F128& y) noexcept { return _mm_cmpgt_ss(x, y); }
vcmpgtps(const F128 & x,const F128 & y)618 BL_INLINE F128 vcmpgtps(const F128& x, const F128& y) noexcept { return _mm_cmpgt_ps(x, y); }
619 
vcmpless(const F128 & x,const F128 & y)620 BL_INLINE F128 vcmpless(const F128& x, const F128& y) noexcept { return _mm_cmple_ss(x, y); }
vcmpleps(const F128 & x,const F128 & y)621 BL_INLINE F128 vcmpleps(const F128& x, const F128& y) noexcept { return _mm_cmple_ps(x, y); }
622 
vcmpltss(const F128 & x,const F128 & y)623 BL_INLINE F128 vcmpltss(const F128& x, const F128& y) noexcept { return _mm_cmplt_ss(x, y); }
vcmpltps(const F128 & x,const F128 & y)624 BL_INLINE F128 vcmpltps(const F128& x, const F128& y) noexcept { return _mm_cmplt_ps(x, y); }
625 
vsqrtss(const F128 & x)626 BL_INLINE F128 vsqrtss(const F128& x) noexcept { return _mm_sqrt_ss(x); }
vsqrtps(const F128 & x)627 BL_INLINE F128 vsqrtps(const F128& x) noexcept { return _mm_sqrt_ps(x); }
628 
vloadf128_32(const void * p)629 BL_INLINE F128 vloadf128_32(const void* p) noexcept { return _mm_load_ss(static_cast<const float*>(p)); }
vloadf128_64(const void * p)630 BL_INLINE F128 vloadf128_64(const void* p) noexcept { return vcast<F128>(vloadi128_64(p)); }
631 
vloadf128a(const void * p)632 BL_INLINE F128 vloadf128a(const void* p) noexcept { return _mm_load_ps(static_cast<const float*>(p)); }
vloadf128u(const void * p)633 BL_INLINE F128 vloadf128u(const void* p) noexcept { return _mm_loadu_ps(static_cast<const float*>(p)); }
634 
vloadf128_l64(const F128 & x,const void * p)635 BL_INLINE F128 vloadf128_l64(const F128& x, const void* p) noexcept { return _mm_loadl_pi(x, static_cast<const __m64*>(p)); }
vloadf128_h64(const F128 & x,const void * p)636 BL_INLINE F128 vloadf128_h64(const F128& x, const void* p) noexcept { return _mm_loadh_pi(x, static_cast<const __m64*>(p)); }
637 
638 #if defined(BL_TARGET_OPT_AVX)
vloadf128_mask32(const void * p,const F128 & mask)639 BL_INLINE F128 vloadf128_mask32(const void* p, const F128& mask) noexcept { return _mm_maskload_ps(static_cast<const float*>(p), vcast<I128>(mask)); }
640 #endif
641 
vbroadcastf128_64(const void * p)642 BL_INLINE F128 vbroadcastf128_64(const void* p) noexcept {
643 #if defined(BL_TARGET_OPT_SSE3)
644   return vcast<F128>(_mm_loaddup_pd(static_cast<const double*>(p)));
645 #else
646   return vduplf64(vloadf128_64(p));
647 #endif
648 }
649 
vstoref32(void * p,const F128 & x)650 BL_INLINE void vstoref32(void* p, const F128& x) noexcept { _mm_store_ss(static_cast<float*>(p), x); }
vstoref64(void * p,const F128 & x)651 BL_INLINE void vstoref64(void* p, const F128& x) noexcept { _mm_storel_pi(static_cast<__m64*>(p), x); }
vstorelf64(void * p,const F128 & x)652 BL_INLINE void vstorelf64(void* p, const F128& x) noexcept { _mm_storel_pi(static_cast<__m64*>(p), x); }
vstorehf64(void * p,const F128 & x)653 BL_INLINE void vstorehf64(void* p, const F128& x) noexcept { _mm_storeh_pi(static_cast<__m64*>(p), x); }
vstoref128a(void * p,const F128 & x)654 BL_INLINE void vstoref128a(void* p, const F128& x) noexcept { _mm_store_ps(static_cast<float*>(p), x); }
vstoref128u(void * p,const F128 & x)655 BL_INLINE void vstoref128u(void* p, const F128& x) noexcept { _mm_storeu_ps(static_cast<float*>(p), x); }
656 
657 #if defined(BL_TARGET_OPT_AVX)
vstoref128_mask32(void * p,const F128 & x,const F128 & mask)658 BL_INLINE void vstoref128_mask32(void* p, const F128& x, const F128& mask) noexcept { _mm_maskstore_ps(static_cast<float*>(p), vcast<I128>(mask), x); }
659 #endif
660 
vhasmaskf32(const F128 & x,int bits0_3)661 BL_INLINE bool vhasmaskf32(const F128& x, int bits0_3) noexcept { return _mm_movemask_ps(vcast<F128>(x)) == bits0_3; }
vhasmaskf64(const F128 & x,int bits0_1)662 BL_INLINE bool vhasmaskf64(const F128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; }
663 
664 // ============================================================================
665 // [BLSIMD - D128]
666 // ============================================================================
667 
668 #if defined(BL_TARGET_OPT_SSE2)
vzerod128()669 BL_INLINE D128 vzerod128() noexcept { return _mm_setzero_pd(); }
670 
vsetd128(double x)671 BL_INLINE D128 vsetd128(double x) noexcept { return _mm_set1_pd(x); }
vsetd128(double x1,double x0)672 BL_INLINE D128 vsetd128(double x1, double x0) noexcept { return _mm_set_pd(x1, x0); }
673 
674 //! Cast a scalar `double` to `D128` vector type.
vcvtd64d128(double x)675 BL_INLINE D128 vcvtd64d128(double x) noexcept {
676 #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
677   // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70708
678   D128 reg;
679   __asm__("" : "=x" (reg) : "0" (x));
680   return reg;
681 #else
682   return _mm_set_sd(x);
683 #endif
684 }
vcvtd128d64(const D128 & x)685 BL_INLINE double vcvtd128d64(const D128& x) noexcept { return _mm_cvtsd_f64(x); }
686 
vcvti32d128(int32_t x)687 BL_INLINE D128 vcvti32d128(int32_t x) noexcept { return _mm_cvtsi32_sd(vzerod128(), x); }
vcvtd128i32(const D128 & x)688 BL_INLINE int32_t vcvtd128i32(const D128& x) noexcept { return _mm_cvtsd_si32(x); }
vcvttd128i32(const D128 & x)689 BL_INLINE int32_t vcvttd128i32(const D128& x) noexcept { return _mm_cvttsd_si32(x); }
690 
691 #if BL_TARGET_ARCH_BITS >= 64
vcvti64d128(int64_t x)692 BL_INLINE D128 vcvti64d128(int64_t x) noexcept { return _mm_cvtsi64_sd(vzerod128(), x); }
vcvtd128i64(const D128 & x)693 BL_INLINE int64_t vcvtd128i64(const D128& x) noexcept { return _mm_cvtsd_si64(x); }
vcvttd128i64(const D128 & x)694 BL_INLINE int64_t vcvttd128i64(const D128& x) noexcept { return _mm_cvttsd_si64(x); }
695 #endif
696 
vcvtf128d128(const F128 & x)697 BL_INLINE D128 vcvtf128d128(const F128& x) noexcept { return _mm_cvtps_pd(x); }
vcvtd128f128(const D128 & x)698 BL_INLINE F128 vcvtd128f128(const D128& x) noexcept { return _mm_cvtpd_ps(x); }
699 
vcvti128f128(const I128 & x)700 BL_INLINE F128 vcvti128f128(const I128& x) noexcept { return _mm_cvtepi32_ps(x); }
vcvti128d128(const I128 & x)701 BL_INLINE D128 vcvti128d128(const I128& x) noexcept { return _mm_cvtepi32_pd(x); }
702 
vcvtf128i128(const F128 & x)703 BL_INLINE I128 vcvtf128i128(const F128& x) noexcept { return _mm_cvtps_epi32(x); }
vcvttf128i128(const F128 & x)704 BL_INLINE I128 vcvttf128i128(const F128& x) noexcept { return _mm_cvttps_epi32(x); }
705 
vcvtd128i128(const D128 & x)706 BL_INLINE I128 vcvtd128i128(const D128& x) noexcept { return _mm_cvtpd_epi32(x); }
vcvttd128i128(const D128 & x)707 BL_INLINE I128 vcvttd128i128(const D128& x) noexcept { return _mm_cvttpd_epi32(x); }
708 
709 template<int A, int B>
vshufd64(const D128 & x,const D128 & y)710 BL_INLINE D128 vshufd64(const D128& x, const D128& y) noexcept { return _mm_shuffle_pd(x, y, (A << 1) | B); }
711 
712 template<int A, int B>
vswizd64(const D128 & x)713 BL_INLINE D128 vswizd64(const D128& x) noexcept {
714 #if !defined(BL_TARGET_OPT_AVX)
715   return vcast<D128>(vswizi64<A, B>(vcast<I128>(x)));
716 #else
717   return vshufd64<A, B>(x, x);
718 #endif
719 }
720 
vswapd64(const D128 & x)721 BL_INLINE D128 vswapd64(const D128& x) noexcept { return vswizd64<0, 1>(x); }
vdupld64(const D128 & x)722 BL_INLINE D128 vdupld64(const D128& x) noexcept { return vswizd64<0, 0>(x); }
vduphd64(const D128 & x)723 BL_INLINE D128 vduphd64(const D128& x) noexcept { return vswizd64<1, 1>(x); }
724 
vunpackld64(const D128 & x,const D128 & y)725 BL_INLINE D128 vunpackld64(const D128& x, const D128& y) noexcept { return _mm_unpacklo_pd(x, y); }
vunpackhd64(const D128 & x,const D128 & y)726 BL_INLINE D128 vunpackhd64(const D128& x, const D128& y) noexcept { return _mm_unpackhi_pd(x, y); }
727 
vor(const D128 & x,const D128 & y)728 BL_INLINE D128 vor(const D128& x, const D128& y) noexcept { return _mm_or_pd(x, y); }
vxor(const D128 & x,const D128 & y)729 BL_INLINE D128 vxor(const D128& x, const D128& y) noexcept { return _mm_xor_pd(x, y); }
vand(const D128 & x,const D128 & y)730 BL_INLINE D128 vand(const D128& x, const D128& y) noexcept { return _mm_and_pd(x, y); }
vandnot_a(const D128 & x,const D128 & y)731 BL_INLINE D128 vandnot_a(const D128& x, const D128& y) noexcept { return _mm_andnot_pd(x, y); }
vandnot_b(const D128 & x,const D128 & y)732 BL_INLINE D128 vandnot_b(const D128& x, const D128& y) noexcept { return _mm_andnot_pd(y, x); }
vblendmask(const D128 & x,const D128 & y,const D128 & mask)733 BL_INLINE D128 vblendmask(const D128& x, const D128& y, const D128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
734 
vaddsd(const D128 & x,const D128 & y)735 BL_INLINE D128 vaddsd(const D128& x, const D128& y) noexcept { return _mm_add_sd(x, y); }
vaddpd(const D128 & x,const D128 & y)736 BL_INLINE D128 vaddpd(const D128& x, const D128& y) noexcept { return _mm_add_pd(x, y); }
737 
vsubsd(const D128 & x,const D128 & y)738 BL_INLINE D128 vsubsd(const D128& x, const D128& y) noexcept { return _mm_sub_sd(x, y); }
vsubpd(const D128 & x,const D128 & y)739 BL_INLINE D128 vsubpd(const D128& x, const D128& y) noexcept { return _mm_sub_pd(x, y); }
740 
vmulsd(const D128 & x,const D128 & y)741 BL_INLINE D128 vmulsd(const D128& x, const D128& y) noexcept { return _mm_mul_sd(x, y); }
vmulpd(const D128 & x,const D128 & y)742 BL_INLINE D128 vmulpd(const D128& x, const D128& y) noexcept { return _mm_mul_pd(x, y); }
743 
vdivsd(const D128 & x,const D128 & y)744 BL_INLINE D128 vdivsd(const D128& x, const D128& y) noexcept { return _mm_div_sd(x, y); }
vdivpd(const D128 & x,const D128 & y)745 BL_INLINE D128 vdivpd(const D128& x, const D128& y) noexcept { return _mm_div_pd(x, y); }
746 
vminsd(const D128 & x,const D128 & y)747 BL_INLINE D128 vminsd(const D128& x, const D128& y) noexcept { return _mm_min_sd(x, y); }
vminpd(const D128 & x,const D128 & y)748 BL_INLINE D128 vminpd(const D128& x, const D128& y) noexcept { return _mm_min_pd(x, y); }
749 
vmaxsd(const D128 & x,const D128 & y)750 BL_INLINE D128 vmaxsd(const D128& x, const D128& y) noexcept { return _mm_max_sd(x, y); }
vmaxpd(const D128 & x,const D128 & y)751 BL_INLINE D128 vmaxpd(const D128& x, const D128& y) noexcept { return _mm_max_pd(x, y); }
752 
vcmpeqsd(const D128 & x,const D128 & y)753 BL_INLINE D128 vcmpeqsd(const D128& x, const D128& y) noexcept { return _mm_cmpeq_sd(x, y); }
vcmpeqpd(const D128 & x,const D128 & y)754 BL_INLINE D128 vcmpeqpd(const D128& x, const D128& y) noexcept { return _mm_cmpeq_pd(x, y); }
755 
vcmpnesd(const D128 & x,const D128 & y)756 BL_INLINE D128 vcmpnesd(const D128& x, const D128& y) noexcept { return _mm_cmpneq_sd(x, y); }
vcmpnepd(const D128 & x,const D128 & y)757 BL_INLINE D128 vcmpnepd(const D128& x, const D128& y) noexcept { return _mm_cmpneq_pd(x, y); }
758 
vcmpgesd(const D128 & x,const D128 & y)759 BL_INLINE D128 vcmpgesd(const D128& x, const D128& y) noexcept { return _mm_cmpge_sd(x, y); }
vcmpgepd(const D128 & x,const D128 & y)760 BL_INLINE D128 vcmpgepd(const D128& x, const D128& y) noexcept { return _mm_cmpge_pd(x, y); }
761 
vcmpgtsd(const D128 & x,const D128 & y)762 BL_INLINE D128 vcmpgtsd(const D128& x, const D128& y) noexcept { return _mm_cmpgt_sd(x, y); }
vcmpgtpd(const D128 & x,const D128 & y)763 BL_INLINE D128 vcmpgtpd(const D128& x, const D128& y) noexcept { return _mm_cmpgt_pd(x, y); }
764 
vcmplesd(const D128 & x,const D128 & y)765 BL_INLINE D128 vcmplesd(const D128& x, const D128& y) noexcept { return _mm_cmple_sd(x, y); }
vcmplepd(const D128 & x,const D128 & y)766 BL_INLINE D128 vcmplepd(const D128& x, const D128& y) noexcept { return _mm_cmple_pd(x, y); }
767 
vcmpltsd(const D128 & x,const D128 & y)768 BL_INLINE D128 vcmpltsd(const D128& x, const D128& y) noexcept { return _mm_cmplt_sd(x, y); }
vcmpltpd(const D128 & x,const D128 & y)769 BL_INLINE D128 vcmpltpd(const D128& x, const D128& y) noexcept { return _mm_cmplt_pd(x, y); }
770 
vsqrtsd(const D128 & x)771 BL_INLINE D128 vsqrtsd(const D128& x) noexcept { return _mm_sqrt_sd(x, x); }
vsqrtpd(const D128 & x)772 BL_INLINE D128 vsqrtpd(const D128& x) noexcept { return _mm_sqrt_pd(x); }
773 
vloadd128_64(const void * p)774 BL_INLINE D128 vloadd128_64(const void* p) noexcept { return _mm_load_sd(static_cast<const double*>(p)); }
vloadd128a(const void * p)775 BL_INLINE D128 vloadd128a(const void* p) noexcept { return _mm_load_pd(static_cast<const double*>(p)); }
vloadd128u(const void * p)776 BL_INLINE D128 vloadd128u(const void* p) noexcept { return _mm_loadu_pd(static_cast<const double*>(p)); }
777 
778 #if defined(BL_TARGET_OPT_AVX)
vloadd128_mask64(const void * p,const D128 & mask)779 BL_INLINE D128 vloadd128_mask64(const void* p, const D128& mask) noexcept { return _mm_maskload_pd(static_cast<const double*>(p), vcast<I128>(mask)); }
780 #endif
781 
vloadd128_l64(const D128 & x,const void * p)782 BL_INLINE D128 vloadd128_l64(const D128& x, const void* p) noexcept { return _mm_loadl_pd(x, static_cast<const double*>(p)); }
vloadd128_h64(const D128 & x,const void * p)783 BL_INLINE D128 vloadd128_h64(const D128& x, const void* p) noexcept { return _mm_loadh_pd(x, static_cast<const double*>(p)); }
784 
vbroadcastd128_64(const void * p)785 BL_INLINE D128 vbroadcastd128_64(const void* p) noexcept {
786 #if defined(BL_TARGET_OPT_SSE3)
787   return _mm_loaddup_pd(static_cast<const double*>(p));
788 #else
789   return vdupld64(vloadd128_64(p));
790 #endif
791 }
792 
vstored64(void * p,const D128 & x)793 BL_INLINE void vstored64(void* p, const D128& x) noexcept { _mm_store_sd(static_cast<double*>(p), x); }
vstoreld64(void * p,const D128 & x)794 BL_INLINE void vstoreld64(void* p, const D128& x) noexcept { _mm_storel_pd(static_cast<double*>(p), x); }
vstorehd64(void * p,const D128 & x)795 BL_INLINE void vstorehd64(void* p, const D128& x) noexcept { _mm_storeh_pd(static_cast<double*>(p), x); }
vstored128a(void * p,const D128 & x)796 BL_INLINE void vstored128a(void* p, const D128& x) noexcept { _mm_store_pd(static_cast<double*>(p), x); }
vstored128u(void * p,const D128 & x)797 BL_INLINE void vstored128u(void* p, const D128& x) noexcept { _mm_storeu_pd(static_cast<double*>(p), x); }
798 
799 #if defined(BL_TARGET_OPT_AVX)
vstored128_mask64(void * p,const D128 & x,const D128 & mask)800 BL_INLINE void vstored128_mask64(void* p, const D128& x, const D128& mask) noexcept { _mm_maskstore_pd(static_cast<double*>(p), vcast<I128>(mask), x); }
801 #endif
802 
vhasmaskd64(const D128 & x,int bits0_1)803 BL_INLINE bool vhasmaskd64(const D128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; }
804 #endif
805 
806 // ============================================================================
807 // [BLSIMD::I256]
808 // ============================================================================
809 
810 #if defined(BL_TARGET_OPT_AVX)
vzeroi256()811 BL_INLINE I256 vzeroi256() noexcept { return _mm256_setzero_si256(); }
812 
vcvti256f256(const I256 & x)813 BL_INLINE F256 vcvti256f256(const I256& x) noexcept { return _mm256_cvtepi32_ps(x); }
vcvti128d256(const I128 & x)814 BL_INLINE D256 vcvti128d256(const I128& x) noexcept { return _mm256_cvtepi32_pd(vcast<I128>(x)); }
vcvti256d256(const I256 & x)815 BL_INLINE D256 vcvti256d256(const I256& x) noexcept { return _mm256_cvtepi32_pd(vcast<I128>(x)); }
816 #endif
817 
818 #if defined(BL_TARGET_OPT_AVX2)
vseti256i8(int8_t x)819 BL_INLINE I256 vseti256i8(int8_t x) noexcept { return _mm256_set1_epi8(x); }
vseti256i16(int16_t x)820 BL_INLINE I256 vseti256i16(int16_t x) noexcept { return _mm256_set1_epi16(x); }
821 
vseti256i32(int32_t x)822 BL_INLINE I256 vseti256i32(int32_t x) noexcept { return _mm256_set1_epi32(x); }
vseti256i32(int32_t x1,int32_t x0)823 BL_INLINE I256 vseti256i32(int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x1, x0, x1, x0, x1, x0, x1, x0); }
vseti256i32(int32_t x3,int32_t x2,int32_t x1,int32_t x0)824 BL_INLINE I256 vseti256i32(int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x3, x2, x1, x0, x3, x2, x1, x0); }
vseti256i32(int32_t x7,int32_t x6,int32_t x5,int32_t x4,int32_t x3,int32_t x2,int32_t x1,int32_t x0)825 BL_INLINE I256 vseti256i32(int32_t x7, int32_t x6, int32_t x5, int32_t x4, int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0); }
826 
vseti256i64(int64_t x)827 BL_INLINE I256 vseti256i64(int64_t x) noexcept {
828 #if BL_TARGET_ARCH_BITS >= 64
829   return _mm256_set1_epi64x(x);
830 #else
831   return vseti256i32(int32_t(uint64_t(x) >> 32), int32_t(x & 0xFFFFFFFFu));
832 #endif
833 }
834 
vseti256i64(int64_t x1,int64_t x0)835 BL_INLINE I256 vseti256i64(int64_t x1, int64_t x0) noexcept {
836   return vseti256i32(int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
837                      int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu),
838                      int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
839                      int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu));
840 }
841 
vseti256i64(int64_t x3,int64_t x2,int64_t x1,int64_t x0)842 BL_INLINE I256 vseti256i64(int64_t x3, int64_t x2, int64_t x1, int64_t x0) noexcept {
843   return vseti256i32(int32_t(uint64_t(x3) >> 32), int32_t(x3 & 0xFFFFFFFFu),
844                      int32_t(uint64_t(x2) >> 32), int32_t(x2 & 0xFFFFFFFFu),
845                      int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
846                      int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu));
847 }
848 
vseti256u8(uint8_t x)849 BL_INLINE I256 vseti256u8(uint8_t x) noexcept { return vseti256i8(int8_t(x)); }
vseti256u16(uint16_t x)850 BL_INLINE I256 vseti256u16(uint16_t x) noexcept { return vseti256i16(int16_t(x)); }
vseti256u32(uint32_t x)851 BL_INLINE I256 vseti256u32(uint32_t x) noexcept { return vseti256i32(int32_t(x)); }
vseti256u64(uint64_t x)852 BL_INLINE I256 vseti256u64(uint64_t x) noexcept { return vseti256i64(int64_t(x)); }
853 
vseti256u32(uint32_t x1,uint32_t x0)854 BL_INLINE I256 vseti256u32(uint32_t x1, uint32_t x0) noexcept {
855   return vseti256i32(int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0),
856                      int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0));
857 }
858 
vseti256u32(uint32_t x3,uint32_t x2,uint32_t x1,uint32_t x0)859 BL_INLINE I256 vseti256u32(uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept {
860   return vseti256i32(int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0),
861                      int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0));
862 }
863 
vseti256u32(uint32_t x7,uint32_t x6,uint32_t x5,uint32_t x4,uint32_t x3,uint32_t x2,uint32_t x1,uint32_t x0)864 BL_INLINE I256 vseti256u32(uint32_t x7, uint32_t x6, uint32_t x5, uint32_t x4, uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept {
865   return vseti256i32(int32_t(x7), int32_t(x6), int32_t(x5), int32_t(x4),
866                      int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0));
867 }
868 
vseti256u64(uint64_t x1,uint64_t x0)869 BL_INLINE I256 vseti256u64(uint64_t x1, uint64_t x0) noexcept {
870   return vseti256i64(int64_t(x1), int64_t(x0));
871 }
872 
vseti256u64(uint64_t x3,uint64_t x2,uint64_t x1,uint64_t x0)873 BL_INLINE I256 vseti256u64(uint64_t x3, uint64_t x2, uint64_t x1, uint64_t x0) noexcept {
874   return vseti256i64(int64_t(x3), int64_t(x2), int64_t(x1), int64_t(x0));
875 }
876 
vcvti32i256(int32_t x)877 BL_INLINE I256 vcvti32i256(int32_t x) noexcept { return vcast<I256>(vcvti32i128(x)); }
vcvtu32i256(uint32_t x)878 BL_INLINE I256 vcvtu32i256(uint32_t x) noexcept { return vcast<I256>(vcvtu32i128(x)); }
879 
vcvti256i32(const I256 & x)880 BL_INLINE int32_t vcvti256i32(const I256& x) noexcept { return vcvti128i32(vcast<I128>(x)); }
vcvti256u32(const I256 & x)881 BL_INLINE uint32_t vcvti256u32(const I256& x) noexcept { return vcvti128u32(vcast<I128>(x)); }
882 
vcvti64i256(int64_t x)883 BL_INLINE I256 vcvti64i256(int64_t x) noexcept { return vcast<I256>(vcvti64i128(x)); }
vcvtu64i256(uint64_t x)884 BL_INLINE I256 vcvtu64i256(uint64_t x) noexcept { return vcast<I256>(vcvtu64i128(x)); }
885 
vcvti256i64(const I256 & x)886 BL_INLINE int64_t vcvti256i64(const I256& x) noexcept { return vcvti128i64(vcast<I128>(x)); }
vcvti256u64(const I256 & x)887 BL_INLINE uint64_t vcvti256u64(const I256& x) noexcept { return vcvti128u64(vcast<I128>(x)); }
888 
889 template<int A, int B>
vpermi128(const I256 & x,const I256 & y)890 BL_INLINE I256 vpermi128(const I256& x, const I256& y) noexcept { return _mm256_permute2x128_si256(x, y, ((A & 0xF) << 4) + (B & 0xF)); }
891 template<int A, int B>
vpermi128(const I256 & x)892 BL_INLINE I256 vpermi128(const I256& x) noexcept { return vpermi128<A, B>(x, x); }
893 
894 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizli16(const I256 & x)895 BL_INLINE I256 vswizli16(const I256& x) noexcept { return _mm256_shufflelo_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
896 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizhi16(const I256 & x)897 BL_INLINE I256 vswizhi16(const I256& x) noexcept { return _mm256_shufflehi_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
898 
899 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizi16(const I256 & x)900 BL_INLINE I256 vswizi16(const I256& x) noexcept { return vswizhi16<A, B, C, D>(vswizli16<A, B, C, D>(x)); }
901 template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
vswizi32(const I256 & x)902 BL_INLINE I256 vswizi32(const I256& x) noexcept { return _mm256_shuffle_epi32(x, _MM_SHUFFLE(A, B, C, D)); }
903 template<int A, int B>
vswizi64(const I256 & x)904 BL_INLINE I256 vswizi64(const I256& x) noexcept { return vswizi32<A*2 + 1, A*2, B*2 + 1, B*2>(x); }
905 
vpshufb(const I256 & x,const I256 & y)906 BL_INLINE I256 vpshufb(const I256& x, const I256& y) noexcept { return _mm256_shuffle_epi8(x, y); }
907 
908 template<int N_BYTES>
vpalignr(const I256 & x,const I256 & y)909 BL_INLINE I256 vpalignr(const I256& x, const I256& y) noexcept { return _mm256_alignr_epi8(x, y, N_BYTES); }
910 
vsplati8i256(const I128 & x)911 BL_INLINE I256 vsplati8i256(const I128& x) noexcept { return _mm256_broadcastb_epi8(vcast<I128>(x)); }
vsplati8i256(const I256 & x)912 BL_INLINE I256 vsplati8i256(const I256& x) noexcept { return _mm256_broadcastb_epi8(vcast<I128>(x)); }
913 
vsplati16i256(const I128 & x)914 BL_INLINE I256 vsplati16i256(const I128& x) noexcept { return _mm256_broadcastw_epi16(vcast<I128>(x)); }
vsplati16i256(const I256 & x)915 BL_INLINE I256 vsplati16i256(const I256& x) noexcept { return _mm256_broadcastw_epi16(vcast<I128>(x)); }
916 
vsplati32i256(const I128 & x)917 BL_INLINE I256 vsplati32i256(const I128& x) noexcept { return _mm256_broadcastd_epi32(vcast<I128>(x)); }
vsplati32i256(const I256 & x)918 BL_INLINE I256 vsplati32i256(const I256& x) noexcept { return _mm256_broadcastd_epi32(vcast<I128>(x)); }
919 
vsplati64i256(const I128 & x)920 BL_INLINE I256 vsplati64i256(const I128& x) noexcept { return _mm256_broadcastq_epi64(vcast<I128>(x)); }
vsplati64i256(const I256 & x)921 BL_INLINE I256 vsplati64i256(const I256& x) noexcept { return _mm256_broadcastq_epi64(vcast<I128>(x)); }
922 
vswapi64(const I256 & x)923 BL_INLINE I256 vswapi64(const I256& x) noexcept { return vswizi64<0, 1>(x); }
vdupli64(const I256 & x)924 BL_INLINE I256 vdupli64(const I256& x) noexcept { return vswizi64<0, 0>(x); }
vduphi64(const I256 & x)925 BL_INLINE I256 vduphi64(const I256& x) noexcept { return vswizi64<1, 1>(x); }
926 
vswapi128(const I256 & x)927 BL_INLINE I256 vswapi128(const I256& x) noexcept { return vpermi128<0, 1>(x); }
vdupli128(const I128 & x)928 BL_INLINE I256 vdupli128(const I128& x) noexcept { return vpermi128<0, 0>(vcast<I256>(x)); }
vdupli128(const I256 & x)929 BL_INLINE I256 vdupli128(const I256& x) noexcept { return vpermi128<0, 0>(x); }
vduphi128(const I256 & x)930 BL_INLINE I256 vduphi128(const I256& x) noexcept { return vpermi128<1, 1>(x); }
931 
vmovli128u8u16(const I128 & x)932 BL_INLINE I256 vmovli128u8u16(const I128& x) noexcept { return _mm256_cvtepu8_epi16(x); }
vmovli128u8u32(const I128 & x)933 BL_INLINE I256 vmovli128u8u32(const I128& x) noexcept { return _mm256_cvtepu8_epi32(x); }
vmovli128u8u64(const I128 & x)934 BL_INLINE I256 vmovli128u8u64(const I128& x) noexcept { return _mm256_cvtepu8_epi64(x); }
vmovli128u16u32(const I128 & x)935 BL_INLINE I256 vmovli128u16u32(const I128& x) noexcept { return _mm256_cvtepu16_epi32(x); }
vmovli128u16u64(const I128 & x)936 BL_INLINE I256 vmovli128u16u64(const I128& x) noexcept { return _mm256_cvtepu16_epi64(x); }
vmovli128u32u64(const I128 & x)937 BL_INLINE I256 vmovli128u32u64(const I128& x) noexcept { return _mm256_cvtepu32_epi64(x); }
938 
vpacki16i8(const I256 & x,const I256 & y)939 BL_INLINE I256 vpacki16i8(const I256& x, const I256& y) noexcept { return _mm256_packs_epi16(x, y); }
vpacki16u8(const I256 & x,const I256 & y)940 BL_INLINE I256 vpacki16u8(const I256& x, const I256& y) noexcept { return _mm256_packus_epi16(x, y); }
vpacki32i16(const I256 & x,const I256 & y)941 BL_INLINE I256 vpacki32i16(const I256& x, const I256& y) noexcept { return _mm256_packs_epi32(x, y); }
vpacki32u16(const I256 & x,const I256 & y)942 BL_INLINE I256 vpacki32u16(const I256& x, const I256& y) noexcept { return _mm256_packus_epi32(x, y); }
943 
vpacki16i8(const I256 & x)944 BL_INLINE I256 vpacki16i8(const I256& x) noexcept { return vpacki16i8(x, x); }
vpacki16u8(const I256 & x)945 BL_INLINE I256 vpacki16u8(const I256& x) noexcept { return vpacki16u8(x, x); }
vpacki32i16(const I256 & x)946 BL_INLINE I256 vpacki32i16(const I256& x) noexcept { return vpacki32i16(x, x); }
vpacki32u16(const I256 & x)947 BL_INLINE I256 vpacki32u16(const I256& x) noexcept { return vpacki32u16(x, x); }
948 
vpacki32i8(const I256 & x)949 BL_INLINE I256 vpacki32i8(const I256& x) noexcept { return vpacki16i8(vpacki32i16(x)); }
vpacki32i8(const I256 & x,const I256 & y)950 BL_INLINE I256 vpacki32i8(const I256& x, const I256& y) noexcept { return vpacki16i8(vpacki32i16(x, y)); }
vpacki32i8(const I256 & x,const I256 & y,const I256 & z,const I256 & w)951 BL_INLINE I256 vpacki32i8(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16i8(vpacki32i16(x, y), vpacki32i16(z, w)); }
952 
vpacki32u8(const I256 & x)953 BL_INLINE I256 vpacki32u8(const I256& x) noexcept { return vpacki16u8(vpacki32i16(x)); }
vpacki32u8(const I256 & x,const I256 & y)954 BL_INLINE I256 vpacki32u8(const I256& x, const I256& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
vpacki32u8(const I256 & x,const I256 & y,const I256 & z,const I256 & w)955 BL_INLINE I256 vpacki32u8(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
956 
vpackzzdb(const I256 & x,const I256 & y)957 BL_INLINE I256 vpackzzdb(const I256& x, const I256& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
vpackzzdb(const I256 & x,const I256 & y,const I256 & z,const I256 & w)958 BL_INLINE I256 vpackzzdb(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
959 
vunpackli8(const I256 & x,const I256 & y)960 BL_INLINE I256 vunpackli8(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi8(x, y); }
vunpackhi8(const I256 & x,const I256 & y)961 BL_INLINE I256 vunpackhi8(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi8(x, y); }
962 
vunpackli16(const I256 & x,const I256 & y)963 BL_INLINE I256 vunpackli16(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi16(x, y); }
vunpackhi16(const I256 & x,const I256 & y)964 BL_INLINE I256 vunpackhi16(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi16(x, y); }
965 
vunpackli32(const I256 & x,const I256 & y)966 BL_INLINE I256 vunpackli32(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi32(x, y); }
vunpackhi32(const I256 & x,const I256 & y)967 BL_INLINE I256 vunpackhi32(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi32(x, y); }
968 
vunpackli64(const I256 & x,const I256 & y)969 BL_INLINE I256 vunpackli64(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi64(x, y); }
vunpackhi64(const I256 & x,const I256 & y)970 BL_INLINE I256 vunpackhi64(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi64(x, y); }
971 
vunpackli128(const I256 & x,const I256 & y)972 BL_INLINE I256 vunpackli128(const I256& x, const I256& y) noexcept { return vpermi128<2, 0>(x, y); }
vunpackhi128(const I256 & x,const I256 & y)973 BL_INLINE I256 vunpackhi128(const I256& x, const I256& y) noexcept { return vpermi128<3, 1>(x, y); }
974 
vor(const I256 & x,const I256 & y)975 BL_INLINE I256 vor(const I256& x, const I256& y) noexcept { return _mm256_or_si256(x, y); }
vxor(const I256 & x,const I256 & y)976 BL_INLINE I256 vxor(const I256& x, const I256& y) noexcept { return _mm256_xor_si256(x, y); }
vand(const I256 & x,const I256 & y)977 BL_INLINE I256 vand(const I256& x, const I256& y) noexcept { return _mm256_and_si256(x, y); }
vandnot_a(const I256 & x,const I256 & y)978 BL_INLINE I256 vandnot_a(const I256& x, const I256& y) noexcept { return _mm256_andnot_si256(x, y); }
vandnot_b(const I256 & x,const I256 & y)979 BL_INLINE I256 vandnot_b(const I256& x, const I256& y) noexcept { return _mm256_andnot_si256(y, x); }
980 
vblendmask(const I256 & x,const I256 & y,const I256 & mask)981 BL_INLINE I256 vblendmask(const I256& x, const I256& y, const I256& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
vblendx(const I256 & x,const I256 & y,const I256 & mask)982 BL_INLINE I256 vblendx(const I256& x, const I256& y, const I256& mask) noexcept { return _mm256_blendv_epi8(x, y, mask); }
983 
vaddi8(const I256 & x,const I256 & y)984 BL_INLINE I256 vaddi8(const I256& x, const I256& y) noexcept { return _mm256_add_epi8(x, y); }
vaddi16(const I256 & x,const I256 & y)985 BL_INLINE I256 vaddi16(const I256& x, const I256& y) noexcept { return _mm256_add_epi16(x, y); }
vaddi32(const I256 & x,const I256 & y)986 BL_INLINE I256 vaddi32(const I256& x, const I256& y) noexcept { return _mm256_add_epi32(x, y); }
vaddi64(const I256 & x,const I256 & y)987 BL_INLINE I256 vaddi64(const I256& x, const I256& y) noexcept { return _mm256_add_epi64(x, y); }
988 
vaddsi8(const I256 & x,const I256 & y)989 BL_INLINE I256 vaddsi8(const I256& x, const I256& y) noexcept { return _mm256_adds_epi8(x, y); }
vaddsu8(const I256 & x,const I256 & y)990 BL_INLINE I256 vaddsu8(const I256& x, const I256& y) noexcept { return _mm256_adds_epu8(x, y); }
vaddsi16(const I256 & x,const I256 & y)991 BL_INLINE I256 vaddsi16(const I256& x, const I256& y) noexcept { return _mm256_adds_epi16(x, y); }
vaddsu16(const I256 & x,const I256 & y)992 BL_INLINE I256 vaddsu16(const I256& x, const I256& y) noexcept { return _mm256_adds_epu16(x, y); }
993 
vsubi8(const I256 & x,const I256 & y)994 BL_INLINE I256 vsubi8(const I256& x, const I256& y) noexcept { return _mm256_sub_epi8(x, y); }
vsubi16(const I256 & x,const I256 & y)995 BL_INLINE I256 vsubi16(const I256& x, const I256& y) noexcept { return _mm256_sub_epi16(x, y); }
vsubi32(const I256 & x,const I256 & y)996 BL_INLINE I256 vsubi32(const I256& x, const I256& y) noexcept { return _mm256_sub_epi32(x, y); }
vsubi64(const I256 & x,const I256 & y)997 BL_INLINE I256 vsubi64(const I256& x, const I256& y) noexcept { return _mm256_sub_epi64(x, y); }
998 
vsubsi8(const I256 & x,const I256 & y)999 BL_INLINE I256 vsubsi8(const I256& x, const I256& y) noexcept { return _mm256_subs_epi8(x, y); }
vsubsu8(const I256 & x,const I256 & y)1000 BL_INLINE I256 vsubsu8(const I256& x, const I256& y) noexcept { return _mm256_subs_epu8(x, y); }
vsubsi16(const I256 & x,const I256 & y)1001 BL_INLINE I256 vsubsi16(const I256& x, const I256& y) noexcept { return _mm256_subs_epi16(x, y); }
vsubsu16(const I256 & x,const I256 & y)1002 BL_INLINE I256 vsubsu16(const I256& x, const I256& y) noexcept { return _mm256_subs_epu16(x, y); }
1003 
vmuli16(const I256 & x,const I256 & y)1004 BL_INLINE I256 vmuli16(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi16(x, y); }
vmulu16(const I256 & x,const I256 & y)1005 BL_INLINE I256 vmulu16(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi16(x, y); }
vmulhi16(const I256 & x,const I256 & y)1006 BL_INLINE I256 vmulhi16(const I256& x, const I256& y) noexcept { return _mm256_mulhi_epi16(x, y); }
vmulhu16(const I256 & x,const I256 & y)1007 BL_INLINE I256 vmulhu16(const I256& x, const I256& y) noexcept { return _mm256_mulhi_epu16(x, y); }
1008 
vmuli32(const I256 & x,const I256 & y)1009 BL_INLINE I256 vmuli32(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi32(x, y); }
vmulu32(const I256 & x,const I256 & y)1010 BL_INLINE I256 vmulu32(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi32(x, y); }
1011 
vmaddi16i32(const I256 & x,const I256 & y)1012 BL_INLINE I256 vmaddi16i32(const I256& x, const I256& y) noexcept { return _mm256_madd_epi16(x, y); }
1013 
vslli16(const I256 & x)1014 template<uint8_t N_BITS> BL_INLINE I256 vslli16(const I256& x) noexcept { return N_BITS ? _mm256_slli_epi16(x, N_BITS) : x; }
vslli32(const I256 & x)1015 template<uint8_t N_BITS> BL_INLINE I256 vslli32(const I256& x) noexcept { return N_BITS ? _mm256_slli_epi32(x, N_BITS) : x; }
vslli64(const I256 & x)1016 template<uint8_t N_BITS> BL_INLINE I256 vslli64(const I256& x) noexcept { return N_BITS ? _mm256_slli_epi64(x, N_BITS) : x; }
1017 
vsrli16(const I256 & x)1018 template<uint8_t N_BITS> BL_INLINE I256 vsrli16(const I256& x) noexcept { return N_BITS ? _mm256_srli_epi16(x, N_BITS) : x; }
vsrli32(const I256 & x)1019 template<uint8_t N_BITS> BL_INLINE I256 vsrli32(const I256& x) noexcept { return N_BITS ? _mm256_srli_epi32(x, N_BITS) : x; }
vsrli64(const I256 & x)1020 template<uint8_t N_BITS> BL_INLINE I256 vsrli64(const I256& x) noexcept { return N_BITS ? _mm256_srli_epi64(x, N_BITS) : x; }
1021 
vsrai16(const I256 & x)1022 template<uint8_t N_BITS> BL_INLINE I256 vsrai16(const I256& x) noexcept { return N_BITS ? _mm256_srai_epi16(x, N_BITS) : x; }
vsrai32(const I256 & x)1023 template<uint8_t N_BITS> BL_INLINE I256 vsrai32(const I256& x) noexcept { return N_BITS ? _mm256_srai_epi32(x, N_BITS) : x; }
1024 
vslli128b(const I256 & x)1025 template<uint8_t N_BYTES> BL_INLINE I256 vslli128b(const I256& x) noexcept { return N_BYTES ? _mm256_slli_si256(x, N_BYTES) : x; }
vsrli128b(const I256 & x)1026 template<uint8_t N_BYTES> BL_INLINE I256 vsrli128b(const I256& x) noexcept { return N_BYTES ? _mm256_srli_si256(x, N_BYTES) : x; }
1027 
vmini8(const I256 & x,const I256 & y)1028 BL_INLINE I256 vmini8(const I256& x, const I256& y) noexcept { return _mm256_min_epi8(x, y); }
vmaxi8(const I256 & x,const I256 & y)1029 BL_INLINE I256 vmaxi8(const I256& x, const I256& y) noexcept { return _mm256_max_epi8(x, y); }
vminu8(const I256 & x,const I256 & y)1030 BL_INLINE I256 vminu8(const I256& x, const I256& y) noexcept { return _mm256_min_epu8(x, y); }
vmaxu8(const I256 & x,const I256 & y)1031 BL_INLINE I256 vmaxu8(const I256& x, const I256& y) noexcept { return _mm256_max_epu8(x, y); }
1032 
vmini16(const I256 & x,const I256 & y)1033 BL_INLINE I256 vmini16(const I256& x, const I256& y) noexcept { return _mm256_min_epi16(x, y); }
vmaxi16(const I256 & x,const I256 & y)1034 BL_INLINE I256 vmaxi16(const I256& x, const I256& y) noexcept { return _mm256_max_epi16(x, y); }
vminu16(const I256 & x,const I256 & y)1035 BL_INLINE I256 vminu16(const I256& x, const I256& y) noexcept { return _mm256_min_epu16(x, y); }
vmaxu16(const I256 & x,const I256 & y)1036 BL_INLINE I256 vmaxu16(const I256& x, const I256& y) noexcept { return _mm256_max_epu16(x, y); }
1037 
vmini32(const I256 & x,const I256 & y)1038 BL_INLINE I256 vmini32(const I256& x, const I256& y) noexcept { return _mm256_min_epi32(x, y); }
vmaxi32(const I256 & x,const I256 & y)1039 BL_INLINE I256 vmaxi32(const I256& x, const I256& y) noexcept { return _mm256_max_epi32(x, y); }
vminu32(const I256 & x,const I256 & y)1040 BL_INLINE I256 vminu32(const I256& x, const I256& y) noexcept { return _mm256_min_epu32(x, y); }
vmaxu32(const I256 & x,const I256 & y)1041 BL_INLINE I256 vmaxu32(const I256& x, const I256& y) noexcept { return _mm256_max_epu32(x, y); }
1042 
vcmpeqi8(const I256 & x,const I256 & y)1043 BL_INLINE I256 vcmpeqi8(const I256& x, const I256& y) noexcept { return _mm256_cmpeq_epi8(x, y); }
vcmpgti8(const I256 & x,const I256 & y)1044 BL_INLINE I256 vcmpgti8(const I256& x, const I256& y) noexcept { return _mm256_cmpgt_epi8(x, y); }
1045 
vcmpeqi16(const I256 & x,const I256 & y)1046 BL_INLINE I256 vcmpeqi16(const I256& x, const I256& y) noexcept { return _mm256_cmpeq_epi16(x, y); }
vcmpgti16(const I256 & x,const I256 & y)1047 BL_INLINE I256 vcmpgti16(const I256& x, const I256& y) noexcept { return _mm256_cmpgt_epi16(x, y); }
1048 
vcmpeqi32(const I256 & x,const I256 & y)1049 BL_INLINE I256 vcmpeqi32(const I256& x, const I256& y) noexcept { return _mm256_cmpeq_epi32(x, y); }
vcmpgti32(const I256 & x,const I256 & y)1050 BL_INLINE I256 vcmpgti32(const I256& x, const I256& y) noexcept { return _mm256_cmpgt_epi32(x, y); }
1051 
vloadi256_32(const void * p)1052 BL_INLINE I256 vloadi256_32(const void* p) noexcept { return vcast<I256>(vloadi128_32(p)); }
vloadi256_64(const void * p)1053 BL_INLINE I256 vloadi256_64(const void* p) noexcept { return vcast<I256>(vloadi128_64(p)); }
vloadi256_128a(const void * p)1054 BL_INLINE I256 vloadi256_128a(const void* p) noexcept { return vcast<I256>(vloadi128a(p)); }
vloadi256_128u(const void * p)1055 BL_INLINE I256 vloadi256_128u(const void* p) noexcept { return vcast<I256>(vloadi128u(p)); }
vloadi256a(const void * p)1056 BL_INLINE I256 vloadi256a(const void* p) noexcept { return _mm256_load_si256(static_cast<const I256*>(p)); }
vloadi256u(const void * p)1057 BL_INLINE I256 vloadi256u(const void* p) noexcept { return _mm256_loadu_si256(static_cast<const I256*>(p)); }
vloadi256_mask32(const void * p,const I256 & mask)1058 BL_INLINE I256 vloadi256_mask32(const void* p, const I256& mask) noexcept { return _mm256_maskload_epi32(static_cast<const int*>(p), mask); }
vloadi256_mask64(const void * p,const I256 & mask)1059 BL_INLINE I256 vloadi256_mask64(const void* p, const I256& mask) noexcept { return _mm256_maskload_epi64(static_cast<const long long*>(p), mask); }
1060 
vloadi256_l64(const I256 & x,const void * p)1061 BL_INLINE I256 vloadi256_l64(const I256& x, const void* p) noexcept { return vcast<I256>(vloadi128_l64(vcast<I128>(x), p)); }
vloadi256_h64(const I256 & x,const void * p)1062 BL_INLINE I256 vloadi256_h64(const I256& x, const void* p) noexcept { return vcast<I256>(vloadi128_h64(vcast<I128>(x), p)); }
1063 
1064 template<uint32_t I>
vinsertu8(const I256 & x,uint32_t y)1065 BL_INLINE I256 vinsertu8(const I256& x, uint32_t y) noexcept { return _mm256_insert_epi8(x, int8_t(y), I); }
1066 template<uint32_t I>
vinsertu16(const I256 & x,uint32_t y)1067 BL_INLINE I256 vinsertu16(const I256& x, uint32_t y) noexcept { return _mm256_insert_epi16(x, int16_t(y), I); }
1068 template<uint32_t I>
vinsertu32(const I256 & x,uint32_t y)1069 BL_INLINE I256 vinsertu32(const I256& x, uint32_t y) noexcept { return _mm256_insert_epi32(x, y, I); }
1070 
1071 template<uint32_t I>
vinsertum8(const I256 & x,const void * p)1072 BL_INLINE I256 vinsertum8(const I256& x, const void* p) noexcept { return _mm256_insert_epi8(x, blMemReadU8(p), I); }
1073 template<uint32_t I>
vinsertum16(const I256 & x,const void * p)1074 BL_INLINE I256 vinsertum16(const I256& x, const void* p) noexcept { return _mm256_insert_epi16(x, blMemReadU16u(p), I); }
1075 template<uint32_t I>
vinsertum32(const I256 & x,const void * p)1076 BL_INLINE I256 vinsertum32(const I256& x, const void* p) noexcept { return _mm256_insert_epi32(x, blMemReadU32u(p), I); }
1077 
1078 template<uint32_t I>
vextractu8(const I256 & x)1079 BL_INLINE uint32_t vextractu8(const I256& x) noexcept { return uint32_t(_mm256_extract_epi8(x, I)); }
1080 template<uint32_t I>
vextractu16(const I256 & x)1081 BL_INLINE uint32_t vextractu16(const I256& x) noexcept { return uint32_t(_mm256_extract_epi16(x, I)); }
1082 template<uint32_t I>
vextractu32(const I256 & x)1083 BL_INLINE uint32_t vextractu32(const I256& x) noexcept { return uint32_t(_mm256_extract_epi32(x, I)); }
1084 
vstorei32(void * p,const I256 & x)1085 BL_INLINE void vstorei32(void* p, const I256& x) noexcept { vstorei32(p, vcast<I128>(x)); }
vstorei64(void * p,const I256 & x)1086 BL_INLINE void vstorei64(void* p, const I256& x) noexcept { vstorei64(p, vcast<I128>(x)); }
vstorei128a(void * p,const I256 & x)1087 BL_INLINE void vstorei128a(void* p, const I256& x) noexcept { vstorei128a(p, vcast<I128>(x)); }
vstorei128u(void * p,const I256 & x)1088 BL_INLINE void vstorei128u(void* p, const I256& x) noexcept { vstorei128u(p, vcast<I128>(x)); }
vstorei256a(void * p,const I256 & x)1089 BL_INLINE void vstorei256a(void* p, const I256& x) noexcept { _mm256_store_si256(static_cast<I256*>(p), x); }
vstorei256u(void * p,const I256 & x)1090 BL_INLINE void vstorei256u(void* p, const I256& x) noexcept { _mm256_storeu_si256(static_cast<I256*>(p), x); }
vstorei256_mask32(void * p,const I256 & x,const I256 & mask)1091 BL_INLINE void vstorei256_mask32(void* p, const I256& x, const I256& mask) noexcept { _mm256_maskstore_epi32(static_cast<int*>(p), mask, x); }
vstorei256_mask64(void * p,const I256 & x,const I256 & mask)1092 BL_INLINE void vstorei256_mask64(void* p, const I256& x, const I256& mask) noexcept { _mm256_maskstore_epi64(static_cast<long long*>(p), mask, x); }
1093 
vstoreli64(void * p,const I256 & x)1094 BL_INLINE void vstoreli64(void* p, const I256& x) noexcept { vstoreli64(p, vcast<I128>(x)); }
vstorehi64(void * p,const I256 & x)1095 BL_INLINE void vstorehi64(void* p, const I256& x) noexcept { vstorehi64(p, vcast<I128>(x)); }
1096 
vhasmaski8(const I256 & x,int bits0_31)1097 BL_INLINE bool vhasmaski8(const I256& x, int bits0_31) noexcept { return _mm256_movemask_epi8(vcast<I256>(x)) == bits0_31; }
vhasmaski8(const F256 & x,int bits0_31)1098 BL_INLINE bool vhasmaski8(const F256& x, int bits0_31) noexcept { return _mm256_movemask_epi8(vcast<I256>(x)) == bits0_31; }
vhasmaski8(const D256 & x,int bits0_31)1099 BL_INLINE bool vhasmaski8(const D256& x, int bits0_31) noexcept { return _mm256_movemask_epi8(vcast<I256>(x)) == bits0_31; }
1100 
vhasmaski32(const I256 & x,int bits0_7)1101 BL_INLINE bool vhasmaski32(const I256& x, int bits0_7) noexcept { return _mm256_movemask_ps(vcast<F256>(x)) == bits0_7; }
vhasmaski64(const I256 & x,int bits0_3)1102 BL_INLINE bool vhasmaski64(const I256& x, int bits0_3) noexcept { return _mm256_movemask_pd(vcast<D256>(x)) == bits0_3; }
1103 
vdiv255u16(const I256 & x)1104 BL_INLINE I256 vdiv255u16(const I256& x) noexcept {
1105   I256 y = vaddi16(x, v_const_as<I256>(blCommonTable.i256_0080008000800080));
1106   return vmulhu16(y, v_const_as<I256>(blCommonTable.i256_0101010101010101));
1107 }
1108 #endif
1109 
1110 // ============================================================================
1111 // [BLSIMD::F256]
1112 // ============================================================================
1113 
1114 #if defined(BL_TARGET_OPT_AVX)
vzerof256()1115 BL_INLINE F256 vzerof256() noexcept { return _mm256_setzero_ps(); }
1116 
vsetf256(float x)1117 BL_INLINE F256 vsetf256(float x) noexcept { return _mm256_set1_ps(x); }
vsetf256(float x1,float x0)1118 BL_INLINE F256 vsetf256(float x1, float x0) noexcept { return _mm256_set_ps(x1, x0, x1, x0, x1, x0, x1, x0); }
vsetf256(float x3,float x2,float x1,float x0)1119 BL_INLINE F256 vsetf256(float x3, float x2, float x1, float x0) noexcept { return _mm256_set_ps(x3, x2, x1, x0, x3, x2, x1, x0); }
vsetf256(float x7,float x6,float x5,float x4,float x3,float x2,float x1,float x0)1120 BL_INLINE F256 vsetf256(float x7, float x6, float x5, float x4, float x3, float x2, float x1, float x0) noexcept { return _mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0); }
1121 
vcvtf32f256(float x)1122 BL_INLINE F256 vcvtf32f256(float x) noexcept { return vcast<F256>(vcvtf32f128(x)); }
vcvtf256f32(const F256 & x)1123 BL_INLINE float vcvtf256f32(const F256& x) noexcept { return vcvtf128f32(vcast<F128>(x)); }
1124 
vcvti32f256(int32_t x)1125 BL_INLINE F256 vcvti32f256(int32_t x) noexcept { return vcast<F256>(vcvti32f128(x)); }
vcvtf256i32(const F256 & x)1126 BL_INLINE int32_t vcvtf256i32(const F256& x) noexcept { return vcvtf128i32(vcast<F128>(x)); }
vcvttf256i32(const F256 & x)1127 BL_INLINE int32_t vcvttf256i32(const F256& x) noexcept { return vcvttf128i32(vcast<F128>(x)); }
1128 
1129 #if BL_TARGET_ARCH_BITS >= 64
vcvti64f256(int64_t x)1130 BL_INLINE F256 vcvti64f256(int64_t x) noexcept { return vcast<F256>(vcvti64f128(x)); }
vcvtf256i64(const F256 & x)1131 BL_INLINE int64_t vcvtf256i64(const F256& x) noexcept { return vcvtf128i64(vcast<F128>(x)); }
vcvttf256i64(const F256 & x)1132 BL_INLINE int64_t vcvttf256i64(const F256& x) noexcept { return vcvttf128i64(vcast<F128>(x)); }
1133 #endif
1134 
vcvtf256i256(const F256 & x)1135 BL_INLINE I256 vcvtf256i256(const F256& x) noexcept { return _mm256_cvtps_epi32(x); }
vcvttf256i256(const F256 & x)1136 BL_INLINE I256 vcvttf256i256(const F256& x) noexcept { return _mm256_cvttps_epi32(x); }
1137 
vcvtf128d256(const F128 & x)1138 BL_INLINE D256 vcvtf128d256(const F128& x) noexcept { return _mm256_cvtps_pd(vcast<F128>(x)); }
vcvtf256d256(const F256 & x)1139 BL_INLINE D256 vcvtf256d256(const F256& x) noexcept { return _mm256_cvtps_pd(vcast<F128>(x)); }
1140 
1141 template<int A, int B, int C, int D>
vshuff32(const F256 & x,const F256 & y)1142 BL_INLINE F256 vshuff32(const F256& x, const F256& y) noexcept { return _mm256_shuffle_ps(x, y, _MM_SHUFFLE(A, B, C, D)); }
1143 template<int A, int B, int C, int D>
vswizf32(const F256 & x)1144 BL_INLINE F256 vswizf32(const F256& x) noexcept { return vshuff32<A, B, C, D>(x, x); }
1145 
1146 template<int A, int B>
vswizf64(const F256 & x)1147 BL_INLINE F256 vswizf64(const F256& x) noexcept { return vshuff32<A*2 + 1, A*2, B*2 + 1, B*2>(x, x); }
1148 
1149 template<int A, int B>
vpermf128(const F256 & x,const F256 & y)1150 BL_INLINE F256 vpermf128(const F256& x, const F256& y) noexcept { return _mm256_permute2f128_ps(x, y, ((A & 0xF) << 4) + (B & 0xF)); }
1151 template<int A, int B>
vpermf128(const F256 & x)1152 BL_INLINE F256 vpermf128(const F256& x) noexcept { return vpermf128<A, B>(x, x); }
1153 
vduplf32(const F256 & x)1154 BL_INLINE F256 vduplf32(const F256& x) noexcept { return vswizf32<2, 2, 0, 0>(x); }
vduphf32(const F256 & x)1155 BL_INLINE F256 vduphf32(const F256& x) noexcept { return vswizf32<3, 3, 1, 1>(x); }
1156 
vswapf64(const F256 & x)1157 BL_INLINE F256 vswapf64(const F256& x) noexcept { return vswizf64<0, 1>(x); }
vduplf64(const F256 & x)1158 BL_INLINE F256 vduplf64(const F256& x) noexcept { return vswizf64<0, 0>(x); }
vduphf64(const F256 & x)1159 BL_INLINE F256 vduphf64(const F256& x) noexcept { return vswizf64<1, 1>(x); }
1160 
vswapf128(const F256 & x)1161 BL_INLINE F256 vswapf128(const F256& x) noexcept { return vpermf128<0, 1>(x); }
vduplf128(const F128 & x)1162 BL_INLINE F256 vduplf128(const F128& x) noexcept { return vpermf128<0, 0>(vcast<F256>(x)); }
vduplf128(const F256 & x)1163 BL_INLINE F256 vduplf128(const F256& x) noexcept { return vpermf128<0, 0>(x); }
vduphf128(const F256 & x)1164 BL_INLINE F256 vduphf128(const F256& x) noexcept { return vpermf128<1, 1>(x); }
1165 
vunpacklf32(const F256 & x,const F256 & y)1166 BL_INLINE F256 vunpacklf32(const F256& x, const F256& y) noexcept { return _mm256_unpacklo_ps(x, y); }
vunpackhf32(const F256 & x,const F256 & y)1167 BL_INLINE F256 vunpackhf32(const F256& x, const F256& y) noexcept { return _mm256_unpackhi_ps(x, y); }
1168 
1169 #if defined(BL_TARGET_OPT_AVX2)
vsplatf32f256(const F128 & x)1170 BL_INLINE F256 vsplatf32f256(const F128& x) noexcept { return _mm256_broadcastss_ps(vcast<F128>(x)); }
vsplatf32f256(const F256 & x)1171 BL_INLINE F256 vsplatf32f256(const F256& x) noexcept { return _mm256_broadcastss_ps(vcast<F128>(x)); }
1172 #else
vsplatf32f256(const F128 & x)1173 BL_INLINE F256 vsplatf32f256(const F128& x) noexcept { return vduplf128(vswizf32<0, 0, 0, 0>(vcast<F128>(x))); }
vsplatf32f256(const F256 & x)1174 BL_INLINE F256 vsplatf32f256(const F256& x) noexcept { return vduplf128(vswizf32<0, 0, 0, 0>(vcast<F128>(x))); }
1175 #endif
1176 
vor(const F256 & x,const F256 & y)1177 BL_INLINE F256 vor(const F256& x, const F256& y) noexcept { return _mm256_or_ps(x, y); }
vxor(const F256 & x,const F256 & y)1178 BL_INLINE F256 vxor(const F256& x, const F256& y) noexcept { return _mm256_xor_ps(x, y); }
vand(const F256 & x,const F256 & y)1179 BL_INLINE F256 vand(const F256& x, const F256& y) noexcept { return _mm256_and_ps(x, y); }
vandnot_a(const F256 & x,const F256 & y)1180 BL_INLINE F256 vandnot_a(const F256& x, const F256& y) noexcept { return _mm256_andnot_ps(x, y); }
vandnot_b(const F256 & x,const F256 & y)1181 BL_INLINE F256 vandnot_b(const F256& x, const F256& y) noexcept { return _mm256_andnot_ps(y, x); }
vblendmask(const F256 & x,const F256 & y,const F256 & mask)1182 BL_INLINE F256 vblendmask(const F256& x, const F256& y, const F256& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
1183 
vaddss(const F256 & x,const F256 & y)1184 BL_INLINE F256 vaddss(const F256& x, const F256& y) noexcept { return vcast<F256>(vaddss(vcast<F128>(x), vcast<F128>(y))); }
vaddps(const F256 & x,const F256 & y)1185 BL_INLINE F256 vaddps(const F256& x, const F256& y) noexcept { return _mm256_add_ps(x, y); }
1186 
vsubss(const F256 & x,const F256 & y)1187 BL_INLINE F256 vsubss(const F256& x, const F256& y) noexcept { return vcast<F256>(vsubss(vcast<F128>(x), vcast<F128>(y))); }
vsubps(const F256 & x,const F256 & y)1188 BL_INLINE F256 vsubps(const F256& x, const F256& y) noexcept { return _mm256_sub_ps(x, y); }
1189 
vmulss(const F256 & x,const F256 & y)1190 BL_INLINE F256 vmulss(const F256& x, const F256& y) noexcept { return vcast<F256>(vmulss(vcast<F128>(x), vcast<F128>(y))); }
vmulps(const F256 & x,const F256 & y)1191 BL_INLINE F256 vmulps(const F256& x, const F256& y) noexcept { return _mm256_mul_ps(x, y); }
1192 
vdivss(const F256 & x,const F256 & y)1193 BL_INLINE F256 vdivss(const F256& x, const F256& y) noexcept { return vcast<F256>(vdivss(vcast<F128>(x), vcast<F128>(y))); }
vdivps(const F256 & x,const F256 & y)1194 BL_INLINE F256 vdivps(const F256& x, const F256& y) noexcept { return _mm256_div_ps(x, y); }
1195 
vminss(const F256 & x,const F256 & y)1196 BL_INLINE F256 vminss(const F256& x, const F256& y) noexcept { return vcast<F256>(vminss(vcast<F128>(x), vcast<F128>(y))); }
vminps(const F256 & x,const F256 & y)1197 BL_INLINE F256 vminps(const F256& x, const F256& y) noexcept { return _mm256_min_ps(x, y); }
1198 
vmaxss(const F256 & x,const F256 & y)1199 BL_INLINE F256 vmaxss(const F256& x, const F256& y) noexcept { return vcast<F256>(vmaxss(vcast<F128>(x), vcast<F128>(y))); }
vmaxps(const F256 & x,const F256 & y)1200 BL_INLINE F256 vmaxps(const F256& x, const F256& y) noexcept { return _mm256_max_ps(x, y); }
1201 
vcmpeqss(const F256 & x,const F256 & y)1202 BL_INLINE F256 vcmpeqss(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpeqss(vcast<F128>(x), vcast<F128>(y))); }
vcmpeqps(const F256 & x,const F256 & y)1203 BL_INLINE F256 vcmpeqps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_EQ_OQ); }
1204 
vcmpness(const F256 & x,const F256 & y)1205 BL_INLINE F256 vcmpness(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpness(vcast<F128>(x), vcast<F128>(y))); }
vcmpneps(const F256 & x,const F256 & y)1206 BL_INLINE F256 vcmpneps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_NEQ_OQ); }
1207 
vcmpgess(const F256 & x,const F256 & y)1208 BL_INLINE F256 vcmpgess(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpgess(vcast<F128>(x), vcast<F128>(y))); }
vcmpgeps(const F256 & x,const F256 & y)1209 BL_INLINE F256 vcmpgeps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_GE_OQ); }
1210 
vcmpgtss(const F256 & x,const F256 & y)1211 BL_INLINE F256 vcmpgtss(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpgtss(vcast<F128>(x), vcast<F128>(y))); }
vcmpgtps(const F256 & x,const F256 & y)1212 BL_INLINE F256 vcmpgtps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_GT_OQ); }
1213 
vcmpless(const F256 & x,const F256 & y)1214 BL_INLINE F256 vcmpless(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpless(vcast<F128>(x), vcast<F128>(y))); }
vcmpleps(const F256 & x,const F256 & y)1215 BL_INLINE F256 vcmpleps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_LE_OQ); }
1216 
vcmpltss(const F256 & x,const F256 & y)1217 BL_INLINE F256 vcmpltss(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpltss(vcast<F128>(x), vcast<F128>(y))); }
vcmpltps(const F256 & x,const F256 & y)1218 BL_INLINE F256 vcmpltps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_LT_OQ); }
1219 
vsqrtss(const F256 & x)1220 BL_INLINE F256 vsqrtss(const F256& x) noexcept { return vcast<F256>(vsqrtss(vcast<F128>(x))); }
vsqrtps(const F256 & x)1221 BL_INLINE F256 vsqrtps(const F256& x) noexcept { return _mm256_sqrt_ps(x); }
1222 
vloadf256_32(const void * p)1223 BL_INLINE F256 vloadf256_32(const void* p) noexcept { return vcast<F256>(vloadf128_32(p)); }
vloadf256_64(const void * p)1224 BL_INLINE F256 vloadf256_64(const void* p) noexcept { return vcast<F256>(vloadf128_64(p)); }
vloadf256_128a(const void * p)1225 BL_INLINE F256 vloadf256_128a(const void* p) noexcept { return vcast<F256>(vloadf128a(p)); }
vloadf256_128u(const void * p)1226 BL_INLINE F256 vloadf256_128u(const void* p) noexcept { return vcast<F256>(vloadf128u(p)); }
vloadf256a(const void * p)1227 BL_INLINE F256 vloadf256a(const void* p) noexcept { return _mm256_load_ps(static_cast<const float*>(p)); }
vloadf256u(const void * p)1228 BL_INLINE F256 vloadf256u(const void* p) noexcept { return _mm256_loadu_ps(static_cast<const float*>(p)); }
vloadf256_mask32(const void * p,const F256 & mask)1229 BL_INLINE F256 vloadf256_mask32(const void* p, const F256& mask) noexcept { return _mm256_maskload_ps(static_cast<const float*>(p), vcast<I256>(mask)); }
1230 
vloadf256_l64(const F256 & x,const void * p)1231 BL_INLINE F256 vloadf256_l64(const F256& x, const void* p) noexcept { return vcast<F256>(vloadf128_l64(vcast<F128>(x), p)); }
vloadf256_h64(const F256 & x,const void * p)1232 BL_INLINE F256 vloadf256_h64(const F256& x, const void* p) noexcept { return vcast<F256>(vloadf128_h64(vcast<F128>(x), p)); }
1233 
vbroadcastf128_32(const void * p)1234 BL_INLINE F128 vbroadcastf128_32(const void* p) noexcept { return vcast<F128>(_mm_broadcast_ss(static_cast<const float*>(p))); }
vbroadcastf256_32(const void * p)1235 BL_INLINE F256 vbroadcastf256_32(const void* p) noexcept { return vcast<F256>(_mm256_broadcast_ss(static_cast<const float*>(p))); }
vbroadcastf256_64(const void * p)1236 BL_INLINE F256 vbroadcastf256_64(const void* p) noexcept { return vcast<F256>(_mm256_broadcast_sd(static_cast<const double*>(p))); }
vbroadcastf256_128(const void * p)1237 BL_INLINE F256 vbroadcastf256_128(const void* p) noexcept { return vcast<F256>(_mm256_broadcast_ps(static_cast<const __m128*>(p))); }
1238 
vstoref32(void * p,const F256 & x)1239 BL_INLINE void vstoref32(void* p, const F256& x) noexcept { vstoref32(p, vcast<F128>(x)); }
vstoref64(void * p,const F256 & x)1240 BL_INLINE void vstoref64(void* p, const F256& x) noexcept { vstoref64(p, vcast<F128>(x)); }
vstorelf64(void * p,const F256 & x)1241 BL_INLINE void vstorelf64(void* p, const F256& x) noexcept { vstorelf64(p, vcast<F128>(x)); }
vstorehf64(void * p,const F256 & x)1242 BL_INLINE void vstorehf64(void* p, const F256& x) noexcept { vstorehf64(p, vcast<F128>(x)); }
vstoref128a(void * p,const F256 & x)1243 BL_INLINE void vstoref128a(void* p, const F256& x) noexcept { vstoref128a(p, vcast<F128>(x)); }
vstoref128u(void * p,const F256 & x)1244 BL_INLINE void vstoref128u(void* p, const F256& x) noexcept { vstoref128u(p, vcast<F128>(x)); }
vstoref256a(void * p,const F256 & x)1245 BL_INLINE void vstoref256a(void* p, const F256& x) noexcept { _mm256_store_ps(static_cast<float*>(p), x); }
vstoref256u(void * p,const F256 & x)1246 BL_INLINE void vstoref256u(void* p, const F256& x) noexcept { _mm256_storeu_ps(static_cast<float*>(p), x); }
vstoref256_mask32(void * p,const F256 & x,const F256 & mask)1247 BL_INLINE void vstoref256_mask32(void* p, const F256& x, const F256& mask) noexcept { _mm256_maskstore_ps(static_cast<float*>(p), vcast<I256>(mask), x); }
1248 
vhasmaskf32(const F256 & x,int bits0_7)1249 BL_INLINE bool vhasmaskf32(const F256& x, int bits0_7) noexcept { return _mm256_movemask_ps(vcast<F256>(x)) == bits0_7; }
vhasmaskf64(const F256 & x,int bits0_3)1250 BL_INLINE bool vhasmaskf64(const F256& x, int bits0_3) noexcept { return _mm256_movemask_pd(vcast<D256>(x)) == bits0_3; }
1251 #endif
1252 
1253 // ============================================================================
1254 // [BLSIMD::D256]
1255 // ============================================================================
1256 
1257 #if defined(BL_TARGET_OPT_AVX)
vzerod256()1258 BL_INLINE D256 vzerod256() noexcept { return _mm256_setzero_pd(); }
vsetd256(double x)1259 BL_INLINE D256 vsetd256(double x) noexcept { return _mm256_set1_pd(x); }
vsetd256(double x1,double x0)1260 BL_INLINE D256 vsetd256(double x1, double x0) noexcept { return _mm256_set_pd(x1, x0, x1, x0); }
vsetd256(double x3,double x2,double x1,double x0)1261 BL_INLINE D256 vsetd256(double x3, double x2, double x1, double x0) noexcept { return _mm256_set_pd(x3, x2, x1, x0); }
1262 
vcvtd64d256(double x)1263 BL_INLINE D256 vcvtd64d256(double x) noexcept { return vcast<D256>(vcvtd64d128(x)); }
vcvtd256d64(const D256 & x)1264 BL_INLINE double vcvtd256d64(const D256& x) noexcept { return vcvtd128d64(vcast<D128>(x)); }
1265 
vcvti32d256(int32_t x)1266 BL_INLINE D256 vcvti32d256(int32_t x) noexcept { return vcast<D256>(vcvti32d128(x)); }
vcvtd256i32(const D256 & x)1267 BL_INLINE int32_t vcvtd256i32(const D256& x) noexcept { return vcvtd128i32(vcast<D128>(x)); }
vcvttd256i32(const D256 & x)1268 BL_INLINE int32_t vcvttd256i32(const D256& x) noexcept { return vcvttd128i32(vcast<D128>(x)); }
1269 
1270 #if BL_TARGET_ARCH_BITS >= 64
vcvti64d256(int64_t x)1271 BL_INLINE D256 vcvti64d256(int64_t x) noexcept { return vcast<D256>(vcvti64d128(x)); }
vcvtd256i64(const D256 & x)1272 BL_INLINE int64_t vcvtd256i64(const D256& x) noexcept { return vcvtd128i64(vcast<D128>(x)); }
vcvttd256i64(const D256 & x)1273 BL_INLINE int64_t vcvttd256i64(const D256& x) noexcept { return vcvttd128i64(vcast<D128>(x)); }
1274 #endif
1275 
vcvtd256i128(const D256 & x)1276 BL_INLINE I128 vcvtd256i128(const D256& x) noexcept { return vcast<I128>(_mm256_cvtpd_epi32(x)); }
vcvtd256i256(const D256 & x)1277 BL_INLINE I256 vcvtd256i256(const D256& x) noexcept { return vcast<I256>(_mm256_cvtpd_epi32(x)); }
1278 
vcvttd256i128(const D256 & x)1279 BL_INLINE I128 vcvttd256i128(const D256& x) noexcept { return vcast<I128>(_mm256_cvttpd_epi32(x)); }
vcvttd256i256(const D256 & x)1280 BL_INLINE I256 vcvttd256i256(const D256& x) noexcept { return vcast<I256>(_mm256_cvttpd_epi32(x)); }
1281 
vcvtd256f128(const D256 & x)1282 BL_INLINE F128 vcvtd256f128(const D256& x) noexcept { return vcast<F128>(_mm256_cvtpd_ps(x)); }
vcvtd256f256(const D256 & x)1283 BL_INLINE F256 vcvtd256f256(const D256& x) noexcept { return vcast<F256>(_mm256_cvtpd_ps(x)); }
1284 
1285 template<int A, int B>
vshufd64(const D256 & x,const D256 & y)1286 BL_INLINE D256 vshufd64(const D256& x, const D256& y) noexcept { return _mm256_shuffle_pd(x, y, (A << 3) | (B << 2) | (A << 1) | B); }
1287 template<int A, int B>
vswizd64(const D256 & x)1288 BL_INLINE D256 vswizd64(const D256& x) noexcept { return vshufd64<A, B>(x, x); }
1289 
1290 template<int A, int B>
vpermd128(const D256 & x,const D256 & y)1291 BL_INLINE D256 vpermd128(const D256& x, const D256& y) noexcept { return _mm256_permute2f128_pd(x, y, ((A & 0xF) << 4) + (B & 0xF)); }
1292 template<int A, int B>
vpermd128(const D256 & x)1293 BL_INLINE D256 vpermd128(const D256& x) noexcept { return vpermd128<A, B>(x, x); }
1294 
vswapd64(const D256 & x)1295 BL_INLINE D256 vswapd64(const D256& x) noexcept { return vswizd64<0, 1>(x); }
vdupld64(const D256 & x)1296 BL_INLINE D256 vdupld64(const D256& x) noexcept { return vswizd64<0, 0>(x); }
vduphd64(const D256 & x)1297 BL_INLINE D256 vduphd64(const D256& x) noexcept { return vswizd64<1, 1>(x); }
1298 
vswapd128(const D256 & x)1299 BL_INLINE D256 vswapd128(const D256& x) noexcept { return vpermd128<0, 1>(x); }
vdupld128(const D128 & x)1300 BL_INLINE D256 vdupld128(const D128& x) noexcept { return vpermd128<0, 0>(vcast<D256>(x)); }
vdupld128(const D256 & x)1301 BL_INLINE D256 vdupld128(const D256& x) noexcept { return vpermd128<0, 0>(x); }
vduphd128(const D256 & x)1302 BL_INLINE D256 vduphd128(const D256& x) noexcept { return vpermd128<1, 1>(x); }
1303 
vunpackld64(const D256 & x,const D256 & y)1304 BL_INLINE D256 vunpackld64(const D256& x, const D256& y) noexcept { return _mm256_unpacklo_pd(x, y); }
vunpackhd64(const D256 & x,const D256 & y)1305 BL_INLINE D256 vunpackhd64(const D256& x, const D256& y) noexcept { return _mm256_unpackhi_pd(x, y); }
1306 
1307 #if defined(BL_TARGET_OPT_AVX2)
vsplatd64d256(const D128 & x)1308 BL_INLINE D256 vsplatd64d256(const D128& x) noexcept { return _mm256_broadcastsd_pd(vcast<D128>(x)); }
vsplatd64d256(const D256 & x)1309 BL_INLINE D256 vsplatd64d256(const D256& x) noexcept { return _mm256_broadcastsd_pd(vcast<D128>(x)); }
1310 #else
vsplatd64d256(const D128 & x)1311 BL_INLINE D256 vsplatd64d256(const D128& x) noexcept { return vdupld128(vswizd64<0, 0>(vcast<D128>(x))); }
vsplatd64d256(const D256 & x)1312 BL_INLINE D256 vsplatd64d256(const D256& x) noexcept { return vdupld128(vswizd64<0, 0>(vcast<D128>(x))); }
1313 #endif
1314 
vor(const D256 & x,const D256 & y)1315 BL_INLINE D256 vor(const D256& x, const D256& y) noexcept { return _mm256_or_pd(x, y); }
vxor(const D256 & x,const D256 & y)1316 BL_INLINE D256 vxor(const D256& x, const D256& y) noexcept { return _mm256_xor_pd(x, y); }
vand(const D256 & x,const D256 & y)1317 BL_INLINE D256 vand(const D256& x, const D256& y) noexcept { return _mm256_and_pd(x, y); }
vandnot_a(const D256 & x,const D256 & y)1318 BL_INLINE D256 vandnot_a(const D256& x, const D256& y) noexcept { return _mm256_andnot_pd(x, y); }
vandnot_b(const D256 & x,const D256 & y)1319 BL_INLINE D256 vandnot_b(const D256& x, const D256& y) noexcept { return _mm256_andnot_pd(y, x); }
vblendmask(const D256 & x,const D256 & y,const D256 & mask)1320 BL_INLINE D256 vblendmask(const D256& x, const D256& y, const D256& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
1321 
vaddsd(const D256 & x,const D256 & y)1322 BL_INLINE D256 vaddsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vaddsd(vcast<D128>(x), vcast<D128>(y))); }
vaddpd(const D256 & x,const D256 & y)1323 BL_INLINE D256 vaddpd(const D256& x, const D256& y) noexcept { return _mm256_add_pd(x, y); }
1324 
vsubsd(const D256 & x,const D256 & y)1325 BL_INLINE D256 vsubsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vsubsd(vcast<D128>(x), vcast<D128>(y))); }
vsubpd(const D256 & x,const D256 & y)1326 BL_INLINE D256 vsubpd(const D256& x, const D256& y) noexcept { return _mm256_sub_pd(x, y); }
1327 
vmulsd(const D256 & x,const D256 & y)1328 BL_INLINE D256 vmulsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vmulsd(vcast<D128>(x), vcast<D128>(y))); }
vmulpd(const D256 & x,const D256 & y)1329 BL_INLINE D256 vmulpd(const D256& x, const D256& y) noexcept { return _mm256_mul_pd(x, y); }
1330 
vdivsd(const D256 & x,const D256 & y)1331 BL_INLINE D256 vdivsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vdivsd(vcast<D128>(x), vcast<D128>(y))); }
vdivpd(const D256 & x,const D256 & y)1332 BL_INLINE D256 vdivpd(const D256& x, const D256& y) noexcept { return _mm256_div_pd(x, y); }
1333 
vminsd(const D256 & x,const D256 & y)1334 BL_INLINE D256 vminsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vminsd(vcast<D128>(x), vcast<D128>(y))); }
vminpd(const D256 & x,const D256 & y)1335 BL_INLINE D256 vminpd(const D256& x, const D256& y) noexcept { return _mm256_min_pd(x, y); }
1336 
vmaxsd(const D256 & x,const D256 & y)1337 BL_INLINE D256 vmaxsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vmaxsd(vcast<D128>(x), vcast<D128>(y))); }
vmaxpd(const D256 & x,const D256 & y)1338 BL_INLINE D256 vmaxpd(const D256& x, const D256& y) noexcept { return _mm256_max_pd(x, y); }
1339 
vcmpeqsd(const D256 & x,const D256 & y)1340 BL_INLINE D256 vcmpeqsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpeqsd(vcast<D128>(x), vcast<D128>(y))); }
vcmpeqpd(const D256 & x,const D256 & y)1341 BL_INLINE D256 vcmpeqpd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_EQ_OQ); }
1342 
vcmpnesd(const D256 & x,const D256 & y)1343 BL_INLINE D256 vcmpnesd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpnesd(vcast<D128>(x), vcast<D128>(y))); }
vcmpnepd(const D256 & x,const D256 & y)1344 BL_INLINE D256 vcmpnepd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_NEQ_OQ); }
1345 
vcmpgesd(const D256 & x,const D256 & y)1346 BL_INLINE D256 vcmpgesd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpgesd(vcast<D128>(x), vcast<D128>(y))); }
vcmpgepd(const D256 & x,const D256 & y)1347 BL_INLINE D256 vcmpgepd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_GE_OQ); }
1348 
vcmpgtsd(const D256 & x,const D256 & y)1349 BL_INLINE D256 vcmpgtsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpgtsd(vcast<D128>(x), vcast<D128>(y))); }
vcmpgtpd(const D256 & x,const D256 & y)1350 BL_INLINE D256 vcmpgtpd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_GT_OQ); }
1351 
vcmplesd(const D256 & x,const D256 & y)1352 BL_INLINE D256 vcmplesd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmplesd(vcast<D128>(x), vcast<D128>(y))); }
vcmplepd(const D256 & x,const D256 & y)1353 BL_INLINE D256 vcmplepd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_LE_OQ); }
1354 
vcmpltsd(const D256 & x,const D256 & y)1355 BL_INLINE D256 vcmpltsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpltsd(vcast<D128>(x), vcast<D128>(y))); }
vcmpltpd(const D256 & x,const D256 & y)1356 BL_INLINE D256 vcmpltpd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_LE_OQ); }
1357 
vsqrtsd(const D256 & x)1358 BL_INLINE D256 vsqrtsd(const D256& x) noexcept { return vcast<D256>(vsqrtsd(vcast<D128>(x))); }
vsqrtpd(const D256 & x)1359 BL_INLINE D256 vsqrtpd(const D256& x) noexcept { return _mm256_sqrt_pd(x); }
1360 
vloadd256_64(const void * p)1361 BL_INLINE D256 vloadd256_64(const void* p) noexcept { return vcast<D256>(vloadd128_64(p)); }
vloadd256_128a(const void * p)1362 BL_INLINE D256 vloadd256_128a(const void* p) noexcept { return vcast<D256>(vloadd128a(p)); }
vloadd256_128u(const void * p)1363 BL_INLINE D256 vloadd256_128u(const void* p) noexcept { return vcast<D256>(vloadd128u(p)); }
vloadd256a(const void * p)1364 BL_INLINE D256 vloadd256a(const void* p) noexcept { return _mm256_load_pd(static_cast<const double*>(p)); }
vloadd256u(const void * p)1365 BL_INLINE D256 vloadd256u(const void* p) noexcept { return _mm256_loadu_pd(static_cast<const double*>(p)); }
vloadd256_mask64(const void * p,const D256 & mask)1366 BL_INLINE D256 vloadd256_mask64(const void* p, const D256& mask) noexcept { return _mm256_maskload_pd(static_cast<const double*>(p), vcast<I256>(mask)); }
1367 
vloadd256_l64(const D256 & x,const void * p)1368 BL_INLINE D256 vloadd256_l64(const D256& x, const void* p) noexcept { return vcast<D256>(vloadd128_l64(vcast<D128>(x), p)); }
vloadd256_h64(const D256 & x,const void * p)1369 BL_INLINE D256 vloadd256_h64(const D256& x, const void* p) noexcept { return vcast<D256>(vloadd128_h64(vcast<D128>(x), p)); }
1370 
vbroadcastd256_64(const void * p)1371 BL_INLINE D256 vbroadcastd256_64(const void* p) noexcept { return _mm256_broadcast_sd(static_cast<const double*>(p)); }
vbroadcastd256_128(const void * p)1372 BL_INLINE D256 vbroadcastd256_128(const void* p) noexcept { return _mm256_broadcast_pd(static_cast<const __m128d*>(p)); }
1373 
vstored64(void * p,const D256 & x)1374 BL_INLINE void vstored64(void* p, const D256& x) noexcept { vstored64(p, vcast<D128>(x)); }
vstoreld64(void * p,const D256 & x)1375 BL_INLINE void vstoreld64(void* p, const D256& x) noexcept { vstoreld64(p, vcast<D128>(x)); }
vstorehd64(void * p,const D256 & x)1376 BL_INLINE void vstorehd64(void* p, const D256& x) noexcept { vstorehd64(p, vcast<D128>(x)); }
vstored128a(void * p,const D256 & x)1377 BL_INLINE void vstored128a(void* p, const D256& x) noexcept { vstored128a(p, vcast<D128>(x)); }
vstored128u(void * p,const D256 & x)1378 BL_INLINE void vstored128u(void* p, const D256& x) noexcept { vstored128u(p, vcast<D128>(x)); }
vstored256a(void * p,const D256 & x)1379 BL_INLINE void vstored256a(void* p, const D256& x) noexcept { _mm256_store_pd(static_cast<double*>(p), x); }
vstored256u(void * p,const D256 & x)1380 BL_INLINE void vstored256u(void* p, const D256& x) noexcept { _mm256_storeu_pd(static_cast<double*>(p), x); }
vstored256_mask64(void * p,const D256 & x,const D256 & mask)1381 BL_INLINE void vstored256_mask64(void* p, const D256& x, const D256& mask) noexcept { _mm256_maskstore_pd(static_cast<double*>(p), vcast<I256>(mask), x); }
1382 
vhasmaskd64(const D256 & x,int bits0_3)1383 BL_INLINE bool vhasmaskd64(const D256& x, int bits0_3) noexcept { return _mm256_movemask_pd(vcast<D256>(x)) == bits0_3; }
1384 #endif
1385 
1386 #endif
1387 
1388 } // {anonymous}
1389 } // {SIMD}
1390 
1391 //! \}
1392 //! \endcond
1393 
1394 #endif // BLEND2D_SIMD_X86_P_H
1395