1 // Simd x86 specific implementations -*- C++ -*- 2 3 // Copyright (C) 2020-2021 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 27 28 #if __cplusplus >= 201703L 29 30 #if !_GLIBCXX_SIMD_X86INTRIN 31 #error \ 32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available" 33 #endif 34 35 _GLIBCXX_SIMD_BEGIN_NAMESPACE 36 37 // __to_masktype {{{ 38 // Given <T, N> return <__int_for_sizeof_t<T>, N>. For _SimdWrapper and 39 // __vector_type_t. 40 template <typename _Tp, size_t _Np> 41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __to_masktype(_SimdWrapper<_Tp,_Np> __x)42 __to_masktype(_SimdWrapper<_Tp, _Np> __x) 43 { 44 return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>( 45 __x._M_data); 46 } 47 48 template <typename _TV, 49 typename _TVT 50 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>, 51 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>> 52 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size> __to_masktype(_TV __x)53 __to_masktype(_TV __x) 54 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); } 55 56 // }}} 57 // __interleave128_lo {{{ 58 template <typename _Ap, typename _Bp, typename _Tp = common_type_t<_Ap, _Bp>, 59 typename _Trait = _VectorTraits<_Tp>> 60 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp __interleave128_lo(const _Ap & __av,const _Bp & __bv)61 __interleave128_lo(const _Ap& __av, const _Bp& __bv) 62 { 63 const _Tp __a(__av); 64 const _Tp __b(__bv); 65 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2) 66 return _Tp{__a[0], __b[0]}; 67 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4) 68 return _Tp{__a[0], __b[0], __a[1], __b[1]}; 69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8) 70 return _Tp{__a[0], __b[0], __a[1], __b[1], 71 __a[2], __b[2], __a[3], __b[3]}; 72 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16) 73 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], 74 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5], 75 __a[6], __b[6], __a[7], __b[7]}; 76 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4) 77 return _Tp{__a[0], __b[0], __a[2], __b[2]}; 78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8) 79 return _Tp{__a[0], __b[0], __a[1], __b[1], 80 __a[4], __b[4], __a[5], __b[5]}; 81 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16) 82 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], 83 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9], 84 __a[10], __b[10], __a[11], __b[11]}; 85 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32) 86 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 87 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], 88 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18], 89 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21], 90 __a[22], __b[22], __a[23], __b[23]}; 91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8) 92 return _Tp{__a[0], __b[0], __a[2], __b[2], 93 __a[4], __b[4], __a[6], __b[6]}; 94 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16) 95 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4], 96 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9], 97 __a[12], __b[12], __a[13], __b[13]}; 98 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32) 99 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 100 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], 101 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18], 102 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25], 103 __a[26], __b[26], __a[27], __b[27]}; 104 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64) 105 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 106 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], 107 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18], 108 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21], 109 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33], 110 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36], 111 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48], 112 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51], 113 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55], 114 __b[55]}; 115 else 116 __assert_unreachable<_Tp>(); 117 } 118 119 // }}} 120 // __is_zero{{{ 121 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 122 _GLIBCXX_SIMD_INTRINSIC constexpr bool __is_zero(_Tp __a)123 __is_zero(_Tp __a) 124 { 125 if (!__builtin_is_constant_evaluated()) 126 { 127 if constexpr (__have_avx) 128 { 129 if constexpr (_TVT::template _S_is<float, 8>) 130 return _mm256_testz_ps(__a, __a); 131 else if constexpr (_TVT::template _S_is<double, 4>) 132 return _mm256_testz_pd(__a, __a); 133 else if constexpr (sizeof(_Tp) == 32) 134 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a)); 135 else if constexpr (_TVT::template _S_is<float>) 136 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a)); 137 else if constexpr (_TVT::template _S_is<double, 2>) 138 return _mm_testz_pd(__a, __a); 139 else 140 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a)); 141 } 142 else if constexpr (__have_sse4_1) 143 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a), 144 __intrin_bitcast<__m128i>(__a)); 145 } 146 else if constexpr (sizeof(_Tp) <= 8) 147 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0; 148 else 149 { 150 const auto __b = __vector_bitcast<_LLong>(__a); 151 if constexpr (sizeof(__b) == 16) 152 return (__b[0] | __b[1]) == 0; 153 else if constexpr (sizeof(__b) == 32) 154 return __is_zero(__lo128(__b) | __hi128(__b)); 155 else if constexpr (sizeof(__b) == 64) 156 return __is_zero(__lo256(__b) | __hi256(__b)); 157 else 158 __assert_unreachable<_Tp>(); 159 } 160 } 161 162 // }}} 163 // __movemask{{{ 164 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 165 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int __movemask(_Tp __a)166 __movemask(_Tp __a) 167 { 168 if constexpr (sizeof(_Tp) == 32) 169 { 170 if constexpr (_TVT::template _S_is<float>) 171 return _mm256_movemask_ps(__to_intrin(__a)); 172 else if constexpr (_TVT::template _S_is<double>) 173 return _mm256_movemask_pd(__to_intrin(__a)); 174 else 175 return _mm256_movemask_epi8(__to_intrin(__a)); 176 } 177 else if constexpr (_TVT::template _S_is<float>) 178 return _mm_movemask_ps(__to_intrin(__a)); 179 else if constexpr (_TVT::template _S_is<double>) 180 return _mm_movemask_pd(__to_intrin(__a)); 181 else 182 return _mm_movemask_epi8(__to_intrin(__a)); 183 } 184 185 // }}} 186 // __testz{{{ 187 template <typename _TI, typename _TVT = _VectorTraits<_TI>> 188 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int __testz(_TI __a,_TI __b)189 __testz(_TI __a, _TI __b) 190 { 191 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type, 192 _TVT::_S_full_size>>); 193 if (!__builtin_is_constant_evaluated()) 194 { 195 if constexpr (sizeof(_TI) == 32) 196 { 197 if constexpr (_TVT::template _S_is<float>) 198 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b)); 199 else if constexpr (_TVT::template _S_is<double>) 200 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b)); 201 else 202 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b)); 203 } 204 else if constexpr (_TVT::template _S_is<float> && __have_avx) 205 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b)); 206 else if constexpr (_TVT::template _S_is<double> && __have_avx) 207 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b)); 208 else if constexpr (__have_sse4_1) 209 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 210 __intrin_bitcast<__m128i>(__to_intrin(__b))); 211 else 212 return __movemask(0 == __and(__a, __b)) != 0; 213 } 214 else 215 return __is_zero(__and(__a, __b)); 216 } 217 218 // }}} 219 // __testc{{{ 220 // requires SSE4.1 or above 221 template <typename _TI, typename _TVT = _VectorTraits<_TI>> 222 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int __testc(_TI __a,_TI __b)223 __testc(_TI __a, _TI __b) 224 { 225 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type, 226 _TVT::_S_full_size>>); 227 if (__builtin_is_constant_evaluated()) 228 return __is_zero(__andnot(__a, __b)); 229 230 if constexpr (sizeof(_TI) == 32) 231 { 232 if constexpr (_TVT::template _S_is<float>) 233 return _mm256_testc_ps(__a, __b); 234 else if constexpr (_TVT::template _S_is<double>) 235 return _mm256_testc_pd(__a, __b); 236 else 237 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b)); 238 } 239 else if constexpr (_TVT::template _S_is<float> && __have_avx) 240 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b)); 241 else if constexpr (_TVT::template _S_is<double> && __have_avx) 242 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b)); 243 else 244 { 245 static_assert(is_same_v<_TI, _TI> && __have_sse4_1); 246 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 247 __intrin_bitcast<__m128i>(__to_intrin(__b))); 248 } 249 } 250 251 // }}} 252 // __testnzc{{{ 253 template <typename _TI, typename _TVT = _VectorTraits<_TI>> 254 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int __testnzc(_TI __a,_TI __b)255 __testnzc(_TI __a, _TI __b) 256 { 257 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type, 258 _TVT::_S_full_size>>); 259 if (!__builtin_is_constant_evaluated()) 260 { 261 if constexpr (sizeof(_TI) == 32) 262 { 263 if constexpr (_TVT::template _S_is<float>) 264 return _mm256_testnzc_ps(__a, __b); 265 else if constexpr (_TVT::template _S_is<double>) 266 return _mm256_testnzc_pd(__a, __b); 267 else 268 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b)); 269 } 270 else if constexpr (_TVT::template _S_is<float> && __have_avx) 271 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b)); 272 else if constexpr (_TVT::template _S_is<double> && __have_avx) 273 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b)); 274 else if constexpr (__have_sse4_1) 275 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 276 __intrin_bitcast<__m128i>(__to_intrin(__b))); 277 else 278 return __movemask(0 == __and(__a, __b)) == 0 279 && __movemask(0 == __andnot(__a, __b)) == 0; 280 } 281 else 282 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b))); 283 } 284 285 // }}} 286 // __xzyw{{{ 287 // shuffles the complete vector, swapping the inner two quarters. Often useful 288 // for AVX for fixing up a shuffle result. 289 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 290 _GLIBCXX_SIMD_INTRINSIC _Tp __xzyw(_Tp __a)291 __xzyw(_Tp __a) 292 { 293 if constexpr (sizeof(_Tp) == 16) 294 { 295 const auto __x = __vector_bitcast<conditional_t< 296 is_floating_point_v<typename _TVT::value_type>, float, int>>(__a); 297 return reinterpret_cast<_Tp>( 298 decltype(__x){__x[0], __x[2], __x[1], __x[3]}); 299 } 300 else if constexpr (sizeof(_Tp) == 32) 301 { 302 const auto __x = __vector_bitcast<conditional_t< 303 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a); 304 return reinterpret_cast<_Tp>( 305 decltype(__x){__x[0], __x[2], __x[1], __x[3]}); 306 } 307 else if constexpr (sizeof(_Tp) == 64) 308 { 309 const auto __x = __vector_bitcast<conditional_t< 310 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a); 311 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4], 312 __x[5], __x[2], __x[3], 313 __x[6], __x[7]}); 314 } 315 else 316 __assert_unreachable<_Tp>(); 317 } 318 319 // }}} 320 // __maskload_epi32{{{ 321 template <typename _Tp> 322 _GLIBCXX_SIMD_INTRINSIC auto __maskload_epi32(const int * __ptr,_Tp __k)323 __maskload_epi32(const int* __ptr, _Tp __k) 324 { 325 if constexpr (sizeof(__k) == 16) 326 return _mm_maskload_epi32(__ptr, __k); 327 else 328 return _mm256_maskload_epi32(__ptr, __k); 329 } 330 331 // }}} 332 // __maskload_epi64{{{ 333 template <typename _Tp> 334 _GLIBCXX_SIMD_INTRINSIC auto __maskload_epi64(const _LLong * __ptr,_Tp __k)335 __maskload_epi64(const _LLong* __ptr, _Tp __k) 336 { 337 if constexpr (sizeof(__k) == 16) 338 return _mm_maskload_epi64(__ptr, __k); 339 else 340 return _mm256_maskload_epi64(__ptr, __k); 341 } 342 343 // }}} 344 // __maskload_ps{{{ 345 template <typename _Tp> 346 _GLIBCXX_SIMD_INTRINSIC auto __maskload_ps(const float * __ptr,_Tp __k)347 __maskload_ps(const float* __ptr, _Tp __k) 348 { 349 if constexpr (sizeof(__k) == 16) 350 return _mm_maskload_ps(__ptr, __k); 351 else 352 return _mm256_maskload_ps(__ptr, __k); 353 } 354 355 // }}} 356 // __maskload_pd{{{ 357 template <typename _Tp> 358 _GLIBCXX_SIMD_INTRINSIC auto __maskload_pd(const double * __ptr,_Tp __k)359 __maskload_pd(const double* __ptr, _Tp __k) 360 { 361 if constexpr (sizeof(__k) == 16) 362 return _mm_maskload_pd(__ptr, __k); 363 else 364 return _mm256_maskload_pd(__ptr, __k); 365 } 366 367 // }}} 368 369 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 370 #include "simd_x86_conversions.h" 371 #endif 372 373 // ISA & type detection {{{ 374 template <typename _Tp, size_t _Np> 375 constexpr bool __is_sse_ps()376 __is_sse_ps() 377 { 378 return __have_sse 379 && is_same_v<_Tp, 380 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16; 381 } 382 383 template <typename _Tp, size_t _Np> 384 constexpr bool __is_sse_pd()385 __is_sse_pd() 386 { 387 return __have_sse2 388 && is_same_v<_Tp, 389 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16; 390 } 391 392 template <typename _Tp, size_t _Np> 393 constexpr bool __is_avx_ps()394 __is_avx_ps() 395 { 396 return __have_avx 397 && is_same_v<_Tp, 398 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32; 399 } 400 401 template <typename _Tp, size_t _Np> 402 constexpr bool __is_avx_pd()403 __is_avx_pd() 404 { 405 return __have_avx 406 && is_same_v<_Tp, 407 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32; 408 } 409 410 template <typename _Tp, size_t _Np> 411 constexpr bool __is_avx512_ps()412 __is_avx512_ps() 413 { 414 return __have_avx512f 415 && is_same_v<_Tp, 416 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64; 417 } 418 419 template <typename _Tp, size_t _Np> 420 constexpr bool __is_avx512_pd()421 __is_avx512_pd() 422 { 423 return __have_avx512f 424 && is_same_v<_Tp, 425 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64; 426 } 427 428 // }}} 429 struct _MaskImplX86Mixin; 430 431 // _CommonImplX86 {{{ 432 struct _CommonImplX86 : _CommonImplBuiltin 433 { 434 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 435 // _S_converts_via_decomposition {{{ 436 template <typename _From, typename _To, size_t _ToSize> _S_converts_via_decomposition_CommonImplX86437 static constexpr bool _S_converts_via_decomposition() 438 { 439 if constexpr (is_integral_v< 440 _From> && is_integral_v<_To> && sizeof(_From) == 8 441 && _ToSize == 16) 442 return (sizeof(_To) == 2 && !__have_ssse3) 443 || (sizeof(_To) == 1 && !__have_avx512f); 444 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>) 445 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8 446 && !__have_avx512dq) 447 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1 448 && _ToSize == 16); 449 else if constexpr ( 450 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8 451 && !__have_avx512dq) 452 return (sizeof(_To) == 4 && _ToSize == 16) 453 || (sizeof(_To) == 8 && _ToSize < 64); 454 else 455 return false; 456 } 457 458 template <typename _From, typename _To, size_t _ToSize> 459 static inline constexpr bool __converts_via_decomposition_v 460 = _S_converts_via_decomposition<_From, _To, _ToSize>(); 461 462 // }}} 463 #endif 464 // _S_store {{{ 465 using _CommonImplBuiltin::_S_store; 466 467 template <typename _Tp, size_t _Np> _S_store_CommonImplX86468 _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __x, 469 void* __addr) 470 { 471 constexpr size_t _Bytes = _Np * sizeof(_Tp); 472 473 if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl) 474 { 475 const auto __v = __to_intrin(__x); 476 477 if constexpr (_Bytes & 1) 478 { 479 if constexpr (_Bytes < 16) 480 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes), 481 __intrin_bitcast<__m128i>(__v)); 482 else if constexpr (_Bytes < 32) 483 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes), 484 __intrin_bitcast<__m256i>(__v)); 485 else 486 _mm512_mask_storeu_epi8(__addr, 487 0xffffffffffffffffull >> (64 - _Bytes), 488 __intrin_bitcast<__m512i>(__v)); 489 } 490 else if constexpr (_Bytes & 2) 491 { 492 if constexpr (_Bytes < 16) 493 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2), 494 __intrin_bitcast<__m128i>(__v)); 495 else if constexpr (_Bytes < 32) 496 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2), 497 __intrin_bitcast<__m256i>(__v)); 498 else 499 _mm512_mask_storeu_epi16(__addr, 500 0xffffffffull >> (32 - _Bytes / 2), 501 __intrin_bitcast<__m512i>(__v)); 502 } 503 else if constexpr (_Bytes & 4) 504 { 505 if constexpr (_Bytes < 16) 506 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4), 507 __intrin_bitcast<__m128i>(__v)); 508 else if constexpr (_Bytes < 32) 509 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4), 510 __intrin_bitcast<__m256i>(__v)); 511 else 512 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4), 513 __intrin_bitcast<__m512i>(__v)); 514 } 515 else 516 { 517 static_assert( 518 _Bytes > 16, 519 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes " 520 "- 1)) != 0 is impossible"); 521 if constexpr (_Bytes < 32) 522 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8), 523 __intrin_bitcast<__m256i>(__v)); 524 else 525 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8), 526 __intrin_bitcast<__m512i>(__v)); 527 } 528 } 529 else 530 _CommonImplBuiltin::_S_store(__x, __addr); 531 } 532 533 // }}} 534 // _S_store_bool_array(_BitMask) {{{ 535 template <size_t _Np, bool _Sanitized> 536 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_store_bool_array_CommonImplX86537 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem) 538 { 539 if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL 540 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>([=]() constexpr { 541 if constexpr (_Np <= 16) 542 return _mm_movm_epi8(__x._M_to_bits()); 543 else if constexpr (_Np <= 32) 544 return _mm256_movm_epi8(__x._M_to_bits()); 545 else if constexpr (_Np <= 64) 546 return _mm512_movm_epi8(__x._M_to_bits()); 547 else 548 __assert_unreachable<_SizeConstant<_Np>>(); 549 }()), 550 __mem); 551 else if constexpr (__have_bmi2) 552 { 553 if constexpr (_Np <= 4) 554 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem); 555 else 556 __execute_n_times<__div_roundup(_Np, sizeof(size_t))>( 557 [&](auto __i) { 558 constexpr size_t __offset = __i * sizeof(size_t); 559 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset); 560 if constexpr (__todo == 1) 561 __mem[__offset] = __x[__offset]; 562 else 563 { 564 const auto __bools = 565 #ifdef __x86_64__ 566 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(), 567 0x0101010101010101ULL); 568 #else // __x86_64__ 569 _pdep_u32( 570 __x.template _M_extract<__offset>()._M_to_bits(), 571 0x01010101U); 572 #endif // __x86_64__ 573 _S_store<__todo>(__bools, __mem + __offset); 574 } 575 }); 576 } 577 else if constexpr (__have_sse2 && _Np > 7) 578 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) { 579 constexpr int __offset = __i * 16; 580 constexpr int __todo = std::min(16, int(_Np) - __offset); 581 const int __bits = __x.template _M_extract<__offset>()._M_to_bits(); 582 __vector_type16_t<_UChar> __bools; 583 if constexpr (__have_avx512f) 584 { 585 auto __as32bits 586 = _mm512_maskz_mov_epi32(__bits, __to_intrin( 587 __vector_broadcast<16>(1))); 588 auto __as16bits 589 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 590 __todo > 8 ? __hi256(__as32bits) 591 : __m256i())); 592 __bools = __vector_bitcast<_UChar>( 593 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 594 } 595 else 596 { 597 using _V = __vector_type_t<_UChar, 16>; 598 auto __tmp = _mm_cvtsi32_si128(__bits); 599 __tmp = _mm_unpacklo_epi8(__tmp, __tmp); 600 __tmp = _mm_unpacklo_epi16(__tmp, __tmp); 601 __tmp = _mm_unpacklo_epi32(__tmp, __tmp); 602 _V __tmp2 = reinterpret_cast<_V>(__tmp); 603 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128, 604 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index 605 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01 606 } 607 _S_store<__todo>(__bools, __mem + __offset); 608 }); 609 else 610 _CommonImplBuiltin::_S_store_bool_array(__x, __mem); 611 } 612 613 // }}} 614 // _S_blend_avx512 {{{ 615 // Returns: __k ? __b : __a 616 // TODO: reverse __a and __b to match COND_EXPR 617 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask 618 // __k 619 template <typename _Kp, typename _TV> 620 _GLIBCXX_SIMD_INTRINSIC static _TV _S_blend_avx512_CommonImplX86621 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept 622 { 623 #ifdef __clang__ 624 // FIXME: this does a boolean choice, not a blend 625 return __k ? __a : __b; 626 #else 627 static_assert(__is_vector_type_v<_TV>); 628 using _Tp = typename _VectorTraits<_TV>::value_type; 629 static_assert(sizeof(_TV) >= 16); 630 static_assert(sizeof(_Tp) <= 8); 631 using _IntT 632 = conditional_t<(sizeof(_Tp) > 2), 633 conditional_t<sizeof(_Tp) == 4, int, long long>, 634 conditional_t<sizeof(_Tp) == 1, char, short>>; 635 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a); 636 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b); 637 if constexpr (sizeof(_TV) == 64) 638 { 639 if constexpr (sizeof(_Tp) == 1) 640 return reinterpret_cast<_TV>( 641 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k)); 642 else if constexpr (sizeof(_Tp) == 2) 643 return reinterpret_cast<_TV>( 644 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k)); 645 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 646 return __builtin_ia32_blendmps_512_mask(__a, __b, __k); 647 else if constexpr (sizeof(_Tp) == 4) 648 return reinterpret_cast<_TV>( 649 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k)); 650 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 651 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k); 652 else if constexpr (sizeof(_Tp) == 8) 653 return reinterpret_cast<_TV>( 654 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k)); 655 } 656 else if constexpr (sizeof(_TV) == 32) 657 { 658 if constexpr (sizeof(_Tp) == 1) 659 return reinterpret_cast<_TV>( 660 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k)); 661 else if constexpr (sizeof(_Tp) == 2) 662 return reinterpret_cast<_TV>( 663 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k)); 664 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 665 return __builtin_ia32_blendmps_256_mask(__a, __b, __k); 666 else if constexpr (sizeof(_Tp) == 4) 667 return reinterpret_cast<_TV>( 668 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k)); 669 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 670 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k); 671 else if constexpr (sizeof(_Tp) == 8) 672 return reinterpret_cast<_TV>( 673 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k)); 674 } 675 else if constexpr (sizeof(_TV) == 16) 676 { 677 if constexpr (sizeof(_Tp) == 1) 678 return reinterpret_cast<_TV>( 679 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k)); 680 else if constexpr (sizeof(_Tp) == 2) 681 return reinterpret_cast<_TV>( 682 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k)); 683 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 684 return __builtin_ia32_blendmps_128_mask(__a, __b, __k); 685 else if constexpr (sizeof(_Tp) == 4) 686 return reinterpret_cast<_TV>( 687 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k)); 688 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 689 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k); 690 else if constexpr (sizeof(_Tp) == 8) 691 return reinterpret_cast<_TV>( 692 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k)); 693 } 694 #endif 695 } 696 697 // }}} 698 // _S_blend_intrin {{{ 699 // Returns: __k ? __b : __a 700 // TODO: reverse __a and __b to match COND_EXPR 701 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32 702 // Bytes wide 703 template <typename _Tp> _S_blend_intrin_CommonImplX86704 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_blend_intrin(_Tp __k, _Tp __a, 705 _Tp __b) noexcept 706 { 707 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>); 708 constexpr struct 709 { 710 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b, 711 __m128 __k) const noexcept 712 { 713 return __builtin_ia32_blendvps(__a, __b, __k); 714 } 715 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b, 716 __m128d __k) const noexcept 717 { 718 return __builtin_ia32_blendvpd(__a, __b, __k); 719 } 720 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b, 721 __m128i __k) const noexcept 722 { 723 return reinterpret_cast<__m128i>( 724 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a), 725 reinterpret_cast<__v16qi>(__b), 726 reinterpret_cast<__v16qi>(__k))); 727 } 728 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b, 729 __m256 __k) const noexcept 730 { 731 return __builtin_ia32_blendvps256(__a, __b, __k); 732 } 733 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b, 734 __m256d __k) const noexcept 735 { 736 return __builtin_ia32_blendvpd256(__a, __b, __k); 737 } 738 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b, 739 __m256i __k) const noexcept 740 { 741 if constexpr (__have_avx2) 742 return reinterpret_cast<__m256i>( 743 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a), 744 reinterpret_cast<__v32qi>(__b), 745 reinterpret_cast<__v32qi>(__k))); 746 else 747 return reinterpret_cast<__m256i>( 748 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a), 749 reinterpret_cast<__v8sf>(__b), 750 reinterpret_cast<__v8sf>(__k))); 751 } 752 } __eval; 753 return __eval(__a, __b, __k); 754 } 755 756 // }}} 757 // _S_blend {{{ 758 // Returns: __k ? __at1 : __at0 759 // TODO: reverse __at0 and __at1 to match COND_EXPR 760 template <typename _Tp, size_t _Np> 761 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_blend_CommonImplX86762 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0, 763 _SimdWrapper<_Tp, _Np> __at1) 764 { 765 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f); 766 if (__k._M_is_constprop() && __at0._M_is_constprop() 767 && __at1._M_is_constprop()) 768 return __generate_from_n_evaluations<_Np, 769 __vector_type_t<_Tp, _Np>>([&]( 770 auto __i) constexpr { return __k[__i] ? __at1[__i] : __at0[__i]; }); 771 else if constexpr (sizeof(__at0) == 64 772 || (__have_avx512vl && sizeof(__at0) >= 16)) 773 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data); 774 else 775 { 776 static_assert((__have_avx512vl && sizeof(__at0) < 16) 777 || !__have_avx512vl); 778 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp); 779 return __vector_bitcast<_Tp, _Np>( 780 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0), 781 __vector_bitcast<_Tp, __size>(__at1))); 782 } 783 } 784 785 template <typename _Tp, size_t _Np> 786 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_blend_CommonImplX86787 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k, 788 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1) 789 { 790 const auto __kk = __wrapper_bitcast<_Tp>(__k); 791 if (__builtin_is_constant_evaluated() 792 || (__kk._M_is_constprop() && __at0._M_is_constprop() 793 && __at1._M_is_constprop())) 794 { 795 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1)); 796 if (__r._M_is_constprop()) 797 return __r; 798 } 799 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl) 800 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 801 // convert to bitmask and call overload above 802 return _S_blend( 803 _SimdWrapper<bool, _Np>( 804 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k) 805 ._M_to_bits()), 806 __at0, __at1); 807 else 808 { 809 // Since GCC does not assume __k to be a mask, using the builtin 810 // conditional operator introduces an extra compare against 0 before 811 // blending. So we rather call the intrinsic here. 812 if constexpr (__have_sse4_1) 813 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0), 814 __to_intrin(__at1)); 815 else 816 return __or(__andnot(__kk, __at0), __and(__kk, __at1)); 817 } 818 } 819 820 // }}} 821 }; 822 823 // }}} 824 // _SimdImplX86 {{{ 825 template <typename _Abi> 826 struct _SimdImplX86 : _SimdImplBuiltin<_Abi> 827 { 828 using _Base = _SimdImplBuiltin<_Abi>; 829 830 template <typename _Tp> 831 using _MaskMember = typename _Base::template _MaskMember<_Tp>; 832 833 template <typename _Tp> 834 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>; 835 836 template <typename _Tp> 837 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>; 838 839 template <typename _Tp> 840 static constexpr size_t _S_max_store_size 841 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64 842 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32 843 : 16; 844 using _MaskImpl = typename _Abi::_MaskImpl; 845 846 // _S_masked_load {{{ 847 template <typename _Tp, size_t _Np, typename _Up> 848 static inline _SimdWrapper<_Tp, _Np> _S_masked_load_SimdImplX86849 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 850 const _Up* __mem) noexcept 851 { 852 static_assert(_Np == _S_size<_Tp>); 853 if constexpr (is_same_v<_Tp, _Up> || // no conversion 854 (sizeof(_Tp) == sizeof(_Up) 855 && is_integral_v< 856 _Tp> == is_integral_v<_Up>) // conversion via bit 857 // reinterpretation 858 ) 859 { 860 [[maybe_unused]] const auto __intrin = __to_intrin(__merge); 861 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl) 862 && sizeof(_Tp) == 1) 863 { 864 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 865 if constexpr (sizeof(__intrin) == 16) 866 __merge = __vector_bitcast<_Tp, _Np>( 867 _mm_mask_loadu_epi8(__intrin, __kk, __mem)); 868 else if constexpr (sizeof(__merge) == 32) 869 __merge = __vector_bitcast<_Tp, _Np>( 870 _mm256_mask_loadu_epi8(__intrin, __kk, __mem)); 871 else if constexpr (sizeof(__merge) == 64) 872 __merge = __vector_bitcast<_Tp, _Np>( 873 _mm512_mask_loadu_epi8(__intrin, __kk, __mem)); 874 else 875 __assert_unreachable<_Tp>(); 876 } 877 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl) 878 && sizeof(_Tp) == 2) 879 { 880 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 881 if constexpr (sizeof(__intrin) == 16) 882 __merge = __vector_bitcast<_Tp, _Np>( 883 _mm_mask_loadu_epi16(__intrin, __kk, __mem)); 884 else if constexpr (sizeof(__intrin) == 32) 885 __merge = __vector_bitcast<_Tp, _Np>( 886 _mm256_mask_loadu_epi16(__intrin, __kk, __mem)); 887 else if constexpr (sizeof(__intrin) == 64) 888 __merge = __vector_bitcast<_Tp, _Np>( 889 _mm512_mask_loadu_epi16(__intrin, __kk, __mem)); 890 else 891 __assert_unreachable<_Tp>(); 892 } 893 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 894 && sizeof(_Tp) == 4 && is_integral_v<_Up>) 895 { 896 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 897 if constexpr (sizeof(__intrin) == 16) 898 __merge = __vector_bitcast<_Tp, _Np>( 899 _mm_mask_loadu_epi32(__intrin, __kk, __mem)); 900 else if constexpr (sizeof(__intrin) == 32) 901 __merge = __vector_bitcast<_Tp, _Np>( 902 _mm256_mask_loadu_epi32(__intrin, __kk, __mem)); 903 else if constexpr (sizeof(__intrin) == 64) 904 __merge = __vector_bitcast<_Tp, _Np>( 905 _mm512_mask_loadu_epi32(__intrin, __kk, __mem)); 906 else 907 __assert_unreachable<_Tp>(); 908 } 909 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 910 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>) 911 { 912 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 913 if constexpr (sizeof(__intrin) == 16) 914 __merge = __vector_bitcast<_Tp, _Np>( 915 _mm_mask_loadu_ps(__intrin, __kk, __mem)); 916 else if constexpr (sizeof(__intrin) == 32) 917 __merge = __vector_bitcast<_Tp, _Np>( 918 _mm256_mask_loadu_ps(__intrin, __kk, __mem)); 919 else if constexpr (sizeof(__intrin) == 64) 920 __merge = __vector_bitcast<_Tp, _Np>( 921 _mm512_mask_loadu_ps(__intrin, __kk, __mem)); 922 else 923 __assert_unreachable<_Tp>(); 924 } 925 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 926 && is_integral_v<_Up>) 927 { 928 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 929 __merge 930 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 931 __vector_bitcast<_Tp, _Np>( 932 __maskload_epi32(reinterpret_cast<const int*>(__mem), 933 __to_intrin(__k)))); 934 } 935 else if constexpr (__have_avx && sizeof(_Tp) == 4) 936 { 937 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 938 __merge 939 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 940 __vector_bitcast<_Tp, _Np>( 941 __maskload_ps(reinterpret_cast<const float*>(__mem), 942 __to_intrin(__k)))); 943 } 944 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 945 && sizeof(_Tp) == 8 && is_integral_v<_Up>) 946 { 947 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 948 if constexpr (sizeof(__intrin) == 16) 949 __merge = __vector_bitcast<_Tp, _Np>( 950 _mm_mask_loadu_epi64(__intrin, __kk, __mem)); 951 else if constexpr (sizeof(__intrin) == 32) 952 __merge = __vector_bitcast<_Tp, _Np>( 953 _mm256_mask_loadu_epi64(__intrin, __kk, __mem)); 954 else if constexpr (sizeof(__intrin) == 64) 955 __merge = __vector_bitcast<_Tp, _Np>( 956 _mm512_mask_loadu_epi64(__intrin, __kk, __mem)); 957 else 958 __assert_unreachable<_Tp>(); 959 } 960 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 961 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>) 962 { 963 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 964 if constexpr (sizeof(__intrin) == 16) 965 __merge = __vector_bitcast<_Tp, _Np>( 966 _mm_mask_loadu_pd(__intrin, __kk, __mem)); 967 else if constexpr (sizeof(__intrin) == 32) 968 __merge = __vector_bitcast<_Tp, _Np>( 969 _mm256_mask_loadu_pd(__intrin, __kk, __mem)); 970 else if constexpr (sizeof(__intrin) == 64) 971 __merge = __vector_bitcast<_Tp, _Np>( 972 _mm512_mask_loadu_pd(__intrin, __kk, __mem)); 973 else 974 __assert_unreachable<_Tp>(); 975 } 976 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 977 && is_integral_v<_Up>) 978 { 979 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 980 __merge 981 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 982 __vector_bitcast<_Tp, _Np>(__maskload_epi64( 983 reinterpret_cast<const _LLong*>(__mem), 984 __to_intrin(__k)))); 985 } 986 else if constexpr (__have_avx && sizeof(_Tp) == 8) 987 { 988 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 989 __merge 990 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 991 __vector_bitcast<_Tp, _Np>( 992 __maskload_pd(reinterpret_cast<const double*>(__mem), 993 __to_intrin(__k)))); 994 } 995 else 996 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 997 [&](auto __i) { 998 __merge._M_set(__i, static_cast<_Tp>( 999 __mem[__i])); 1000 }); 1001 } 1002 /* Very uncertain, that the following improves anything. Needs 1003 benchmarking 1004 * before it's activated. 1005 else if constexpr (sizeof(_Up) <= 8 && // no long double 1006 !__converts_via_decomposition_v< 1007 _Up, _Tp, 1008 sizeof(__merge)> // conversion via decomposition 1009 // is better handled via the 1010 // bit_iteration fallback below 1011 ) 1012 { 1013 // TODO: copy pattern from _S_masked_store, which doesn't resort to 1014 // fixed_size 1015 using _Ap = simd_abi::deduce_t<_Up, _Np>; 1016 using _ATraits = _SimdTraits<_Up, _Ap>; 1017 using _AImpl = typename _ATraits::_SimdImpl; 1018 typename _ATraits::_SimdMember __uncvted{}; 1019 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template 1020 _S_convert<_Up>(__k); 1021 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem); 1022 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter; 1023 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted)); 1024 } 1025 */ 1026 else 1027 __merge = _Base::_S_masked_load(__merge, __k, __mem); 1028 return __merge; 1029 } 1030 1031 // }}} 1032 // _S_masked_store_nocvt {{{ 1033 template <typename _Tp, size_t _Np> 1034 _GLIBCXX_SIMD_INTRINSIC static void _S_masked_store_nocvt_SimdImplX861035 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, 1036 _SimdWrapper<bool, _Np> __k) 1037 { 1038 [[maybe_unused]] const auto __vi = __to_intrin(__v); 1039 if constexpr (sizeof(__vi) == 64) 1040 { 1041 static_assert(sizeof(__v) == 64 && __have_avx512f); 1042 if constexpr (__have_avx512bw && sizeof(_Tp) == 1) 1043 _mm512_mask_storeu_epi8(__mem, __k, __vi); 1044 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2) 1045 _mm512_mask_storeu_epi16(__mem, __k, __vi); 1046 else if constexpr (__have_avx512f && sizeof(_Tp) == 4) 1047 { 1048 if constexpr (is_integral_v<_Tp>) 1049 _mm512_mask_storeu_epi32(__mem, __k, __vi); 1050 else 1051 _mm512_mask_storeu_ps(__mem, __k, __vi); 1052 } 1053 else if constexpr (__have_avx512f && sizeof(_Tp) == 8) 1054 { 1055 if constexpr (is_integral_v<_Tp>) 1056 _mm512_mask_storeu_epi64(__mem, __k, __vi); 1057 else 1058 _mm512_mask_storeu_pd(__mem, __k, __vi); 1059 } 1060 #if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32 1061 // with Skylake-AVX512, __have_avx512bw is true 1062 else if constexpr (__have_sse2) 1063 { 1064 using _M = __vector_type_t<_Tp, _Np>; 1065 using _MVT = _VectorTraits<_M>; 1066 _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)), 1067 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)), 1068 reinterpret_cast<char*>(__mem)); 1069 _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)), 1070 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>( 1071 __k._M_data >> 1 * _MVT::_S_full_size)), 1072 reinterpret_cast<char*>(__mem) + 1 * 16); 1073 _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)), 1074 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>( 1075 __k._M_data >> 2 * _MVT::_S_full_size)), 1076 reinterpret_cast<char*>(__mem) + 2 * 16); 1077 if constexpr (_Np > 48 / sizeof(_Tp)) 1078 _mm_maskmoveu_si128( 1079 __auto_bitcast(__extract<3, 4>(__v._M_data)), 1080 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>( 1081 __k._M_data >> 3 * _MVT::_S_full_size)), 1082 reinterpret_cast<char*>(__mem) + 3 * 16); 1083 } 1084 #endif 1085 else 1086 __assert_unreachable<_Tp>(); 1087 } 1088 else if constexpr (sizeof(__vi) == 32) 1089 { 1090 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1091 _mm256_mask_storeu_epi8(__mem, __k, __vi); 1092 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1093 _mm256_mask_storeu_epi16(__mem, __k, __vi); 1094 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4) 1095 { 1096 if constexpr (is_integral_v<_Tp>) 1097 _mm256_mask_storeu_epi32(__mem, __k, __vi); 1098 else 1099 _mm256_mask_storeu_ps(__mem, __k, __vi); 1100 } 1101 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8) 1102 { 1103 if constexpr (is_integral_v<_Tp>) 1104 _mm256_mask_storeu_epi64(__mem, __k, __vi); 1105 else 1106 _mm256_mask_storeu_pd(__mem, __k, __vi); 1107 } 1108 else if constexpr (__have_avx512f 1109 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 1110 { 1111 // use a 512-bit maskstore, using zero-extension of the bitmask 1112 _S_masked_store_nocvt( 1113 _SimdWrapper64<_Tp>( 1114 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)), 1115 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data)); 1116 } 1117 else 1118 _S_masked_store_nocvt(__v, __mem, 1119 _MaskImpl::template _S_to_maskvector< 1120 __int_for_sizeof_t<_Tp>, _Np>(__k)); 1121 } 1122 else if constexpr (sizeof(__vi) == 16) 1123 { 1124 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1125 _mm_mask_storeu_epi8(__mem, __k, __vi); 1126 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1127 _mm_mask_storeu_epi16(__mem, __k, __vi); 1128 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4) 1129 { 1130 if constexpr (is_integral_v<_Tp>) 1131 _mm_mask_storeu_epi32(__mem, __k, __vi); 1132 else 1133 _mm_mask_storeu_ps(__mem, __k, __vi); 1134 } 1135 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8) 1136 { 1137 if constexpr (is_integral_v<_Tp>) 1138 _mm_mask_storeu_epi64(__mem, __k, __vi); 1139 else 1140 _mm_mask_storeu_pd(__mem, __k, __vi); 1141 } 1142 else if constexpr (__have_avx512f 1143 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 1144 { 1145 // use a 512-bit maskstore, using zero-extension of the bitmask 1146 _S_masked_store_nocvt( 1147 _SimdWrapper64<_Tp>( 1148 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)), 1149 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data)); 1150 } 1151 else 1152 _S_masked_store_nocvt(__v, __mem, 1153 _MaskImpl::template _S_to_maskvector< 1154 __int_for_sizeof_t<_Tp>, _Np>(__k)); 1155 } 1156 else 1157 __assert_unreachable<_Tp>(); 1158 } 1159 1160 template <typename _Tp, size_t _Np> 1161 _GLIBCXX_SIMD_INTRINSIC static void _S_masked_store_nocvt_SimdImplX861162 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, 1163 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k) 1164 { 1165 if constexpr (sizeof(__v) <= 16) 1166 { 1167 [[maybe_unused]] const auto __vi 1168 = __intrin_bitcast<__m128i>(__as_vector(__v)); 1169 [[maybe_unused]] const auto __ki 1170 = __intrin_bitcast<__m128i>(__as_vector(__k)); 1171 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1172 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi); 1173 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1174 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi); 1175 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 1176 && is_integral_v<_Tp>) 1177 _mm_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi); 1178 else if constexpr (__have_avx && sizeof(_Tp) == 4) 1179 _mm_maskstore_ps(reinterpret_cast<float*>(__mem), __ki, 1180 __vector_bitcast<float>(__vi)); 1181 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1182 && is_integral_v<_Tp>) 1183 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi); 1184 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1185 _mm_maskstore_pd(reinterpret_cast<double*>(__mem), __ki, 1186 __vector_bitcast<double>(__vi)); 1187 else if constexpr (__have_sse2) 1188 _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<char*>(__mem)); 1189 } 1190 else if constexpr (sizeof(__v) == 32) 1191 { 1192 [[maybe_unused]] const auto __vi 1193 = __intrin_bitcast<__m256i>(__as_vector(__v)); 1194 [[maybe_unused]] const auto __ki 1195 = __intrin_bitcast<__m256i>(__as_vector(__k)); 1196 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1197 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi); 1198 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1199 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi); 1200 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 1201 && is_integral_v<_Tp>) 1202 _mm256_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi); 1203 else if constexpr (sizeof(_Tp) == 4) 1204 _mm256_maskstore_ps(reinterpret_cast<float*>(__mem), __ki, 1205 __vector_bitcast<float>(__v)); 1206 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1207 && is_integral_v<_Tp>) 1208 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, 1209 __vi); 1210 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1211 _mm256_maskstore_pd(reinterpret_cast<double*>(__mem), __ki, 1212 __vector_bitcast<double>(__v)); 1213 else if constexpr (__have_sse2) 1214 { 1215 _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki), 1216 reinterpret_cast<char*>(__mem)); 1217 _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki), 1218 reinterpret_cast<char*>(__mem) + 16); 1219 } 1220 } 1221 else 1222 __assert_unreachable<_Tp>(); 1223 } 1224 1225 // }}} 1226 // _S_masked_store {{{ 1227 template <typename _Tp, size_t _Np, typename _Up> 1228 _GLIBCXX_SIMD_INTRINSIC static void _S_masked_store_SimdImplX861229 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem, 1230 const _MaskMember<_Tp> __k) noexcept 1231 { 1232 if constexpr (is_integral_v< 1233 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up) 1234 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw) 1235 && (sizeof(__v) == 64 || __have_avx512vl)) 1236 { // truncating store 1237 const auto __vi = __to_intrin(__v); 1238 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 1239 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1240 && sizeof(__vi) == 64) 1241 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1242 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1243 && sizeof(__vi) == 32) 1244 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1245 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1246 && sizeof(__vi) == 16) 1247 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1248 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1249 && sizeof(__vi) == 64) 1250 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1251 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1252 && sizeof(__vi) == 32) 1253 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1254 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1255 && sizeof(__vi) == 16) 1256 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1257 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1258 && sizeof(__vi) == 64) 1259 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1260 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1261 && sizeof(__vi) == 32) 1262 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1263 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1264 && sizeof(__vi) == 16) 1265 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1266 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1267 && sizeof(__vi) == 64) 1268 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1269 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1270 && sizeof(__vi) == 32) 1271 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1272 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1273 && sizeof(__vi) == 16) 1274 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1275 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1276 && sizeof(__vi) == 64) 1277 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1278 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1279 && sizeof(__vi) == 32) 1280 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1281 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1282 && sizeof(__vi) == 16) 1283 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1284 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1285 && sizeof(__vi) == 64) 1286 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1287 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1288 && sizeof(__vi) == 32) 1289 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1290 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1291 && sizeof(__vi) == 16) 1292 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1293 else 1294 __assert_unreachable<_Tp>(); 1295 } 1296 else 1297 _Base::_S_masked_store(__v, __mem, __k); 1298 } 1299 1300 // }}} 1301 // _S_multiplies {{{ 1302 template <typename _V, typename _VVT = _VectorTraits<_V>> _S_multiplies_SimdImplX861303 _GLIBCXX_SIMD_INTRINSIC static constexpr _V _S_multiplies(_V __x, _V __y) 1304 { 1305 using _Tp = typename _VVT::value_type; 1306 if (__builtin_is_constant_evaluated() || __x._M_is_constprop() 1307 || __y._M_is_constprop()) 1308 return __as_vector(__x) * __as_vector(__y); 1309 else if constexpr (sizeof(_Tp) == 1) 1310 { 1311 if constexpr (sizeof(_V) == 2) 1312 { 1313 const auto __xs = reinterpret_cast<short>(__x._M_data); 1314 const auto __ys = reinterpret_cast<short>(__y._M_data); 1315 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short( 1316 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00)))); 1317 } 1318 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3) 1319 { 1320 const auto __xi = reinterpret_cast<int>(__x._M_data); 1321 const auto __yi = reinterpret_cast<int>(__y._M_data); 1322 return reinterpret_cast<__vector_type_t<_Tp, 3>>( 1323 ((__xi * __yi) & 0xff) 1324 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00) 1325 | ((__xi >> 16) * (__yi & 0xff0000))); 1326 } 1327 else if constexpr (sizeof(_V) == 4) 1328 { 1329 const auto __xi = reinterpret_cast<int>(__x._M_data); 1330 const auto __yi = reinterpret_cast<int>(__y._M_data); 1331 return reinterpret_cast<__vector_type_t<_Tp, 4>>( 1332 ((__xi * __yi) & 0xff) 1333 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00) 1334 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000) 1335 | ((__xi >> 24) * (__yi & 0xff000000u))); 1336 } 1337 else if constexpr (sizeof(_V) == 8 && __have_avx2 1338 && is_signed_v<_Tp>) 1339 return __convert<typename _VVT::type>( 1340 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x))) 1341 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y)))); 1342 else if constexpr (sizeof(_V) == 8 && __have_avx2 1343 && is_unsigned_v<_Tp>) 1344 return __convert<typename _VVT::type>( 1345 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x))) 1346 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y)))); 1347 else 1348 { 1349 // codegen of `x*y` is suboptimal (as of GCC 9.0.1) 1350 constexpr size_t __full_size = _VVT::_S_full_size; 1351 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8; 1352 using _ShortW = _SimdWrapper<short, _Np>; 1353 const _ShortW __even = __vector_bitcast<short, _Np>(__x) 1354 * __vector_bitcast<short, _Np>(__y); 1355 _ShortW __high_byte = _ShortW()._M_data - 256; 1356 //[&]() { asm("" : "+x"(__high_byte._M_data)); }(); 1357 const _ShortW __odd 1358 = (__vector_bitcast<short, _Np>(__x) >> 8) 1359 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data); 1360 if constexpr (__have_avx512bw && sizeof(_V) > 2) 1361 return _CommonImplX86::_S_blend_avx512( 1362 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even), 1363 __vector_bitcast<_Tp>(__odd)); 1364 else if constexpr (__have_sse4_1 && sizeof(_V) > 2) 1365 return _CommonImplX86::_S_blend_intrin(__to_intrin( 1366 __high_byte), 1367 __to_intrin(__even), 1368 __to_intrin(__odd)); 1369 else 1370 return __to_intrin( 1371 __or(__andnot(__high_byte, __even), __odd)); 1372 } 1373 } 1374 else 1375 return _Base::_S_multiplies(__x, __y); 1376 } 1377 1378 // }}} 1379 // _S_divides {{{ 1380 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993 1381 template <typename _Tp, size_t _Np> 1382 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1383 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1384 { 1385 if (!__builtin_is_constant_evaluated() 1386 && !__builtin_constant_p(__y._M_data)) 1387 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4) 1388 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1) 1389 // Note that using floating-point division is likely to raise the 1390 // *Inexact* exception flag and thus appears like an invalid 1391 // "as-if" transformation. However, C++ doesn't specify how the 1392 // fpenv can be observed and points to C. C says that function 1393 // calls are assumed to potentially raise fp exceptions, unless 1394 // documented otherwise. Consequently, operator/, which is a 1395 // function call, may raise fp exceptions. 1396 /*const struct _CsrGuard 1397 { 1398 const unsigned _M_data = _mm_getcsr(); 1399 _CsrGuard() 1400 { 1401 _mm_setcsr(0x9f80); // turn off FP exceptions and 1402 flush-to-zero 1403 } 1404 ~_CsrGuard() { _mm_setcsr(_M_data); } 1405 } __csr;*/ 1406 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>; 1407 constexpr size_t __n_intermediate 1408 = std::min(_Np, (__have_avx512f ? 64 1409 : __have_avx ? 32 1410 : 16) 1411 / sizeof(_Float)); 1412 using _FloatV = __vector_type_t<_Float, __n_intermediate>; 1413 constexpr size_t __n_floatv 1414 = __div_roundup(_Np, __n_intermediate); 1415 using _R = __vector_type_t<_Tp, _Np>; 1416 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x); 1417 const auto __yf = __convert_all<_FloatV, __n_floatv>( 1418 _Abi::__make_padding_nonzero(__as_vector(__y))); 1419 return __call_with_n_evaluations<__n_floatv>( 1420 [](auto... __quotients) { 1421 return __vector_convert<_R>(__quotients...); 1422 }, 1423 [&__xf, 1424 &__yf](auto __i) -> _SimdWrapper<_Float, __n_intermediate> { 1425 #if !defined __clang__ && __GCC_IEC_559 == 0 1426 // If -freciprocal-math is active, using the `/` operator is 1427 // incorrect because it may be translated to an imprecise 1428 // multiplication with reciprocal. We need to use inline 1429 // assembly to force a real division. 1430 _FloatV __r; 1431 if constexpr (__have_avx) // -mno-sse2avx is irrelevant 1432 // because once -mavx is given, GCC 1433 // emits VEX encoded vdivp[sd] 1434 { 1435 if constexpr (sizeof(_Tp) == 4) 1436 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}" 1437 : "=x"(__r) 1438 : "x"(__xf[__i]), "x"(__yf[__i])); 1439 else 1440 asm("vdivps\t{%2, %1, %0|%0, %1, %2}" 1441 : "=x"(__r) 1442 : "x"(__xf[__i]), "x"(__yf[__i])); 1443 } 1444 else 1445 { 1446 __r = __xf[__i]; 1447 if constexpr (sizeof(_Tp) == 4) 1448 asm("divpd\t{%1, %0|%0, %1}" 1449 : "=x"(__r) 1450 : "x"(__yf[__i])); 1451 else 1452 asm("divps\t{%1, %0|%0, %1}" 1453 : "=x"(__r) 1454 : "x"(__yf[__i])); 1455 } 1456 return __r; 1457 #else 1458 return __xf[__i] / __yf[__i]; 1459 #endif 1460 }); 1461 } 1462 /* 64-bit int division is potentially optimizable via double division if 1463 * the value in __x is small enough and the conversion between 1464 * int<->double is efficient enough: 1465 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> && 1466 sizeof(_Tp) == 8) 1467 { 1468 if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1469 { 1470 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull, 1471 0xffe0'0000'0000'0000ull})) 1472 { 1473 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data) 1474 } 1475 } 1476 } 1477 */ 1478 return _Base::_S_divides(__x, __y); 1479 } 1480 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993 1481 1482 // }}} 1483 // _S_modulus {{{ 1484 template <typename _Tp, size_t _Np> 1485 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1486 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1487 { 1488 if (__builtin_is_constant_evaluated() 1489 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8) 1490 return _Base::_S_modulus(__x, __y); 1491 else 1492 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y))); 1493 } 1494 1495 // }}} 1496 // _S_bit_shift_left {{{ 1497 // Notes on UB. C++2a [expr.shift] says: 1498 // -1- [...] The operands shall be of integral or unscoped enumeration type 1499 // and integral promotions are performed. The type of the result is that 1500 // of the promoted left operand. The behavior is undefined if the right 1501 // operand is negative, or greater than or equal to the width of the 1502 // promoted left operand. 1503 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo 1504 // 2^N, where N is the width of the type of the result. 1505 // 1506 // C++17 [expr.shift] says: 1507 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated 1508 // bits are zero-filled. If E1 has an unsigned type, the value of the 1509 // result is E1 × 2^E2 , reduced modulo one more than the maximum value 1510 // representable in the result type. Otherwise, if E1 has a signed type 1511 // and non-negative value, and E1 × 2^E2 is representable in the 1512 // corresponding unsigned type of the result type, then that value, 1513 // converted to the result type, is the resulting value; otherwise, the 1514 // behavior is undefined. 1515 // 1516 // Consequences: 1517 // With C++2a signed and unsigned types have the same UB 1518 // characteristics: 1519 // - left shift is not UB for 0 <= RHS < max(32, #bits(T)) 1520 // 1521 // With C++17 there's little room for optimizations because the standard 1522 // requires all shifts to happen on promoted integrals (i.e. int). Thus, 1523 // short and char shifts must assume shifts affect bits of neighboring 1524 // values. 1525 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1526 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1527 inline _GLIBCXX_CONST static typename _TVT::type 1528 _S_bit_shift_left(_Tp __xx, int __y) 1529 { 1530 using _V = typename _TVT::type; 1531 using _Up = typename _TVT::value_type; 1532 _V __x = __xx; 1533 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1534 if (__builtin_is_constant_evaluated()) 1535 return __x << __y; 1536 #if __cplusplus > 201703 1537 // after C++17, signed shifts have no UB, and behave just like unsigned 1538 // shifts 1539 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) 1540 return __vector_bitcast<_Up>( 1541 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), 1542 __y)); 1543 #endif 1544 else if constexpr (sizeof(_Up) == 1) 1545 { 1546 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894) 1547 if (__builtin_constant_p(__y)) 1548 { 1549 if (__y == 0) 1550 return __x; 1551 else if (__y == 1) 1552 return __x + __x; 1553 else if (__y == 2) 1554 { 1555 __x = __x + __x; 1556 return __x + __x; 1557 } 1558 else if (__y > 2 && __y < 8) 1559 { 1560 if constexpr (sizeof(__x) > sizeof(unsigned)) 1561 { 1562 const _UChar __mask = 0xff << __y; // precomputed vector 1563 return __vector_bitcast<_Up>( 1564 __vector_bitcast<_UChar>( 1565 __vector_bitcast<unsigned>(__x) << __y) 1566 & __mask); 1567 } 1568 else 1569 { 1570 const unsigned __mask 1571 = (0xff & (0xff << __y)) * 0x01010101u; 1572 return reinterpret_cast<_V>( 1573 static_cast<__int_for_sizeof_t<_V>>( 1574 unsigned( 1575 reinterpret_cast<__int_for_sizeof_t<_V>>(__x) 1576 << __y) 1577 & __mask)); 1578 } 1579 } 1580 else if (__y >= 8 && __y < 32) 1581 return _V(); 1582 else 1583 __builtin_unreachable(); 1584 } 1585 // general strategy in the following: use an sllv instead of sll 1586 // instruction, because it's 2 to 4 times faster: 1587 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16) 1588 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8( 1589 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix), 1590 _mm256_set1_epi16(__y)))); 1591 else if constexpr (__have_avx512bw && sizeof(__x) == 32) 1592 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1593 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix), 1594 _mm512_set1_epi16(__y)))); 1595 else if constexpr (__have_avx512bw && sizeof(__x) == 64) 1596 { 1597 const auto __shift = _mm512_set1_epi16(__y); 1598 return __vector_bitcast<_Up>( 1599 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1600 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)), 1601 _mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1602 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift)))); 1603 } 1604 else if constexpr (__have_avx2 && sizeof(__x) == 32) 1605 { 1606 #if 1 1607 const auto __shift = _mm_cvtsi32_si128(__y); 1608 auto __k 1609 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift); 1610 __k |= _mm256_srli_epi16(__k, 8); 1611 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift) 1612 & __k); 1613 #else 1614 const _Up __k = 0xff << __y; 1615 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y) 1616 & __k; 1617 #endif 1618 } 1619 else 1620 { 1621 const auto __shift = _mm_cvtsi32_si128(__y); 1622 auto __k 1623 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift); 1624 __k |= _mm_srli_epi16(__k, 8); 1625 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k); 1626 } 1627 } 1628 return __x << __y; 1629 } 1630 1631 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1632 inline _GLIBCXX_CONST static typename _TVT::type 1633 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y) 1634 { 1635 using _V = typename _TVT::type; 1636 using _Up = typename _TVT::value_type; 1637 _V __x = __xx; 1638 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1639 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1640 if (__builtin_is_constant_evaluated()) 1641 return __x << __y; 1642 #if __cplusplus > 201703 1643 // after C++17, signed shifts have no UB, and behave just like unsigned 1644 // shifts 1645 else if constexpr (is_signed_v<_Up>) 1646 return __vector_bitcast<_Up>( 1647 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), 1648 __vector_bitcast<make_unsigned_t<_Up>>(__y))); 1649 #endif 1650 else if constexpr (sizeof(_Up) == 1) 1651 { 1652 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1653 return __vector_bitcast<_Up>(__concat( 1654 _mm512_cvtepi16_epi8( 1655 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)), 1656 _mm512_cvtepu8_epi16(__lo256(__iy)))), 1657 _mm512_cvtepi16_epi8( 1658 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)), 1659 _mm512_cvtepu8_epi16(__hi256(__iy)))))); 1660 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1661 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1662 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix), 1663 _mm512_cvtepu8_epi16(__iy)))); 1664 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl) 1665 return __intrin_bitcast<_V>( 1666 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix), 1667 _mm_cvtepu8_epi16(__iy)))); 1668 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1669 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1670 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix), 1671 _mm256_cvtepu8_epi16(__iy)))); 1672 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1673 return __intrin_bitcast<_V>( 1674 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1675 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)), 1676 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy)))))); 1677 else if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1678 { 1679 auto __mask 1680 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5); 1681 auto __x4 1682 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); 1683 __x4 &= char(0xf0); 1684 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1685 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4))); 1686 __mask += __mask; 1687 auto __x2 1688 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); 1689 __x2 &= char(0xfc); 1690 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1691 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2))); 1692 __mask += __mask; 1693 auto __x1 = __x + __x; 1694 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1695 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1))); 1696 return __x 1697 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1698 } 1699 else if constexpr (sizeof(__x) == 16) 1700 { 1701 auto __mask 1702 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5); 1703 auto __x4 1704 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); 1705 __x4 &= char(0xf0); 1706 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x; 1707 __mask += __mask; 1708 auto __x2 1709 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); 1710 __x2 &= char(0xfc); 1711 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x; 1712 __mask += __mask; 1713 auto __x1 = __x + __x; 1714 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x; 1715 return __x 1716 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1717 } 1718 else 1719 return __x << __y; 1720 } 1721 else if constexpr (sizeof(_Up) == 2) 1722 { 1723 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1724 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy)); 1725 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl) 1726 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy)); 1727 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1728 return __vector_bitcast<_Up>( 1729 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix), 1730 _mm512_castsi256_si512(__iy)))); 1731 else if constexpr (sizeof __ix == 32 && __have_avx2) 1732 { 1733 const auto __ux = __vector_bitcast<unsigned>(__x); 1734 const auto __uy = __vector_bitcast<unsigned>(__y); 1735 return __vector_bitcast<_Up>(_mm256_blend_epi16( 1736 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1737 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1738 } 1739 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1740 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy)); 1741 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1742 return __intrin_bitcast<_V>( 1743 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix), 1744 _mm512_castsi128_si512(__iy)))); 1745 else if constexpr (sizeof __ix == 16 && __have_avx2) 1746 { 1747 const auto __ux = __vector_bitcast<unsigned>(__ix); 1748 const auto __uy = __vector_bitcast<unsigned>(__iy); 1749 return __intrin_bitcast<_V>(_mm_blend_epi16( 1750 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1751 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1752 } 1753 else if constexpr (sizeof __ix == 16) 1754 { 1755 using _Float4 = __vector_type_t<float, 4>; 1756 using _Int4 = __vector_type_t<int, 4>; 1757 using _UInt4 = __vector_type_t<unsigned, 4>; 1758 const _UInt4 __yu 1759 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3))); 1760 return __x 1761 * __intrin_bitcast<_V>( 1762 __vector_convert<_Int4>(_SimdWrapper<float, 4>( 1763 reinterpret_cast<_Float4>(__yu << 23))) 1764 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>( 1765 reinterpret_cast<_Float4>((__yu >> 16) << 23))) 1766 << 16)); 1767 } 1768 else 1769 __assert_unreachable<_Tp>(); 1770 } 1771 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16 1772 && !__have_avx2) 1773 // latency is suboptimal, but throughput is at full speedup 1774 return __intrin_bitcast<_V>( 1775 __vector_bitcast<unsigned>(__ix) 1776 * __vector_convert<__vector_type16_t<int>>( 1777 _SimdWrapper<float, 4>(__vector_bitcast<float>( 1778 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000)))); 1779 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16 1780 && !__have_avx2) 1781 { 1782 const auto __lo = _mm_sll_epi64(__ix, __iy); 1783 const auto __hi 1784 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy)); 1785 if constexpr (__have_sse4_1) 1786 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0)); 1787 else 1788 return __vector_bitcast<_Up>( 1789 _mm_move_sd(__vector_bitcast<double>(__hi), 1790 __vector_bitcast<double>(__lo))); 1791 } 1792 else 1793 return __x << __y; 1794 } 1795 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 1796 1797 // }}} 1798 // _S_bit_shift_right {{{ 1799 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1800 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1801 inline _GLIBCXX_CONST static typename _TVT::type 1802 _S_bit_shift_right(_Tp __xx, int __y) 1803 { 1804 using _V = typename _TVT::type; 1805 using _Up = typename _TVT::value_type; 1806 _V __x = __xx; 1807 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1808 if (__builtin_is_constant_evaluated()) 1809 return __x >> __y; 1810 else if (__builtin_constant_p(__y) 1811 && is_unsigned_v< 1812 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__)) 1813 return _V(); 1814 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{ 1815 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y) 1816 & _Up(0xff >> __y); 1817 //}}} 1818 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{ 1819 return __intrin_bitcast<_V>( 1820 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix) 1821 >> (__y + 8)) 1822 << 8) 1823 | (__vector_bitcast<_UShort>( 1824 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8) 1825 >> __y) 1826 >> 8)); 1827 //}}} 1828 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected 1829 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{ 1830 { 1831 if (__y > 32) 1832 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32) 1833 & _Up(0xffff'ffff'0000'0000ull)) 1834 | __vector_bitcast<_Up>( 1835 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix) 1836 >> 32) 1837 >> (__y - 32)); 1838 else 1839 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix) 1840 >> __y) 1841 | __vector_bitcast<_Up>( 1842 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll) 1843 >> __y); 1844 } 1845 //}}} 1846 else 1847 return __x >> __y; 1848 } 1849 1850 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1851 inline _GLIBCXX_CONST static typename _TVT::type 1852 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y) 1853 { 1854 using _V = typename _TVT::type; 1855 using _Up = typename _TVT::value_type; 1856 _V __x = __xx; 1857 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1858 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1859 if (__builtin_is_constant_evaluated() 1860 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 1861 return __x >> __y; 1862 else if constexpr (sizeof(_Up) == 1) //{{{ 1863 { 1864 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl) 1865 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8( 1866 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix), 1867 _mm_cvtepi8_epi16(__iy)) 1868 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix), 1869 _mm_cvtepu8_epi16(__iy)))); 1870 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl) 1871 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1872 is_signed_v<_Up> 1873 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix), 1874 _mm256_cvtepi8_epi16(__iy)) 1875 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix), 1876 _mm256_cvtepu8_epi16(__iy)))); 1877 else if constexpr (sizeof(__x) == 32 && __have_avx512bw) 1878 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1879 is_signed_v<_Up> 1880 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix), 1881 _mm512_cvtepi8_epi16(__iy)) 1882 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix), 1883 _mm512_cvtepu8_epi16(__iy)))); 1884 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>) 1885 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1886 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1887 0x5555'5555'5555'5555ull, 1888 _mm512_srav_epi16( 1889 _mm512_slli_epi16(__ix, 8), 1890 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy, 1891 _mm512_set1_epi16(8))))); 1892 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>) 1893 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1894 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1895 0x5555'5555'5555'5555ull, 1896 _mm512_srlv_epi16( 1897 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix), 1898 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy)))); 1899 /* This has better throughput but higher latency than the impl below 1900 else if constexpr (__have_avx2 && sizeof(__x) == 16 && 1901 is_unsigned_v<_Up>) 1902 { 1903 const auto __shorts = __to_intrin(_S_bit_shift_right( 1904 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)), 1905 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy)))); 1906 return __vector_bitcast<_Up>( 1907 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts))); 1908 } 1909 */ 1910 else if constexpr (__have_avx2 && sizeof(__x) > 8) 1911 // the following uses vpsr[al]vd, which requires AVX2 1912 if constexpr (is_signed_v<_Up>) 1913 { 1914 const auto r3 = __vector_bitcast<_UInt>( 1915 (__vector_bitcast<int>(__x) 1916 >> (__vector_bitcast<_UInt>(__y) >> 24))) 1917 & 0xff000000u; 1918 const auto r2 1919 = __vector_bitcast<_UInt>( 1920 ((__vector_bitcast<int>(__x) << 8) 1921 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))) 1922 & 0xff000000u; 1923 const auto r1 1924 = __vector_bitcast<_UInt>( 1925 ((__vector_bitcast<int>(__x) << 16) 1926 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))) 1927 & 0xff000000u; 1928 const auto r0 = __vector_bitcast<_UInt>( 1929 (__vector_bitcast<int>(__x) << 24) 1930 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24)); 1931 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1932 | (r0 >> 24)); 1933 } 1934 else 1935 { 1936 const auto r3 = (__vector_bitcast<_UInt>(__x) 1937 >> (__vector_bitcast<_UInt>(__y) >> 24)) 1938 & 0xff000000u; 1939 const auto r2 1940 = ((__vector_bitcast<_UInt>(__x) << 8) 1941 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)) 1942 & 0xff000000u; 1943 const auto r1 1944 = ((__vector_bitcast<_UInt>(__x) << 16) 1945 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)) 1946 & 0xff000000u; 1947 const auto r0 1948 = (__vector_bitcast<_UInt>(__x) << 24) 1949 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24); 1950 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1951 | (r0 >> 24)); 1952 } 1953 else if constexpr (__have_sse4_1 1954 && is_unsigned_v<_Up> && sizeof(__x) > 2) 1955 { 1956 auto __x128 = __vector_bitcast<_Up>(__ix); 1957 auto __mask 1958 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5); 1959 auto __x4 = __vector_bitcast<_Up>( 1960 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f)); 1961 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1962 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4))); 1963 __mask += __mask; 1964 auto __x2 = __vector_bitcast<_Up>( 1965 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f)); 1966 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1967 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2))); 1968 __mask += __mask; 1969 auto __x1 = __vector_bitcast<_Up>( 1970 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f)); 1971 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1972 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1))); 1973 return __intrin_bitcast<_V>( 1974 __x128 1975 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 1976 == 0)); // y > 7 nulls the result 1977 } 1978 else if constexpr (__have_sse4_1 1979 && is_signed_v<_Up> && sizeof(__x) > 2) 1980 { 1981 auto __mask = __vector_bitcast<_UChar>( 1982 __vector_bitcast<_UShort>(__iy) << 5); 1983 auto __maskl = [&]() { 1984 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); 1985 }; 1986 auto __xh = __vector_bitcast<short>(__ix); 1987 auto __xl = __vector_bitcast<short>(__ix) << 8; 1988 auto __xh4 = __xh >> 4; 1989 auto __xl4 = __xl >> 4; 1990 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 1991 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4))); 1992 __xl = __vector_bitcast<short>( 1993 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 1994 __to_intrin(__xl4))); 1995 __mask += __mask; 1996 auto __xh2 = __xh >> 2; 1997 auto __xl2 = __xl >> 2; 1998 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 1999 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2))); 2000 __xl = __vector_bitcast<short>( 2001 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2002 __to_intrin(__xl2))); 2003 __mask += __mask; 2004 auto __xh1 = __xh >> 1; 2005 auto __xl1 = __xl >> 1; 2006 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2007 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1))); 2008 __xl = __vector_bitcast<short>( 2009 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2010 __to_intrin(__xl1))); 2011 return __intrin_bitcast<_V>( 2012 (__vector_bitcast<_Up>((__xh & short(0xff00))) 2013 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2014 >> 8)) 2015 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2016 == 0)); // y > 7 nulls the result 2017 } 2018 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2 2019 { 2020 auto __mask 2021 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5); 2022 auto __x4 = __vector_bitcast<_Up>( 2023 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f)); 2024 __x = __mask > 0x7f ? __x4 : __x; 2025 __mask += __mask; 2026 auto __x2 = __vector_bitcast<_Up>( 2027 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f)); 2028 __x = __mask > 0x7f ? __x2 : __x; 2029 __mask += __mask; 2030 auto __x1 = __vector_bitcast<_Up>( 2031 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f)); 2032 __x = __mask > 0x7f ? __x1 : __x; 2033 return __x 2034 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2035 } 2036 else if constexpr (sizeof(__x) > 2) // signed SSE2 2037 { 2038 static_assert(is_signed_v<_Up>); 2039 auto __maskh = __vector_bitcast<_UShort>(__y) << 5; 2040 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8); 2041 auto __xh = __vector_bitcast<short>(__x); 2042 auto __xl = __vector_bitcast<short>(__x) << 8; 2043 auto __xh4 = __xh >> 4; 2044 auto __xl4 = __xl >> 4; 2045 __xh = __maskh > 0x7fff ? __xh4 : __xh; 2046 __xl = __maskl > 0x7fff ? __xl4 : __xl; 2047 __maskh += __maskh; 2048 __maskl += __maskl; 2049 auto __xh2 = __xh >> 2; 2050 auto __xl2 = __xl >> 2; 2051 __xh = __maskh > 0x7fff ? __xh2 : __xh; 2052 __xl = __maskl > 0x7fff ? __xl2 : __xl; 2053 __maskh += __maskh; 2054 __maskl += __maskl; 2055 auto __xh1 = __xh >> 1; 2056 auto __xl1 = __xl >> 1; 2057 __xh = __maskh > 0x7fff ? __xh1 : __xh; 2058 __xl = __maskl > 0x7fff ? __xl1 : __xl; 2059 __x = __vector_bitcast<_Up>((__xh & short(0xff00))) 2060 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2061 >> 8); 2062 return __x 2063 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2064 } 2065 else 2066 return __x >> __y; 2067 } //}}} 2068 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ 2069 { 2070 [[maybe_unused]] auto __blend_0xaa = [](auto __a, auto __b) { 2071 if constexpr (sizeof(__a) == 16) 2072 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2073 0xaa); 2074 else if constexpr (sizeof(__a) == 32) 2075 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2076 0xaa); 2077 else if constexpr (sizeof(__a) == 64) 2078 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), 2079 __to_intrin(__b)); 2080 else 2081 __assert_unreachable<decltype(__a)>(); 2082 }; 2083 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) 2084 return __intrin_bitcast<_V>(is_signed_v<_Up> 2085 ? _mm_srav_epi16(__ix, __iy) 2086 : _mm_srlv_epi16(__ix, __iy)); 2087 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32) 2088 return __vector_bitcast<_Up>(is_signed_v<_Up> 2089 ? _mm256_srav_epi16(__ix, __iy) 2090 : _mm256_srlv_epi16(__ix, __iy)); 2091 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64) 2092 return __vector_bitcast<_Up>(is_signed_v<_Up> 2093 ? _mm512_srav_epi16(__ix, __iy) 2094 : _mm512_srlv_epi16(__ix, __iy)); 2095 else if constexpr (__have_avx2 && is_signed_v<_Up>) 2096 return __intrin_bitcast<_V>( 2097 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16) 2098 >> (__vector_bitcast<int>(__iy) & 0xffffu)) 2099 >> 16, 2100 __vector_bitcast<int>(__ix) 2101 >> (__vector_bitcast<int>(__iy) >> 16))); 2102 else if constexpr (__have_avx2 && is_unsigned_v<_Up>) 2103 return __intrin_bitcast<_V>( 2104 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu) 2105 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu), 2106 __vector_bitcast<_UInt>(__ix) 2107 >> (__vector_bitcast<_UInt>(__iy) >> 16))); 2108 else if constexpr (__have_sse4_1) 2109 { 2110 auto __mask = __vector_bitcast<_UShort>(__iy); 2111 auto __x128 = __vector_bitcast<_Up>(__ix); 2112 //__mask *= 0x0808; 2113 __mask = (__mask << 3) | (__mask << 11); 2114 // do __x128 = 0 where __y[4] is set 2115 __x128 = __vector_bitcast<_Up>( 2116 _mm_blendv_epi8(__to_intrin(__x128), __m128i(), 2117 __to_intrin(__mask))); 2118 // do __x128 =>> 8 where __y[3] is set 2119 __x128 = __vector_bitcast<_Up>( 2120 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8), 2121 __to_intrin(__mask += __mask))); 2122 // do __x128 =>> 4 where __y[2] is set 2123 __x128 = __vector_bitcast<_Up>( 2124 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4), 2125 __to_intrin(__mask += __mask))); 2126 // do __x128 =>> 2 where __y[1] is set 2127 __x128 = __vector_bitcast<_Up>( 2128 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2), 2129 __to_intrin(__mask += __mask))); 2130 // do __x128 =>> 1 where __y[0] is set 2131 return __intrin_bitcast<_V>( 2132 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1), 2133 __to_intrin(__mask + __mask))); 2134 } 2135 else 2136 { 2137 auto __k = __vector_bitcast<_UShort>(__iy) << 11; 2138 auto __x128 = __vector_bitcast<_Up>(__ix); 2139 auto __mask = [](__vector_type16_t<_UShort> __kk) { 2140 return __vector_bitcast<short>(__kk) < 0; 2141 }; 2142 // do __x128 = 0 where __y[4] is set 2143 __x128 = __mask(__k) ? decltype(__x128)() : __x128; 2144 // do __x128 =>> 8 where __y[3] is set 2145 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128; 2146 // do __x128 =>> 4 where __y[2] is set 2147 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128; 2148 // do __x128 =>> 2 where __y[1] is set 2149 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128; 2150 // do __x128 =>> 1 where __y[0] is set 2151 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1 2152 : __x128); 2153 } 2154 } //}}} 2155 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{ 2156 { 2157 if constexpr (is_unsigned_v<_Up>) 2158 { 2159 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31 2160 const __m128 __factor_f = reinterpret_cast<__m128>( 2161 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23)); 2162 const __m128i __factor 2163 = __builtin_constant_p(__factor_f) 2164 ? __to_intrin( 2165 __make_vector<unsigned>(__factor_f[0], __factor_f[1], 2166 __factor_f[2], __factor_f[3])) 2167 : _mm_cvttps_epi32(__factor_f); 2168 const auto __r02 2169 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31); 2170 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4), 2171 _mm_srli_si128(__factor, 4)); 2172 if constexpr (__have_sse4_1) 2173 return __intrin_bitcast<_V>( 2174 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33)); 2175 else 2176 return __intrin_bitcast<_V>( 2177 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4)); 2178 } 2179 else 2180 { 2181 auto __shift = [](auto __a, auto __b) { 2182 if constexpr (is_signed_v<_Up>) 2183 return _mm_sra_epi32(__a, __b); 2184 else 2185 return _mm_srl_epi32(__a, __b); 2186 }; 2187 const auto __r0 2188 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i())); 2189 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32)); 2190 const auto __r2 2191 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i())); 2192 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12)); 2193 if constexpr (__have_sse4_1) 2194 return __intrin_bitcast<_V>( 2195 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3), 2196 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0)); 2197 else 2198 return __intrin_bitcast<_V>(_mm_unpacklo_epi64( 2199 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)), 2200 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4)))); 2201 } 2202 } //}}} 2203 else 2204 return __x >> __y; 2205 } 2206 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 2207 2208 // }}} 2209 // compares {{{ 2210 // _S_equal_to {{{ 2211 template <typename _Tp, size_t _Np> 2212 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_equal_to_SimdImplX862213 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2214 { 2215 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2216 { 2217 if (__builtin_is_constant_evaluated() 2218 || (__x._M_is_constprop() && __y._M_is_constprop())) 2219 return _MaskImpl::_S_to_bits( 2220 __as_wrapper<_Np>(__x._M_data == __y._M_data)); 2221 2222 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2223 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2224 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2225 if constexpr (is_floating_point_v<_Tp>) 2226 { 2227 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2228 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2229 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2230 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2231 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2232 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2233 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2234 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2235 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2236 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2237 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2238 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2239 else 2240 __assert_unreachable<_Tp>(); 2241 } 2242 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2243 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2244 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2245 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2246 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2247 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2248 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2249 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2250 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2251 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2252 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2253 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2254 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2255 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2256 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2257 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2258 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2259 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2260 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2261 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2262 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2263 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2264 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2265 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2266 else 2267 __assert_unreachable<_Tp>(); 2268 } // }}} 2269 else if (__builtin_is_constant_evaluated()) 2270 return _Base::_S_equal_to(__x, __y); 2271 else if constexpr (sizeof(__x) == 8) // {{{ 2272 { 2273 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2274 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2275 _MaskMember<_Tp> __r64; 2276 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2277 return __r64; 2278 } // }}} 2279 else 2280 return _Base::_S_equal_to(__x, __y); 2281 } 2282 2283 // }}} 2284 // _S_not_equal_to {{{ 2285 template <typename _Tp, size_t _Np> 2286 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_not_equal_to_SimdImplX862287 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2288 { 2289 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2290 { 2291 if (__builtin_is_constant_evaluated() 2292 || (__x._M_is_constprop() && __y._M_is_constprop())) 2293 return _MaskImpl::_S_to_bits( 2294 __as_wrapper<_Np>(__x._M_data != __y._M_data)); 2295 2296 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2297 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2298 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2299 if constexpr (is_floating_point_v<_Tp>) 2300 { 2301 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2302 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2303 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2304 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2305 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2306 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2307 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2308 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2309 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2310 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2311 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2312 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2313 else 2314 __assert_unreachable<_Tp>(); 2315 } 2316 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2317 return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2318 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2319 return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2320 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2321 return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2322 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2323 return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2324 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2325 return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2326 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2327 return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2328 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2329 return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2330 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2331 return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2332 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2333 return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2334 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2335 return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2336 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2337 return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2338 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2339 return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2340 else 2341 __assert_unreachable<_Tp>(); 2342 } // }}} 2343 else if constexpr (!__builtin_is_constant_evaluated() // {{{ 2344 && sizeof(__x) == 8) 2345 { 2346 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2347 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2348 _MaskMember<_Tp> __r64; 2349 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2350 return __r64; 2351 } // }}} 2352 else 2353 return _Base::_S_not_equal_to(__x, __y); 2354 } 2355 2356 // }}} 2357 // _S_less {{{ 2358 template <typename _Tp, size_t _Np> 2359 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_less_SimdImplX862360 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2361 { 2362 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2363 { 2364 if (__builtin_is_constant_evaluated() 2365 || (__x._M_is_constprop() && __y._M_is_constprop())) 2366 return _MaskImpl::_S_to_bits( 2367 __as_wrapper<_Np>(__x._M_data < __y._M_data)); 2368 2369 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2370 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2371 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2372 if constexpr (sizeof(__xi) == 64) 2373 { 2374 if constexpr (is_same_v<_Tp, float>) 2375 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2376 else if constexpr (is_same_v<_Tp, double>) 2377 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2378 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2379 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2380 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2381 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2382 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2383 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2384 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2385 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2386 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2387 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2388 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2389 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2390 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2391 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2392 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2393 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2394 else 2395 __assert_unreachable<_Tp>(); 2396 } 2397 else if constexpr (sizeof(__xi) == 32) 2398 { 2399 if constexpr (is_same_v<_Tp, float>) 2400 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2401 else if constexpr (is_same_v<_Tp, double>) 2402 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2403 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2404 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2405 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2406 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2407 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2408 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2409 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2410 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2411 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2412 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2413 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2414 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2415 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2416 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2417 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2418 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2419 else 2420 __assert_unreachable<_Tp>(); 2421 } 2422 else if constexpr (sizeof(__xi) == 16) 2423 { 2424 if constexpr (is_same_v<_Tp, float>) 2425 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2426 else if constexpr (is_same_v<_Tp, double>) 2427 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2428 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2429 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2430 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2431 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2432 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2433 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2434 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2435 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2436 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2437 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2438 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2439 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2440 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2441 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2442 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2443 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2444 else 2445 __assert_unreachable<_Tp>(); 2446 } 2447 else 2448 __assert_unreachable<_Tp>(); 2449 } // }}} 2450 else if constexpr (!__builtin_is_constant_evaluated() // {{{ 2451 && sizeof(__x) == 8) 2452 { 2453 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2454 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2455 _MaskMember<_Tp> __r64; 2456 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2457 return __r64; 2458 } // }}} 2459 else 2460 return _Base::_S_less(__x, __y); 2461 } 2462 2463 // }}} 2464 // _S_less_equal {{{ 2465 template <typename _Tp, size_t _Np> 2466 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_less_equal_SimdImplX862467 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2468 { 2469 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2470 { 2471 if (__builtin_is_constant_evaluated() 2472 || (__x._M_is_constprop() && __y._M_is_constprop())) 2473 return _MaskImpl::_S_to_bits( 2474 __as_wrapper<_Np>(__x._M_data <= __y._M_data)); 2475 2476 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2477 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2478 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2479 if constexpr (sizeof(__xi) == 64) 2480 { 2481 if constexpr (is_same_v<_Tp, float>) 2482 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2483 else if constexpr (is_same_v<_Tp, double>) 2484 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2485 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2486 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi); 2487 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2488 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi); 2489 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2490 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi); 2491 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2492 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi); 2493 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2494 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi); 2495 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2496 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi); 2497 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2498 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi); 2499 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2500 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi); 2501 else 2502 __assert_unreachable<_Tp>(); 2503 } 2504 else if constexpr (sizeof(__xi) == 32) 2505 { 2506 if constexpr (is_same_v<_Tp, float>) 2507 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2508 else if constexpr (is_same_v<_Tp, double>) 2509 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2510 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2511 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi); 2512 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2513 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi); 2514 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2515 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi); 2516 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2517 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi); 2518 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2519 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi); 2520 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2521 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi); 2522 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2523 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi); 2524 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2525 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi); 2526 else 2527 __assert_unreachable<_Tp>(); 2528 } 2529 else if constexpr (sizeof(__xi) == 16) 2530 { 2531 if constexpr (is_same_v<_Tp, float>) 2532 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2533 else if constexpr (is_same_v<_Tp, double>) 2534 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2535 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2536 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi); 2537 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2538 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi); 2539 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2540 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi); 2541 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2542 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi); 2543 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2544 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi); 2545 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2546 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi); 2547 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2548 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi); 2549 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2550 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi); 2551 else 2552 __assert_unreachable<_Tp>(); 2553 } 2554 else 2555 __assert_unreachable<_Tp>(); 2556 } // }}} 2557 else if constexpr (!__builtin_is_constant_evaluated() // {{{ 2558 && sizeof(__x) == 8) 2559 { 2560 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2561 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2562 _MaskMember<_Tp> __r64; 2563 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2564 return __r64; 2565 } // }}} 2566 else 2567 return _Base::_S_less_equal(__x, __y); 2568 } 2569 2570 // }}} }}} 2571 // negation {{{ 2572 template <typename _Tp, size_t _Np> 2573 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_negate_SimdImplX862574 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 2575 { 2576 if constexpr (__is_avx512_abi<_Abi>()) 2577 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>()); 2578 else 2579 return _Base::_S_negate(__x); 2580 } 2581 2582 // }}} 2583 // math {{{ 2584 using _Base::_S_abs; 2585 2586 // _S_sqrt {{{ 2587 template <typename _Tp, size_t _Np> 2588 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_sqrt_SimdImplX862589 _S_sqrt(_SimdWrapper<_Tp, _Np> __x) 2590 { 2591 if constexpr (__is_sse_ps<_Tp, _Np>()) 2592 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x))); 2593 else if constexpr (__is_sse_pd<_Tp, _Np>()) 2594 return _mm_sqrt_pd(__x); 2595 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2596 return _mm256_sqrt_ps(__x); 2597 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2598 return _mm256_sqrt_pd(__x); 2599 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 2600 return _mm512_sqrt_ps(__x); 2601 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2602 return _mm512_sqrt_pd(__x); 2603 else 2604 __assert_unreachable<_Tp>(); 2605 } 2606 2607 // }}} 2608 // _S_ldexp {{{ 2609 template <typename _Tp, size_t _Np> 2610 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_ldexp_SimdImplX862611 _S_ldexp(_SimdWrapper<_Tp, _Np> __x, 2612 __fixed_size_storage_t<int, _Np> __exp) 2613 { 2614 if constexpr (sizeof(__x) == 64 || __have_avx512vl) 2615 { 2616 const auto __xi = __to_intrin(__x); 2617 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi> 2618 __cvt; 2619 const auto __expi = __to_intrin(__cvt(__exp)); 2620 using _Up = __bool_storage_member_type_t<_Np>; 2621 constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up(); 2622 if constexpr (sizeof(__xi) == 16) 2623 { 2624 if constexpr (sizeof(_Tp) == 8) 2625 return _mm_maskz_scalef_pd(__k1, __xi, __expi); 2626 else 2627 return _mm_maskz_scalef_ps(__k1, __xi, __expi); 2628 } 2629 else if constexpr (sizeof(__xi) == 32) 2630 { 2631 if constexpr (sizeof(_Tp) == 8) 2632 return _mm256_maskz_scalef_pd(__k1, __xi, __expi); 2633 else 2634 return _mm256_maskz_scalef_ps(__k1, __xi, __expi); 2635 } 2636 else 2637 { 2638 static_assert(sizeof(__xi) == 64); 2639 if constexpr (sizeof(_Tp) == 8) 2640 return _mm512_maskz_scalef_pd(__k1, __xi, __expi); 2641 else 2642 return _mm512_maskz_scalef_ps(__k1, __xi, __expi); 2643 } 2644 } 2645 else 2646 return _Base::_S_ldexp(__x, __exp); 2647 } 2648 2649 // }}} 2650 // _S_trunc {{{ 2651 template <typename _Tp, size_t _Np> 2652 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_trunc_SimdImplX862653 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2654 { 2655 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2656 return _mm512_roundscale_ps(__x, 0x0b); 2657 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2658 return _mm512_roundscale_pd(__x, 0x0b); 2659 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2660 return _mm256_round_ps(__x, 0xb); 2661 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2662 return _mm256_round_pd(__x, 0xb); 2663 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2664 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb)); 2665 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2666 return _mm_round_pd(__x, 0xb); 2667 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2668 { 2669 auto __truncated 2670 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))); 2671 const auto __no_fractional_values 2672 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x)) 2673 & 0x7f800000u) 2674 < 0x4b000000; // the exponent is so large that no mantissa bits 2675 // signify fractional values (0x3f8 + 23*8 = 2676 // 0x4b0) 2677 return __no_fractional_values ? __truncated : __to_intrin(__x); 2678 } 2679 else 2680 return _Base::_S_trunc(__x); 2681 } 2682 2683 // }}} 2684 // _S_round {{{ 2685 template <typename _Tp, size_t _Np> 2686 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_round_SimdImplX862687 _S_round(_SimdWrapper<_Tp, _Np> __x) 2688 { 2689 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away 2690 // from zero as required by std::round. Therefore this function is more 2691 // complicated. 2692 using _V = __vector_type_t<_Tp, _Np>; 2693 _V __truncated; 2694 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2695 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b); 2696 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2697 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b); 2698 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2699 __truncated = _mm256_round_ps(__x._M_data, 2700 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2701 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2702 __truncated = _mm256_round_pd(__x._M_data, 2703 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2704 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2705 __truncated = __auto_bitcast( 2706 _mm_round_ps(__to_intrin(__x), 2707 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); 2708 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2709 __truncated 2710 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2711 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2712 __truncated = __auto_bitcast( 2713 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)))); 2714 else 2715 return _Base::_S_round(__x); 2716 2717 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0 2718 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0 2719 2720 const _V __rounded 2721 = __truncated 2722 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5) 2723 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1) 2724 : _V()); 2725 if constexpr (__have_sse4_1) 2726 return __rounded; 2727 else // adjust for missing range in cvttps_epi32 2728 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded 2729 : __x._M_data; 2730 } 2731 2732 // }}} 2733 // _S_nearbyint {{{ 2734 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> _S_nearbyint_SimdImplX862735 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_nearbyint(_Tp __x) noexcept 2736 { 2737 if constexpr (_TVT::template _S_is<float, 16>) 2738 return _mm512_roundscale_ps(__x, 0x0c); 2739 else if constexpr (_TVT::template _S_is<double, 8>) 2740 return _mm512_roundscale_pd(__x, 0x0c); 2741 else if constexpr (_TVT::template _S_is<float, 8>) 2742 return _mm256_round_ps(__x, 2743 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2744 else if constexpr (_TVT::template _S_is<double, 4>) 2745 return _mm256_round_pd(__x, 2746 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2747 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) 2748 return _mm_round_ps(__x, 2749 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2750 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) 2751 return _mm_round_pd(__x, 2752 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2753 else 2754 return _Base::_S_nearbyint(__x); 2755 } 2756 2757 // }}} 2758 // _S_rint {{{ 2759 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> _S_rint_SimdImplX862760 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_rint(_Tp __x) noexcept 2761 { 2762 if constexpr (_TVT::template _S_is<float, 16>) 2763 return _mm512_roundscale_ps(__x, 0x04); 2764 else if constexpr (_TVT::template _S_is<double, 8>) 2765 return _mm512_roundscale_pd(__x, 0x04); 2766 else if constexpr (_TVT::template _S_is<float, 8>) 2767 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2768 else if constexpr (_TVT::template _S_is<double, 4>) 2769 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2770 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) 2771 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2772 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) 2773 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2774 else 2775 return _Base::_S_rint(__x); 2776 } 2777 2778 // }}} 2779 // _S_floor {{{ 2780 template <typename _Tp, size_t _Np> 2781 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_floor_SimdImplX862782 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2783 { 2784 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2785 return _mm512_roundscale_ps(__x, 0x09); 2786 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2787 return _mm512_roundscale_pd(__x, 0x09); 2788 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2789 return _mm256_round_ps(__x, 0x9); 2790 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2791 return _mm256_round_pd(__x, 0x9); 2792 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2793 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9)); 2794 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2795 return _mm_round_pd(__x, 0x9); 2796 else 2797 return _Base::_S_floor(__x); 2798 } 2799 2800 // }}} 2801 // _S_ceil {{{ 2802 template <typename _Tp, size_t _Np> 2803 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_ceil_SimdImplX862804 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2805 { 2806 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2807 return _mm512_roundscale_ps(__x, 0x0a); 2808 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2809 return _mm512_roundscale_pd(__x, 0x0a); 2810 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2811 return _mm256_round_ps(__x, 0xa); 2812 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2813 return _mm256_round_pd(__x, 0xa); 2814 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2815 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa)); 2816 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2817 return _mm_round_pd(__x, 0xa); 2818 else 2819 return _Base::_S_ceil(__x); 2820 } 2821 2822 // }}} 2823 // _S_signbit {{{ 2824 template <typename _Tp, size_t _Np> 2825 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_signbit_SimdImplX862826 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2827 { 2828 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 2829 { 2830 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4) 2831 return _mm512_movepi32_mask( 2832 __intrin_bitcast<__m512i>(__x._M_data)); 2833 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8) 2834 return _mm512_movepi64_mask( 2835 __intrin_bitcast<__m512i>(__x._M_data)); 2836 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4) 2837 return _mm256_movepi32_mask( 2838 __intrin_bitcast<__m256i>(__x._M_data)); 2839 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8) 2840 return _mm256_movepi64_mask( 2841 __intrin_bitcast<__m256i>(__x._M_data)); 2842 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4) 2843 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2844 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8) 2845 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2846 } 2847 else if constexpr (__is_avx512_abi<_Abi>()) 2848 { 2849 const auto __xi = __to_intrin(__x); 2850 [[maybe_unused]] constexpr auto __k1 2851 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2852 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2853 return _mm_movemask_ps(__xi); 2854 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2855 return _mm_movemask_pd(__xi); 2856 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2857 return _mm256_movemask_ps(__xi); 2858 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2859 return _mm256_movemask_pd(__xi); 2860 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2861 return _mm512_mask_cmplt_epi32_mask( 2862 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2863 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2864 return _mm512_mask_cmplt_epi64_mask( 2865 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2866 else 2867 __assert_unreachable<_Tp>(); 2868 } 2869 else 2870 return _Base::_S_signbit(__x); 2871 /*{ 2872 using _I = __int_for_sizeof_t<_Tp>; 2873 if constexpr (sizeof(__x) == 64) 2874 return _S_less(__vector_bitcast<_I>(__x), _I()); 2875 else 2876 { 2877 const auto __xx = __vector_bitcast<_I>(__x._M_data); 2878 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>; 2879 if constexpr ((sizeof(_Tp) == 4 && 2880 (__have_avx2 || sizeof(__x) == 16)) || 2881 __have_avx512vl) 2882 { 2883 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>); 2884 } 2885 else if constexpr ((__have_avx2 || 2886 (__have_ssse3 && sizeof(__x) == 16))) 2887 { 2888 return __vector_bitcast<_Tp>((__xx & __signmask) == 2889 __signmask); 2890 } 2891 else 2892 { // SSE2/3 or AVX (w/o AVX2) 2893 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1); 2894 return __vector_bitcast<_Tp>( 2895 __vector_bitcast<_Tp>( 2896 (__xx & __signmask) | 2897 __vector_bitcast<_I>(__one)) // -1 or 1 2898 != __one); 2899 } 2900 } 2901 }*/ 2902 } 2903 2904 // }}} 2905 // _S_isnonzerovalue_mask {{{ 2906 // (isnormal | is subnormal == !isinf & !isnan & !is zero) 2907 template <typename _Tp> _S_isnonzerovalue_mask_SimdImplX862908 _GLIBCXX_SIMD_INTRINSIC static auto _S_isnonzerovalue_mask(_Tp __x) 2909 { 2910 using _Traits = _VectorTraits<_Tp>; 2911 if constexpr (__have_avx512dq_vl) 2912 { 2913 if constexpr (_Traits::template _S_is< 2914 float, 2> || _Traits::template _S_is<float, 4>) 2915 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f)); 2916 else if constexpr (_Traits::template _S_is<float, 8>) 2917 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f)); 2918 else if constexpr (_Traits::template _S_is<float, 16>) 2919 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f)); 2920 else if constexpr (_Traits::template _S_is<double, 2>) 2921 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f)); 2922 else if constexpr (_Traits::template _S_is<double, 4>) 2923 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f)); 2924 else if constexpr (_Traits::template _S_is<double, 8>) 2925 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f)); 2926 else 2927 __assert_unreachable<_Tp>(); 2928 } 2929 else 2930 { 2931 using _Up = typename _Traits::value_type; 2932 constexpr size_t _Np = _Traits::_S_full_size; 2933 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0 2934 const auto __b = __x * _Up(); // NaN if __x == inf 2935 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>()) 2936 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b), 2937 _CMP_ORD_Q); 2938 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>()) 2939 return __mmask8(0xf 2940 & _mm512_cmp_ps_mask(__auto_bitcast(__a), 2941 __auto_bitcast(__b), 2942 _CMP_ORD_Q)); 2943 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>()) 2944 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2945 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>()) 2946 return __mmask8(0x3 2947 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 2948 __auto_bitcast(__b), 2949 _CMP_ORD_Q)); 2950 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>()) 2951 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 2952 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>()) 2953 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a), 2954 __auto_bitcast(__b), 2955 _CMP_ORD_Q)); 2956 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>()) 2957 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2958 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>()) 2959 return __mmask8(0xf 2960 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 2961 __auto_bitcast(__b), 2962 _CMP_ORD_Q)); 2963 else if constexpr (__is_avx512_ps<_Up, _Np>()) 2964 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 2965 else if constexpr (__is_avx512_pd<_Up, _Np>()) 2966 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2967 else 2968 __assert_unreachable<_Tp>(); 2969 } 2970 } 2971 2972 // }}} 2973 // _S_isfinite {{{ 2974 template <typename _Tp, size_t _Np> 2975 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_isfinite_SimdImplX862976 _S_isfinite(_SimdWrapper<_Tp, _Np> __x) 2977 { 2978 static_assert(is_floating_point_v<_Tp>); 2979 #if !__FINITE_MATH_ONLY__ 2980 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 2981 { 2982 const auto __xi = __to_intrin(__x); 2983 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2984 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2985 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99); 2986 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2987 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99); 2988 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2989 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99); 2990 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2991 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99); 2992 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2993 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99); 2994 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2995 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99); 2996 } 2997 else if constexpr (__is_avx512_abi<_Abi>()) 2998 { 2999 // if all exponent bits are set, __x is either inf or NaN 3000 using _I = __int_for_sizeof_t<_Tp>; 3001 const auto __inf = __vector_bitcast<_I>( 3002 __vector_broadcast<_Np>(__infinity_v<_Tp>)); 3003 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf); 3004 } 3005 else 3006 #endif 3007 return _Base::_S_isfinite(__x); 3008 } 3009 3010 // }}} 3011 // _S_isinf {{{ 3012 template <typename _Tp, size_t _Np> 3013 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_isinf_SimdImplX863014 _S_isinf(_SimdWrapper<_Tp, _Np> __x) 3015 { 3016 #if !__FINITE_MATH_ONLY__ 3017 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3018 { 3019 const auto __xi = __to_intrin(__x); 3020 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3021 return _mm512_fpclass_ps_mask(__xi, 0x18); 3022 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3023 return _mm512_fpclass_pd_mask(__xi, 0x18); 3024 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3025 return _mm256_fpclass_ps_mask(__xi, 0x18); 3026 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3027 return _mm256_fpclass_pd_mask(__xi, 0x18); 3028 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3029 return _mm_fpclass_ps_mask(__xi, 0x18); 3030 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3031 return _mm_fpclass_pd_mask(__xi, 0x18); 3032 else 3033 __assert_unreachable<_Tp>(); 3034 } 3035 else if constexpr (__have_avx512dq_vl) 3036 { 3037 if constexpr (__is_sse_pd<_Tp, _Np>()) 3038 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18)); 3039 else if constexpr (__is_avx_pd<_Tp, _Np>()) 3040 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18)); 3041 else if constexpr (__is_sse_ps<_Tp, _Np>()) 3042 return _mm_movm_epi32( 3043 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18)); 3044 else if constexpr (__is_avx_ps<_Tp, _Np>()) 3045 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18)); 3046 else 3047 __assert_unreachable<_Tp>(); 3048 } 3049 else 3050 #endif 3051 return _Base::_S_isinf(__x); 3052 } 3053 3054 // }}} 3055 // _S_isnormal {{{ 3056 template <typename _Tp, size_t _Np> 3057 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_isnormal_SimdImplX863058 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 3059 { 3060 #if __FINITE_MATH_ONLY__ 3061 [[maybe_unused]] constexpr int __mode = 0x26; 3062 #else 3063 [[maybe_unused]] constexpr int __mode = 0xbf; 3064 #endif 3065 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3066 { 3067 const auto __xi = __to_intrin(__x); 3068 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3069 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3070 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode); 3071 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3072 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode); 3073 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3074 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode); 3075 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3076 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode); 3077 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3078 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode); 3079 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3080 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode); 3081 else 3082 __assert_unreachable<_Tp>(); 3083 } 3084 else if constexpr (__have_avx512dq) 3085 { 3086 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) 3087 return _mm_movm_epi32( 3088 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode))); 3089 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) 3090 return _mm256_movm_epi32( 3091 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode))); 3092 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 3093 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode)); 3094 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) 3095 return _mm_movm_epi64( 3096 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode))); 3097 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) 3098 return _mm256_movm_epi64( 3099 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode))); 3100 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 3101 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode)); 3102 else 3103 __assert_unreachable<_Tp>(); 3104 } 3105 else if constexpr (__is_avx512_abi<_Abi>()) 3106 { 3107 using _I = __int_for_sizeof_t<_Tp>; 3108 const auto absn = __vector_bitcast<_I>(_S_abs(__x)); 3109 const auto minn = __vector_bitcast<_I>( 3110 __vector_broadcast<_Np>(__norm_min_v<_Tp>)); 3111 #if __FINITE_MATH_ONLY__ 3112 return _S_less_equal<_I, _Np>(minn, absn); 3113 #else 3114 const auto infn 3115 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 3116 return __and(_S_less_equal<_I, _Np>(minn, absn), 3117 _S_less<_I, _Np>(absn, infn)); 3118 #endif 3119 } 3120 else 3121 return _Base::_S_isnormal(__x); 3122 } 3123 3124 // }}} 3125 // _S_isnan {{{ 3126 template <typename _Tp, size_t _Np> 3127 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_isnan_SimdImplX863128 _S_isnan(_SimdWrapper<_Tp, _Np> __x) 3129 { return _S_isunordered(__x, __x); } 3130 3131 // }}} 3132 // _S_isunordered {{{ 3133 template <typename _Tp, size_t _Np> 3134 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_isunordered_SimdImplX863135 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x, 3136 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y) 3137 { 3138 #if __FINITE_MATH_ONLY__ 3139 return {}; // false 3140 #else 3141 const auto __xi = __to_intrin(__x); 3142 const auto __yi = __to_intrin(__y); 3143 if constexpr (__is_avx512_abi<_Abi>()) 3144 { 3145 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3146 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3147 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3148 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3149 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3150 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3151 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3152 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3153 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3154 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3155 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3156 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3157 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3158 } 3159 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3160 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q)); 3161 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3162 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q)); 3163 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3164 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi)); 3165 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3166 return __to_masktype(_mm_cmpunord_pd(__xi, __yi)); 3167 else 3168 __assert_unreachable<_Tp>(); 3169 #endif 3170 } 3171 3172 // }}} 3173 // _S_isgreater {{{ 3174 template <typename _Tp, size_t _Np> _S_isgreater_SimdImplX863175 static constexpr _MaskMember<_Tp> _S_isgreater(_SimdWrapper<_Tp, _Np> __x, 3176 _SimdWrapper<_Tp, _Np> __y) 3177 { 3178 const auto __xi = __to_intrin(__x); 3179 const auto __yi = __to_intrin(__y); 3180 if constexpr (__is_avx512_abi<_Abi>()) 3181 { 3182 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3183 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3184 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3185 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3186 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3187 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3188 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3189 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3190 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3191 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3192 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3193 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3194 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3195 else 3196 __assert_unreachable<_Tp>(); 3197 } 3198 else if constexpr (__have_avx) 3199 { 3200 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3201 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3202 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3203 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3204 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3205 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3206 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3207 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3208 else 3209 __assert_unreachable<_Tp>(); 3210 } 3211 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3212 && sizeof(_Tp) == 4) 3213 { 3214 const auto __xn = __vector_bitcast<int>(__xi); 3215 const auto __yn = __vector_bitcast<int>(__yi); 3216 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3217 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3218 return __auto_bitcast( 3219 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp)); 3220 } 3221 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3222 && sizeof(_Tp) == 8) 3223 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3224 -_mm_ucomigt_sd(__xi, __yi), 3225 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi), 3226 _mm_unpackhi_pd(__yi, __yi))}; 3227 else 3228 return _Base::_S_isgreater(__x, __y); 3229 } 3230 3231 // }}} 3232 // _S_isgreaterequal {{{ 3233 template <typename _Tp, size_t _Np> 3234 static constexpr _MaskMember<_Tp> _S_isgreaterequal_SimdImplX863235 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3236 { 3237 const auto __xi = __to_intrin(__x); 3238 const auto __yi = __to_intrin(__y); 3239 if constexpr (__is_avx512_abi<_Abi>()) 3240 { 3241 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3242 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3243 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3244 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3245 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3246 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3247 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3248 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3249 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3250 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3251 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3252 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3253 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3254 else 3255 __assert_unreachable<_Tp>(); 3256 } 3257 else if constexpr (__have_avx) 3258 { 3259 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3260 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3261 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3262 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3263 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3264 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3265 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3266 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3267 else 3268 __assert_unreachable<_Tp>(); 3269 } 3270 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3271 && sizeof(_Tp) == 4) 3272 { 3273 const auto __xn = __vector_bitcast<int>(__xi); 3274 const auto __yn = __vector_bitcast<int>(__yi); 3275 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3276 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3277 return __auto_bitcast( 3278 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp)); 3279 } 3280 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3281 && sizeof(_Tp) == 8) 3282 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3283 -_mm_ucomige_sd(__xi, __yi), 3284 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi), 3285 _mm_unpackhi_pd(__yi, __yi))}; 3286 else 3287 return _Base::_S_isgreaterequal(__x, __y); 3288 } 3289 3290 // }}} 3291 // _S_isless {{{ 3292 template <typename _Tp, size_t _Np> _S_isless_SimdImplX863293 static constexpr _MaskMember<_Tp> _S_isless(_SimdWrapper<_Tp, _Np> __x, 3294 _SimdWrapper<_Tp, _Np> __y) 3295 { 3296 const auto __xi = __to_intrin(__x); 3297 const auto __yi = __to_intrin(__y); 3298 if constexpr (__is_avx512_abi<_Abi>()) 3299 { 3300 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3301 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3302 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3303 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3304 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3305 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3306 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3307 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3308 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3309 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3310 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3311 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3312 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3313 else 3314 __assert_unreachable<_Tp>(); 3315 } 3316 else if constexpr (__have_avx) 3317 { 3318 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3319 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3320 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3321 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3322 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3323 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3324 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3325 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3326 else 3327 __assert_unreachable<_Tp>(); 3328 } 3329 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3330 && sizeof(_Tp) == 4) 3331 { 3332 const auto __xn = __vector_bitcast<int>(__xi); 3333 const auto __yn = __vector_bitcast<int>(__yi); 3334 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3335 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3336 return __auto_bitcast( 3337 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp)); 3338 } 3339 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3340 && sizeof(_Tp) == 8) 3341 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3342 -_mm_ucomigt_sd(__yi, __xi), 3343 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi), 3344 _mm_unpackhi_pd(__xi, __xi))}; 3345 else 3346 return _Base::_S_isless(__x, __y); 3347 } 3348 3349 // }}} 3350 // _S_islessequal {{{ 3351 template <typename _Tp, size_t _Np> 3352 static constexpr _MaskMember<_Tp> _S_islessequal_SimdImplX863353 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3354 { 3355 const auto __xi = __to_intrin(__x); 3356 const auto __yi = __to_intrin(__y); 3357 if constexpr (__is_avx512_abi<_Abi>()) 3358 { 3359 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3360 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3361 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3362 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3363 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3364 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3365 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3366 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3367 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3368 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3369 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3370 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3371 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3372 else 3373 __assert_unreachable<_Tp>(); 3374 } 3375 else if constexpr (__have_avx) 3376 { 3377 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3378 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3379 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3380 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3381 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3382 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3383 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3384 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3385 else 3386 __assert_unreachable<_Tp>(); 3387 } 3388 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3389 && sizeof(_Tp) == 4) 3390 { 3391 const auto __xn = __vector_bitcast<int>(__xi); 3392 const auto __yn = __vector_bitcast<int>(__yi); 3393 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3394 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3395 return __auto_bitcast( 3396 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp)); 3397 } 3398 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3399 && sizeof(_Tp) == 8) 3400 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3401 -_mm_ucomige_sd(__yi, __xi), 3402 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi), 3403 _mm_unpackhi_pd(__xi, __xi))}; 3404 else 3405 return _Base::_S_islessequal(__x, __y); 3406 } 3407 3408 // }}} 3409 // _S_islessgreater {{{ 3410 template <typename _Tp, size_t _Np> 3411 static constexpr _MaskMember<_Tp> _S_islessgreater_SimdImplX863412 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3413 { 3414 const auto __xi = __to_intrin(__x); 3415 const auto __yi = __to_intrin(__y); 3416 if constexpr (__is_avx512_abi<_Abi>()) 3417 { 3418 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3419 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3420 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3421 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3422 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3423 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3424 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3425 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3426 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3427 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3428 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3429 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3430 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3431 else 3432 __assert_unreachable<_Tp>(); 3433 } 3434 else if constexpr (__have_avx) 3435 { 3436 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3437 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3438 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3439 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3440 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3441 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3442 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3443 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3444 else 3445 __assert_unreachable<_Tp>(); 3446 } 3447 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3448 return __auto_bitcast( 3449 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi))); 3450 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3451 return __to_masktype( 3452 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi))); 3453 else 3454 __assert_unreachable<_Tp>(); 3455 } 3456 3457 //}}} }}} 3458 }; 3459 3460 // }}} 3461 // _MaskImplX86Mixin {{{ 3462 struct _MaskImplX86Mixin 3463 { 3464 template <typename _Tp> 3465 using _TypeTag = _Tp*; 3466 3467 using _Base = _MaskImplBuiltinMixin; 3468 3469 // _S_to_maskvector(bool) {{{ 3470 template <typename _Up, size_t _ToN = 1, typename _Tp> 3471 _GLIBCXX_SIMD_INTRINSIC static constexpr enable_if_t< 3472 is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>> _S_to_maskvector_MaskImplX86Mixin3473 _S_to_maskvector(_Tp __x) 3474 { 3475 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3476 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 3477 : __vector_type_t<_Up, _ToN>(); 3478 } 3479 3480 // }}} 3481 // _S_to_maskvector(_SanitizedBitMask) {{{ 3482 template <typename _Up, size_t _UpN = 0, size_t _Np, 3483 size_t _ToN = _UpN == 0 ? _Np : _UpN> 3484 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> _S_to_maskvector_MaskImplX86Mixin3485 _S_to_maskvector(_SanitizedBitMask<_Np> __x) 3486 { 3487 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3488 using _UV = __vector_type_t<_Up, _ToN>; 3489 using _UI = __intrinsic_type_t<_Up, _ToN>; 3490 [[maybe_unused]] const auto __k = __x._M_to_bits(); 3491 if constexpr (_Np == 1) 3492 return _S_to_maskvector<_Up, _ToN>(__k); 3493 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3494 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>( 3495 [&](auto __i) -> _Up { return -__x[__i.value]; }); 3496 else if constexpr (sizeof(_Up) == 1) 3497 { 3498 if constexpr (sizeof(_UI) == 16) 3499 { 3500 if constexpr (__have_avx512bw_vl) 3501 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k)); 3502 else if constexpr (__have_avx512bw) 3503 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k))); 3504 else if constexpr (__have_avx512f) 3505 { 3506 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3507 auto __as16bits 3508 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3509 __hi256(__as32bits))); 3510 return __intrin_bitcast<_UV>( 3511 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 3512 } 3513 else if constexpr (__have_ssse3) 3514 { 3515 const auto __bitmask = __to_intrin( 3516 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 3517 8, 16, 32, 64, 128)); 3518 return __intrin_bitcast<_UV>( 3519 __vector_bitcast<_Up>( 3520 _mm_shuffle_epi8(__to_intrin( 3521 __vector_type_t<_ULLong, 2>{__k}), 3522 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3523 1, 1, 1, 1, 1, 1, 1)) 3524 & __bitmask) 3525 != 0); 3526 } 3527 // else fall through 3528 } 3529 else if constexpr (sizeof(_UI) == 32) 3530 { 3531 if constexpr (__have_avx512bw_vl) 3532 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k)); 3533 else if constexpr (__have_avx512bw) 3534 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k))); 3535 else if constexpr (__have_avx512f) 3536 { 3537 auto __as16bits = // 0 16 1 17 ... 15 31 3538 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()), 3539 16) 3540 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16, 3541 ~__m512i()), 3542 16); 3543 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16( 3544 __lo256(__as16bits), 3545 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ... 3546 ); 3547 // deinterleave: 3548 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8( 3549 __0_16_1_17, // 0 16 1 17 2 ... 3550 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 3551 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3552 3, 5, 7, 9, 11, 13, 3553 15)))); // 0-7 16-23 8-15 24-31 -> xzyw 3554 // 0-3 8-11 16-19 24-27 3555 // 4-7 12-15 20-23 28-31 3556 } 3557 else if constexpr (__have_avx2) 3558 { 3559 const auto __bitmask 3560 = _mm256_broadcastsi128_si256(__to_intrin( 3561 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 3562 4, 8, 16, 32, 64, 128))); 3563 return __vector_bitcast<_Up>( 3564 __vector_bitcast<_Up>( 3565 _mm256_shuffle_epi8( 3566 _mm256_broadcastsi128_si256( 3567 __to_intrin(__vector_type_t<_ULLong, 2>{__k})), 3568 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3569 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3570 3, 3, 3, 3, 3, 3)) 3571 & __bitmask) 3572 != 0); 3573 } 3574 // else fall through 3575 } 3576 else if constexpr (sizeof(_UI) == 64) 3577 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k)); 3578 if constexpr (std::min(_ToN, _Np) <= 4) 3579 { 3580 if constexpr (_Np > 7) // avoid overflow 3581 __x &= _SanitizedBitMask<_Np>(0x0f); 3582 const _UInt __char_mask 3583 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL) 3584 * 0xff; 3585 _UV __r = {}; 3586 __builtin_memcpy(&__r, &__char_mask, 3587 std::min(sizeof(__r), sizeof(__char_mask))); 3588 return __r; 3589 } 3590 else if constexpr (std::min(_ToN, _Np) <= 7) 3591 { 3592 if constexpr (_Np > 7) // avoid overflow 3593 __x &= _SanitizedBitMask<_Np>(0x7f); 3594 const _ULLong __char_mask 3595 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL) 3596 * 0xff; 3597 _UV __r = {}; 3598 __builtin_memcpy(&__r, &__char_mask, 3599 std::min(sizeof(__r), sizeof(__char_mask))); 3600 return __r; 3601 } 3602 } 3603 else if constexpr (sizeof(_Up) == 2) 3604 { 3605 if constexpr (sizeof(_UI) == 16) 3606 { 3607 if constexpr (__have_avx512bw_vl) 3608 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k)); 3609 else if constexpr (__have_avx512bw) 3610 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k))); 3611 else if constexpr (__have_avx512f) 3612 { 3613 __m256i __as32bits = {}; 3614 if constexpr (__have_avx512vl) 3615 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i()); 3616 else 3617 __as32bits 3618 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())); 3619 return __intrin_bitcast<_UV>( 3620 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits))); 3621 } 3622 // else fall through 3623 } 3624 else if constexpr (sizeof(_UI) == 32) 3625 { 3626 if constexpr (__have_avx512bw_vl) 3627 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k)); 3628 else if constexpr (__have_avx512bw) 3629 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k))); 3630 else if constexpr (__have_avx512f) 3631 { 3632 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3633 return __vector_bitcast<_Up>( 3634 __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3635 __hi256(__as32bits)))); 3636 } 3637 // else fall through 3638 } 3639 else if constexpr (sizeof(_UI) == 64) 3640 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k)); 3641 } 3642 else if constexpr (sizeof(_Up) == 4) 3643 { 3644 if constexpr (sizeof(_UI) == 16) 3645 { 3646 if constexpr (__have_avx512dq_vl) 3647 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k)); 3648 else if constexpr (__have_avx512dq) 3649 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k))); 3650 else if constexpr (__have_avx512vl) 3651 return __intrin_bitcast<_UV>( 3652 _mm_maskz_mov_epi32(__k, ~__m128i())); 3653 else if constexpr (__have_avx512f) 3654 return __intrin_bitcast<_UV>( 3655 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3656 // else fall through 3657 } 3658 else if constexpr (sizeof(_UI) == 32) 3659 { 3660 if constexpr (__have_avx512dq_vl) 3661 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k)); 3662 else if constexpr (__have_avx512dq) 3663 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k))); 3664 else if constexpr (__have_avx512vl) 3665 return __vector_bitcast<_Up>( 3666 _mm256_maskz_mov_epi32(__k, ~__m256i())); 3667 else if constexpr (__have_avx512f) 3668 return __vector_bitcast<_Up>( 3669 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3670 // else fall through 3671 } 3672 else if constexpr (sizeof(_UI) == 64) 3673 return __vector_bitcast<_Up>( 3674 __have_avx512dq ? _mm512_movm_epi32(__k) 3675 : _mm512_maskz_mov_epi32(__k, ~__m512i())); 3676 } 3677 else if constexpr (sizeof(_Up) == 8) 3678 { 3679 if constexpr (sizeof(_UI) == 16) 3680 { 3681 if constexpr (__have_avx512dq_vl) 3682 return __vector_bitcast<_Up>(_mm_movm_epi64(__k)); 3683 else if constexpr (__have_avx512dq) 3684 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k))); 3685 else if constexpr (__have_avx512vl) 3686 return __vector_bitcast<_Up>( 3687 _mm_maskz_mov_epi64(__k, ~__m128i())); 3688 else if constexpr (__have_avx512f) 3689 return __vector_bitcast<_Up>( 3690 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3691 // else fall through 3692 } 3693 else if constexpr (sizeof(_UI) == 32) 3694 { 3695 if constexpr (__have_avx512dq_vl) 3696 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k)); 3697 else if constexpr (__have_avx512dq) 3698 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k))); 3699 else if constexpr (__have_avx512vl) 3700 return __vector_bitcast<_Up>( 3701 _mm256_maskz_mov_epi64(__k, ~__m256i())); 3702 else if constexpr (__have_avx512f) 3703 return __vector_bitcast<_Up>( 3704 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3705 // else fall through 3706 } 3707 else if constexpr (sizeof(_UI) == 64) 3708 return __vector_bitcast<_Up>( 3709 __have_avx512dq ? _mm512_movm_epi64(__k) 3710 : _mm512_maskz_mov_epi64(__k, ~__m512i())); 3711 } 3712 3713 using _UpUInt = make_unsigned_t<_Up>; 3714 using _V = __vector_type_t<_UpUInt, _ToN>; 3715 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__; 3716 if constexpr (_ToN == 2) 3717 { 3718 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])}); 3719 } 3720 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32) 3721 { 3722 if constexpr (sizeof(_Up) == 4) 3723 return __vector_bitcast<_Up>(_mm256_cmp_ps( 3724 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)), 3725 _mm256_castsi256_ps(_mm256_setr_epi32( 3726 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))), 3727 _mm256_setzero_ps(), _CMP_NEQ_UQ)); 3728 else if constexpr (sizeof(_Up) == 8) 3729 return __vector_bitcast<_Up>(_mm256_cmp_pd( 3730 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)), 3731 _mm256_castsi256_pd( 3732 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))), 3733 _mm256_setzero_pd(), _CMP_NEQ_UQ)); 3734 else 3735 __assert_unreachable<_Up>(); 3736 } 3737 else if constexpr (__bits_per_element >= _ToN) 3738 { 3739 constexpr auto __bitmask 3740 = __generate_vector<_V>([](auto __i) constexpr->_UpUInt { 3741 return __i < _ToN ? 1ull << __i : 0; 3742 }); 3743 const auto __bits 3744 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; 3745 if constexpr (__bits_per_element > _ToN) 3746 return __vector_bitcast<_Up>(__bits) > 0; 3747 else 3748 return __vector_bitcast<_Up>(__bits != 0); 3749 } 3750 else 3751 { 3752 const _V __tmp 3753 = __generate_vector<_V>([&](auto __i) constexpr { 3754 return static_cast<_UpUInt>( 3755 __k >> (__bits_per_element * (__i / __bits_per_element))); 3756 }) 3757 & __generate_vector<_V>([](auto __i) constexpr { 3758 return static_cast<_UpUInt>(1ull 3759 << (__i % __bits_per_element)); 3760 }); // mask bit index 3761 return __intrin_bitcast<_UV>(__tmp != _V()); 3762 } 3763 } 3764 3765 // }}} 3766 // _S_to_maskvector(_SimdWrapper) {{{ 3767 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np, 3768 size_t _ToN = _UpN == 0 ? _Np : _UpN> 3769 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> _S_to_maskvector_MaskImplX86Mixin3770 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 3771 { 3772 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3773 using _TW = _SimdWrapper<_Tp, _Np>; 3774 using _UW = _SimdWrapper<_Up, _ToN>; 3775 using _UI = __intrinsic_type_t<_Up, _ToN>; 3776 if constexpr (is_same_v<_Tp, bool>) // bits -> vector 3777 return _S_to_maskvector<_Up, _ToN>( 3778 _BitMask<_Np>(__x._M_data)._M_sanitized()); 3779 // vector -> vector bitcast 3780 else if constexpr (sizeof(_Up) == sizeof(_Tp) 3781 && sizeof(_TW) == sizeof(_UW)) 3782 return __wrapper_bitcast<_Up, _ToN>( 3783 _ToN <= _Np 3784 ? __x 3785 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x)); 3786 else // vector -> vector {{{ 3787 { 3788 if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3789 { 3790 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); 3791 return __generate_from_n_evaluations<std::min(_ToN, _Np), 3792 __vector_type_t<_Up, _ToN>>( 3793 [&](auto __i) -> _Up { return __y[__i.value]; }); 3794 } 3795 using _To = __vector_type_t<_Up, _ToN>; 3796 [[maybe_unused]] constexpr size_t _FromN = _Np; 3797 constexpr int _FromBytes = sizeof(_Tp); 3798 constexpr int _ToBytes = sizeof(_Up); 3799 const auto __k = __x._M_data; 3800 3801 if constexpr (_FromBytes == _ToBytes) 3802 return __intrin_bitcast<_To>(__k); 3803 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16) 3804 { // SSE -> SSE {{{ 3805 if constexpr (_FromBytes == 4 && _ToBytes == 8) 3806 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3807 else if constexpr (_FromBytes == 2 && _ToBytes == 8) 3808 { 3809 const auto __y 3810 = __vector_bitcast<int>(__interleave128_lo(__k, __k)); 3811 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3812 } 3813 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 3814 { 3815 auto __y 3816 = __vector_bitcast<short>(__interleave128_lo(__k, __k)); 3817 auto __z 3818 = __vector_bitcast<int>(__interleave128_lo(__y, __y)); 3819 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z)); 3820 } 3821 else if constexpr (_FromBytes == 8 && _ToBytes == 4 3822 && __have_sse2) 3823 return __intrin_bitcast<_To>( 3824 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3825 else if constexpr (_FromBytes == 8 && _ToBytes == 4) 3826 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k), 3827 _UI()); 3828 else if constexpr (_FromBytes == 2 && _ToBytes == 4) 3829 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3830 else if constexpr (_FromBytes == 1 && _ToBytes == 4) 3831 { 3832 const auto __y 3833 = __vector_bitcast<short>(__interleave128_lo(__k, __k)); 3834 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3835 } 3836 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 3837 { 3838 if constexpr (__have_sse2 && !__have_ssse3) 3839 return __intrin_bitcast<_To>(_mm_packs_epi32( 3840 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()), 3841 __m128i())); 3842 else 3843 return __intrin_bitcast<_To>( 3844 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>( 3845 __vector_bitcast<_Up>(__k))); 3846 } 3847 else if constexpr (_FromBytes == 4 && _ToBytes == 2) 3848 return __intrin_bitcast<_To>( 3849 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3850 else if constexpr (_FromBytes == 1 && _ToBytes == 2) 3851 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3852 else if constexpr (_FromBytes == 8 && _ToBytes == 1 3853 && __have_ssse3) 3854 return __intrin_bitcast<_To>( 3855 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3856 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1, 3857 -1, -1, -1, -1, -1, -1, -1, 3858 -1))); 3859 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 3860 { 3861 auto __y 3862 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3863 __y = _mm_packs_epi32(__y, __m128i()); 3864 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3865 } 3866 else if constexpr (_FromBytes == 4 && _ToBytes == 1 3867 && __have_ssse3) 3868 return __intrin_bitcast<_To>( 3869 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3870 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 3871 -1, -1, -1, -1, -1, -1, -1, 3872 -1))); 3873 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 3874 { 3875 const auto __y 3876 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3877 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3878 } 3879 else if constexpr (_FromBytes == 2 && _ToBytes == 1) 3880 return __intrin_bitcast<_To>( 3881 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())); 3882 else 3883 __assert_unreachable<_Tp>(); 3884 } // }}} 3885 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32) 3886 { // AVX -> AVX {{{ 3887 if constexpr (_FromBytes == _ToBytes) 3888 __assert_unreachable<_Tp>(); 3889 else if constexpr (_FromBytes == _ToBytes * 2) 3890 { 3891 const auto __y = __vector_bitcast<_LLong>(__k); 3892 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 3893 _mm_packs_epi16(__lo128(__y), __hi128(__y)))); 3894 } 3895 else if constexpr (_FromBytes == _ToBytes * 4) 3896 { 3897 const auto __y = __vector_bitcast<_LLong>(__k); 3898 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 3899 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 3900 __m128i()))); 3901 } 3902 else if constexpr (_FromBytes == _ToBytes * 8) 3903 { 3904 const auto __y = __vector_bitcast<_LLong>(__k); 3905 return __intrin_bitcast<_To>( 3906 _mm256_castsi128_si256(_mm_shuffle_epi8( 3907 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 3908 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 3909 -1, -1, -1, -1, -1)))); 3910 } 3911 else if constexpr (_FromBytes * 2 == _ToBytes) 3912 { 3913 auto __y = __xzyw(__to_intrin(__k)); 3914 if constexpr (is_floating_point_v< 3915 _Tp> || (!__have_avx2 && _FromBytes == 4)) 3916 { 3917 const auto __yy = __vector_bitcast<float>(__y); 3918 return __intrin_bitcast<_To>( 3919 _mm256_unpacklo_ps(__yy, __yy)); 3920 } 3921 else 3922 return __intrin_bitcast<_To>( 3923 _mm256_unpacklo_epi8(__y, __y)); 3924 } 3925 else if constexpr (_FromBytes * 4 == _ToBytes) 3926 { 3927 auto __y 3928 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 3929 __lo128(__vector_bitcast<_LLong>( 3930 __k))); // drops 3/4 of input 3931 return __intrin_bitcast<_To>( 3932 __concat(_mm_unpacklo_epi16(__y, __y), 3933 _mm_unpackhi_epi16(__y, __y))); 3934 } 3935 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 3936 { 3937 auto __y 3938 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 3939 __lo128(__vector_bitcast<_LLong>( 3940 __k))); // drops 3/4 of input 3941 __y 3942 = _mm_unpacklo_epi16(__y, 3943 __y); // drops another 1/2 => 7/8 total 3944 return __intrin_bitcast<_To>( 3945 __concat(_mm_unpacklo_epi32(__y, __y), 3946 _mm_unpackhi_epi32(__y, __y))); 3947 } 3948 else 3949 __assert_unreachable<_Tp>(); 3950 } // }}} 3951 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16) 3952 { // SSE -> AVX {{{ 3953 if constexpr (_FromBytes == _ToBytes) 3954 return __intrin_bitcast<_To>( 3955 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>( 3956 __zero_extend(__to_intrin(__k)))); 3957 else if constexpr (_FromBytes * 2 == _ToBytes) 3958 { // keep all 3959 return __intrin_bitcast<_To>( 3960 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k), 3961 __vector_bitcast<_LLong>(__k)), 3962 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k), 3963 __vector_bitcast<_LLong>(__k)))); 3964 } 3965 else if constexpr (_FromBytes * 4 == _ToBytes) 3966 { 3967 if constexpr (__have_avx2) 3968 { 3969 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 3970 __concat(__vector_bitcast<_LLong>(__k), 3971 __vector_bitcast<_LLong>(__k)), 3972 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3973 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 3974 6, 6, 7, 7, 7, 7))); 3975 } 3976 else 3977 { 3978 return __intrin_bitcast<_To>(__concat( 3979 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3980 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 3981 2, 2, 2, 2, 3, 3, 3, 3)), 3982 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3983 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5, 3984 6, 6, 6, 6, 7, 7, 7, 3985 7)))); 3986 } 3987 } 3988 else if constexpr (_FromBytes * 8 == _ToBytes) 3989 { 3990 if constexpr (__have_avx2) 3991 { 3992 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 3993 __concat(__vector_bitcast<_LLong>(__k), 3994 __vector_bitcast<_LLong>(__k)), 3995 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3996 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3997 3, 3, 3, 3, 3, 3))); 3998 } 3999 else 4000 { 4001 return __intrin_bitcast<_To>(__concat( 4002 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4003 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 4004 1, 1, 1, 1, 1, 1, 1, 1)), 4005 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4006 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 4007 3, 3, 3, 3, 3, 3, 3, 4008 3)))); 4009 } 4010 } 4011 else if constexpr (_FromBytes == _ToBytes * 2) 4012 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4013 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())))); 4014 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 4015 { 4016 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4017 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4018 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1, 4019 -1, -1, -1, -1, -1, -1, -1, 4020 -1))))); 4021 } 4022 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4023 { 4024 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4025 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4026 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4027 -1, -1, -1, -1, -1, -1, -1, 4028 -1))))); 4029 } 4030 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4031 { 4032 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4033 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4034 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, 4035 -1, -1, -1, -1, -1, -1, -1, 4036 -1, -1))))); 4037 } 4038 else 4039 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable"); 4040 } // }}} 4041 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32) 4042 { // AVX -> SSE {{{ 4043 if constexpr (_FromBytes == _ToBytes) 4044 { // keep low 1/2 4045 return __intrin_bitcast<_To>(__lo128(__k)); 4046 } 4047 else if constexpr (_FromBytes == _ToBytes * 2) 4048 { // keep all 4049 auto __y = __vector_bitcast<_LLong>(__k); 4050 return __intrin_bitcast<_To>( 4051 _mm_packs_epi16(__lo128(__y), __hi128(__y))); 4052 } 4053 else if constexpr (_FromBytes == _ToBytes * 4) 4054 { // add 1/2 undef 4055 auto __y = __vector_bitcast<_LLong>(__k); 4056 return __intrin_bitcast<_To>( 4057 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4058 __m128i())); 4059 } 4060 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4061 { // add 3/4 undef 4062 auto __y = __vector_bitcast<_LLong>(__k); 4063 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 4064 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4065 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, 4066 -1, -1, -1, -1))); 4067 } 4068 else if constexpr (_FromBytes * 2 == _ToBytes) 4069 { // keep low 1/4 4070 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4071 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4072 } 4073 else if constexpr (_FromBytes * 4 == _ToBytes) 4074 { // keep low 1/8 4075 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4076 __y = _mm_unpacklo_epi8(__y, __y); 4077 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4078 } 4079 else if constexpr (_FromBytes * 8 == _ToBytes) 4080 { // keep low 1/16 4081 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4082 __y = _mm_unpacklo_epi8(__y, __y); 4083 __y = _mm_unpacklo_epi8(__y, __y); 4084 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4085 } 4086 else 4087 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable"); 4088 } // }}} 4089 else 4090 return _Base::template _S_to_maskvector<_Up, _ToN>(__x); 4091 /* 4092 if constexpr (_FromBytes > _ToBytes) { 4093 const _To __y = __vector_bitcast<_Up>(__k); 4094 return [&] <size_t... _Is> (index_sequence<_Is...>) { 4095 constexpr int _Stride = _FromBytes / _ToBytes; 4096 return _To{__y[(_Is + 1) * _Stride - 1]...}; 4097 }(make_index_sequence<std::min(_ToN, _FromN)>()); 4098 } else { 4099 // {0, 0, 1, 1} (_Dups = 2, _Is<4>) 4100 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>) 4101 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>) 4102 // ... 4103 return [&] <size_t... _Is> (index_sequence<_Is...>) { 4104 constexpr int __dup = _ToBytes / _FromBytes; 4105 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...}); 4106 }(make_index_sequence<_FromN>()); 4107 } 4108 */ 4109 } // }}} 4110 } 4111 4112 // }}} 4113 // _S_to_bits {{{ 4114 template <typename _Tp, size_t _Np> 4115 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> _S_to_bits_MaskImplX86Mixin4116 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 4117 { 4118 if constexpr (is_same_v<_Tp, bool>) 4119 return _BitMask<_Np>(__x._M_data)._M_sanitized(); 4120 else 4121 { 4122 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4123 if (__builtin_is_constant_evaluated() 4124 || __builtin_constant_p(__x._M_data)) 4125 { 4126 const auto __bools = -__x._M_data; 4127 const _ULLong __k = __call_with_n_evaluations<_Np>( 4128 [](auto... __bits) { return (__bits | ...); }, 4129 [&](auto __i) { return _ULLong(__bools[+__i]) << __i; }); 4130 if (__builtin_is_constant_evaluated() 4131 || __builtin_constant_p(__k)) 4132 return __k; 4133 } 4134 const auto __xi = __to_intrin(__x); 4135 if constexpr (sizeof(_Tp) == 1) 4136 if constexpr (sizeof(__xi) == 16) 4137 if constexpr (__have_avx512bw_vl) 4138 return _BitMask<_Np>(_mm_movepi8_mask(__xi)); 4139 else // implies SSE2 4140 return _BitMask<_Np>(_mm_movemask_epi8(__xi)); 4141 else if constexpr (sizeof(__xi) == 32) 4142 if constexpr (__have_avx512bw_vl) 4143 return _BitMask<_Np>(_mm256_movepi8_mask(__xi)); 4144 else // implies AVX2 4145 return _BitMask<_Np>(_mm256_movemask_epi8(__xi)); 4146 else // implies AVX512BW 4147 return _BitMask<_Np>(_mm512_movepi8_mask(__xi)); 4148 4149 else if constexpr (sizeof(_Tp) == 2) 4150 if constexpr (sizeof(__xi) == 16) 4151 if constexpr (__have_avx512bw_vl) 4152 return _BitMask<_Np>(_mm_movepi16_mask(__xi)); 4153 else if constexpr (__have_avx512bw) 4154 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4155 else // implies SSE2 4156 return _BitMask<_Np>( 4157 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i()))); 4158 else if constexpr (sizeof(__xi) == 32) 4159 if constexpr (__have_avx512bw_vl) 4160 return _BitMask<_Np>(_mm256_movepi16_mask(__xi)); 4161 else if constexpr (__have_avx512bw) 4162 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4163 else // implies SSE2 4164 return _BitMask<_Np>(_mm_movemask_epi8( 4165 _mm_packs_epi16(__lo128(__xi), __hi128(__xi)))); 4166 else // implies AVX512BW 4167 return _BitMask<_Np>(_mm512_movepi16_mask(__xi)); 4168 4169 else if constexpr (sizeof(_Tp) == 4) 4170 if constexpr (sizeof(__xi) == 16) 4171 if constexpr (__have_avx512dq_vl) 4172 return _BitMask<_Np>(_mm_movepi32_mask(__xi)); 4173 else if constexpr (__have_avx512vl) 4174 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i())); 4175 else if constexpr (__have_avx512dq) 4176 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4177 else if constexpr (__have_avx512f) 4178 return _BitMask<_Np>( 4179 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4180 else // implies SSE 4181 return _BitMask<_Np>( 4182 _mm_movemask_ps(reinterpret_cast<__m128>(__xi))); 4183 else if constexpr (sizeof(__xi) == 32) 4184 if constexpr (__have_avx512dq_vl) 4185 return _BitMask<_Np>(_mm256_movepi32_mask(__xi)); 4186 else if constexpr (__have_avx512dq) 4187 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4188 else if constexpr (__have_avx512vl) 4189 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i())); 4190 else if constexpr (__have_avx512f) 4191 return _BitMask<_Np>( 4192 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4193 else // implies AVX 4194 return _BitMask<_Np>( 4195 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi))); 4196 else // implies AVX512?? 4197 if constexpr (__have_avx512dq) 4198 return _BitMask<_Np>(_mm512_movepi32_mask(__xi)); 4199 else // implies AVX512F 4200 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i())); 4201 4202 else if constexpr (sizeof(_Tp) == 8) 4203 if constexpr (sizeof(__xi) == 16) 4204 if constexpr (__have_avx512dq_vl) 4205 return _BitMask<_Np>(_mm_movepi64_mask(__xi)); 4206 else if constexpr (__have_avx512dq) 4207 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4208 else if constexpr (__have_avx512vl) 4209 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i())); 4210 else if constexpr (__have_avx512f) 4211 return _BitMask<_Np>( 4212 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4213 else // implies SSE2 4214 return _BitMask<_Np>( 4215 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi))); 4216 else if constexpr (sizeof(__xi) == 32) 4217 if constexpr (__have_avx512dq_vl) 4218 return _BitMask<_Np>(_mm256_movepi64_mask(__xi)); 4219 else if constexpr (__have_avx512dq) 4220 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4221 else if constexpr (__have_avx512vl) 4222 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i())); 4223 else if constexpr (__have_avx512f) 4224 return _BitMask<_Np>( 4225 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4226 else // implies AVX 4227 return _BitMask<_Np>( 4228 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi))); 4229 else // implies AVX512?? 4230 if constexpr (__have_avx512dq) 4231 return _BitMask<_Np>(_mm512_movepi64_mask(__xi)); 4232 else // implies AVX512F 4233 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i())); 4234 4235 else 4236 __assert_unreachable<_Tp>(); 4237 } 4238 } 4239 // }}} 4240 }; 4241 4242 // }}} 4243 // _MaskImplX86 {{{ 4244 template <typename _Abi> 4245 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi> 4246 { 4247 using _MaskImplX86Mixin::_S_to_bits; 4248 using _MaskImplX86Mixin::_S_to_maskvector; 4249 using _MaskImplBuiltin<_Abi>::_S_convert; 4250 4251 // member types {{{ 4252 template <typename _Tp> 4253 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 4254 4255 template <typename _Tp> 4256 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 4257 4258 template <typename _Tp> 4259 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 4260 4261 using _Base = _MaskImplBuiltin<_Abi>; 4262 4263 // }}} 4264 // _S_broadcast {{{ 4265 template <typename _Tp> 4266 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_broadcast_MaskImplX864267 _S_broadcast(bool __x) 4268 { 4269 if constexpr (__is_avx512_abi<_Abi>()) 4270 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1)) 4271 : _MaskMember<_Tp>(); 4272 else 4273 return _Base::template _S_broadcast<_Tp>(__x); 4274 } 4275 4276 // }}} 4277 // _S_load {{{ 4278 template <typename _Tp> 4279 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_load_MaskImplX864280 _S_load(const bool* __mem) 4281 { 4282 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4283 if constexpr (__have_avx512bw) 4284 { 4285 const auto __to_vec_or_bits = [](auto __bits) -> decltype(auto) { 4286 if constexpr (__is_avx512_abi<_Abi>()) 4287 return __bits; 4288 else 4289 return _S_to_maskvector<_Tp>( 4290 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); 4291 }; 4292 4293 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl) 4294 { 4295 __m128i __a = {}; 4296 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4297 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a)); 4298 } 4299 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl) 4300 { 4301 __m256i __a = {}; 4302 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4303 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a)); 4304 } 4305 else if constexpr (_S_size<_Tp> <= 64) 4306 { 4307 __m512i __a = {}; 4308 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4309 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a)); 4310 } 4311 } 4312 else if constexpr (__is_avx512_abi<_Abi>()) 4313 { 4314 if constexpr (_S_size<_Tp> <= 8) 4315 { 4316 __m128i __a = {}; 4317 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4318 const auto __b = _mm512_cvtepi8_epi64(__a); 4319 return _mm512_test_epi64_mask(__b, __b); 4320 } 4321 else if constexpr (_S_size<_Tp> <= 16) 4322 { 4323 __m128i __a = {}; 4324 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4325 const auto __b = _mm512_cvtepi8_epi32(__a); 4326 return _mm512_test_epi32_mask(__b, __b); 4327 } 4328 else if constexpr (_S_size<_Tp> <= 32) 4329 { 4330 __m128i __a = {}; 4331 __builtin_memcpy(&__a, __mem, 16); 4332 const auto __b = _mm512_cvtepi8_epi32(__a); 4333 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16); 4334 const auto __c = _mm512_cvtepi8_epi32(__a); 4335 return _mm512_test_epi32_mask(__b, __b) 4336 | (_mm512_test_epi32_mask(__c, __c) << 16); 4337 } 4338 else if constexpr (_S_size<_Tp> <= 64) 4339 { 4340 __m128i __a = {}; 4341 __builtin_memcpy(&__a, __mem, 16); 4342 const auto __b = _mm512_cvtepi8_epi32(__a); 4343 __builtin_memcpy(&__a, __mem + 16, 16); 4344 const auto __c = _mm512_cvtepi8_epi32(__a); 4345 if constexpr (_S_size<_Tp> <= 48) 4346 { 4347 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32); 4348 const auto __d = _mm512_cvtepi8_epi32(__a); 4349 return _mm512_test_epi32_mask(__b, __b) 4350 | (_mm512_test_epi32_mask(__c, __c) << 16) 4351 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32); 4352 } 4353 else 4354 { 4355 __builtin_memcpy(&__a, __mem + 16, 16); 4356 const auto __d = _mm512_cvtepi8_epi32(__a); 4357 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48); 4358 const auto __e = _mm512_cvtepi8_epi32(__a); 4359 return _mm512_test_epi32_mask(__b, __b) 4360 | (_mm512_test_epi32_mask(__c, __c) << 16) 4361 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32) 4362 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48); 4363 } 4364 } 4365 else 4366 __assert_unreachable<_Tp>(); 4367 } 4368 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2) 4369 return __vector_bitcast<_Tp>( 4370 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]), 4371 -int(__mem[1]), -int(__mem[1])}); 4372 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx) 4373 { 4374 int __bool4 = 0; 4375 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>); 4376 const auto __k = __to_intrin( 4377 (__vector_broadcast<4>(__bool4) 4378 & __make_vector<int>(0x1, 0x100, 0x10000, 4379 _S_size<_Tp> == 4 ? 0x1000000 : 0)) 4380 != 0); 4381 return __vector_bitcast<_Tp>( 4382 __concat(_mm_unpacklo_epi32(__k, __k), 4383 _mm_unpackhi_epi32(__k, __k))); 4384 } 4385 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4) 4386 { 4387 int __bools = 0; 4388 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>); 4389 if constexpr (__have_sse2) 4390 { 4391 __m128i __k = _mm_cvtsi32_si128(__bools); 4392 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i()); 4393 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4394 _mm_unpacklo_epi16(__k, __k)); 4395 } 4396 else 4397 { 4398 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools)); 4399 _mm_empty(); 4400 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4401 _mm_cmpgt_ps(__k, __m128())); 4402 } 4403 } 4404 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8) 4405 { 4406 __m128i __k = {}; 4407 __builtin_memcpy(&__k, __mem, _S_size<_Tp>); 4408 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i()); 4409 return __vector_bitcast<_Tp>( 4410 __concat(_mm_unpacklo_epi16(__k, __k), 4411 _mm_unpackhi_epi16(__k, __k))); 4412 } 4413 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16) 4414 { 4415 __m128i __k = {}; 4416 __builtin_memcpy(&__k, __mem, _S_size<_Tp>); 4417 __k = _mm_cmpgt_epi8(__k, __m128i()); 4418 if constexpr (_S_size<_Tp> <= 8) 4419 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4420 _mm_unpacklo_epi8(__k, __k)); 4421 else 4422 return __concat(_mm_unpacklo_epi8(__k, __k), 4423 _mm_unpackhi_epi8(__k, __k)); 4424 } 4425 else 4426 return _Base::template _S_load<_Tp>(__mem); 4427 } 4428 4429 // }}} 4430 // _S_from_bitmask{{{ 4431 template <size_t _Np, typename _Tp> 4432 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_from_bitmask_MaskImplX864433 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>) 4434 { 4435 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4436 if constexpr (__is_avx512_abi<_Abi>()) 4437 return __bits._M_to_bits(); 4438 else 4439 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); 4440 } 4441 4442 // }}} 4443 // _S_masked_load {{{2 4444 template <typename _Tp, size_t _Np> 4445 static inline _SimdWrapper<_Tp, _Np> _S_masked_load_MaskImplX864446 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, 4447 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept 4448 { 4449 if constexpr (__is_avx512_abi<_Abi>()) 4450 { 4451 if constexpr (__have_avx512bw_vl) 4452 { 4453 if constexpr (_Np <= 16) 4454 { 4455 const auto __a 4456 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem); 4457 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a); 4458 } 4459 else if constexpr (_Np <= 32) 4460 { 4461 const auto __a 4462 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem); 4463 return (__merge & ~__mask) 4464 | _mm256_test_epi8_mask(__a, __a); 4465 } 4466 else if constexpr (_Np <= 64) 4467 { 4468 const auto __a 4469 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem); 4470 return (__merge & ~__mask) 4471 | _mm512_test_epi8_mask(__a, __a); 4472 } 4473 else 4474 __assert_unreachable<_Tp>(); 4475 } 4476 else 4477 { 4478 _BitOps::_S_bit_iteration(__mask, [&](auto __i) { 4479 __merge._M_set(__i, __mem[__i]); 4480 }); 4481 return __merge; 4482 } 4483 } 4484 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1) 4485 { 4486 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4487 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(), 4488 _mm256_mask_loadu_epi8(__m256i(), 4489 __k, __mem)); 4490 } 4491 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1) 4492 { 4493 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4494 __merge 4495 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k, 4496 __m128i(), 4497 _mm_mask_loadu_epi8(__m128i(), __k, __mem)); 4498 } 4499 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2) 4500 { 4501 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4502 __merge = _mm256_mask_sub_epi16( 4503 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4504 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); 4505 } 4506 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2) 4507 { 4508 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4509 __merge = _mm_mask_sub_epi16( 4510 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4511 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); 4512 } 4513 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4) 4514 { 4515 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4516 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32( 4517 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4518 _mm256_cvtepi8_epi32( 4519 _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4520 } 4521 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4) 4522 { 4523 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4524 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32( 4525 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4526 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4527 } 4528 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8) 4529 { 4530 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4531 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64( 4532 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4533 _mm256_cvtepi8_epi64( 4534 _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4535 } 4536 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8) 4537 { 4538 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4539 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64( 4540 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4541 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4542 } 4543 else 4544 return _Base::_S_masked_load(__merge, __mask, __mem); 4545 return __merge; 4546 } 4547 4548 // _S_store {{{2 4549 template <typename _Tp, size_t _Np> _S_store_MaskImplX864550 _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __v, 4551 bool* __mem) noexcept 4552 { 4553 if constexpr (__is_avx512_abi<_Abi>()) 4554 { 4555 if constexpr (__have_avx512bw_vl) 4556 _CommonImplX86::_S_store<_Np>( 4557 __vector_bitcast<char>([](auto __data) { 4558 if constexpr (_Np <= 16) 4559 return _mm_maskz_set1_epi8(__data, 1); 4560 else if constexpr (_Np <= 32) 4561 return _mm256_maskz_set1_epi8(__data, 1); 4562 else 4563 return _mm512_maskz_set1_epi8(__data, 1); 4564 }(__v._M_data)), 4565 __mem); 4566 else if constexpr (_Np <= 8) 4567 _CommonImplX86::_S_store<_Np>( 4568 __vector_bitcast<char>( 4569 #if defined __x86_64__ 4570 __make_wrapper<_ULLong>( 4571 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull) 4572 #else 4573 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U), 4574 _pdep_u32(__v._M_data >> 4, 4575 0x01010101U)) 4576 #endif 4577 ), 4578 __mem); 4579 else if constexpr (_Np <= 16) 4580 _mm512_mask_cvtepi32_storeu_epi8( 4581 __mem, 0xffffu >> (16 - _Np), 4582 _mm512_maskz_set1_epi32(__v._M_data, 1)); 4583 else 4584 __assert_unreachable<_Tp>(); 4585 } 4586 else if constexpr (__is_sse_abi<_Abi>()) //{{{ 4587 { 4588 if constexpr (_Np == 2 && sizeof(_Tp) == 8) 4589 { 4590 const auto __k = __vector_bitcast<int>(__v); 4591 __mem[0] = -__k[1]; 4592 __mem[1] = -__k[3]; 4593 } 4594 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) 4595 { 4596 if constexpr (__have_sse2) 4597 { 4598 const unsigned __bool4 4599 = __vector_bitcast<_UInt>(_mm_packs_epi16( 4600 _mm_packs_epi32(__intrin_bitcast<__m128i>( 4601 __to_intrin(__v)), 4602 __m128i()), 4603 __m128i()))[0] 4604 & 0x01010101u; 4605 __builtin_memcpy(__mem, &__bool4, _Np); 4606 } 4607 else if constexpr (__have_mmx) 4608 { 4609 const __m64 __k = _mm_cvtps_pi8( 4610 __and(__to_intrin(__v), _mm_set1_ps(1.f))); 4611 __builtin_memcpy(__mem, &__k, _Np); 4612 _mm_empty(); 4613 } 4614 else 4615 return _Base::_S_store(__v, __mem); 4616 } 4617 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) 4618 { 4619 _CommonImplX86::_S_store<_Np>( 4620 __vector_bitcast<char>(_mm_packs_epi16( 4621 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15), 4622 __m128i())), 4623 __mem); 4624 } 4625 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) 4626 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem); 4627 else 4628 __assert_unreachable<_Tp>(); 4629 } // }}} 4630 else if constexpr (__is_avx_abi<_Abi>()) // {{{ 4631 { 4632 if constexpr (_Np <= 4 && sizeof(_Tp) == 8) 4633 { 4634 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); 4635 int __bool4; 4636 if constexpr (__have_avx2) 4637 __bool4 = _mm256_movemask_epi8(__k); 4638 else 4639 __bool4 = (_mm_movemask_epi8(__lo128(__k)) 4640 | (_mm_movemask_epi8(__hi128(__k)) << 16)); 4641 __bool4 &= 0x01010101; 4642 __builtin_memcpy(__mem, &__bool4, _Np); 4643 } 4644 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4) 4645 { 4646 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); 4647 const auto __k2 4648 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)), 4649 15); 4650 const auto __k3 4651 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i())); 4652 _CommonImplX86::_S_store<_Np>(__k3, __mem); 4653 } 4654 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2) 4655 { 4656 if constexpr (__have_avx2) 4657 { 4658 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15); 4659 const auto __bools = __vector_bitcast<char>( 4660 _mm_packs_epi16(__lo128(__x), __hi128(__x))); 4661 _CommonImplX86::_S_store<_Np>(__bools, __mem); 4662 } 4663 else 4664 { 4665 const auto __bools 4666 = 1 4667 & __vector_bitcast<_UChar>( 4668 _mm_packs_epi16(__lo128(__to_intrin(__v)), 4669 __hi128(__to_intrin(__v)))); 4670 _CommonImplX86::_S_store<_Np>(__bools, __mem); 4671 } 4672 } 4673 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1) 4674 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem); 4675 else 4676 __assert_unreachable<_Tp>(); 4677 } // }}} 4678 else 4679 __assert_unreachable<_Tp>(); 4680 } 4681 4682 // _S_masked_store {{{2 4683 template <typename _Tp, size_t _Np> 4684 static inline void _S_masked_store_MaskImplX864685 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, 4686 const _SimdWrapper<_Tp, _Np> __k) noexcept 4687 { 4688 if constexpr (__is_avx512_abi<_Abi>()) 4689 { 4690 static_assert(is_same_v<_Tp, bool>); 4691 if constexpr (_Np <= 16 && __have_avx512bw_vl) 4692 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1)); 4693 else if constexpr (_Np <= 16) 4694 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k, 4695 _mm512_maskz_set1_epi32(__v, 1)); 4696 else if constexpr (_Np <= 32 && __have_avx512bw_vl) 4697 _mm256_mask_storeu_epi8(__mem, __k, 4698 _mm256_maskz_set1_epi8(__v, 1)); 4699 else if constexpr (_Np <= 32 && __have_avx512bw) 4700 _mm256_mask_storeu_epi8(__mem, __k, 4701 __lo256(_mm512_maskz_set1_epi8(__v, 1))); 4702 else if constexpr (_Np <= 64 && __have_avx512bw) 4703 _mm512_mask_storeu_epi8(__mem, __k, 4704 _mm512_maskz_set1_epi8(__v, 1)); 4705 else 4706 __assert_unreachable<_Tp>(); 4707 } 4708 else 4709 _Base::_S_masked_store(__v, __mem, __k); 4710 } 4711 4712 // logical and bitwise operators {{{2 4713 template <typename _Tp, size_t _Np> 4714 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_logical_and_MaskImplX864715 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, 4716 const _SimdWrapper<_Tp, _Np>& __y) 4717 { 4718 if constexpr (is_same_v<_Tp, bool>) 4719 { 4720 if constexpr (__have_avx512dq && _Np <= 8) 4721 return _kand_mask8(__x._M_data, __y._M_data); 4722 else if constexpr (_Np <= 16) 4723 return _kand_mask16(__x._M_data, __y._M_data); 4724 else if constexpr (__have_avx512bw && _Np <= 32) 4725 return _kand_mask32(__x._M_data, __y._M_data); 4726 else if constexpr (__have_avx512bw && _Np <= 64) 4727 return _kand_mask64(__x._M_data, __y._M_data); 4728 else 4729 __assert_unreachable<_Tp>(); 4730 } 4731 else 4732 return _Base::_S_logical_and(__x, __y); 4733 } 4734 4735 template <typename _Tp, size_t _Np> 4736 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_logical_or_MaskImplX864737 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, 4738 const _SimdWrapper<_Tp, _Np>& __y) 4739 { 4740 if constexpr (is_same_v<_Tp, bool>) 4741 { 4742 if constexpr (__have_avx512dq && _Np <= 8) 4743 return _kor_mask8(__x._M_data, __y._M_data); 4744 else if constexpr (_Np <= 16) 4745 return _kor_mask16(__x._M_data, __y._M_data); 4746 else if constexpr (__have_avx512bw && _Np <= 32) 4747 return _kor_mask32(__x._M_data, __y._M_data); 4748 else if constexpr (__have_avx512bw && _Np <= 64) 4749 return _kor_mask64(__x._M_data, __y._M_data); 4750 else 4751 __assert_unreachable<_Tp>(); 4752 } 4753 else 4754 return _Base::_S_logical_or(__x, __y); 4755 } 4756 4757 template <typename _Tp, size_t _Np> 4758 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_not_MaskImplX864759 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x) 4760 { 4761 if constexpr (is_same_v<_Tp, bool>) 4762 { 4763 if constexpr (__have_avx512dq && _Np <= 8) 4764 return _kandn_mask8(__x._M_data, 4765 _Abi::template __implicit_mask_n<_Np>()); 4766 else if constexpr (_Np <= 16) 4767 return _kandn_mask16(__x._M_data, 4768 _Abi::template __implicit_mask_n<_Np>()); 4769 else if constexpr (__have_avx512bw && _Np <= 32) 4770 return _kandn_mask32(__x._M_data, 4771 _Abi::template __implicit_mask_n<_Np>()); 4772 else if constexpr (__have_avx512bw && _Np <= 64) 4773 return _kandn_mask64(__x._M_data, 4774 _Abi::template __implicit_mask_n<_Np>()); 4775 else 4776 __assert_unreachable<_Tp>(); 4777 } 4778 else 4779 return _Base::_S_bit_not(__x); 4780 } 4781 4782 template <typename _Tp, size_t _Np> 4783 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_and_MaskImplX864784 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, 4785 const _SimdWrapper<_Tp, _Np>& __y) 4786 { 4787 if constexpr (is_same_v<_Tp, bool>) 4788 { 4789 if constexpr (__have_avx512dq && _Np <= 8) 4790 return _kand_mask8(__x._M_data, __y._M_data); 4791 else if constexpr (_Np <= 16) 4792 return _kand_mask16(__x._M_data, __y._M_data); 4793 else if constexpr (__have_avx512bw && _Np <= 32) 4794 return _kand_mask32(__x._M_data, __y._M_data); 4795 else if constexpr (__have_avx512bw && _Np <= 64) 4796 return _kand_mask64(__x._M_data, __y._M_data); 4797 else 4798 __assert_unreachable<_Tp>(); 4799 } 4800 else 4801 return _Base::_S_bit_and(__x, __y); 4802 } 4803 4804 template <typename _Tp, size_t _Np> 4805 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_or_MaskImplX864806 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, 4807 const _SimdWrapper<_Tp, _Np>& __y) 4808 { 4809 if constexpr (is_same_v<_Tp, bool>) 4810 { 4811 if constexpr (__have_avx512dq && _Np <= 8) 4812 return _kor_mask8(__x._M_data, __y._M_data); 4813 else if constexpr (_Np <= 16) 4814 return _kor_mask16(__x._M_data, __y._M_data); 4815 else if constexpr (__have_avx512bw && _Np <= 32) 4816 return _kor_mask32(__x._M_data, __y._M_data); 4817 else if constexpr (__have_avx512bw && _Np <= 64) 4818 return _kor_mask64(__x._M_data, __y._M_data); 4819 else 4820 __assert_unreachable<_Tp>(); 4821 } 4822 else 4823 return _Base::_S_bit_or(__x, __y); 4824 } 4825 4826 template <typename _Tp, size_t _Np> 4827 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_xor_MaskImplX864828 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, 4829 const _SimdWrapper<_Tp, _Np>& __y) 4830 { 4831 if constexpr (is_same_v<_Tp, bool>) 4832 { 4833 if constexpr (__have_avx512dq && _Np <= 8) 4834 return _kxor_mask8(__x._M_data, __y._M_data); 4835 else if constexpr (_Np <= 16) 4836 return _kxor_mask16(__x._M_data, __y._M_data); 4837 else if constexpr (__have_avx512bw && _Np <= 32) 4838 return _kxor_mask32(__x._M_data, __y._M_data); 4839 else if constexpr (__have_avx512bw && _Np <= 64) 4840 return _kxor_mask64(__x._M_data, __y._M_data); 4841 else 4842 __assert_unreachable<_Tp>(); 4843 } 4844 else 4845 return _Base::_S_bit_xor(__x, __y); 4846 } 4847 4848 //}}}2 4849 // _S_masked_assign{{{ 4850 template <size_t _Np> 4851 _GLIBCXX_SIMD_INTRINSIC static void _S_masked_assign_MaskImplX864852 _S_masked_assign(_SimdWrapper<bool, _Np> __k, 4853 _SimdWrapper<bool, _Np>& __lhs, 4854 _SimdWrapper<bool, _Np> __rhs) 4855 { 4856 __lhs._M_data 4857 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data); 4858 } 4859 4860 template <size_t _Np> 4861 _GLIBCXX_SIMD_INTRINSIC static void _S_masked_assign_MaskImplX864862 _S_masked_assign(_SimdWrapper<bool, _Np> __k, 4863 _SimdWrapper<bool, _Np>& __lhs, bool __rhs) 4864 { 4865 if (__rhs) 4866 __lhs._M_data = __k._M_data | __lhs._M_data; 4867 else 4868 __lhs._M_data = ~__k._M_data & __lhs._M_data; 4869 } 4870 4871 using _MaskImplBuiltin<_Abi>::_S_masked_assign; 4872 4873 //}}} 4874 // _S_all_of {{{ 4875 template <typename _Tp> _S_all_of_MaskImplX864876 _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k) 4877 { 4878 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 4879 { 4880 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 4881 using _TI = __intrinsic_type_t<_Tp, _Np>; 4882 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 4883 if constexpr (__have_sse4_1) 4884 { 4885 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 4886 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 4887 return 0 != __testc(__a, __b); 4888 } 4889 else if constexpr (is_same_v<_Tp, float>) 4890 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) 4891 == (1 << _Np) - 1; 4892 else if constexpr (is_same_v<_Tp, double>) 4893 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) 4894 == (1 << _Np) - 1; 4895 else 4896 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) 4897 == (1 << (_Np * sizeof(_Tp))) - 1; 4898 } 4899 else if constexpr (__is_avx512_abi<_Abi>()) 4900 { 4901 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>(); 4902 const auto __kk = __k._M_data._M_data; 4903 if constexpr (sizeof(__kk) == 1) 4904 { 4905 if constexpr (__have_avx512dq) 4906 return _kortestc_mask8_u8(__kk, _Mask == 0xff 4907 ? __kk 4908 : __mmask8(~_Mask)); 4909 else 4910 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask)); 4911 } 4912 else if constexpr (sizeof(__kk) == 2) 4913 return _kortestc_mask16_u8(__kk, _Mask == 0xffff 4914 ? __kk 4915 : __mmask16(~_Mask)); 4916 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw) 4917 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU 4918 ? __kk 4919 : __mmask32(~_Mask)); 4920 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw) 4921 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL 4922 ? __kk 4923 : __mmask64(~_Mask)); 4924 else 4925 __assert_unreachable<_Tp>(); 4926 } 4927 } 4928 4929 // }}} 4930 // _S_any_of {{{ 4931 template <typename _Tp> _S_any_of_MaskImplX864932 _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k) 4933 { 4934 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 4935 { 4936 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 4937 using _TI = __intrinsic_type_t<_Tp, _Np>; 4938 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 4939 if constexpr (__have_sse4_1) 4940 { 4941 if constexpr (_Abi::template _S_is_partial< 4942 _Tp> || sizeof(__k) < 16) 4943 { 4944 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 4945 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 4946 return 0 == __testz(__a, __b); 4947 } 4948 else 4949 return 0 == __testz(__a, __a); 4950 } 4951 else if constexpr (is_same_v<_Tp, float>) 4952 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0; 4953 else if constexpr (is_same_v<_Tp, double>) 4954 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0; 4955 else 4956 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) 4957 != 0; 4958 } 4959 else if constexpr (__is_avx512_abi<_Abi>()) 4960 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) 4961 != 0; 4962 } 4963 4964 // }}} 4965 // _S_none_of {{{ 4966 template <typename _Tp> _S_none_of_MaskImplX864967 _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k) 4968 { 4969 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 4970 { 4971 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 4972 using _TI = __intrinsic_type_t<_Tp, _Np>; 4973 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 4974 if constexpr (__have_sse4_1) 4975 { 4976 if constexpr (_Abi::template _S_is_partial< 4977 _Tp> || sizeof(__k) < 16) 4978 { 4979 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 4980 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 4981 return 0 != __testz(__a, __b); 4982 } 4983 else 4984 return 0 != __testz(__a, __a); 4985 } 4986 else if constexpr (is_same_v<_Tp, float>) 4987 return (__movemask(__a) & ((1 << _Np) - 1)) == 0; 4988 else if constexpr (is_same_v<_Tp, double>) 4989 return (__movemask(__a) & ((1 << _Np) - 1)) == 0; 4990 else 4991 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1)) 4992 == 0; 4993 } 4994 else if constexpr (__is_avx512_abi<_Abi>()) 4995 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) 4996 == 0; 4997 } 4998 4999 // }}} 5000 // _S_some_of {{{ 5001 template <typename _Tp> _S_some_of_MaskImplX865002 _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k) 5003 { 5004 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5005 { 5006 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5007 using _TI = __intrinsic_type_t<_Tp, _Np>; 5008 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5009 if constexpr (__have_sse4_1) 5010 { 5011 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5012 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5013 return 0 != __testnzc(__a, __b); 5014 } 5015 else if constexpr (is_same_v<_Tp, float>) 5016 { 5017 constexpr int __allbits = (1 << _Np) - 1; 5018 const auto __tmp = _mm_movemask_ps(__a) & __allbits; 5019 return __tmp > 0 && __tmp < __allbits; 5020 } 5021 else if constexpr (is_same_v<_Tp, double>) 5022 { 5023 constexpr int __allbits = (1 << _Np) - 1; 5024 const auto __tmp = _mm_movemask_pd(__a) & __allbits; 5025 return __tmp > 0 && __tmp < __allbits; 5026 } 5027 else 5028 { 5029 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1; 5030 const auto __tmp = _mm_movemask_epi8(__a) & __allbits; 5031 return __tmp > 0 && __tmp < __allbits; 5032 } 5033 } 5034 else if constexpr (__is_avx512_abi<_Abi>()) 5035 return _S_any_of(__k) && !_S_all_of(__k); 5036 else 5037 __assert_unreachable<_Tp>(); 5038 } 5039 5040 // }}} 5041 // _S_popcount {{{ 5042 template <typename _Tp> _S_popcount_MaskImplX865043 _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k) 5044 { 5045 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5046 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data; 5047 if constexpr (__is_avx512_abi<_Abi>()) 5048 { 5049 if constexpr (_Np > 32) 5050 return __builtin_popcountll(__kk); 5051 else 5052 return __builtin_popcount(__kk); 5053 } 5054 else 5055 { 5056 if constexpr (__have_popcnt) 5057 { 5058 int __bits 5059 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk))); 5060 const int __count = __builtin_popcount(__bits); 5061 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count; 5062 } 5063 else if constexpr (_Np == 2 && sizeof(_Tp) == 8) 5064 { 5065 const int mask = _mm_movemask_pd(__auto_bitcast(__kk)); 5066 return mask - (mask >> 1); 5067 } 5068 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8) 5069 { 5070 auto __x = -(__lo128(__kk) + __hi128(__kk)); 5071 return __x[0] + __x[1]; 5072 } 5073 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) 5074 { 5075 if constexpr (__have_sse2) 5076 { 5077 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk)); 5078 __x = _mm_add_epi32( 5079 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3))); 5080 __x = _mm_add_epi32( 5081 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2))); 5082 return -_mm_cvtsi128_si32(__x); 5083 } 5084 else 5085 return __builtin_popcount( 5086 _mm_movemask_ps(__auto_bitcast(__kk))); 5087 } 5088 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) 5089 { 5090 auto __x = __to_intrin(__kk); 5091 __x = _mm_add_epi16(__x, 5092 _mm_shuffle_epi32(__x, 5093 _MM_SHUFFLE(0, 1, 2, 3))); 5094 __x = _mm_add_epi16( 5095 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3))); 5096 __x = _mm_add_epi16( 5097 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1))); 5098 return -short(_mm_extract_epi16(__x, 0)); 5099 } 5100 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) 5101 { 5102 auto __x = __to_intrin(__kk); 5103 __x = _mm_add_epi8(__x, 5104 _mm_shuffle_epi32(__x, 5105 _MM_SHUFFLE(0, 1, 2, 3))); 5106 __x = _mm_add_epi8(__x, 5107 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 5108 3))); 5109 __x = _mm_add_epi8(__x, 5110 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 5111 1))); 5112 auto __y = -__vector_bitcast<_UChar>(__x); 5113 if constexpr (__have_sse4_1) 5114 return __y[0] + __y[1]; 5115 else 5116 { 5117 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0); 5118 return (__z & 0xff) + (__z >> 8); 5119 } 5120 } 5121 else if constexpr (sizeof(__kk) == 32) 5122 { 5123 // The following works only as long as the implementations above 5124 // use a summation 5125 using _I = __int_for_sizeof_t<_Tp>; 5126 const auto __as_int = __vector_bitcast<_I>(__kk); 5127 _MaskImplX86<simd_abi::__sse>::_S_popcount( 5128 simd_mask<_I, simd_abi::__sse>(__private_init, 5129 __lo128(__as_int) 5130 + __hi128(__as_int))); 5131 } 5132 else 5133 __assert_unreachable<_Tp>(); 5134 } 5135 } 5136 5137 // }}} 5138 // _S_find_first_set {{{ 5139 template <typename _Tp> 5140 _GLIBCXX_SIMD_INTRINSIC static int _S_find_first_set_MaskImplX865141 _S_find_first_set(simd_mask<_Tp, _Abi> __k) 5142 { 5143 if constexpr (__is_avx512_abi<_Abi>()) 5144 return std::__countr_zero(__k._M_data._M_data); 5145 else 5146 return _Base::_S_find_first_set(__k); 5147 } 5148 5149 // }}} 5150 // _S_find_last_set {{{ 5151 template <typename _Tp> 5152 _GLIBCXX_SIMD_INTRINSIC static int _S_find_last_set_MaskImplX865153 _S_find_last_set(simd_mask<_Tp, _Abi> __k) 5154 { 5155 if constexpr (__is_avx512_abi<_Abi>()) 5156 return std::__bit_width(__k._M_data._M_data) - 1; 5157 else 5158 return _Base::_S_find_last_set(__k); 5159 } 5160 5161 // }}} 5162 }; 5163 5164 // }}} 5165 5166 _GLIBCXX_SIMD_END_NAMESPACE 5167 #endif // __cplusplus >= 201703L 5168 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 5169 5170 // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80 5171