1 /*************************************************************************** 2 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * 3 * Martin Renou * 4 * Copyright (c) QuantStack * 5 * * 6 * Distributed under the terms of the BSD 3-Clause License. * 7 * * 8 * The full license is in the file LICENSE, distributed with this software. * 9 ****************************************************************************/ 10 11 #ifndef XSIMD_AVX512_INT16_HPP 12 #define XSIMD_AVX512_INT16_HPP 13 14 #include "xsimd_avx512_bool.hpp" 15 #include "xsimd_avx512_int_base.hpp" 16 17 namespace xsimd 18 { 19 20 #define XSIMD_APPLY_AVX2_FUNCTION_INT16(func, avx_lhs, avx_rhs) \ 21 XSIMD_APPLY_AVX2_FUNCTION(16, func, avx_lhs, avx_rhs) 22 23 /*************************** 24 * batch_bool<int16_t, 32> * 25 ***************************/ 26 27 template <> 28 struct simd_batch_traits<batch_bool<int16_t, 32>> 29 { 30 using value_type = int16_t; 31 static constexpr std::size_t size = 32; 32 using batch_type = batch<int16_t, 32>; 33 static constexpr std::size_t align = 64; 34 }; 35 36 template <> 37 struct simd_batch_traits<batch_bool<uint16_t, 32>> 38 { 39 using value_type = uint16_t; 40 static constexpr std::size_t size = 32; 41 using batch_type = batch<uint16_t, 32>; 42 static constexpr std::size_t align = 64; 43 }; 44 45 #if defined(XSIMD_AVX512BW_AVAILABLE) 46 47 template <> 48 class batch_bool<int16_t, 32> : 49 public batch_bool_avx512<__mmask32, batch_bool<int16_t, 32>> 50 { 51 public: 52 53 using base_class = batch_bool_avx512<__mmask32, batch_bool<int16_t, 32>>; 54 using base_class::base_class; 55 }; 56 57 template <> 58 class batch_bool<uint16_t, 32> : 59 public batch_bool_avx512<__mmask32, batch_bool<uint16_t, 32>> 60 { 61 public: 62 63 using base_class = batch_bool_avx512<__mmask32, batch_bool<uint16_t, 32>>; 64 using base_class::base_class; 65 }; 66 67 namespace detail 68 { 69 template <> 70 struct batch_bool_kernel<int16_t, 32> 71 : batch_bool_kernel_avx512<int16_t, 32> 72 { 73 }; 74 75 template <> 76 struct batch_bool_kernel<uint16_t, 32> 77 : batch_bool_kernel_avx512<uint16_t, 32> 78 { 79 }; 80 } 81 82 #else 83 84 template <> 85 class batch_bool<int16_t, 32> : public avx512_fallback_batch_bool<int16_t, 32> 86 { 87 public: 88 89 using base_class = avx512_fallback_batch_bool<int16_t, 32>; 90 using base_class::base_class; 91 }; 92 93 template <> 94 class batch_bool<uint16_t, 32> : public avx512_fallback_batch_bool<uint16_t, 32> 95 { 96 public: 97 98 using base_class = avx512_fallback_batch_bool<uint16_t, 32>; 99 using base_class::base_class; 100 }; 101 102 103 namespace detail 104 { 105 template <> 106 struct batch_bool_kernel<int16_t, 32> 107 : avx512_fallback_batch_bool_kernel<int16_t, 32> 108 { 109 }; 110 111 template <> 112 struct batch_bool_kernel<uint16_t, 32> 113 : avx512_fallback_batch_bool_kernel<uint16_t, 32> 114 { 115 }; 116 } 117 118 #endif 119 120 /********************** 121 * batch<int16_t, 32> * 122 **********************/ 123 124 template <> 125 struct simd_batch_traits<batch<int16_t, 32>> 126 { 127 using value_type = int16_t; 128 static constexpr std::size_t size = 32; 129 using batch_bool_type = batch_bool<int16_t, 32>; 130 static constexpr std::size_t align = 64; 131 using storage_type = __m512i; 132 }; 133 134 template <> 135 struct simd_batch_traits<batch<uint16_t, 32>> 136 { 137 using value_type = uint16_t; 138 static constexpr std::size_t size = 32; 139 using batch_bool_type = batch_bool<uint16_t, 32>; 140 static constexpr std::size_t align = 64; 141 using storage_type = __m512i; 142 }; 143 144 template <> 145 class batch<int16_t, 32> : public avx512_int_batch<int16_t, 32> 146 { 147 public: 148 149 using base_class = avx512_int_batch; 150 using base_class::base_class; 151 using base_class::load_aligned; 152 using base_class::load_unaligned; 153 using base_class::store_aligned; 154 using base_class::store_unaligned; 155 156 batch() = default; 157 batch(const char * src)158 explicit batch(const char* src) 159 : batch(reinterpret_cast<const int16_t*>(src)) 160 { 161 } 162 batch(const char * src,aligned_mode)163 batch(const char* src, aligned_mode) 164 : batch(reinterpret_cast<const int16_t*>(src), aligned_mode{}) 165 { 166 } 167 batch(const char * src,unaligned_mode)168 batch(const char* src, unaligned_mode) 169 : batch(reinterpret_cast<const int16_t*>(src), unaligned_mode{}) 170 { 171 } 172 173 XSIMD_DECLARE_LOAD_STORE_INT16(int16_t, 32) 174 XSIMD_DECLARE_LOAD_STORE_LONG(int16_t, 32) 175 }; 176 177 template <> 178 class batch<uint16_t, 32> : public avx512_int_batch<uint16_t, 32> 179 { 180 public: 181 182 using base_class = avx512_int_batch; 183 using base_class::base_class; 184 using base_class::load_aligned; 185 using base_class::load_unaligned; 186 using base_class::store_aligned; 187 using base_class::store_unaligned; 188 189 XSIMD_DECLARE_LOAD_STORE_INT16(uint16_t, 32) 190 XSIMD_DECLARE_LOAD_STORE_LONG(uint16_t, 32) 191 }; 192 193 batch<int16_t, 32> operator<<(const batch<int16_t, 32>& lhs, int32_t rhs); 194 batch<int16_t, 32> operator>>(const batch<int16_t, 32>& lhs, int32_t rhs); 195 batch<int16_t, 32> operator<<(const batch<int16_t, 32>& lhs, const batch<int16_t, 32>& rhs); 196 batch<int16_t, 32> operator>>(const batch<int16_t, 32>& lhs, const batch<int16_t, 32>& rhs); 197 batch<uint16_t, 32> operator<<(const batch<uint16_t, 32>& lhs, int32_t rhs); 198 batch<uint16_t, 32> operator>>(const batch<uint16_t, 32>& lhs, int32_t rhs); 199 batch<uint16_t, 32> operator<<(const batch<uint16_t, 32>& lhs, const batch<int16_t, 32>& rhs); 200 batch<uint16_t, 32> operator>>(const batch<uint16_t, 32>& lhs, const batch<int16_t, 32>& rhs); 201 202 /************************************* 203 * batch<int16_t, 32> implementation * 204 *************************************/ 205 206 namespace detail 207 { 208 template <class T> 209 struct avx512_int16_batch_kernel 210 : avx512_int_kernel_base<batch<T, 32>> 211 { 212 using batch_type = batch<T, 32>; 213 using value_type = T; 214 using batch_bool_type = batch_bool<T, 32>; 215 negxsimd::detail::avx512_int16_batch_kernel216 static batch_type neg(const batch_type& rhs) 217 { 218 #if defined(XSIMD_AVX512BW_AVAILABLE) 219 return _mm512_sub_epi16(_mm512_setzero_si512(), rhs); 220 #else 221 XSIMD_SPLIT_AVX512(rhs); 222 __m256i res_low = _mm256_sub_epi16(_mm256_setzero_si256(), rhs_low); 223 __m256i res_high = _mm256_sub_epi16(_mm256_setzero_si256(), rhs_high); 224 XSIMD_RETURN_MERGED_AVX(res_low, res_high); 225 #endif 226 } 227 addxsimd::detail::avx512_int16_batch_kernel228 static batch_type add(const batch_type& lhs, const batch_type& rhs) 229 { 230 #if defined(XSIMD_AVX512BW_AVAILABLE) 231 return _mm512_add_epi16(lhs, rhs); 232 #else 233 XSIMD_APPLY_AVX2_FUNCTION_INT16(add, lhs, rhs); 234 #endif 235 } 236 subxsimd::detail::avx512_int16_batch_kernel237 static batch_type sub(const batch_type& lhs, const batch_type& rhs) 238 { 239 #if defined(XSIMD_AVX512BW_AVAILABLE) 240 return _mm512_sub_epi16(lhs, rhs); 241 #else 242 XSIMD_APPLY_AVX2_FUNCTION_INT16(sub, lhs, rhs); 243 #endif 244 } 245 saddxsimd::detail::avx512_int16_batch_kernel246 static batch_type sadd(const batch_type &lhs, const batch_type &rhs) 247 { 248 #if defined(XSIMD_AVX512BW_AVAILABLE) 249 return _mm512_adds_epi16(lhs, rhs); 250 #else 251 XSIMD_APPLY_AVX2_FUNCTION_INT16(sadd, lhs, rhs); 252 #endif 253 } 254 ssubxsimd::detail::avx512_int16_batch_kernel255 static batch_type ssub(const batch_type &lhs, const batch_type &rhs) 256 { 257 #if defined(XSIMD_AVX512BW_AVAILABLE) 258 return _mm512_subs_epi16(lhs, rhs); 259 #else 260 XSIMD_APPLY_AVX2_FUNCTION_INT16(ssub, lhs, rhs); 261 #endif 262 } 263 mulxsimd::detail::avx512_int16_batch_kernel264 static batch_type mul(const batch_type& lhs, const batch_type& rhs) 265 { 266 #if defined(XSIMD_AVX512BW_AVAILABLE) 267 return _mm512_mullo_epi16(lhs, rhs); 268 #else 269 XSIMD_APPLY_AVX2_FUNCTION_INT16(mul, lhs, rhs); 270 #endif 271 } 272 divxsimd::detail::avx512_int16_batch_kernel273 static batch_type div(const batch_type& lhs, const batch_type& rhs) 274 { 275 XSIMD_APPLY_AVX2_FUNCTION_INT16(div, lhs, rhs); 276 } 277 modxsimd::detail::avx512_int16_batch_kernel278 static batch_type mod(const batch_type& lhs, const batch_type& rhs) 279 { 280 XSIMD_MACRO_UNROLL_BINARY(%); 281 } 282 bitwise_andxsimd::detail::avx512_int16_batch_kernel283 static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) 284 { 285 return _mm512_and_si512(lhs, rhs); 286 } 287 bitwise_orxsimd::detail::avx512_int16_batch_kernel288 static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) 289 { 290 return _mm512_or_si512(lhs, rhs); 291 } 292 bitwise_xorxsimd::detail::avx512_int16_batch_kernel293 static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) 294 { 295 return _mm512_xor_si512(lhs, rhs); 296 } 297 bitwise_notxsimd::detail::avx512_int16_batch_kernel298 static batch_type bitwise_not(const batch_type& rhs) 299 { 300 return _mm512_xor_si512(rhs, _mm512_set1_epi16(-1)); 301 } 302 bitwise_andnotxsimd::detail::avx512_int16_batch_kernel303 static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) 304 { 305 return _mm512_andnot_si512(lhs, rhs); 306 } 307 fmaxsimd::detail::avx512_int16_batch_kernel308 static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) 309 { 310 return x * y + z; 311 } 312 fmsxsimd::detail::avx512_int16_batch_kernel313 static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) 314 { 315 return x * y - z; 316 } 317 fnmaxsimd::detail::avx512_int16_batch_kernel318 static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) 319 { 320 return -x * y + z; 321 } 322 fnmsxsimd::detail::avx512_int16_batch_kernel323 static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) 324 { 325 return -x * y - z; 326 } 327 haddxsimd::detail::avx512_int16_batch_kernel328 static value_type hadd(const batch_type& rhs) 329 { 330 XSIMD_SPLIT_AVX512(rhs); 331 auto tmp = batch<value_type, 16>(rhs_low) + batch<value_type, 16>(rhs_high); 332 return xsimd::hadd(batch<value_type, 16>(tmp)); 333 } 334 selectxsimd::detail::avx512_int16_batch_kernel335 static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) 336 { 337 #if defined(XSIMD_AVX512BW_AVAILABLE) && !defined(_MSC_VER) 338 auto res = _mm512_mask_blend_epi16((__mmask32)cond, (__m512i)b, (__m512i)a); 339 return batch_type(res); 340 #else 341 __m512i mcond = _mm512_maskz_broadcastw_epi16((__mmask32)cond, _mm_set1_epi32(~0)); 342 XSIMD_SPLIT_AVX512(mcond); 343 XSIMD_SPLIT_AVX512(a); 344 XSIMD_SPLIT_AVX512(b); 345 346 auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low); 347 auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high); 348 349 XSIMD_RETURN_MERGED_AVX(res_lo, res_hi); 350 #endif 351 } 352 zip_loxsimd::detail::avx512_int16_batch_kernel353 static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) 354 { 355 return _mm512_unpacklo_epi16(lhs, rhs); 356 } 357 zip_hixsimd::detail::avx512_int16_batch_kernel358 static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) 359 { 360 return _mm512_unpackhi_epi16(lhs, rhs); 361 } 362 extract_pairxsimd::detail::avx512_int16_batch_kernel363 static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num) 364 { 365 #if defined(XSIMD_AVX512BW_AVAILABLE) 366 const batch_type lhs = v_rhs; 367 const batch_type rhs = v_lhs; 368 const int n = 2 * num; 369 switch(n) 370 { 371 case 0: return rhs; 372 XSIMD_REPEAT_64_v2(_mm512_alignr_epi8); 373 default: break; 374 } 375 return batch_type(T(0)); 376 #else 377 batch_type b_concatenate; 378 const int n = num; 379 for (int i = 0 ; i < (32 - n); ++i) 380 { 381 b_concatenate[i] = v_lhs[i + n]; 382 if(i < n) 383 { 384 b_concatenate[32 - 1 - i] = v_rhs[n - 1 - i]; 385 } 386 } 387 return b_concatenate; 388 #endif 389 } 390 391 }; 392 393 template <> 394 struct batch_kernel<int16_t, 32> 395 : public avx512_int16_batch_kernel<int16_t> 396 { absxsimd::detail::batch_kernel397 static batch_type abs(const batch_type& rhs) 398 { 399 #if defined(XSIMD_AVX512BW_AVAILABLE) 400 return _mm512_abs_epi16(rhs); 401 #else 402 XSIMD_SPLIT_AVX512(rhs); 403 __m256i res_low = _mm256_abs_epi16(rhs_low); 404 __m256i res_high = _mm256_abs_epi16(rhs_high); 405 XSIMD_RETURN_MERGED_AVX(res_low, res_high); 406 #endif 407 } 408 minxsimd::detail::batch_kernel409 static batch_type min(const batch_type& lhs, const batch_type& rhs) 410 { 411 #if defined(XSIMD_AVX512BW_AVAILABLE) 412 return _mm512_min_epi16(lhs, rhs); 413 #else 414 XSIMD_APPLY_AVX2_FUNCTION_INT16(min, lhs, rhs); 415 #endif 416 } 417 maxxsimd::detail::batch_kernel418 static batch_type max(const batch_type& lhs, const batch_type& rhs) 419 { 420 #if defined(XSIMD_AVX512BW_AVAILABLE) 421 return _mm512_max_epi16(lhs, rhs); 422 #else 423 XSIMD_APPLY_AVX2_FUNCTION_INT16(max, lhs, rhs); 424 #endif 425 } 426 eqxsimd::detail::batch_kernel427 static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) 428 { 429 #if defined(XSIMD_AVX512BW_AVAILABLE) 430 return _mm512_cmpeq_epi16_mask(lhs, rhs); 431 #else 432 XSIMD_APPLY_AVX2_FUNCTION_INT16(eq, lhs, rhs); 433 #endif 434 } 435 neqxsimd::detail::batch_kernel436 static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) 437 { 438 #if defined(XSIMD_AVX512BW_AVAILABLE) 439 return _mm512_cmpneq_epi16_mask(lhs, rhs); 440 #else 441 XSIMD_APPLY_AVX2_FUNCTION_INT16(neq, lhs, rhs); 442 #endif 443 } 444 ltxsimd::detail::batch_kernel445 static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) 446 { 447 #if defined(XSIMD_AVX512BW_AVAILABLE) 448 return _mm512_cmplt_epi16_mask(lhs, rhs); 449 #else 450 XSIMD_APPLY_AVX2_FUNCTION_INT16(lt, lhs, rhs); 451 #endif 452 } 453 ltexsimd::detail::batch_kernel454 static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) 455 { 456 #if defined(XSIMD_AVX512BW_AVAILABLE) 457 return _mm512_cmple_epi16_mask(lhs, rhs); 458 #else 459 XSIMD_APPLY_AVX2_FUNCTION_INT16(lte, lhs, rhs); 460 #endif 461 } 462 }; 463 464 template <> 465 struct batch_kernel<uint16_t, 32> 466 : public avx512_int16_batch_kernel<uint16_t> 467 { absxsimd::detail::batch_kernel468 static batch_type abs(const batch_type& rhs) 469 { 470 return rhs; 471 } 472 minxsimd::detail::batch_kernel473 static batch_type min(const batch_type& lhs, const batch_type& rhs) 474 { 475 #if defined(XSIMD_AVX512BW_AVAILABLE) 476 return _mm512_min_epu16(lhs, rhs); 477 #else 478 XSIMD_APPLY_AVX2_FUNCTION_INT16(min, lhs, rhs); 479 #endif 480 } 481 maxxsimd::detail::batch_kernel482 static batch_type max(const batch_type& lhs, const batch_type& rhs) 483 { 484 #if defined(XSIMD_AVX512BW_AVAILABLE) 485 return _mm512_max_epu16(lhs, rhs); 486 #else 487 XSIMD_APPLY_AVX2_FUNCTION_INT16(max, lhs, rhs); 488 #endif 489 } 490 eqxsimd::detail::batch_kernel491 static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) 492 { 493 #if defined(XSIMD_AVX512BW_AVAILABLE) 494 return _mm512_cmpeq_epu16_mask(lhs, rhs); 495 #else 496 XSIMD_APPLY_AVX2_FUNCTION_INT16(eq, lhs, rhs); 497 #endif 498 } 499 neqxsimd::detail::batch_kernel500 static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) 501 { 502 #if defined(XSIMD_AVX512BW_AVAILABLE) 503 return _mm512_cmpneq_epu16_mask(lhs, rhs); 504 #else 505 XSIMD_APPLY_AVX2_FUNCTION_INT16(neq, lhs, rhs); 506 #endif 507 } 508 ltxsimd::detail::batch_kernel509 static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) 510 { 511 #if defined(XSIMD_AVX512BW_AVAILABLE) 512 return _mm512_cmplt_epu16_mask(lhs, rhs); 513 #else 514 XSIMD_APPLY_AVX2_FUNCTION_INT16(lt, lhs, rhs); 515 #endif 516 } 517 ltexsimd::detail::batch_kernel518 static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) 519 { 520 #if defined(XSIMD_AVX512BW_AVAILABLE) 521 return _mm512_cmple_epu16_mask(lhs, rhs); 522 #else 523 XSIMD_APPLY_AVX2_FUNCTION_INT16(lte, lhs, rhs); 524 #endif 525 } 526 saddxsimd::detail::batch_kernel527 static batch_type sadd(const batch_type &lhs, const batch_type &rhs) 528 { 529 #if defined(XSIMD_AVX512BW_AVAILABLE) 530 return _mm512_adds_epu16(lhs, rhs); 531 #else 532 XSIMD_APPLY_AVX2_FUNCTION_UINT16(sadd, lhs, rhs); 533 #endif 534 } 535 ssubxsimd::detail::batch_kernel536 static batch_type ssub(const batch_type &lhs, const batch_type &rhs) 537 { 538 #if defined(XSIMD_AVX512BW_AVAILABLE) 539 return _mm512_subs_epu16(lhs, rhs); 540 #else 541 XSIMD_APPLY_AVX2_FUNCTION_UINT16(ssub, lhs, rhs); 542 #endif 543 } 544 }; 545 } 546 operator <<(const batch<int16_t,32> & lhs,int32_t rhs)547 inline batch<int16_t, 32> operator<<(const batch<int16_t, 32>& lhs, int32_t rhs) 548 { 549 #if defined(XSIMD_AVX512BW_AVAILABLE) 550 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) 551 return _mm512_sllv_epi16(lhs, _mm512_set1_epi16(rhs)); 552 #else 553 return _mm512_slli_epi16(lhs, rhs); 554 #endif 555 #else 556 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) 557 __m512i tmp = _mm512_sllv_epi32(lhs, _mm512_set1_epi32(rhs)); 558 #else 559 __m512i tmp = _mm512_slli_epi32(lhs, rhs); 560 #endif 561 return _mm512_and_si512(_mm512_set1_epi16(0xFFFF << rhs), tmp); 562 #endif 563 } 564 operator >>(const batch<int16_t,32> & lhs,int32_t rhs)565 inline batch<int16_t, 32> operator>>(const batch<int16_t, 32>& lhs, int32_t rhs) 566 { 567 #if defined(XSIMD_AVX512BW_AVAILABLE) 568 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) 569 return _mm512_srav_epi16(lhs, _mm512_set1_epi16(rhs)); 570 #else 571 return _mm512_srai_epi16(lhs, rhs); 572 #endif 573 #else 574 return avx512_detail::shift_impl([](int16_t val, int32_t s) { return val >> s; }, lhs, rhs); 575 #endif 576 } 577 operator <<(const batch<int16_t,32> & lhs,const batch<int16_t,32> & rhs)578 inline batch<int16_t, 32> operator<<(const batch<int16_t, 32>& lhs, const batch<int16_t, 32>& rhs) 579 { 580 #if defined(XSIMD_AVX512BW_AVAILABLE) 581 return _mm512_sllv_epi16(lhs, rhs); 582 #else 583 return avx512_detail::shift_impl([](int16_t val, int16_t s) { return val << s; }, lhs, rhs); 584 #endif 585 } 586 operator >>(const batch<int16_t,32> & lhs,const batch<int16_t,32> & rhs)587 inline batch<int16_t, 32> operator>>(const batch<int16_t, 32>& lhs, const batch<int16_t, 32>& rhs) 588 { 589 #if defined(XSIMD_AVX512BW_AVAILABLE) 590 return _mm512_srav_epi16(lhs, rhs); 591 #else 592 return avx512_detail::shift_impl([](int16_t val, int16_t s) { return val >> s; }, lhs, rhs); 593 #endif 594 } 595 596 XSIMD_DEFINE_LOAD_STORE_INT16(int16_t, 32, 64) 597 XSIMD_DEFINE_LOAD_STORE_LONG(int16_t, 32, 64) 598 operator <<(const batch<uint16_t,32> & lhs,int32_t rhs)599 inline batch<uint16_t, 32> operator<<(const batch<uint16_t, 32>& lhs, int32_t rhs) 600 { 601 #if defined(XSIMD_AVX512BW_AVAILABLE) 602 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) 603 return _mm512_sllv_epi16(lhs, _mm512_set1_epi16(rhs)); 604 #else 605 return _mm512_slli_epi16(lhs, rhs); 606 #endif 607 #else 608 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) 609 __m512i tmp = _mm512_sllv_epi32(lhs, _mm512_set1_epi32(rhs)); 610 #else 611 __m512i tmp = _mm512_slli_epi32(lhs, rhs); 612 #endif 613 return _mm512_and_si512(_mm512_set1_epi16(0xFFFF << rhs), tmp); 614 #endif 615 } 616 operator >>(const batch<uint16_t,32> & lhs,int32_t rhs)617 inline batch<uint16_t, 32> operator>>(const batch<uint16_t, 32>& lhs, int32_t rhs) 618 { 619 #if defined(XSIMD_AVX512BW_AVAILABLE) 620 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) 621 return _mm512_srlv_epi16(lhs, _mm512_set1_epi16(rhs)); 622 #else 623 return _mm512_srli_epi16(lhs, rhs); 624 #endif 625 #else 626 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) 627 __m512i tmp = _mm512_srlv_epi32(lhs, _mm512_set1_epi32(rhs)); 628 #else 629 __m512i tmp = _mm512_srli_epi32(lhs, rhs); 630 #endif 631 return _mm512_and_si512(_mm512_set1_epi16(0xFFFF >> rhs), tmp); 632 #endif 633 } 634 operator <<(const batch<uint16_t,32> & lhs,const batch<int16_t,32> & rhs)635 inline batch<uint16_t, 32> operator<<(const batch<uint16_t, 32>& lhs, const batch<int16_t, 32>& rhs) 636 { 637 #if defined(XSIMD_AVX512BW_AVAILABLE) 638 return _mm512_sllv_epi16(lhs, rhs); 639 #else 640 return avx512_detail::shift_impl([](uint16_t val, int16_t s) { return val << s; }, lhs, rhs); 641 #endif 642 } 643 operator >>(const batch<uint16_t,32> & lhs,const batch<int16_t,32> & rhs)644 inline batch<uint16_t, 32> operator>>(const batch<uint16_t, 32>& lhs, const batch<int16_t, 32>& rhs) 645 { 646 #if defined(XSIMD_AVX512BW_AVAILABLE) 647 return _mm512_srlv_epi16(lhs, rhs); 648 #else 649 return avx512_detail::shift_impl([](uint16_t val, int16_t s) { return val >> s; }, lhs, rhs); 650 #endif 651 } 652 653 XSIMD_DEFINE_LOAD_STORE_INT16(uint16_t, 32, 64) 654 XSIMD_DEFINE_LOAD_STORE_LONG(uint16_t, 32, 64) 655 656 #undef XSIMD_APPLY_AVX2_FUNCTION_INT16 657 } 658 659 #endif 660