1 // Copyright 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL 16 // operations when compiling for those targets. 17 // External include guard in highway.h - see comment there. 18 19 #include <emmintrin.h> 20 #if HWY_TARGET == HWY_SSSE3 21 #include <tmmintrin.h> // SSSE3 22 #else 23 #include <smmintrin.h> // SSE4 24 #include <wmmintrin.h> // CLMUL 25 #endif 26 #include <stddef.h> 27 #include <stdint.h> 28 29 #include "hwy/base.h" 30 #include "hwy/ops/shared-inl.h" 31 32 // Clang 3.9 generates VINSERTF128 instead of the desired VBROADCASTF128, 33 // which would free up port5. However, inline assembly isn't supported on 34 // MSVC, results in incorrect output on GCC 8.3, and raises "invalid output size 35 // for constraint" errors on Clang (https://gcc.godbolt.org/z/-Jt_-F), hence we 36 // disable it. 37 #ifndef HWY_LOADDUP_ASM 38 #define HWY_LOADDUP_ASM 0 39 #endif 40 41 HWY_BEFORE_NAMESPACE(); 42 namespace hwy { 43 namespace HWY_NAMESPACE { 44 45 template <typename T> 46 using Full128 = Simd<T, 16 / sizeof(T)>; 47 48 namespace detail { 49 50 template <typename T> 51 struct Raw128 { 52 using type = __m128i; 53 }; 54 template <> 55 struct Raw128<float> { 56 using type = __m128; 57 }; 58 template <> 59 struct Raw128<double> { 60 using type = __m128d; 61 }; 62 63 } // namespace detail 64 65 template <typename T, size_t N = 16 / sizeof(T)> 66 class Vec128 { 67 using Raw = typename detail::Raw128<T>::type; 68 69 public: 70 // Compound assignment. Only usable if there is a corresponding non-member 71 // binary operator overload. For example, only f32 and f64 support division. 72 HWY_INLINE Vec128& operator*=(const Vec128 other) { 73 return *this = (*this * other); 74 } 75 HWY_INLINE Vec128& operator/=(const Vec128 other) { 76 return *this = (*this / other); 77 } 78 HWY_INLINE Vec128& operator+=(const Vec128 other) { 79 return *this = (*this + other); 80 } 81 HWY_INLINE Vec128& operator-=(const Vec128 other) { 82 return *this = (*this - other); 83 } 84 HWY_INLINE Vec128& operator&=(const Vec128 other) { 85 return *this = (*this & other); 86 } 87 HWY_INLINE Vec128& operator|=(const Vec128 other) { 88 return *this = (*this | other); 89 } 90 HWY_INLINE Vec128& operator^=(const Vec128 other) { 91 return *this = (*this ^ other); 92 } 93 94 Raw raw; 95 }; 96 97 // Forward-declare for use by DeduceD, see below. 98 template <typename T> 99 class Vec256; 100 template <typename T> 101 class Vec512; 102 103 #if HWY_TARGET <= HWY_AVX3 104 105 namespace detail { 106 107 // Template arg: sizeof(lane type) 108 template <size_t size> 109 struct RawMask128 {}; 110 template <> 111 struct RawMask128<1> { 112 using type = __mmask16; 113 }; 114 template <> 115 struct RawMask128<2> { 116 using type = __mmask8; 117 }; 118 template <> 119 struct RawMask128<4> { 120 using type = __mmask8; 121 }; 122 template <> 123 struct RawMask128<8> { 124 using type = __mmask8; 125 }; 126 127 } // namespace detail 128 129 template <typename T, size_t N> 130 struct Mask128 { 131 using Raw = typename detail::RawMask128<sizeof(T)>::type; 132 133 static Mask128<T, N> FromBits(uint64_t mask_bits) { 134 return Mask128<T, N>{static_cast<Raw>(mask_bits)}; 135 } 136 137 Raw raw; 138 }; 139 140 #else // AVX2 or below 141 142 // FF..FF or 0. 143 template <typename T, size_t N = 16 / sizeof(T)> 144 struct Mask128 { 145 typename detail::Raw128<T>::type raw; 146 }; 147 148 #endif // HWY_TARGET <= HWY_AVX3 149 150 namespace detail { 151 152 // Deduce Simd<T, N> from Vec*<T, N> (pointers because Vec256/512 may be 153 // incomplete types at this point; this is simpler than avoiding multiple 154 // definitions of DFromV via #if) 155 struct DeduceD { 156 template <typename T, size_t N> 157 Simd<T, N> operator()(const Vec128<T, N>*) const { 158 return Simd<T, N>(); 159 } 160 template <typename T> 161 Simd<T, 32 / sizeof(T)> operator()(const Vec256<T>*) const { 162 return Simd<T, 32 / sizeof(T)>(); 163 } 164 template <typename T> 165 Simd<T, 64 / sizeof(T)> operator()(const Vec512<T>*) const { 166 return Simd<T, 64 / sizeof(T)>(); 167 } 168 }; 169 170 // Workaround for MSVC v19.14: alias with a dependent type fails to specialize. 171 template <class V> 172 struct ExpandDFromV { 173 using type = decltype(DeduceD()(static_cast<V*>(nullptr))); 174 }; 175 176 } // namespace detail 177 178 template <class V> 179 using DFromV = typename detail::ExpandDFromV<V>::type; 180 181 template <class V> 182 using TFromV = TFromD<DFromV<V>>; 183 184 // ------------------------------ BitCast 185 186 namespace detail { 187 188 HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; } 189 HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); } 190 HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); } 191 192 template <typename T, size_t N> 193 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) { 194 return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)}; 195 } 196 197 // Cannot rely on function overloading because return types differ. 198 template <typename T> 199 struct BitCastFromInteger128 { 200 HWY_INLINE __m128i operator()(__m128i v) { return v; } 201 }; 202 template <> 203 struct BitCastFromInteger128<float> { 204 HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); } 205 }; 206 template <> 207 struct BitCastFromInteger128<double> { 208 HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); } 209 }; 210 211 template <typename T, size_t N> 212 HWY_INLINE Vec128<T, N> BitCastFromByte(Simd<T, N> /* tag */, 213 Vec128<uint8_t, N * sizeof(T)> v) { 214 return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)}; 215 } 216 217 } // namespace detail 218 219 template <typename T, size_t N, typename FromT> 220 HWY_API Vec128<T, N> BitCast(Simd<T, N> d, 221 Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) { 222 return detail::BitCastFromByte(d, detail::BitCastToByte(v)); 223 } 224 225 // ------------------------------ Zero 226 227 // Returns an all-zero vector/part. 228 template <typename T, size_t N, HWY_IF_LE128(T, N)> 229 HWY_API Vec128<T, N> Zero(Simd<T, N> /* tag */) { 230 return Vec128<T, N>{_mm_setzero_si128()}; 231 } 232 template <size_t N, HWY_IF_LE128(float, N)> 233 HWY_API Vec128<float, N> Zero(Simd<float, N> /* tag */) { 234 return Vec128<float, N>{_mm_setzero_ps()}; 235 } 236 template <size_t N, HWY_IF_LE128(double, N)> 237 HWY_API Vec128<double, N> Zero(Simd<double, N> /* tag */) { 238 return Vec128<double, N>{_mm_setzero_pd()}; 239 } 240 241 template <class D> 242 using VFromD = decltype(Zero(D())); 243 244 // ------------------------------ Set 245 246 // Returns a vector/part with all lanes set to "t". 247 template <size_t N, HWY_IF_LE128(uint8_t, N)> 248 HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N> /* tag */, const uint8_t t) { 249 return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT 250 } 251 template <size_t N, HWY_IF_LE128(uint16_t, N)> 252 HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N> /* tag */, const uint16_t t) { 253 return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT 254 } 255 template <size_t N, HWY_IF_LE128(uint32_t, N)> 256 HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N> /* tag */, const uint32_t t) { 257 return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))}; 258 } 259 template <size_t N, HWY_IF_LE128(uint64_t, N)> 260 HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N> /* tag */, const uint64_t t) { 261 return Vec128<uint64_t, N>{ 262 _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT 263 } 264 template <size_t N, HWY_IF_LE128(int8_t, N)> 265 HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N> /* tag */, const int8_t t) { 266 return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT 267 } 268 template <size_t N, HWY_IF_LE128(int16_t, N)> 269 HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N> /* tag */, const int16_t t) { 270 return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT 271 } 272 template <size_t N, HWY_IF_LE128(int32_t, N)> 273 HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N> /* tag */, const int32_t t) { 274 return Vec128<int32_t, N>{_mm_set1_epi32(t)}; 275 } 276 template <size_t N, HWY_IF_LE128(int64_t, N)> 277 HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N> /* tag */, const int64_t t) { 278 return Vec128<int64_t, N>{ 279 _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT 280 } 281 template <size_t N, HWY_IF_LE128(float, N)> 282 HWY_API Vec128<float, N> Set(Simd<float, N> /* tag */, const float t) { 283 return Vec128<float, N>{_mm_set1_ps(t)}; 284 } 285 template <size_t N, HWY_IF_LE128(double, N)> 286 HWY_API Vec128<double, N> Set(Simd<double, N> /* tag */, const double t) { 287 return Vec128<double, N>{_mm_set1_pd(t)}; 288 } 289 290 HWY_DIAGNOSTICS(push) 291 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") 292 293 // Returns a vector with uninitialized elements. 294 template <typename T, size_t N, HWY_IF_LE128(T, N)> 295 HWY_API Vec128<T, N> Undefined(Simd<T, N> /* tag */) { 296 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC 297 // generate an XOR instruction. 298 return Vec128<T, N>{_mm_undefined_si128()}; 299 } 300 template <size_t N, HWY_IF_LE128(float, N)> 301 HWY_API Vec128<float, N> Undefined(Simd<float, N> /* tag */) { 302 return Vec128<float, N>{_mm_undefined_ps()}; 303 } 304 template <size_t N, HWY_IF_LE128(double, N)> 305 HWY_API Vec128<double, N> Undefined(Simd<double, N> /* tag */) { 306 return Vec128<double, N>{_mm_undefined_pd()}; 307 } 308 309 HWY_DIAGNOSTICS(pop) 310 311 // ------------------------------ GetLane 312 313 // Gets the single value stored in a vector/part. 314 template <size_t N> 315 HWY_API uint8_t GetLane(const Vec128<uint8_t, N> v) { 316 return static_cast<uint8_t>(_mm_cvtsi128_si32(v.raw) & 0xFF); 317 } 318 template <size_t N> 319 HWY_API int8_t GetLane(const Vec128<int8_t, N> v) { 320 return static_cast<int8_t>(_mm_cvtsi128_si32(v.raw) & 0xFF); 321 } 322 template <size_t N> 323 HWY_API uint16_t GetLane(const Vec128<uint16_t, N> v) { 324 return static_cast<uint16_t>(_mm_cvtsi128_si32(v.raw) & 0xFFFF); 325 } 326 template <size_t N> 327 HWY_API int16_t GetLane(const Vec128<int16_t, N> v) { 328 return static_cast<int16_t>(_mm_cvtsi128_si32(v.raw) & 0xFFFF); 329 } 330 template <size_t N> 331 HWY_API uint32_t GetLane(const Vec128<uint32_t, N> v) { 332 return static_cast<uint32_t>(_mm_cvtsi128_si32(v.raw)); 333 } 334 template <size_t N> 335 HWY_API int32_t GetLane(const Vec128<int32_t, N> v) { 336 return _mm_cvtsi128_si32(v.raw); 337 } 338 template <size_t N> 339 HWY_API float GetLane(const Vec128<float, N> v) { 340 return _mm_cvtss_f32(v.raw); 341 } 342 template <size_t N> 343 HWY_API uint64_t GetLane(const Vec128<uint64_t, N> v) { 344 #if HWY_ARCH_X86_32 345 alignas(16) uint64_t lanes[2]; 346 Store(v, Simd<uint64_t, N>(), lanes); 347 return lanes[0]; 348 #else 349 return static_cast<uint64_t>(_mm_cvtsi128_si64(v.raw)); 350 #endif 351 } 352 template <size_t N> 353 HWY_API int64_t GetLane(const Vec128<int64_t, N> v) { 354 #if HWY_ARCH_X86_32 355 alignas(16) int64_t lanes[2]; 356 Store(v, Simd<int64_t, N>(), lanes); 357 return lanes[0]; 358 #else 359 return _mm_cvtsi128_si64(v.raw); 360 #endif 361 } 362 template <size_t N> 363 HWY_API double GetLane(const Vec128<double, N> v) { 364 return _mm_cvtsd_f64(v.raw); 365 } 366 367 // ================================================== LOGICAL 368 369 // ------------------------------ And 370 371 template <typename T, size_t N> 372 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { 373 return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)}; 374 } 375 template <size_t N> 376 HWY_API Vec128<float, N> And(const Vec128<float, N> a, 377 const Vec128<float, N> b) { 378 return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)}; 379 } 380 template <size_t N> 381 HWY_API Vec128<double, N> And(const Vec128<double, N> a, 382 const Vec128<double, N> b) { 383 return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)}; 384 } 385 386 // ------------------------------ AndNot 387 388 // Returns ~not_mask & mask. 389 template <typename T, size_t N> 390 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) { 391 return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)}; 392 } 393 template <size_t N> 394 HWY_API Vec128<float, N> AndNot(const Vec128<float, N> not_mask, 395 const Vec128<float, N> mask) { 396 return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)}; 397 } 398 template <size_t N> 399 HWY_API Vec128<double, N> AndNot(const Vec128<double, N> not_mask, 400 const Vec128<double, N> mask) { 401 return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)}; 402 } 403 404 // ------------------------------ Or 405 406 template <typename T, size_t N> 407 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { 408 return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)}; 409 } 410 411 template <size_t N> 412 HWY_API Vec128<float, N> Or(const Vec128<float, N> a, 413 const Vec128<float, N> b) { 414 return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)}; 415 } 416 template <size_t N> 417 HWY_API Vec128<double, N> Or(const Vec128<double, N> a, 418 const Vec128<double, N> b) { 419 return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)}; 420 } 421 422 // ------------------------------ Xor 423 424 template <typename T, size_t N> 425 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { 426 return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)}; 427 } 428 429 template <size_t N> 430 HWY_API Vec128<float, N> Xor(const Vec128<float, N> a, 431 const Vec128<float, N> b) { 432 return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)}; 433 } 434 template <size_t N> 435 HWY_API Vec128<double, N> Xor(const Vec128<double, N> a, 436 const Vec128<double, N> b) { 437 return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)}; 438 } 439 440 // ------------------------------ Not 441 442 template <typename T, size_t N> 443 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) { 444 using TU = MakeUnsigned<T>; 445 #if HWY_TARGET <= HWY_AVX3 446 const __m128i vu = BitCast(Simd<TU, N>(), v).raw; 447 return BitCast(Simd<T, N>(), 448 Vec128<TU, N>{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)}); 449 #else 450 return Xor(v, BitCast(Simd<T, N>(), Vec128<TU, N>{_mm_set1_epi32(-1)})); 451 #endif 452 } 453 454 // ------------------------------ Operator overloads (internal-only if float) 455 456 template <typename T, size_t N> 457 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) { 458 return And(a, b); 459 } 460 461 template <typename T, size_t N> 462 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) { 463 return Or(a, b); 464 } 465 466 template <typename T, size_t N> 467 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) { 468 return Xor(a, b); 469 } 470 471 // ------------------------------ PopulationCount 472 473 // 8/16 require BITALG, 32/64 require VPOPCNTDQ. 474 #if HWY_TARGET == HWY_AVX3_DL 475 476 #ifdef HWY_NATIVE_POPCNT 477 #undef HWY_NATIVE_POPCNT 478 #else 479 #define HWY_NATIVE_POPCNT 480 #endif 481 482 namespace detail { 483 484 template <typename T, size_t N> 485 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */, 486 Vec128<T, N> v) { 487 return Vec128<T, N>{_mm_popcnt_epi8(v.raw)}; 488 } 489 template <typename T, size_t N> 490 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */, 491 Vec128<T, N> v) { 492 return Vec128<T, N>{_mm_popcnt_epi16(v.raw)}; 493 } 494 template <typename T, size_t N> 495 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */, 496 Vec128<T, N> v) { 497 return Vec128<T, N>{_mm_popcnt_epi32(v.raw)}; 498 } 499 template <typename T, size_t N> 500 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */, 501 Vec128<T, N> v) { 502 return Vec128<T, N>{_mm_popcnt_epi64(v.raw)}; 503 } 504 505 } // namespace detail 506 507 template <typename T, size_t N> 508 HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) { 509 return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v); 510 } 511 512 #endif // HWY_TARGET == HWY_AVX3_DL 513 514 // ================================================== SIGN 515 516 // ------------------------------ Neg 517 518 template <typename T, size_t N, HWY_IF_FLOAT(T)> 519 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) { 520 return Xor(v, SignBit(Simd<T, N>())); 521 } 522 523 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)> 524 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) { 525 return Zero(Simd<T, N>()) - v; 526 } 527 528 // ------------------------------ Abs 529 530 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. 531 template <size_t N> 532 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) { 533 #if HWY_COMPILER_MSVC 534 // Workaround for incorrect codegen? (reaches breakpoint) 535 const auto zero = Zero(Simd<int8_t, N>()); 536 return Vec128<int8_t, N>{_mm_max_epi8(v.raw, (zero - v).raw)}; 537 #else 538 return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)}; 539 #endif 540 } 541 template <size_t N> 542 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) { 543 return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)}; 544 } 545 template <size_t N> 546 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) { 547 return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)}; 548 } 549 // i64 is implemented after BroadcastSignBit. 550 template <size_t N> 551 HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) { 552 const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)}; 553 return v & BitCast(Simd<float, N>(), mask); 554 } 555 template <size_t N> 556 HWY_API Vec128<double, N> Abs(const Vec128<double, N> v) { 557 const Vec128<int64_t, N> mask{_mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)}; 558 return v & BitCast(Simd<double, N>(), mask); 559 } 560 561 // ------------------------------ CopySign 562 563 template <typename T, size_t N> 564 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn, 565 const Vec128<T, N> sign) { 566 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 567 568 const Simd<T, N> d; 569 const auto msb = SignBit(d); 570 571 #if HWY_TARGET <= HWY_AVX3 572 const Rebind<MakeUnsigned<T>, decltype(d)> du; 573 // Truth table for msb, magn, sign | bitwise msb ? sign : mag 574 // 0 0 0 | 0 575 // 0 0 1 | 0 576 // 0 1 0 | 1 577 // 0 1 1 | 1 578 // 1 0 0 | 0 579 // 1 0 1 | 1 580 // 1 1 0 | 0 581 // 1 1 1 | 1 582 // The lane size does not matter because we are not using predication. 583 const __m128i out = _mm_ternarylogic_epi32( 584 BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC); 585 return BitCast(d, decltype(Zero(du)){out}); 586 #else 587 return Or(AndNot(msb, magn), And(msb, sign)); 588 #endif 589 } 590 591 template <typename T, size_t N> 592 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs, 593 const Vec128<T, N> sign) { 594 #if HWY_TARGET <= HWY_AVX3 595 // AVX3 can also handle abs < 0, so no extra action needed. 596 return CopySign(abs, sign); 597 #else 598 return Or(abs, And(SignBit(Simd<T, N>()), sign)); 599 #endif 600 } 601 602 // ================================================== MASK 603 604 #if HWY_TARGET <= HWY_AVX3 605 606 // ------------------------------ IfThenElse 607 608 // Returns mask ? b : a. 609 610 namespace detail { 611 612 // Templates for signed/unsigned integer of a particular size. 613 template <typename T, size_t N> 614 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1> /* tag */, 615 Mask128<T, N> mask, Vec128<T, N> yes, 616 Vec128<T, N> no) { 617 return Vec128<T, N>{_mm_mask_mov_epi8(no.raw, mask.raw, yes.raw)}; 618 } 619 template <typename T, size_t N> 620 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<2> /* tag */, 621 Mask128<T, N> mask, Vec128<T, N> yes, 622 Vec128<T, N> no) { 623 return Vec128<T, N>{_mm_mask_mov_epi16(no.raw, mask.raw, yes.raw)}; 624 } 625 template <typename T, size_t N> 626 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<4> /* tag */, 627 Mask128<T, N> mask, Vec128<T, N> yes, 628 Vec128<T, N> no) { 629 return Vec128<T, N>{_mm_mask_mov_epi32(no.raw, mask.raw, yes.raw)}; 630 } 631 template <typename T, size_t N> 632 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8> /* tag */, 633 Mask128<T, N> mask, Vec128<T, N> yes, 634 Vec128<T, N> no) { 635 return Vec128<T, N>{_mm_mask_mov_epi64(no.raw, mask.raw, yes.raw)}; 636 } 637 638 } // namespace detail 639 640 template <typename T, size_t N> 641 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, 642 Vec128<T, N> no) { 643 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no); 644 } 645 646 template <size_t N> 647 HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask, 648 Vec128<float, N> yes, Vec128<float, N> no) { 649 return Vec128<float, N>{_mm_mask_mov_ps(no.raw, mask.raw, yes.raw)}; 650 } 651 652 template <size_t N> 653 HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask, 654 Vec128<double, N> yes, 655 Vec128<double, N> no) { 656 return Vec128<double, N>{_mm_mask_mov_pd(no.raw, mask.raw, yes.raw)}; 657 } 658 659 namespace detail { 660 661 template <typename T, size_t N> 662 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<1> /* tag */, 663 Mask128<T, N> mask, Vec128<T, N> yes) { 664 return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)}; 665 } 666 template <typename T, size_t N> 667 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<2> /* tag */, 668 Mask128<T, N> mask, Vec128<T, N> yes) { 669 return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)}; 670 } 671 template <typename T, size_t N> 672 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<4> /* tag */, 673 Mask128<T, N> mask, Vec128<T, N> yes) { 674 return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)}; 675 } 676 template <typename T, size_t N> 677 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8> /* tag */, 678 Mask128<T, N> mask, Vec128<T, N> yes) { 679 return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)}; 680 } 681 682 } // namespace detail 683 684 template <typename T, size_t N> 685 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { 686 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes); 687 } 688 689 template <size_t N> 690 HWY_API Vec128<float, N> IfThenElseZero(Mask128<float, N> mask, 691 Vec128<float, N> yes) { 692 return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)}; 693 } 694 695 template <size_t N> 696 HWY_API Vec128<double, N> IfThenElseZero(Mask128<double, N> mask, 697 Vec128<double, N> yes) { 698 return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)}; 699 } 700 701 namespace detail { 702 703 template <typename T, size_t N> 704 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<1> /* tag */, 705 Mask128<T, N> mask, Vec128<T, N> no) { 706 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. 707 return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; 708 } 709 template <typename T, size_t N> 710 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<2> /* tag */, 711 Mask128<T, N> mask, Vec128<T, N> no) { 712 return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; 713 } 714 template <typename T, size_t N> 715 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<4> /* tag */, 716 Mask128<T, N> mask, Vec128<T, N> no) { 717 return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; 718 } 719 template <typename T, size_t N> 720 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8> /* tag */, 721 Mask128<T, N> mask, Vec128<T, N> no) { 722 return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; 723 } 724 725 } // namespace detail 726 727 template <typename T, size_t N> 728 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { 729 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no); 730 } 731 732 template <size_t N> 733 HWY_API Vec128<float, N> IfThenZeroElse(Mask128<float, N> mask, 734 Vec128<float, N> no) { 735 return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; 736 } 737 738 template <size_t N> 739 HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask, 740 Vec128<double, N> no) { 741 return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; 742 } 743 744 // ------------------------------ Mask logical 745 746 // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently. 747 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS) && \ 748 (HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \ 749 HWY_COMPILER_CLANG >= 800) 750 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1 751 #else 752 #define HWY_COMPILER_HAS_MASK_INTRINSICS 0 753 #endif 754 755 namespace detail { 756 757 template <typename T, size_t N> 758 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, 759 const Mask128<T, N> b) { 760 #if HWY_COMPILER_HAS_MASK_INTRINSICS 761 return Mask128<T, N>{_kand_mask16(a.raw, b.raw)}; 762 #else 763 return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)}; 764 #endif 765 } 766 template <typename T, size_t N> 767 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, 768 const Mask128<T, N> b) { 769 #if HWY_COMPILER_HAS_MASK_INTRINSICS 770 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; 771 #else 772 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; 773 #endif 774 } 775 template <typename T, size_t N> 776 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, 777 const Mask128<T, N> b) { 778 #if HWY_COMPILER_HAS_MASK_INTRINSICS 779 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; 780 #else 781 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; 782 #endif 783 } 784 template <typename T, size_t N> 785 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, 786 const Mask128<T, N> b) { 787 #if HWY_COMPILER_HAS_MASK_INTRINSICS 788 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; 789 #else 790 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; 791 #endif 792 } 793 794 template <typename T, size_t N> 795 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, 796 const Mask128<T, N> b) { 797 #if HWY_COMPILER_HAS_MASK_INTRINSICS 798 return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)}; 799 #else 800 return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)}; 801 #endif 802 } 803 template <typename T, size_t N> 804 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, 805 const Mask128<T, N> b) { 806 #if HWY_COMPILER_HAS_MASK_INTRINSICS 807 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; 808 #else 809 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; 810 #endif 811 } 812 template <typename T, size_t N> 813 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, 814 const Mask128<T, N> b) { 815 #if HWY_COMPILER_HAS_MASK_INTRINSICS 816 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; 817 #else 818 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; 819 #endif 820 } 821 template <typename T, size_t N> 822 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, 823 const Mask128<T, N> b) { 824 #if HWY_COMPILER_HAS_MASK_INTRINSICS 825 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; 826 #else 827 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; 828 #endif 829 } 830 831 template <typename T, size_t N> 832 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, 833 const Mask128<T, N> b) { 834 #if HWY_COMPILER_HAS_MASK_INTRINSICS 835 return Mask128<T, N>{_kor_mask16(a.raw, b.raw)}; 836 #else 837 return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)}; 838 #endif 839 } 840 template <typename T, size_t N> 841 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, 842 const Mask128<T, N> b) { 843 #if HWY_COMPILER_HAS_MASK_INTRINSICS 844 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; 845 #else 846 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; 847 #endif 848 } 849 template <typename T, size_t N> 850 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, 851 const Mask128<T, N> b) { 852 #if HWY_COMPILER_HAS_MASK_INTRINSICS 853 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; 854 #else 855 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; 856 #endif 857 } 858 template <typename T, size_t N> 859 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, 860 const Mask128<T, N> b) { 861 #if HWY_COMPILER_HAS_MASK_INTRINSICS 862 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; 863 #else 864 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; 865 #endif 866 } 867 868 template <typename T, size_t N> 869 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, 870 const Mask128<T, N> b) { 871 #if HWY_COMPILER_HAS_MASK_INTRINSICS 872 return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)}; 873 #else 874 return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)}; 875 #endif 876 } 877 template <typename T, size_t N> 878 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, 879 const Mask128<T, N> b) { 880 #if HWY_COMPILER_HAS_MASK_INTRINSICS 881 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; 882 #else 883 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; 884 #endif 885 } 886 template <typename T, size_t N> 887 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, 888 const Mask128<T, N> b) { 889 #if HWY_COMPILER_HAS_MASK_INTRINSICS 890 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; 891 #else 892 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; 893 #endif 894 } 895 template <typename T, size_t N> 896 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, 897 const Mask128<T, N> b) { 898 #if HWY_COMPILER_HAS_MASK_INTRINSICS 899 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; 900 #else 901 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; 902 #endif 903 } 904 905 } // namespace detail 906 907 template <typename T, size_t N> 908 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { 909 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b); 910 } 911 912 template <typename T, size_t N> 913 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { 914 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b); 915 } 916 917 template <typename T, size_t N> 918 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { 919 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b); 920 } 921 922 template <typename T, size_t N> 923 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { 924 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b); 925 } 926 927 template <typename T, size_t N> 928 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { 929 // Flip only the valid bits. 930 return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1)); 931 } 932 933 #else // AVX2 or below 934 935 // ------------------------------ Mask 936 937 // Mask and Vec are the same (true = FF..FF). 938 template <typename T, size_t N> 939 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { 940 return Mask128<T, N>{v.raw}; 941 } 942 943 template <typename T, size_t N> 944 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 945 return Vec128<T, N>{v.raw}; 946 } 947 948 template <typename T, size_t N> 949 HWY_API Vec128<T, N> VecFromMask(const Simd<T, N> /* tag */, 950 const Mask128<T, N> v) { 951 return Vec128<T, N>{v.raw}; 952 } 953 954 #if HWY_TARGET == HWY_SSSE3 955 956 // mask ? yes : no 957 template <typename T, size_t N> 958 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, 959 Vec128<T, N> no) { 960 const auto vmask = VecFromMask(Simd<T, N>(), mask); 961 return Or(And(vmask, yes), AndNot(vmask, no)); 962 } 963 964 #else // HWY_TARGET == HWY_SSSE3 965 966 // mask ? yes : no 967 template <typename T, size_t N> 968 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, 969 Vec128<T, N> no) { 970 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)}; 971 } 972 template <size_t N> 973 HWY_API Vec128<float, N> IfThenElse(const Mask128<float, N> mask, 974 const Vec128<float, N> yes, 975 const Vec128<float, N> no) { 976 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)}; 977 } 978 template <size_t N> 979 HWY_API Vec128<double, N> IfThenElse(const Mask128<double, N> mask, 980 const Vec128<double, N> yes, 981 const Vec128<double, N> no) { 982 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)}; 983 } 984 985 #endif // HWY_TARGET == HWY_SSSE3 986 987 // mask ? yes : 0 988 template <typename T, size_t N> 989 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { 990 return yes & VecFromMask(Simd<T, N>(), mask); 991 } 992 993 // mask ? 0 : no 994 template <typename T, size_t N> 995 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { 996 return AndNot(VecFromMask(Simd<T, N>(), mask), no); 997 } 998 999 // ------------------------------ Mask logical 1000 1001 template <typename T, size_t N> 1002 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { 1003 return MaskFromVec(Not(VecFromMask(Simd<T, N>(), m))); 1004 } 1005 1006 template <typename T, size_t N> 1007 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { 1008 const Simd<T, N> d; 1009 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); 1010 } 1011 1012 template <typename T, size_t N> 1013 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { 1014 const Simd<T, N> d; 1015 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); 1016 } 1017 1018 template <typename T, size_t N> 1019 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { 1020 const Simd<T, N> d; 1021 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); 1022 } 1023 1024 template <typename T, size_t N> 1025 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { 1026 const Simd<T, N> d; 1027 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); 1028 } 1029 1030 #endif // HWY_TARGET <= HWY_AVX3 1031 1032 // ================================================== SWIZZLE (1) 1033 1034 // ------------------------------ Hard-coded shuffles 1035 1036 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). 1037 // Shuffle0321 rotates one lane to the right (the previous least-significant 1038 // lane is now most-significant). These could also be implemented via 1039 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. 1040 1041 // Swap 32-bit halves in 64-bit halves. 1042 template <size_t N> 1043 HWY_API Vec128<uint32_t, N> Shuffle2301(const Vec128<uint32_t, N> v) { 1044 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 1045 return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0xB1)}; 1046 } 1047 template <size_t N> 1048 HWY_API Vec128<int32_t, N> Shuffle2301(const Vec128<int32_t, N> v) { 1049 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 1050 return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0xB1)}; 1051 } 1052 template <size_t N> 1053 HWY_API Vec128<float, N> Shuffle2301(const Vec128<float, N> v) { 1054 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 1055 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)}; 1056 } 1057 1058 // Swap 64-bit halves 1059 HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) { 1060 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; 1061 } 1062 HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) { 1063 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; 1064 } 1065 HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) { 1066 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)}; 1067 } 1068 HWY_API Vec128<uint64_t> Shuffle01(const Vec128<uint64_t> v) { 1069 return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; 1070 } 1071 HWY_API Vec128<int64_t> Shuffle01(const Vec128<int64_t> v) { 1072 return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; 1073 } 1074 HWY_API Vec128<double> Shuffle01(const Vec128<double> v) { 1075 return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)}; 1076 } 1077 1078 // Rotate right 32 bits 1079 HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) { 1080 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)}; 1081 } 1082 HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) { 1083 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)}; 1084 } 1085 HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) { 1086 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)}; 1087 } 1088 // Rotate left 32 bits 1089 HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) { 1090 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)}; 1091 } 1092 HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) { 1093 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)}; 1094 } 1095 HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) { 1096 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)}; 1097 } 1098 1099 // Reverse 1100 HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) { 1101 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)}; 1102 } 1103 HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) { 1104 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)}; 1105 } 1106 HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) { 1107 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)}; 1108 } 1109 1110 // ================================================== COMPARE 1111 1112 #if HWY_TARGET <= HWY_AVX3 1113 1114 // Comparisons set a mask bit to 1 if the condition is true, else 0. 1115 1116 template <typename TFrom, size_t NFrom, typename TTo, size_t NTo> 1117 HWY_API Mask128<TTo, NTo> RebindMask(Simd<TTo, NTo> /*tag*/, 1118 Mask128<TFrom, NFrom> m) { 1119 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); 1120 return Mask128<TTo, NTo>{m.raw}; 1121 } 1122 1123 namespace detail { 1124 1125 template <typename T, size_t N> 1126 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<1> /*tag*/, const Vec128<T, N> v, 1127 const Vec128<T, N> bit) { 1128 return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)}; 1129 } 1130 template <typename T, size_t N> 1131 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<2> /*tag*/, const Vec128<T, N> v, 1132 const Vec128<T, N> bit) { 1133 return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)}; 1134 } 1135 template <typename T, size_t N> 1136 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<4> /*tag*/, const Vec128<T, N> v, 1137 const Vec128<T, N> bit) { 1138 return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)}; 1139 } 1140 template <typename T, size_t N> 1141 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<8> /*tag*/, const Vec128<T, N> v, 1142 const Vec128<T, N> bit) { 1143 return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)}; 1144 } 1145 1146 } // namespace detail 1147 1148 template <typename T, size_t N> 1149 HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) { 1150 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 1151 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit); 1152 } 1153 1154 // ------------------------------ Equality 1155 1156 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)> 1157 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { 1158 return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)}; 1159 } 1160 1161 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)> 1162 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { 1163 return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)}; 1164 } 1165 1166 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 1167 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { 1168 return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)}; 1169 } 1170 1171 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)> 1172 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { 1173 return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)}; 1174 } 1175 1176 template <size_t N> 1177 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) { 1178 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; 1179 } 1180 1181 template <size_t N> 1182 HWY_API Mask128<double, N> operator==(Vec128<double, N> a, 1183 Vec128<double, N> b) { 1184 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; 1185 } 1186 1187 // ------------------------------ Inequality 1188 1189 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)> 1190 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { 1191 return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)}; 1192 } 1193 1194 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)> 1195 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { 1196 return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)}; 1197 } 1198 1199 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 1200 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { 1201 return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)}; 1202 } 1203 1204 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)> 1205 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { 1206 return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)}; 1207 } 1208 1209 template <size_t N> 1210 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) { 1211 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; 1212 } 1213 1214 template <size_t N> 1215 HWY_API Mask128<double, N> operator!=(Vec128<double, N> a, 1216 Vec128<double, N> b) { 1217 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; 1218 } 1219 1220 // ------------------------------ Strict inequality 1221 1222 // Signed/float < 1223 template <size_t N> 1224 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 1225 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)}; 1226 } 1227 template <size_t N> 1228 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a, 1229 Vec128<int16_t, N> b) { 1230 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)}; 1231 } 1232 template <size_t N> 1233 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a, 1234 Vec128<int32_t, N> b) { 1235 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)}; 1236 } 1237 template <size_t N> 1238 HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a, 1239 Vec128<int64_t, N> b) { 1240 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)}; 1241 } 1242 1243 template <size_t N> 1244 HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a, 1245 Vec128<uint8_t, N> b) { 1246 return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)}; 1247 } 1248 template <size_t N> 1249 HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a, 1250 Vec128<uint16_t, N> b) { 1251 return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)}; 1252 } 1253 template <size_t N> 1254 HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a, 1255 Vec128<uint32_t, N> b) { 1256 return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)}; 1257 } 1258 template <size_t N> 1259 HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a, 1260 Vec128<uint64_t, N> b) { 1261 return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)}; 1262 } 1263 1264 template <size_t N> 1265 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) { 1266 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; 1267 } 1268 template <size_t N> 1269 HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) { 1270 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; 1271 } 1272 1273 // ------------------------------ Weak inequality 1274 1275 template <size_t N> 1276 HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) { 1277 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; 1278 } 1279 template <size_t N> 1280 HWY_API Mask128<double, N> operator>=(Vec128<double, N> a, 1281 Vec128<double, N> b) { 1282 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; 1283 } 1284 1285 // ------------------------------ Mask 1286 1287 namespace detail { 1288 1289 template <typename T, size_t N> 1290 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/, 1291 const Vec128<T, N> v) { 1292 return Mask128<T, N>{_mm_movepi8_mask(v.raw)}; 1293 } 1294 template <typename T, size_t N> 1295 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/, 1296 const Vec128<T, N> v) { 1297 return Mask128<T, N>{_mm_movepi16_mask(v.raw)}; 1298 } 1299 template <typename T, size_t N> 1300 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/, 1301 const Vec128<T, N> v) { 1302 return Mask128<T, N>{_mm_movepi32_mask(v.raw)}; 1303 } 1304 template <typename T, size_t N> 1305 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/, 1306 const Vec128<T, N> v) { 1307 return Mask128<T, N>{_mm_movepi64_mask(v.raw)}; 1308 } 1309 1310 } // namespace detail 1311 1312 template <typename T, size_t N> 1313 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { 1314 return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v); 1315 } 1316 // There do not seem to be native floating-point versions of these instructions. 1317 template <size_t N> 1318 HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) { 1319 return Mask128<float, N>{MaskFromVec(BitCast(Simd<int32_t, N>(), v)).raw}; 1320 } 1321 template <size_t N> 1322 HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) { 1323 return Mask128<double, N>{MaskFromVec(BitCast(Simd<int64_t, N>(), v)).raw}; 1324 } 1325 1326 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)> 1327 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 1328 return Vec128<T, N>{_mm_movm_epi8(v.raw)}; 1329 } 1330 1331 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)> 1332 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 1333 return Vec128<T, N>{_mm_movm_epi16(v.raw)}; 1334 } 1335 1336 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 1337 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 1338 return Vec128<T, N>{_mm_movm_epi32(v.raw)}; 1339 } 1340 1341 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)> 1342 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 1343 return Vec128<T, N>{_mm_movm_epi64(v.raw)}; 1344 } 1345 1346 template <size_t N> 1347 HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) { 1348 return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))}; 1349 } 1350 1351 template <size_t N> 1352 HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) { 1353 return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))}; 1354 } 1355 1356 template <typename T, size_t N> 1357 HWY_API Vec128<T, N> VecFromMask(Simd<T, N> /* tag */, const Mask128<T, N> v) { 1358 return VecFromMask(v); 1359 } 1360 1361 #else // AVX2 or below 1362 1363 // Comparisons fill a lane with 1-bits if the condition is true, else 0. 1364 1365 template <typename TFrom, typename TTo, size_t N> 1366 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) { 1367 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); 1368 const Simd<TFrom, N> d; 1369 return MaskFromVec(BitCast(Simd<TTo, N>(), VecFromMask(d, m))); 1370 } 1371 1372 template <typename T, size_t N> 1373 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { 1374 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 1375 return (v & bit) == bit; 1376 } 1377 1378 // ------------------------------ Equality 1379 1380 // Unsigned 1381 template <size_t N> 1382 HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a, 1383 const Vec128<uint8_t, N> b) { 1384 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)}; 1385 } 1386 template <size_t N> 1387 HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a, 1388 const Vec128<uint16_t, N> b) { 1389 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)}; 1390 } 1391 template <size_t N> 1392 HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a, 1393 const Vec128<uint32_t, N> b) { 1394 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)}; 1395 } 1396 template <size_t N> 1397 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a, 1398 const Vec128<uint64_t, N> b) { 1399 #if HWY_TARGET == HWY_SSSE3 1400 const Simd<uint32_t, N * 2> d32; 1401 const Simd<uint64_t, N> d64; 1402 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); 1403 const auto cmp64 = cmp32 & Shuffle2301(cmp32); 1404 return MaskFromVec(BitCast(d64, cmp64)); 1405 #else 1406 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)}; 1407 #endif 1408 } 1409 1410 // Signed 1411 template <size_t N> 1412 HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a, 1413 const Vec128<int8_t, N> b) { 1414 return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)}; 1415 } 1416 template <size_t N> 1417 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a, 1418 Vec128<int16_t, N> b) { 1419 return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)}; 1420 } 1421 template <size_t N> 1422 HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a, 1423 const Vec128<int32_t, N> b) { 1424 return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)}; 1425 } 1426 template <size_t N> 1427 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a, 1428 const Vec128<int64_t, N> b) { 1429 // Same as signed ==; avoid duplicating the SSSE3 version. 1430 const Simd<uint64_t, N> du; 1431 return RebindMask(Simd<int64_t, N>(), BitCast(du, a) == BitCast(du, b)); 1432 } 1433 1434 // Float 1435 template <size_t N> 1436 HWY_API Mask128<float, N> operator==(const Vec128<float, N> a, 1437 const Vec128<float, N> b) { 1438 return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)}; 1439 } 1440 template <size_t N> 1441 HWY_API Mask128<double, N> operator==(const Vec128<double, N> a, 1442 const Vec128<double, N> b) { 1443 return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)}; 1444 } 1445 1446 // ------------------------------ Inequality 1447 1448 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)> 1449 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { 1450 return Not(a == b); 1451 } 1452 1453 template <size_t N> 1454 HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a, 1455 const Vec128<float, N> b) { 1456 return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)}; 1457 } 1458 template <size_t N> 1459 HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a, 1460 const Vec128<double, N> b) { 1461 return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)}; 1462 } 1463 1464 // ------------------------------ Strict inequality 1465 1466 // Signed/float < 1467 template <size_t N> 1468 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 1469 return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)}; 1470 } 1471 template <size_t N> 1472 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a, 1473 Vec128<int16_t, N> b) { 1474 return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)}; 1475 } 1476 template <size_t N> 1477 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a, 1478 Vec128<int32_t, N> b) { 1479 return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)}; 1480 } 1481 1482 template <typename T, size_t N, HWY_IF_UNSIGNED(T)> 1483 HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { 1484 const Simd<T, N> du; 1485 const RebindToSigned<decltype(du)> di; 1486 const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1); 1487 return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb))); 1488 } 1489 1490 template <size_t N> 1491 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) { 1492 return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)}; 1493 } 1494 template <size_t N> 1495 HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) { 1496 return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)}; 1497 } 1498 1499 template <size_t N> 1500 HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a, 1501 const Vec128<int64_t, N> b) { 1502 #if HWY_TARGET == HWY_SSSE3 1503 // If the upper half is less than or greater, this is the answer. 1504 const __m128i m_gt = _mm_cmpgt_epi32(a.raw, b.raw); 1505 1506 // Otherwise, the lower half decides. 1507 const __m128i m_eq = _mm_cmpeq_epi32(a.raw, b.raw); 1508 const __m128i lo_in_hi = _mm_shuffle_epi32(m_gt, _MM_SHUFFLE(2, 2, 0, 0)); 1509 const __m128i lo_gt = _mm_and_si128(m_eq, lo_in_hi); 1510 1511 const __m128i gt = _mm_or_si128(lo_gt, m_gt); 1512 // Copy result in upper 32 bits to lower 32 bits. 1513 return Mask128<int64_t, N>{_mm_shuffle_epi32(gt, _MM_SHUFFLE(3, 3, 1, 1))}; 1514 #else 1515 return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2 1516 #endif 1517 } 1518 1519 // ------------------------------ Weak inequality 1520 1521 template <size_t N> 1522 HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a, 1523 const Vec128<float, N> b) { 1524 return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)}; 1525 } 1526 template <size_t N> 1527 HWY_API Mask128<double, N> operator>=(const Vec128<double, N> a, 1528 const Vec128<double, N> b) { 1529 return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)}; 1530 } 1531 1532 #endif // HWY_TARGET <= HWY_AVX3 1533 1534 // ------------------------------ Reversed comparisons 1535 1536 template <typename T, size_t N> 1537 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) { 1538 return b > a; 1539 } 1540 1541 template <typename T, size_t N> 1542 HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) { 1543 return b >= a; 1544 } 1545 1546 // ------------------------------ FirstN (Iota, Lt) 1547 1548 template <typename T, size_t N, HWY_IF_LE128(T, N)> 1549 HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) { 1550 #if HWY_TARGET <= HWY_AVX3 1551 (void)d; 1552 const uint64_t all = (1ull << N) - 1; 1553 // BZHI only looks at the lower 8 bits of num! 1554 const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num); 1555 return Mask128<T, N>::FromBits(bits); 1556 #else 1557 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper. 1558 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num))); 1559 #endif 1560 } 1561 1562 template <class D> 1563 using MFromD = decltype(FirstN(D(), 0)); 1564 1565 // ================================================== MEMORY (1) 1566 1567 // Clang static analysis claims the memory immediately after a partial vector 1568 // store is uninitialized, and also flags the input to partial loads (at least 1569 // for loadl_pd) as "garbage". This is a false alarm because msan does not 1570 // raise errors. We work around this by using CopyBytes instead of intrinsics, 1571 // but only for the analyzer to avoid potentially bad code generation. 1572 // Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7. 1573 #ifndef HWY_SAFE_PARTIAL_LOAD_STORE 1574 #if defined(__clang_analyzer__) || \ 1575 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) 1576 #define HWY_SAFE_PARTIAL_LOAD_STORE 1 1577 #else 1578 #define HWY_SAFE_PARTIAL_LOAD_STORE 0 1579 #endif 1580 #endif // HWY_SAFE_PARTIAL_LOAD_STORE 1581 1582 // ------------------------------ Load 1583 1584 template <typename T> 1585 HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) { 1586 return Vec128<T>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))}; 1587 } 1588 HWY_API Vec128<float> Load(Full128<float> /* tag */, 1589 const float* HWY_RESTRICT aligned) { 1590 return Vec128<float>{_mm_load_ps(aligned)}; 1591 } 1592 HWY_API Vec128<double> Load(Full128<double> /* tag */, 1593 const double* HWY_RESTRICT aligned) { 1594 return Vec128<double>{_mm_load_pd(aligned)}; 1595 } 1596 1597 template <typename T> 1598 HWY_API Vec128<T> LoadU(Full128<T> /* tag */, const T* HWY_RESTRICT p) { 1599 return Vec128<T>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))}; 1600 } 1601 HWY_API Vec128<float> LoadU(Full128<float> /* tag */, 1602 const float* HWY_RESTRICT p) { 1603 return Vec128<float>{_mm_loadu_ps(p)}; 1604 } 1605 HWY_API Vec128<double> LoadU(Full128<double> /* tag */, 1606 const double* HWY_RESTRICT p) { 1607 return Vec128<double>{_mm_loadu_pd(p)}; 1608 } 1609 1610 template <typename T> 1611 HWY_API Vec128<T, 8 / sizeof(T)> Load(Simd<T, 8 / sizeof(T)> /* tag */, 1612 const T* HWY_RESTRICT p) { 1613 #if HWY_SAFE_PARTIAL_LOAD_STORE 1614 __m128i v = _mm_setzero_si128(); 1615 CopyBytes<8>(p, &v); 1616 return Vec128<T, 8 / sizeof(T)>{v}; 1617 #else 1618 return Vec128<T, 8 / sizeof(T)>{ 1619 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))}; 1620 #endif 1621 } 1622 1623 HWY_API Vec128<float, 2> Load(Simd<float, 2> /* tag */, 1624 const float* HWY_RESTRICT p) { 1625 #if HWY_SAFE_PARTIAL_LOAD_STORE 1626 __m128 v = _mm_setzero_ps(); 1627 CopyBytes<8>(p, &v); 1628 return Vec128<float, 2>{v}; 1629 #else 1630 const __m128 hi = _mm_setzero_ps(); 1631 return Vec128<float, 2>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))}; 1632 #endif 1633 } 1634 1635 HWY_API Vec128<double, 1> Load(Simd<double, 1> /* tag */, 1636 const double* HWY_RESTRICT p) { 1637 #if HWY_SAFE_PARTIAL_LOAD_STORE 1638 __m128d v = _mm_setzero_pd(); 1639 CopyBytes<8>(p, &v); 1640 return Vec128<double, 1>{v}; 1641 #else 1642 return Vec128<double, 1>{_mm_load_sd(p)}; 1643 #endif 1644 } 1645 1646 HWY_API Vec128<float, 1> Load(Simd<float, 1> /* tag */, 1647 const float* HWY_RESTRICT p) { 1648 #if HWY_SAFE_PARTIAL_LOAD_STORE 1649 __m128 v = _mm_setzero_ps(); 1650 CopyBytes<4>(p, &v); 1651 return Vec128<float, 1>{v}; 1652 #else 1653 return Vec128<float, 1>{_mm_load_ss(p)}; 1654 #endif 1655 } 1656 1657 // Any <= 32 bit except <float, 1> 1658 template <typename T, size_t N, HWY_IF_LE32(T, N)> 1659 HWY_API Vec128<T, N> Load(Simd<T, N> /* tag */, const T* HWY_RESTRICT p) { 1660 constexpr size_t kSize = sizeof(T) * N; 1661 #if HWY_SAFE_PARTIAL_LOAD_STORE 1662 __m128 v = _mm_setzero_ps(); 1663 CopyBytes<kSize>(p, &v); 1664 return Vec128<T, N>{v}; 1665 #else 1666 int32_t bits; 1667 CopyBytes<kSize>(p, &bits); 1668 return Vec128<T, N>{_mm_cvtsi32_si128(bits)}; 1669 #endif 1670 } 1671 1672 // For < 128 bit, LoadU == Load. 1673 template <typename T, size_t N, HWY_IF_LE64(T, N)> 1674 HWY_API Vec128<T, N> LoadU(Simd<T, N> d, const T* HWY_RESTRICT p) { 1675 return Load(d, p); 1676 } 1677 1678 // 128-bit SIMD => nothing to duplicate, same as an unaligned load. 1679 template <typename T, size_t N, HWY_IF_LE128(T, N)> 1680 HWY_API Vec128<T, N> LoadDup128(Simd<T, N> d, const T* HWY_RESTRICT p) { 1681 return LoadU(d, p); 1682 } 1683 1684 // Returns a vector with lane i=[0, N) set to "first" + i. 1685 template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)> 1686 HWY_API Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) { 1687 HWY_ALIGN T lanes[16 / sizeof(T)]; 1688 for (size_t i = 0; i < 16 / sizeof(T); ++i) { 1689 lanes[i] = static_cast<T>(first + static_cast<T2>(i)); 1690 } 1691 return Load(d, lanes); 1692 } 1693 1694 // ------------------------------ MaskedLoad 1695 1696 #if HWY_TARGET <= HWY_AVX3 1697 1698 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 1699 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */, 1700 const T* HWY_RESTRICT aligned) { 1701 return Vec128<T, N>{_mm_maskz_load_epi32(m.raw, aligned)}; 1702 } 1703 1704 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)> 1705 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */, 1706 const T* HWY_RESTRICT aligned) { 1707 return Vec128<T, N>{_mm_maskz_load_epi64(m.raw, aligned)}; 1708 } 1709 1710 template <size_t N> 1711 HWY_API Vec128<float, N> MaskedLoad(Mask128<float, N> m, 1712 Simd<float, N> /* tag */, 1713 const float* HWY_RESTRICT aligned) { 1714 return Vec128<float, N>{_mm_maskz_load_ps(m.raw, aligned)}; 1715 } 1716 1717 template <size_t N> 1718 HWY_API Vec128<double, N> MaskedLoad(Mask128<double, N> m, 1719 Simd<double, N> /* tag */, 1720 const double* HWY_RESTRICT aligned) { 1721 return Vec128<double, N>{_mm_maskz_load_pd(m.raw, aligned)}; 1722 } 1723 1724 // There is no load_epi8/16, so use loadu instead. 1725 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)> 1726 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */, 1727 const T* HWY_RESTRICT aligned) { 1728 return Vec128<T, N>{_mm_maskz_loadu_epi8(m.raw, aligned)}; 1729 } 1730 1731 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)> 1732 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */, 1733 const T* HWY_RESTRICT aligned) { 1734 return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, aligned)}; 1735 } 1736 1737 #elif HWY_TARGET == HWY_AVX2 1738 1739 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 1740 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */, 1741 const T* HWY_RESTRICT aligned) { 1742 auto aligned_p = reinterpret_cast<const int*>(aligned); // NOLINT 1743 return Vec128<T, N>{_mm_maskload_epi32(aligned_p, m.raw)}; 1744 } 1745 1746 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)> 1747 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */, 1748 const T* HWY_RESTRICT aligned) { 1749 auto aligned_p = reinterpret_cast<const long long*>(aligned); // NOLINT 1750 return Vec128<T, N>{_mm_maskload_epi64(aligned_p, m.raw)}; 1751 } 1752 1753 template <size_t N> 1754 HWY_API Vec128<float, N> MaskedLoad(Mask128<float, N> m, Simd<float, N> d, 1755 const float* HWY_RESTRICT aligned) { 1756 const Vec128<int32_t, N> mi = 1757 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m)); 1758 return Vec128<float, N>{_mm_maskload_ps(aligned, mi.raw)}; 1759 } 1760 1761 template <size_t N> 1762 HWY_API Vec128<double, N> MaskedLoad(Mask128<double, N> m, Simd<double, N> d, 1763 const double* HWY_RESTRICT aligned) { 1764 const Vec128<int64_t, N> mi = 1765 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m)); 1766 return Vec128<double, N>{_mm_maskload_pd(aligned, mi.raw)}; 1767 } 1768 1769 // There is no maskload_epi8/16, so blend instead. 1770 template <typename T, size_t N, hwy::EnableIf<sizeof(T) <= 2>* = nullptr> 1771 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> d, 1772 const T* HWY_RESTRICT aligned) { 1773 return IfThenElseZero(m, Load(d, aligned)); 1774 } 1775 1776 #else // <= SSE4 1777 1778 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). 1779 template <typename T, size_t N> 1780 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> d, 1781 const T* HWY_RESTRICT aligned) { 1782 return IfThenElseZero(m, Load(d, aligned)); 1783 } 1784 1785 #endif 1786 1787 // ------------------------------ Store 1788 1789 template <typename T> 1790 HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) { 1791 _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw); 1792 } 1793 HWY_API void Store(const Vec128<float> v, Full128<float> /* tag */, 1794 float* HWY_RESTRICT aligned) { 1795 _mm_store_ps(aligned, v.raw); 1796 } 1797 HWY_API void Store(const Vec128<double> v, Full128<double> /* tag */, 1798 double* HWY_RESTRICT aligned) { 1799 _mm_store_pd(aligned, v.raw); 1800 } 1801 1802 template <typename T> 1803 HWY_API void StoreU(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT p) { 1804 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw); 1805 } 1806 HWY_API void StoreU(const Vec128<float> v, Full128<float> /* tag */, 1807 float* HWY_RESTRICT p) { 1808 _mm_storeu_ps(p, v.raw); 1809 } 1810 HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */, 1811 double* HWY_RESTRICT p) { 1812 _mm_storeu_pd(p, v.raw); 1813 } 1814 1815 template <typename T> 1816 HWY_API void Store(Vec128<T, 8 / sizeof(T)> v, Simd<T, 8 / sizeof(T)> /* tag */, 1817 T* HWY_RESTRICT p) { 1818 #if HWY_SAFE_PARTIAL_LOAD_STORE 1819 CopyBytes<8>(&v, p); 1820 #else 1821 _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw); 1822 #endif 1823 } 1824 HWY_API void Store(const Vec128<float, 2> v, Simd<float, 2> /* tag */, 1825 float* HWY_RESTRICT p) { 1826 #if HWY_SAFE_PARTIAL_LOAD_STORE 1827 CopyBytes<8>(&v, p); 1828 #else 1829 _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw); 1830 #endif 1831 } 1832 HWY_API void Store(const Vec128<double, 1> v, Simd<double, 1> /* tag */, 1833 double* HWY_RESTRICT p) { 1834 #if HWY_SAFE_PARTIAL_LOAD_STORE 1835 CopyBytes<8>(&v, p); 1836 #else 1837 _mm_storel_pd(p, v.raw); 1838 #endif 1839 } 1840 1841 // Any <= 32 bit except <float, 1> 1842 template <typename T, size_t N, HWY_IF_LE32(T, N)> 1843 HWY_API void Store(Vec128<T, N> v, Simd<T, N> /* tag */, T* HWY_RESTRICT p) { 1844 CopyBytes<sizeof(T) * N>(&v, p); 1845 } 1846 HWY_API void Store(const Vec128<float, 1> v, Simd<float, 1> /* tag */, 1847 float* HWY_RESTRICT p) { 1848 #if HWY_SAFE_PARTIAL_LOAD_STORE 1849 CopyBytes<4>(&v, p); 1850 #else 1851 _mm_store_ss(p, v.raw); 1852 #endif 1853 } 1854 1855 // For < 128 bit, StoreU == Store. 1856 template <typename T, size_t N, HWY_IF_LE64(T, N)> 1857 HWY_API void StoreU(const Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT p) { 1858 Store(v, d, p); 1859 } 1860 1861 // ================================================== ARITHMETIC 1862 1863 // ------------------------------ Addition 1864 1865 // Unsigned 1866 template <size_t N> 1867 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a, 1868 const Vec128<uint8_t, N> b) { 1869 return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)}; 1870 } 1871 template <size_t N> 1872 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a, 1873 const Vec128<uint16_t, N> b) { 1874 return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)}; 1875 } 1876 template <size_t N> 1877 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a, 1878 const Vec128<uint32_t, N> b) { 1879 return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)}; 1880 } 1881 template <size_t N> 1882 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a, 1883 const Vec128<uint64_t, N> b) { 1884 return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)}; 1885 } 1886 1887 // Signed 1888 template <size_t N> 1889 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a, 1890 const Vec128<int8_t, N> b) { 1891 return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)}; 1892 } 1893 template <size_t N> 1894 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a, 1895 const Vec128<int16_t, N> b) { 1896 return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)}; 1897 } 1898 template <size_t N> 1899 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a, 1900 const Vec128<int32_t, N> b) { 1901 return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)}; 1902 } 1903 template <size_t N> 1904 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a, 1905 const Vec128<int64_t, N> b) { 1906 return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)}; 1907 } 1908 1909 // Float 1910 template <size_t N> 1911 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a, 1912 const Vec128<float, N> b) { 1913 return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)}; 1914 } 1915 template <size_t N> 1916 HWY_API Vec128<double, N> operator+(const Vec128<double, N> a, 1917 const Vec128<double, N> b) { 1918 return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)}; 1919 } 1920 1921 // ------------------------------ Subtraction 1922 1923 // Unsigned 1924 template <size_t N> 1925 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a, 1926 const Vec128<uint8_t, N> b) { 1927 return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)}; 1928 } 1929 template <size_t N> 1930 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a, 1931 Vec128<uint16_t, N> b) { 1932 return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)}; 1933 } 1934 template <size_t N> 1935 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a, 1936 const Vec128<uint32_t, N> b) { 1937 return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)}; 1938 } 1939 template <size_t N> 1940 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a, 1941 const Vec128<uint64_t, N> b) { 1942 return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)}; 1943 } 1944 1945 // Signed 1946 template <size_t N> 1947 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a, 1948 const Vec128<int8_t, N> b) { 1949 return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)}; 1950 } 1951 template <size_t N> 1952 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a, 1953 const Vec128<int16_t, N> b) { 1954 return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)}; 1955 } 1956 template <size_t N> 1957 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a, 1958 const Vec128<int32_t, N> b) { 1959 return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)}; 1960 } 1961 template <size_t N> 1962 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a, 1963 const Vec128<int64_t, N> b) { 1964 return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)}; 1965 } 1966 1967 // Float 1968 template <size_t N> 1969 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a, 1970 const Vec128<float, N> b) { 1971 return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)}; 1972 } 1973 template <size_t N> 1974 HWY_API Vec128<double, N> operator-(const Vec128<double, N> a, 1975 const Vec128<double, N> b) { 1976 return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)}; 1977 } 1978 1979 // ------------------------------ Saturating addition 1980 1981 // Returns a + b clamped to the destination range. 1982 1983 // Unsigned 1984 template <size_t N> 1985 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a, 1986 const Vec128<uint8_t, N> b) { 1987 return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)}; 1988 } 1989 template <size_t N> 1990 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a, 1991 const Vec128<uint16_t, N> b) { 1992 return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)}; 1993 } 1994 1995 // Signed 1996 template <size_t N> 1997 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a, 1998 const Vec128<int8_t, N> b) { 1999 return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)}; 2000 } 2001 template <size_t N> 2002 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a, 2003 const Vec128<int16_t, N> b) { 2004 return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)}; 2005 } 2006 2007 // ------------------------------ Saturating subtraction 2008 2009 // Returns a - b clamped to the destination range. 2010 2011 // Unsigned 2012 template <size_t N> 2013 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a, 2014 const Vec128<uint8_t, N> b) { 2015 return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)}; 2016 } 2017 template <size_t N> 2018 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a, 2019 const Vec128<uint16_t, N> b) { 2020 return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)}; 2021 } 2022 2023 // Signed 2024 template <size_t N> 2025 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a, 2026 const Vec128<int8_t, N> b) { 2027 return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)}; 2028 } 2029 template <size_t N> 2030 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a, 2031 const Vec128<int16_t, N> b) { 2032 return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)}; 2033 } 2034 2035 // ------------------------------ AverageRound 2036 2037 // Returns (a + b + 1) / 2 2038 2039 // Unsigned 2040 template <size_t N> 2041 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a, 2042 const Vec128<uint8_t, N> b) { 2043 return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)}; 2044 } 2045 template <size_t N> 2046 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a, 2047 const Vec128<uint16_t, N> b) { 2048 return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)}; 2049 } 2050 2051 // ------------------------------ Integer multiplication 2052 2053 template <size_t N> 2054 HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a, 2055 const Vec128<uint16_t, N> b) { 2056 return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)}; 2057 } 2058 template <size_t N> 2059 HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a, 2060 const Vec128<int16_t, N> b) { 2061 return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)}; 2062 } 2063 2064 // Returns the upper 16 bits of a * b in each lane. 2065 template <size_t N> 2066 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a, 2067 const Vec128<uint16_t, N> b) { 2068 return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)}; 2069 } 2070 template <size_t N> 2071 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a, 2072 const Vec128<int16_t, N> b) { 2073 return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)}; 2074 } 2075 2076 // Multiplies even lanes (0, 2 ..) and places the double-wide result into 2077 // even and the upper half into its odd neighbor lane. 2078 template <size_t N> 2079 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a, 2080 const Vec128<uint32_t, N> b) { 2081 return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)}; 2082 } 2083 2084 #if HWY_TARGET == HWY_SSSE3 2085 2086 template <size_t N, HWY_IF_LE64(int32_t, N)> // N=1 or 2 2087 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a, 2088 const Vec128<int32_t, N> b) { 2089 return Set(Simd<int64_t, (N + 1) / 2>(), int64_t(GetLane(a)) * GetLane(b)); 2090 } 2091 HWY_API Vec128<int64_t> MulEven(const Vec128<int32_t> a, 2092 const Vec128<int32_t> b) { 2093 alignas(16) int32_t a_lanes[4]; 2094 alignas(16) int32_t b_lanes[4]; 2095 const Full128<int32_t> di32; 2096 Store(a, di32, a_lanes); 2097 Store(b, di32, b_lanes); 2098 alignas(16) int64_t mul[2]; 2099 mul[0] = int64_t(a_lanes[0]) * b_lanes[0]; 2100 mul[1] = int64_t(a_lanes[2]) * b_lanes[2]; 2101 return Load(Full128<int64_t>(), mul); 2102 } 2103 2104 #else // HWY_TARGET == HWY_SSSE3 2105 2106 template <size_t N> 2107 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a, 2108 const Vec128<int32_t, N> b) { 2109 return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)}; 2110 } 2111 2112 #endif // HWY_TARGET == HWY_SSSE3 2113 2114 template <size_t N> 2115 HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a, 2116 const Vec128<uint32_t, N> b) { 2117 #if HWY_TARGET == HWY_SSSE3 2118 // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency. 2119 // 64-bit right shift would also work but also needs port 5, so no benefit. 2120 // Notation: x=don't care, z=0. 2121 const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1)); 2122 const auto mullo_x2x0 = MulEven(a, b); 2123 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1)); 2124 const auto mullo_x3x1 = 2125 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1}); 2126 // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating 2127 // the latter requires one more instruction or a constant. 2128 const __m128i mul_20 = 2129 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0)); 2130 const __m128i mul_31 = 2131 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0)); 2132 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)}; 2133 #else 2134 return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)}; 2135 #endif 2136 } 2137 2138 template <size_t N> 2139 HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a, 2140 const Vec128<int32_t, N> b) { 2141 // Same as unsigned; avoid duplicating the SSSE3 code. 2142 const Simd<uint32_t, N> du; 2143 return BitCast(Simd<int32_t, N>(), BitCast(du, a) * BitCast(du, b)); 2144 } 2145 2146 // ------------------------------ ShiftLeft 2147 2148 template <int kBits, size_t N> 2149 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) { 2150 return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)}; 2151 } 2152 2153 template <int kBits, size_t N> 2154 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) { 2155 return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)}; 2156 } 2157 2158 template <int kBits, size_t N> 2159 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) { 2160 return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)}; 2161 } 2162 2163 template <int kBits, size_t N> 2164 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) { 2165 return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)}; 2166 } 2167 template <int kBits, size_t N> 2168 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) { 2169 return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)}; 2170 } 2171 template <int kBits, size_t N> 2172 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) { 2173 return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)}; 2174 } 2175 2176 template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)> 2177 HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) { 2178 const Simd<T, N> d8; 2179 // Use raw instead of BitCast to support N=1. 2180 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw}; 2181 return kBits == 1 2182 ? (v + v) 2183 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF))); 2184 } 2185 2186 // ------------------------------ ShiftRight 2187 2188 template <int kBits, size_t N> 2189 HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) { 2190 return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)}; 2191 } 2192 template <int kBits, size_t N> 2193 HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) { 2194 return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)}; 2195 } 2196 template <int kBits, size_t N> 2197 HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) { 2198 return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)}; 2199 } 2200 2201 template <int kBits, size_t N> 2202 HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) { 2203 const Simd<uint8_t, N> d8; 2204 // Use raw instead of BitCast to support N=1. 2205 const Vec128<uint8_t, N> shifted{ 2206 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw}; 2207 return shifted & Set(d8, 0xFF >> kBits); 2208 } 2209 2210 template <int kBits, size_t N> 2211 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) { 2212 return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)}; 2213 } 2214 template <int kBits, size_t N> 2215 HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) { 2216 return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)}; 2217 } 2218 2219 template <int kBits, size_t N> 2220 HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) { 2221 const Simd<int8_t, N> di; 2222 const Simd<uint8_t, N> du; 2223 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); 2224 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); 2225 return (shifted ^ shifted_sign) - shifted_sign; 2226 } 2227 2228 // i64 is implemented after BroadcastSignBit. 2229 2230 // ------------------------------ RotateRight (ShiftRight, Or) 2231 2232 template <int kBits, size_t N> 2233 HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) { 2234 static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); 2235 #if HWY_TARGET <= HWY_AVX3 2236 return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)}; 2237 #else 2238 if (kBits == 0) return v; 2239 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v)); 2240 #endif 2241 } 2242 2243 template <int kBits, size_t N> 2244 HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) { 2245 static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); 2246 #if HWY_TARGET <= HWY_AVX3 2247 return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)}; 2248 #else 2249 if (kBits == 0) return v; 2250 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v)); 2251 #endif 2252 } 2253 2254 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) 2255 2256 template <size_t N> 2257 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) { 2258 return VecFromMask(v < Zero(Simd<int8_t, N>())); 2259 } 2260 2261 template <size_t N> 2262 HWY_API Vec128<int16_t, N> BroadcastSignBit(const Vec128<int16_t, N> v) { 2263 return ShiftRight<15>(v); 2264 } 2265 2266 template <size_t N> 2267 HWY_API Vec128<int32_t, N> BroadcastSignBit(const Vec128<int32_t, N> v) { 2268 return ShiftRight<31>(v); 2269 } 2270 2271 template <size_t N> 2272 HWY_API Vec128<int64_t, N> BroadcastSignBit(const Vec128<int64_t, N> v) { 2273 #if HWY_TARGET <= HWY_AVX3 2274 return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)}; 2275 #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4 2276 return VecFromMask(v < Zero(Simd<int64_t, N>())); 2277 #else 2278 // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift 2279 // avoids generating a zero. 2280 const Simd<int32_t, N * 2> d32; 2281 const auto sign = ShiftRight<31>(BitCast(d32, v)); 2282 return Vec128<int64_t, N>{ 2283 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))}; 2284 #endif 2285 } 2286 2287 template <size_t N> 2288 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) { 2289 #if HWY_TARGET <= HWY_AVX3 2290 return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)}; 2291 #else 2292 const auto zero = Zero(Simd<int64_t, N>()); 2293 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); 2294 #endif 2295 } 2296 2297 template <int kBits, size_t N> 2298 HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) { 2299 #if HWY_TARGET <= HWY_AVX3 2300 return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, kBits)}; 2301 #else 2302 const Simd<int64_t, N> di; 2303 const Simd<uint64_t, N> du; 2304 const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); 2305 const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); 2306 return right | sign; 2307 #endif 2308 } 2309 2310 // ------------------------------ ZeroIfNegative (BroadcastSignBit) 2311 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2312 HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) { 2313 const Simd<T, N> d; 2314 #if HWY_TARGET == HWY_SSSE3 2315 const RebindToSigned<decltype(d)> di; 2316 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); 2317 #else 2318 const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS 2319 #endif 2320 return IfThenElse(mask, Zero(d), v); 2321 } 2322 2323 // ------------------------------ ShiftLeftSame 2324 2325 template <size_t N> 2326 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v, 2327 const int bits) { 2328 return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; 2329 } 2330 template <size_t N> 2331 HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v, 2332 const int bits) { 2333 return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; 2334 } 2335 template <size_t N> 2336 HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v, 2337 const int bits) { 2338 return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; 2339 } 2340 2341 template <size_t N> 2342 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v, 2343 const int bits) { 2344 return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; 2345 } 2346 2347 template <size_t N> 2348 HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v, 2349 const int bits) { 2350 return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; 2351 } 2352 2353 template <size_t N> 2354 HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v, 2355 const int bits) { 2356 return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; 2357 } 2358 2359 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)> 2360 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) { 2361 const Simd<T, N> d8; 2362 // Use raw instead of BitCast to support N=1. 2363 const Vec128<T, N> shifted{ 2364 ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw}; 2365 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF)); 2366 } 2367 2368 // ------------------------------ ShiftRightSame (BroadcastSignBit) 2369 2370 template <size_t N> 2371 HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v, 2372 const int bits) { 2373 return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; 2374 } 2375 template <size_t N> 2376 HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v, 2377 const int bits) { 2378 return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; 2379 } 2380 template <size_t N> 2381 HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v, 2382 const int bits) { 2383 return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; 2384 } 2385 2386 template <size_t N> 2387 HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v, 2388 const int bits) { 2389 const Simd<uint8_t, N> d8; 2390 // Use raw instead of BitCast to support N=1. 2391 const Vec128<uint8_t, N> shifted{ 2392 ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw}; 2393 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits)); 2394 } 2395 2396 template <size_t N> 2397 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v, 2398 const int bits) { 2399 return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; 2400 } 2401 2402 template <size_t N> 2403 HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v, 2404 const int bits) { 2405 return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; 2406 } 2407 template <size_t N> 2408 HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v, 2409 const int bits) { 2410 #if HWY_TARGET <= HWY_AVX3 2411 return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; 2412 #else 2413 const Simd<int64_t, N> di; 2414 const Simd<uint64_t, N> du; 2415 const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); 2416 const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); 2417 return right | sign; 2418 #endif 2419 } 2420 2421 template <size_t N> 2422 HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) { 2423 const Simd<int8_t, N> di; 2424 const Simd<uint8_t, N> du; 2425 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); 2426 const auto shifted_sign = 2427 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits))); 2428 return (shifted ^ shifted_sign) - shifted_sign; 2429 } 2430 2431 // ------------------------------ Floating-point mul / div 2432 2433 template <size_t N> 2434 HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) { 2435 return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)}; 2436 } 2437 HWY_API Vec128<float, 1> operator*(const Vec128<float, 1> a, 2438 const Vec128<float, 1> b) { 2439 return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)}; 2440 } 2441 template <size_t N> 2442 HWY_API Vec128<double, N> operator*(const Vec128<double, N> a, 2443 const Vec128<double, N> b) { 2444 return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)}; 2445 } 2446 HWY_API Vec128<double, 1> operator*(const Vec128<double, 1> a, 2447 const Vec128<double, 1> b) { 2448 return Vec128<double, 1>{_mm_mul_sd(a.raw, b.raw)}; 2449 } 2450 2451 template <size_t N> 2452 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a, 2453 const Vec128<float, N> b) { 2454 return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)}; 2455 } 2456 HWY_API Vec128<float, 1> operator/(const Vec128<float, 1> a, 2457 const Vec128<float, 1> b) { 2458 return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)}; 2459 } 2460 template <size_t N> 2461 HWY_API Vec128<double, N> operator/(const Vec128<double, N> a, 2462 const Vec128<double, N> b) { 2463 return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)}; 2464 } 2465 HWY_API Vec128<double, 1> operator/(const Vec128<double, 1> a, 2466 const Vec128<double, 1> b) { 2467 return Vec128<double, 1>{_mm_div_sd(a.raw, b.raw)}; 2468 } 2469 2470 // Approximate reciprocal 2471 template <size_t N> 2472 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) { 2473 return Vec128<float, N>{_mm_rcp_ps(v.raw)}; 2474 } 2475 HWY_API Vec128<float, 1> ApproximateReciprocal(const Vec128<float, 1> v) { 2476 return Vec128<float, 1>{_mm_rcp_ss(v.raw)}; 2477 } 2478 2479 // Absolute value of difference. 2480 template <size_t N> 2481 HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a, 2482 const Vec128<float, N> b) { 2483 return Abs(a - b); 2484 } 2485 2486 // ------------------------------ Floating-point multiply-add variants 2487 2488 // Returns mul * x + add 2489 template <size_t N> 2490 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul, 2491 const Vec128<float, N> x, 2492 const Vec128<float, N> add) { 2493 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 2494 return mul * x + add; 2495 #else 2496 return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)}; 2497 #endif 2498 } 2499 template <size_t N> 2500 HWY_API Vec128<double, N> MulAdd(const Vec128<double, N> mul, 2501 const Vec128<double, N> x, 2502 const Vec128<double, N> add) { 2503 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 2504 return mul * x + add; 2505 #else 2506 return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)}; 2507 #endif 2508 } 2509 2510 // Returns add - mul * x 2511 template <size_t N> 2512 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul, 2513 const Vec128<float, N> x, 2514 const Vec128<float, N> add) { 2515 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 2516 return add - mul * x; 2517 #else 2518 return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)}; 2519 #endif 2520 } 2521 template <size_t N> 2522 HWY_API Vec128<double, N> NegMulAdd(const Vec128<double, N> mul, 2523 const Vec128<double, N> x, 2524 const Vec128<double, N> add) { 2525 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 2526 return add - mul * x; 2527 #else 2528 return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)}; 2529 #endif 2530 } 2531 2532 // Returns mul * x - sub 2533 template <size_t N> 2534 HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul, 2535 const Vec128<float, N> x, 2536 const Vec128<float, N> sub) { 2537 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 2538 return mul * x - sub; 2539 #else 2540 return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)}; 2541 #endif 2542 } 2543 template <size_t N> 2544 HWY_API Vec128<double, N> MulSub(const Vec128<double, N> mul, 2545 const Vec128<double, N> x, 2546 const Vec128<double, N> sub) { 2547 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 2548 return mul * x - sub; 2549 #else 2550 return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)}; 2551 #endif 2552 } 2553 2554 // Returns -mul * x - sub 2555 template <size_t N> 2556 HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul, 2557 const Vec128<float, N> x, 2558 const Vec128<float, N> sub) { 2559 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 2560 return Neg(mul) * x - sub; 2561 #else 2562 return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)}; 2563 #endif 2564 } 2565 template <size_t N> 2566 HWY_API Vec128<double, N> NegMulSub(const Vec128<double, N> mul, 2567 const Vec128<double, N> x, 2568 const Vec128<double, N> sub) { 2569 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 2570 return Neg(mul) * x - sub; 2571 #else 2572 return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)}; 2573 #endif 2574 } 2575 2576 // ------------------------------ Floating-point square root 2577 2578 // Full precision square root 2579 template <size_t N> 2580 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) { 2581 return Vec128<float, N>{_mm_sqrt_ps(v.raw)}; 2582 } 2583 HWY_API Vec128<float, 1> Sqrt(const Vec128<float, 1> v) { 2584 return Vec128<float, 1>{_mm_sqrt_ss(v.raw)}; 2585 } 2586 template <size_t N> 2587 HWY_API Vec128<double, N> Sqrt(const Vec128<double, N> v) { 2588 return Vec128<double, N>{_mm_sqrt_pd(v.raw)}; 2589 } 2590 HWY_API Vec128<double, 1> Sqrt(const Vec128<double, 1> v) { 2591 return Vec128<double, 1>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)}; 2592 } 2593 2594 // Approximate reciprocal square root 2595 template <size_t N> 2596 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) { 2597 return Vec128<float, N>{_mm_rsqrt_ps(v.raw)}; 2598 } 2599 HWY_API Vec128<float, 1> ApproximateReciprocalSqrt(const Vec128<float, 1> v) { 2600 return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)}; 2601 } 2602 2603 // ------------------------------ Min (Gt, IfThenElse) 2604 2605 namespace detail { 2606 2607 template <typename T, size_t N> 2608 HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MinU(const Vec128<T, N> a, 2609 const Vec128<T, N> b) { 2610 const Simd<T, N> du; 2611 const RebindToSigned<decltype(du)> di; 2612 const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1))); 2613 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); 2614 return IfThenElse(gt, b, a); 2615 } 2616 2617 } // namespace detail 2618 2619 // Unsigned 2620 template <size_t N> 2621 HWY_API Vec128<uint8_t, N> Min(const Vec128<uint8_t, N> a, 2622 const Vec128<uint8_t, N> b) { 2623 return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)}; 2624 } 2625 template <size_t N> 2626 HWY_API Vec128<uint16_t, N> Min(const Vec128<uint16_t, N> a, 2627 const Vec128<uint16_t, N> b) { 2628 #if HWY_TARGET == HWY_SSSE3 2629 return detail::MinU(a, b); 2630 #else 2631 return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)}; 2632 #endif 2633 } 2634 template <size_t N> 2635 HWY_API Vec128<uint32_t, N> Min(const Vec128<uint32_t, N> a, 2636 const Vec128<uint32_t, N> b) { 2637 #if HWY_TARGET == HWY_SSSE3 2638 return detail::MinU(a, b); 2639 #else 2640 return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)}; 2641 #endif 2642 } 2643 template <size_t N> 2644 HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a, 2645 const Vec128<uint64_t, N> b) { 2646 #if HWY_TARGET <= HWY_AVX3 2647 return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)}; 2648 #else 2649 return detail::MinU(a, b); 2650 #endif 2651 } 2652 2653 // Signed 2654 template <size_t N> 2655 HWY_API Vec128<int8_t, N> Min(const Vec128<int8_t, N> a, 2656 const Vec128<int8_t, N> b) { 2657 #if HWY_TARGET == HWY_SSSE3 2658 return IfThenElse(a < b, a, b); 2659 #else 2660 return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)}; 2661 #endif 2662 } 2663 template <size_t N> 2664 HWY_API Vec128<int16_t, N> Min(const Vec128<int16_t, N> a, 2665 const Vec128<int16_t, N> b) { 2666 return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)}; 2667 } 2668 template <size_t N> 2669 HWY_API Vec128<int32_t, N> Min(const Vec128<int32_t, N> a, 2670 const Vec128<int32_t, N> b) { 2671 #if HWY_TARGET == HWY_SSSE3 2672 return IfThenElse(a < b, a, b); 2673 #else 2674 return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)}; 2675 #endif 2676 } 2677 template <size_t N> 2678 HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a, 2679 const Vec128<int64_t, N> b) { 2680 #if HWY_TARGET <= HWY_AVX3 2681 return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)}; 2682 #else 2683 return IfThenElse(a < b, a, b); 2684 #endif 2685 } 2686 2687 // Float 2688 template <size_t N> 2689 HWY_API Vec128<float, N> Min(const Vec128<float, N> a, 2690 const Vec128<float, N> b) { 2691 return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)}; 2692 } 2693 template <size_t N> 2694 HWY_API Vec128<double, N> Min(const Vec128<double, N> a, 2695 const Vec128<double, N> b) { 2696 return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)}; 2697 } 2698 2699 // ------------------------------ Max (Gt, IfThenElse) 2700 2701 namespace detail { 2702 template <typename T, size_t N> 2703 HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MaxU(const Vec128<T, N> a, 2704 const Vec128<T, N> b) { 2705 const Simd<T, N> du; 2706 const RebindToSigned<decltype(du)> di; 2707 const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1))); 2708 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); 2709 return IfThenElse(gt, a, b); 2710 } 2711 2712 } // namespace detail 2713 2714 // Unsigned 2715 template <size_t N> 2716 HWY_API Vec128<uint8_t, N> Max(const Vec128<uint8_t, N> a, 2717 const Vec128<uint8_t, N> b) { 2718 return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)}; 2719 } 2720 template <size_t N> 2721 HWY_API Vec128<uint16_t, N> Max(const Vec128<uint16_t, N> a, 2722 const Vec128<uint16_t, N> b) { 2723 #if HWY_TARGET == HWY_SSSE3 2724 return detail::MaxU(a, b); 2725 #else 2726 return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)}; 2727 #endif 2728 } 2729 template <size_t N> 2730 HWY_API Vec128<uint32_t, N> Max(const Vec128<uint32_t, N> a, 2731 const Vec128<uint32_t, N> b) { 2732 #if HWY_TARGET == HWY_SSSE3 2733 return detail::MaxU(a, b); 2734 #else 2735 return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)}; 2736 #endif 2737 } 2738 template <size_t N> 2739 HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a, 2740 const Vec128<uint64_t, N> b) { 2741 #if HWY_TARGET <= HWY_AVX3 2742 return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)}; 2743 #else 2744 return detail::MaxU(a, b); 2745 #endif 2746 } 2747 2748 // Signed 2749 template <size_t N> 2750 HWY_API Vec128<int8_t, N> Max(const Vec128<int8_t, N> a, 2751 const Vec128<int8_t, N> b) { 2752 #if HWY_TARGET == HWY_SSSE3 2753 return IfThenElse(a < b, b, a); 2754 #else 2755 return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)}; 2756 #endif 2757 } 2758 template <size_t N> 2759 HWY_API Vec128<int16_t, N> Max(const Vec128<int16_t, N> a, 2760 const Vec128<int16_t, N> b) { 2761 return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)}; 2762 } 2763 template <size_t N> 2764 HWY_API Vec128<int32_t, N> Max(const Vec128<int32_t, N> a, 2765 const Vec128<int32_t, N> b) { 2766 #if HWY_TARGET == HWY_SSSE3 2767 return IfThenElse(a < b, b, a); 2768 #else 2769 return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)}; 2770 #endif 2771 } 2772 template <size_t N> 2773 HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a, 2774 const Vec128<int64_t, N> b) { 2775 #if HWY_TARGET <= HWY_AVX3 2776 return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)}; 2777 #else 2778 return IfThenElse(a < b, b, a); 2779 #endif 2780 } 2781 2782 // Float 2783 template <size_t N> 2784 HWY_API Vec128<float, N> Max(const Vec128<float, N> a, 2785 const Vec128<float, N> b) { 2786 return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)}; 2787 } 2788 template <size_t N> 2789 HWY_API Vec128<double, N> Max(const Vec128<double, N> a, 2790 const Vec128<double, N> b) { 2791 return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)}; 2792 } 2793 2794 // ================================================== MEMORY (2) 2795 2796 // ------------------------------ Non-temporal stores 2797 2798 // On clang6, we see incorrect code generated for _mm_stream_pi, so 2799 // round even partial vectors up to 16 bytes. 2800 template <typename T, size_t N> 2801 HWY_API void Stream(Vec128<T, N> v, Simd<T, N> /* tag */, 2802 T* HWY_RESTRICT aligned) { 2803 _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), v.raw); 2804 } 2805 template <size_t N> 2806 HWY_API void Stream(const Vec128<float, N> v, Simd<float, N> /* tag */, 2807 float* HWY_RESTRICT aligned) { 2808 _mm_stream_ps(aligned, v.raw); 2809 } 2810 template <size_t N> 2811 HWY_API void Stream(const Vec128<double, N> v, Simd<double, N> /* tag */, 2812 double* HWY_RESTRICT aligned) { 2813 _mm_stream_pd(aligned, v.raw); 2814 } 2815 2816 // ------------------------------ Scatter 2817 2818 // Work around warnings in the intrinsic definitions (passing -1 as a mask). 2819 HWY_DIAGNOSTICS(push) 2820 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") 2821 2822 // Unfortunately the GCC/Clang intrinsics do not accept int64_t*. 2823 using GatherIndex64 = long long int; // NOLINT(google-runtime-int) 2824 static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type"); 2825 2826 #if HWY_TARGET <= HWY_AVX3 2827 namespace detail { 2828 2829 template <typename T, size_t N> 2830 HWY_INLINE void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec128<T, N> v, 2831 Simd<T, N> /* tag */, T* HWY_RESTRICT base, 2832 const Vec128<int32_t, N> offset) { 2833 if (N == 4) { 2834 _mm_i32scatter_epi32(base, offset.raw, v.raw, 1); 2835 } else { 2836 const __mmask8 mask = (1u << N) - 1; 2837 _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1); 2838 } 2839 } 2840 template <typename T, size_t N> 2841 HWY_INLINE void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec128<T, N> v, 2842 Simd<T, N> /* tag */, T* HWY_RESTRICT base, 2843 const Vec128<int32_t, N> index) { 2844 if (N == 4) { 2845 _mm_i32scatter_epi32(base, index.raw, v.raw, 4); 2846 } else { 2847 const __mmask8 mask = (1u << N) - 1; 2848 _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4); 2849 } 2850 } 2851 2852 template <typename T, size_t N> 2853 HWY_INLINE void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec128<T, N> v, 2854 Simd<T, N> /* tag */, T* HWY_RESTRICT base, 2855 const Vec128<int64_t, N> offset) { 2856 if (N == 2) { 2857 _mm_i64scatter_epi64(base, offset.raw, v.raw, 1); 2858 } else { 2859 const __mmask8 mask = (1u << N) - 1; 2860 _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1); 2861 } 2862 } 2863 template <typename T, size_t N> 2864 HWY_INLINE void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec128<T, N> v, 2865 Simd<T, N> /* tag */, T* HWY_RESTRICT base, 2866 const Vec128<int64_t, N> index) { 2867 if (N == 2) { 2868 _mm_i64scatter_epi64(base, index.raw, v.raw, 8); 2869 } else { 2870 const __mmask8 mask = (1u << N) - 1; 2871 _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8); 2872 } 2873 } 2874 2875 } // namespace detail 2876 2877 template <typename T, size_t N, typename Offset> 2878 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base, 2879 const Vec128<Offset, N> offset) { 2880 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); 2881 return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset); 2882 } 2883 template <typename T, size_t N, typename Index> 2884 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base, 2885 const Vec128<Index, N> index) { 2886 static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); 2887 return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index); 2888 } 2889 2890 template <size_t N> 2891 HWY_API void ScatterOffset(Vec128<float, N> v, Simd<float, N> /* tag */, 2892 float* HWY_RESTRICT base, 2893 const Vec128<int32_t, N> offset) { 2894 if (N == 4) { 2895 _mm_i32scatter_ps(base, offset.raw, v.raw, 1); 2896 } else { 2897 const __mmask8 mask = (1u << N) - 1; 2898 _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1); 2899 } 2900 } 2901 template <size_t N> 2902 HWY_API void ScatterIndex(Vec128<float, N> v, Simd<float, N> /* tag */, 2903 float* HWY_RESTRICT base, 2904 const Vec128<int32_t, N> index) { 2905 if (N == 4) { 2906 _mm_i32scatter_ps(base, index.raw, v.raw, 4); 2907 } else { 2908 const __mmask8 mask = (1u << N) - 1; 2909 _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4); 2910 } 2911 } 2912 2913 template <size_t N> 2914 HWY_API void ScatterOffset(Vec128<double, N> v, Simd<double, N> /* tag */, 2915 double* HWY_RESTRICT base, 2916 const Vec128<int64_t, N> offset) { 2917 if (N == 2) { 2918 _mm_i64scatter_pd(base, offset.raw, v.raw, 1); 2919 } else { 2920 const __mmask8 mask = (1u << N) - 1; 2921 _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1); 2922 } 2923 } 2924 template <size_t N> 2925 HWY_API void ScatterIndex(Vec128<double, N> v, Simd<double, N> /* tag */, 2926 double* HWY_RESTRICT base, 2927 const Vec128<int64_t, N> index) { 2928 if (N == 2) { 2929 _mm_i64scatter_pd(base, index.raw, v.raw, 8); 2930 } else { 2931 const __mmask8 mask = (1u << N) - 1; 2932 _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8); 2933 } 2934 } 2935 #else // HWY_TARGET <= HWY_AVX3 2936 2937 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)> 2938 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base, 2939 const Vec128<Offset, N> offset) { 2940 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); 2941 2942 alignas(16) T lanes[N]; 2943 Store(v, d, lanes); 2944 2945 alignas(16) Offset offset_lanes[N]; 2946 Store(offset, Simd<Offset, N>(), offset_lanes); 2947 2948 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base); 2949 for (size_t i = 0; i < N; ++i) { 2950 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]); 2951 } 2952 } 2953 2954 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)> 2955 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base, 2956 const Vec128<Index, N> index) { 2957 static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); 2958 2959 alignas(16) T lanes[N]; 2960 Store(v, d, lanes); 2961 2962 alignas(16) Index index_lanes[N]; 2963 Store(index, Simd<Index, N>(), index_lanes); 2964 2965 for (size_t i = 0; i < N; ++i) { 2966 base[index_lanes[i]] = lanes[i]; 2967 } 2968 } 2969 2970 #endif 2971 2972 // ------------------------------ Gather (Load/Store) 2973 2974 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 2975 2976 template <typename T, size_t N, typename Offset> 2977 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d, 2978 const T* HWY_RESTRICT base, 2979 const Vec128<Offset, N> offset) { 2980 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); 2981 2982 alignas(16) Offset offset_lanes[N]; 2983 Store(offset, Simd<Offset, N>(), offset_lanes); 2984 2985 alignas(16) T lanes[N]; 2986 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base); 2987 for (size_t i = 0; i < N; ++i) { 2988 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]); 2989 } 2990 return Load(d, lanes); 2991 } 2992 2993 template <typename T, size_t N, typename Index> 2994 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base, 2995 const Vec128<Index, N> index) { 2996 static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); 2997 2998 alignas(16) Index index_lanes[N]; 2999 Store(index, Simd<Index, N>(), index_lanes); 3000 3001 alignas(16) T lanes[N]; 3002 for (size_t i = 0; i < N; ++i) { 3003 lanes[i] = base[index_lanes[i]]; 3004 } 3005 return Load(d, lanes); 3006 } 3007 3008 #else 3009 3010 namespace detail { 3011 3012 template <typename T, size_t N> 3013 HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<4> /* tag */, 3014 Simd<T, N> /* d */, 3015 const T* HWY_RESTRICT base, 3016 const Vec128<int32_t, N> offset) { 3017 return Vec128<T, N>{_mm_i32gather_epi32( 3018 reinterpret_cast<const int32_t*>(base), offset.raw, 1)}; 3019 } 3020 template <typename T, size_t N> 3021 HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<4> /* tag */, 3022 Simd<T, N> /* d */, 3023 const T* HWY_RESTRICT base, 3024 const Vec128<int32_t, N> index) { 3025 return Vec128<T, N>{_mm_i32gather_epi32( 3026 reinterpret_cast<const int32_t*>(base), index.raw, 4)}; 3027 } 3028 3029 template <typename T, size_t N> 3030 HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<8> /* tag */, 3031 Simd<T, N> /* d */, 3032 const T* HWY_RESTRICT base, 3033 const Vec128<int64_t, N> offset) { 3034 return Vec128<T, N>{_mm_i64gather_epi64( 3035 reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)}; 3036 } 3037 template <typename T, size_t N> 3038 HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<8> /* tag */, 3039 Simd<T, N> /* d */, 3040 const T* HWY_RESTRICT base, 3041 const Vec128<int64_t, N> index) { 3042 return Vec128<T, N>{_mm_i64gather_epi64( 3043 reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)}; 3044 } 3045 3046 } // namespace detail 3047 3048 template <typename T, size_t N, typename Offset> 3049 HWY_API Vec128<T, N> GatherOffset(Simd<T, N> d, const T* HWY_RESTRICT base, 3050 const Vec128<Offset, N> offset) { 3051 return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset); 3052 } 3053 template <typename T, size_t N, typename Index> 3054 HWY_API Vec128<T, N> GatherIndex(Simd<T, N> d, const T* HWY_RESTRICT base, 3055 const Vec128<Index, N> index) { 3056 return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index); 3057 } 3058 3059 template <size_t N> 3060 HWY_API Vec128<float, N> GatherOffset(Simd<float, N> /* tag */, 3061 const float* HWY_RESTRICT base, 3062 const Vec128<int32_t, N> offset) { 3063 return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)}; 3064 } 3065 template <size_t N> 3066 HWY_API Vec128<float, N> GatherIndex(Simd<float, N> /* tag */, 3067 const float* HWY_RESTRICT base, 3068 const Vec128<int32_t, N> index) { 3069 return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)}; 3070 } 3071 3072 template <size_t N> 3073 HWY_API Vec128<double, N> GatherOffset(Simd<double, N> /* tag */, 3074 const double* HWY_RESTRICT base, 3075 const Vec128<int64_t, N> offset) { 3076 return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)}; 3077 } 3078 template <size_t N> 3079 HWY_API Vec128<double, N> GatherIndex(Simd<double, N> /* tag */, 3080 const double* HWY_RESTRICT base, 3081 const Vec128<int64_t, N> index) { 3082 return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)}; 3083 } 3084 3085 #endif // HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 3086 3087 HWY_DIAGNOSTICS(pop) 3088 3089 // ================================================== SWIZZLE (2) 3090 3091 // ------------------------------ LowerHalf 3092 3093 // Returns upper/lower half of a vector. 3094 template <typename T, size_t N> 3095 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2> /* tag */, Vec128<T, N> v) { 3096 return Vec128<T, N / 2>{v.raw}; 3097 } 3098 3099 template <typename T, size_t N> 3100 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { 3101 return LowerHalf(Simd<T, N / 2>(), v); 3102 } 3103 3104 // ------------------------------ ShiftLeftBytes 3105 3106 template <int kBytes, typename T, size_t N> 3107 HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N> /* tag */, Vec128<T, N> v) { 3108 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 3109 return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)}; 3110 } 3111 3112 template <int kBytes, typename T, size_t N> 3113 HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) { 3114 return ShiftLeftBytes<kBytes>(Simd<T, N>(), v); 3115 } 3116 3117 // ------------------------------ ShiftLeftLanes 3118 3119 template <int kLanes, typename T, size_t N> 3120 HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N> d, const Vec128<T, N> v) { 3121 const Repartition<uint8_t, decltype(d)> d8; 3122 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); 3123 } 3124 3125 template <int kLanes, typename T, size_t N> 3126 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) { 3127 return ShiftLeftLanes<kLanes>(Simd<T, N>(), v); 3128 } 3129 3130 // ------------------------------ ShiftRightBytes 3131 template <int kBytes, typename T, size_t N> 3132 HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N> /* tag */, Vec128<T, N> v) { 3133 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 3134 // For partial vectors, clear upper lanes so we shift in zeros. 3135 if (N != 16 / sizeof(T)) { 3136 const Vec128<T> vfull{v.raw}; 3137 v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw}; 3138 } 3139 return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)}; 3140 } 3141 3142 // ------------------------------ ShiftRightLanes 3143 template <int kLanes, typename T, size_t N> 3144 HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N> d, const Vec128<T, N> v) { 3145 const Repartition<uint8_t, decltype(d)> d8; 3146 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v))); 3147 } 3148 3149 // ------------------------------ UpperHalf (ShiftRightBytes) 3150 3151 // Full input: copy hi into lo (smaller instruction encoding than shifts). 3152 template <typename T> 3153 HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Half<Full128<T>> /* tag */, 3154 Vec128<T> v) { 3155 return Vec128<T, 8 / sizeof(T)>{_mm_unpackhi_epi64(v.raw, v.raw)}; 3156 } 3157 HWY_API Vec128<float, 2> UpperHalf(Simd<float, 2> /* tag */, Vec128<float> v) { 3158 return Vec128<float, 2>{_mm_movehl_ps(v.raw, v.raw)}; 3159 } 3160 HWY_API Vec128<double, 1> UpperHalf(Simd<double, 1> /* tag */, 3161 Vec128<double> v) { 3162 return Vec128<double, 1>{_mm_unpackhi_pd(v.raw, v.raw)}; 3163 } 3164 3165 // Partial 3166 template <typename T, size_t N, HWY_IF_LE64(T, N)> 3167 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N>> /* tag */, 3168 Vec128<T, N> v) { 3169 const Simd<T, N> d; 3170 const auto vu = BitCast(RebindToUnsigned<decltype(d)>(), v); 3171 const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(vu)); 3172 return Vec128<T, (N + 1) / 2>{upper.raw}; 3173 } 3174 3175 // ------------------------------ CombineShiftRightBytes 3176 3177 template <int kBytes, typename T, class V = Vec128<T>> 3178 HWY_API V CombineShiftRightBytes(Full128<T> d, V hi, V lo) { 3179 const Repartition<uint8_t, decltype(d)> d8; 3180 return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8( 3181 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); 3182 } 3183 3184 template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N), 3185 class V = Vec128<T, N>> 3186 HWY_API V CombineShiftRightBytes(Simd<T, N> d, V hi, V lo) { 3187 constexpr size_t kSize = N * sizeof(T); 3188 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); 3189 const Repartition<uint8_t, decltype(d)> d8; 3190 const Full128<uint8_t> d_full8; 3191 using V8 = VFromD<decltype(d_full8)>; 3192 const V8 hi8{BitCast(d8, hi).raw}; 3193 // Move into most-significant bytes 3194 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); 3195 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8); 3196 return V{BitCast(Full128<T>(), r).raw}; 3197 } 3198 3199 // ------------------------------ Broadcast/splat any lane 3200 3201 // Unsigned 3202 template <int kLane, size_t N> 3203 HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) { 3204 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3205 if (kLane < 4) { 3206 const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); 3207 return Vec128<uint16_t, N>{_mm_unpacklo_epi64(lo, lo)}; 3208 } else { 3209 const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); 3210 return Vec128<uint16_t, N>{_mm_unpackhi_epi64(hi, hi)}; 3211 } 3212 } 3213 template <int kLane, size_t N> 3214 HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) { 3215 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3216 return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)}; 3217 } 3218 template <int kLane, size_t N> 3219 HWY_API Vec128<uint64_t, N> Broadcast(const Vec128<uint64_t, N> v) { 3220 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3221 return Vec128<uint64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; 3222 } 3223 3224 // Signed 3225 template <int kLane, size_t N> 3226 HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) { 3227 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3228 if (kLane < 4) { 3229 const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF); 3230 return Vec128<int16_t, N>{_mm_unpacklo_epi64(lo, lo)}; 3231 } else { 3232 const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF); 3233 return Vec128<int16_t, N>{_mm_unpackhi_epi64(hi, hi)}; 3234 } 3235 } 3236 template <int kLane, size_t N> 3237 HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) { 3238 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3239 return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)}; 3240 } 3241 template <int kLane, size_t N> 3242 HWY_API Vec128<int64_t, N> Broadcast(const Vec128<int64_t, N> v) { 3243 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3244 return Vec128<int64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; 3245 } 3246 3247 // Float 3248 template <int kLane, size_t N> 3249 HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) { 3250 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3251 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; 3252 } 3253 template <int kLane, size_t N> 3254 HWY_API Vec128<double, N> Broadcast(const Vec128<double, N> v) { 3255 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3256 return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)}; 3257 } 3258 3259 // ------------------------------ TableLookupBytes 3260 template <typename T, size_t N, typename TI, size_t NI> 3261 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes, 3262 const Vec128<TI, NI> from) { 3263 return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)}; 3264 } 3265 3266 // ------------------------------ TableLookupBytesOr0 3267 // For all vector widths; x86 anyway zeroes if >= 0x80. 3268 template <class V, class VI> 3269 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { 3270 return TableLookupBytes(bytes, from); 3271 } 3272 3273 // ------------------------------ TableLookupLanes (Shuffle01) 3274 3275 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. 3276 template <typename T, size_t N = 16 / sizeof(T)> 3277 struct Indices128 { 3278 __m128i raw; 3279 }; 3280 3281 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N), 3282 HWY_IF_LANE_SIZE(T, 4)> 3283 HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N> d, Vec128<TI, N> vec) { 3284 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 3285 #if HWY_IS_DEBUG_BUILD 3286 const Simd<TI, N> di; 3287 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && 3288 AllTrue(di, Lt(vec, Set(di, N)))); 3289 #endif 3290 3291 #if HWY_TARGET <= HWY_AVX2 3292 (void)d; 3293 return Indices128<T, N>{vec.raw}; 3294 #else 3295 const Repartition<uint8_t, decltype(d)> d8; 3296 using V8 = VFromD<decltype(d8)>; 3297 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3, 3298 0, 1, 2, 3, 0, 1, 2, 3}; 3299 3300 // Broadcast each lane index to all 4 bytes of T 3301 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = { 3302 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; 3303 const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); 3304 3305 // Shift to bytes 3306 const Repartition<uint16_t, decltype(d)> d16; 3307 const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); 3308 3309 return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw}; 3310 #endif 3311 } 3312 3313 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N), 3314 HWY_IF_LANE_SIZE(T, 8)> 3315 HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N> /* tag */, 3316 Vec128<TI, N> vec) { 3317 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 3318 #if HWY_IS_DEBUG_BUILD 3319 const Simd<TI, N> di; 3320 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && 3321 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N))))); 3322 #endif 3323 3324 // No change - even without AVX3, we can shuffle+blend. 3325 return Indices128<T, N>{vec.raw}; 3326 } 3327 3328 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)> 3329 HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const TI* idx) { 3330 const Rebind<TI, decltype(d)> di; 3331 return IndicesFromVec(d, LoadU(di, idx)); 3332 } 3333 3334 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 3335 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { 3336 #if HWY_TARGET <= HWY_AVX2 3337 const Simd<T, N> d; 3338 const Simd<float, N> df; 3339 const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)}; 3340 return BitCast(d, perm); 3341 #else 3342 return TableLookupBytes(v, Vec128<T, N>{idx.raw}); 3343 #endif 3344 } 3345 3346 template <size_t N, HWY_IF_GE64(float, N)> 3347 HWY_API Vec128<float, N> TableLookupLanes(Vec128<float, N> v, 3348 Indices128<float, N> idx) { 3349 #if HWY_TARGET <= HWY_AVX2 3350 return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)}; 3351 #else 3352 const Simd<int32_t, N> di; 3353 const Simd<float, N> df; 3354 return BitCast(df, 3355 TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw})); 3356 #endif 3357 } 3358 3359 // Single lane: no change 3360 template <typename T> 3361 HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v, 3362 Indices128<T, 1> /* idx */) { 3363 return v; 3364 } 3365 3366 template <typename T, HWY_IF_LANE_SIZE(T, 8)> 3367 HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) { 3368 const Full128<T> d; 3369 Vec128<int64_t> vidx{idx.raw}; 3370 #if HWY_TARGET <= HWY_AVX2 3371 // There is no _mm_permute[x]var_epi64. 3372 vidx += vidx; // bit1 is the decider (unusual) 3373 const Full128<double> df; 3374 return BitCast( 3375 d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)}); 3376 #else 3377 // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit 3378 // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 3379 // to obtain an all-zero or all-one mask. 3380 const Full128<int64_t> di; 3381 const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1); 3382 const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same)); 3383 return IfThenElse(mask_same, v, Shuffle01(v)); 3384 #endif 3385 } 3386 3387 HWY_API Vec128<double> TableLookupLanes(Vec128<double> v, 3388 Indices128<double> idx) { 3389 Vec128<int64_t> vidx{idx.raw}; 3390 #if HWY_TARGET <= HWY_AVX2 3391 vidx += vidx; // bit1 is the decider (unusual) 3392 return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)}; 3393 #else 3394 // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit 3395 // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 3396 // to obtain an all-zero or all-one mask. 3397 const Full128<double> d; 3398 const Full128<int64_t> di; 3399 const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1); 3400 const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same)); 3401 return IfThenElse(mask_same, v, Shuffle01(v)); 3402 #endif 3403 } 3404 3405 // ------------------------------ Reverse (Shuffle0123, Shuffle2301) 3406 3407 // Single lane: no change 3408 template <typename T> 3409 HWY_API Vec128<T, 1> Reverse(Simd<T, 1> /* tag */, const Vec128<T, 1> v) { 3410 return v; 3411 } 3412 3413 // Two lanes: shuffle 3414 template <typename T, HWY_IF_LANE_SIZE(T, 4)> 3415 HWY_API Vec128<T, 2> Reverse(Simd<T, 2> /* tag */, const Vec128<T, 2> v) { 3416 return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw}; 3417 } 3418 3419 template <typename T, HWY_IF_LANE_SIZE(T, 8)> 3420 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) { 3421 return Shuffle01(v); 3422 } 3423 3424 // Four lanes: shuffle 3425 template <typename T, HWY_IF_LANE_SIZE(T, 4)> 3426 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) { 3427 return Shuffle0123(v); 3428 } 3429 3430 // 16-bit 3431 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)> 3432 HWY_API Vec128<T, N> Reverse(Simd<T, N> d, const Vec128<T, N> v) { 3433 #if HWY_TARGET <= HWY_AVX3 3434 if (N == 1) return v; 3435 if (N == 2) { 3436 const Repartition<uint32_t, decltype(d)> du32; 3437 return BitCast(d, RotateRight<16>(BitCast(du32, v))); 3438 } 3439 const RebindToSigned<decltype(d)> di; 3440 alignas(16) constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0}; 3441 const Vec128<int16_t, N> idx = Load(di, kReverse + (N == 8 ? 0 : 4)); 3442 return BitCast(d, Vec128<int16_t, N>{ 3443 _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); 3444 #else 3445 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32; 3446 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); 3447 #endif 3448 } 3449 3450 // ------------------------------ InterleaveLower 3451 3452 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides 3453 // the least-significant lane) and "b". To concatenate two half-width integers 3454 // into one, use ZipLower/Upper instead (also works with scalar). 3455 3456 template <size_t N, HWY_IF_LE128(uint8_t, N)> 3457 HWY_API Vec128<uint8_t, N> InterleaveLower(const Vec128<uint8_t, N> a, 3458 const Vec128<uint8_t, N> b) { 3459 return Vec128<uint8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)}; 3460 } 3461 template <size_t N, HWY_IF_LE128(uint16_t, N)> 3462 HWY_API Vec128<uint16_t, N> InterleaveLower(const Vec128<uint16_t, N> a, 3463 const Vec128<uint16_t, N> b) { 3464 return Vec128<uint16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)}; 3465 } 3466 template <size_t N, HWY_IF_LE128(uint32_t, N)> 3467 HWY_API Vec128<uint32_t, N> InterleaveLower(const Vec128<uint32_t, N> a, 3468 const Vec128<uint32_t, N> b) { 3469 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)}; 3470 } 3471 template <size_t N, HWY_IF_LE128(uint64_t, N)> 3472 HWY_API Vec128<uint64_t, N> InterleaveLower(const Vec128<uint64_t, N> a, 3473 const Vec128<uint64_t, N> b) { 3474 return Vec128<uint64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)}; 3475 } 3476 3477 template <size_t N, HWY_IF_LE128(int8_t, N)> 3478 HWY_API Vec128<int8_t, N> InterleaveLower(const Vec128<int8_t, N> a, 3479 const Vec128<int8_t, N> b) { 3480 return Vec128<int8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)}; 3481 } 3482 template <size_t N, HWY_IF_LE128(int16_t, N)> 3483 HWY_API Vec128<int16_t, N> InterleaveLower(const Vec128<int16_t, N> a, 3484 const Vec128<int16_t, N> b) { 3485 return Vec128<int16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)}; 3486 } 3487 template <size_t N, HWY_IF_LE128(int32_t, N)> 3488 HWY_API Vec128<int32_t, N> InterleaveLower(const Vec128<int32_t, N> a, 3489 const Vec128<int32_t, N> b) { 3490 return Vec128<int32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)}; 3491 } 3492 template <size_t N, HWY_IF_LE128(int64_t, N)> 3493 HWY_API Vec128<int64_t, N> InterleaveLower(const Vec128<int64_t, N> a, 3494 const Vec128<int64_t, N> b) { 3495 return Vec128<int64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)}; 3496 } 3497 3498 template <size_t N, HWY_IF_LE128(float, N)> 3499 HWY_API Vec128<float, N> InterleaveLower(const Vec128<float, N> a, 3500 const Vec128<float, N> b) { 3501 return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)}; 3502 } 3503 template <size_t N, HWY_IF_LE128(double, N)> 3504 HWY_API Vec128<double, N> InterleaveLower(const Vec128<double, N> a, 3505 const Vec128<double, N> b) { 3506 return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)}; 3507 } 3508 3509 // Additional overload for the optional Simd<> tag. 3510 template <typename T, size_t N, HWY_IF_LE128(T, N), class V = Vec128<T, N>> 3511 HWY_API V InterleaveLower(Simd<T, N> /* tag */, V a, V b) { 3512 return InterleaveLower(a, b); 3513 } 3514 3515 // ------------------------------ InterleaveUpper (UpperHalf) 3516 3517 // All functions inside detail lack the required D parameter. 3518 namespace detail { 3519 3520 HWY_API Vec128<uint8_t> InterleaveUpper(const Vec128<uint8_t> a, 3521 const Vec128<uint8_t> b) { 3522 return Vec128<uint8_t>{_mm_unpackhi_epi8(a.raw, b.raw)}; 3523 } 3524 HWY_API Vec128<uint16_t> InterleaveUpper(const Vec128<uint16_t> a, 3525 const Vec128<uint16_t> b) { 3526 return Vec128<uint16_t>{_mm_unpackhi_epi16(a.raw, b.raw)}; 3527 } 3528 HWY_API Vec128<uint32_t> InterleaveUpper(const Vec128<uint32_t> a, 3529 const Vec128<uint32_t> b) { 3530 return Vec128<uint32_t>{_mm_unpackhi_epi32(a.raw, b.raw)}; 3531 } 3532 HWY_API Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a, 3533 const Vec128<uint64_t> b) { 3534 return Vec128<uint64_t>{_mm_unpackhi_epi64(a.raw, b.raw)}; 3535 } 3536 3537 HWY_API Vec128<int8_t> InterleaveUpper(const Vec128<int8_t> a, 3538 const Vec128<int8_t> b) { 3539 return Vec128<int8_t>{_mm_unpackhi_epi8(a.raw, b.raw)}; 3540 } 3541 HWY_API Vec128<int16_t> InterleaveUpper(const Vec128<int16_t> a, 3542 const Vec128<int16_t> b) { 3543 return Vec128<int16_t>{_mm_unpackhi_epi16(a.raw, b.raw)}; 3544 } 3545 HWY_API Vec128<int32_t> InterleaveUpper(const Vec128<int32_t> a, 3546 const Vec128<int32_t> b) { 3547 return Vec128<int32_t>{_mm_unpackhi_epi32(a.raw, b.raw)}; 3548 } 3549 HWY_API Vec128<int64_t> InterleaveUpper(const Vec128<int64_t> a, 3550 const Vec128<int64_t> b) { 3551 return Vec128<int64_t>{_mm_unpackhi_epi64(a.raw, b.raw)}; 3552 } 3553 3554 HWY_API Vec128<float> InterleaveUpper(const Vec128<float> a, 3555 const Vec128<float> b) { 3556 return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)}; 3557 } 3558 HWY_API Vec128<double> InterleaveUpper(const Vec128<double> a, 3559 const Vec128<double> b) { 3560 return Vec128<double>{_mm_unpackhi_pd(a.raw, b.raw)}; 3561 } 3562 3563 } // namespace detail 3564 3565 // Full 3566 template <typename T, class V = Vec128<T>> 3567 HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) { 3568 return detail::InterleaveUpper(a, b); 3569 } 3570 3571 // Partial 3572 template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>> 3573 HWY_API V InterleaveUpper(Simd<T, N> d, V a, V b) { 3574 const Half<decltype(d)> d2; 3575 return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw}); 3576 } 3577 3578 // ------------------------------ ZipLower/ZipUpper (InterleaveLower) 3579 3580 // Same as Interleave*, except that the return lanes are double-width integers; 3581 // this is necessary because the single-lane scalar cannot return two values. 3582 template <typename T, size_t N, class DW = RepartitionToWide<Simd<T, N>>> 3583 HWY_API VFromD<DW> ZipLower(Vec128<T, N> a, Vec128<T, N> b) { 3584 return BitCast(DW(), InterleaveLower(a, b)); 3585 } 3586 template <typename T, size_t N, class D = Simd<T, N>, 3587 class DW = RepartitionToWide<D>> 3588 HWY_API VFromD<DW> ZipLower(DW dw, Vec128<T, N> a, Vec128<T, N> b) { 3589 return BitCast(dw, InterleaveLower(D(), a, b)); 3590 } 3591 3592 template <typename T, size_t N, class D = Simd<T, N>, 3593 class DW = RepartitionToWide<D>> 3594 HWY_API VFromD<DW> ZipUpper(DW dw, Vec128<T, N> a, Vec128<T, N> b) { 3595 return BitCast(dw, InterleaveUpper(D(), a, b)); 3596 } 3597 3598 // ================================================== COMBINE 3599 3600 // ------------------------------ Combine (InterleaveLower) 3601 3602 // N = N/2 + N/2 (upper half undefined) 3603 template <typename T, size_t N, HWY_IF_LE128(T, N)> 3604 HWY_API Vec128<T, N> Combine(Simd<T, N> d, Vec128<T, N / 2> hi_half, 3605 Vec128<T, N / 2> lo_half) { 3606 const Half<decltype(d)> d2; 3607 const RebindToUnsigned<decltype(d2)> du2; 3608 // Treat half-width input as one lane, and expand to two lanes. 3609 using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>; 3610 const VU lo{BitCast(du2, lo_half).raw}; 3611 const VU hi{BitCast(du2, hi_half).raw}; 3612 return BitCast(d, InterleaveLower(lo, hi)); 3613 } 3614 3615 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) 3616 3617 template <typename T, HWY_IF_NOT_FLOAT(T)> 3618 HWY_API Vec128<T> ZeroExtendVector(Full128<T> /* tag */, 3619 Vec128<T, 8 / sizeof(T)> lo) { 3620 return Vec128<T>{_mm_move_epi64(lo.raw)}; 3621 } 3622 3623 template <typename T, HWY_IF_FLOAT(T)> 3624 HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec128<T, 8 / sizeof(T)> lo) { 3625 const RebindToUnsigned<decltype(d)> du; 3626 return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo))); 3627 } 3628 3629 template <typename T, size_t N, HWY_IF_LE64(T, N)> 3630 HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N> d, Vec128<T, N / 2> lo) { 3631 return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw}); 3632 } 3633 3634 // ------------------------------ Concat full (InterleaveLower) 3635 3636 // hiH,hiL loH,loL |-> hiL,loL (= lower halves) 3637 template <typename T> 3638 HWY_API Vec128<T> ConcatLowerLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) { 3639 const Repartition<uint64_t, decltype(d)> d64; 3640 return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); 3641 } 3642 3643 // hiH,hiL loH,loL |-> hiH,loH (= upper halves) 3644 template <typename T> 3645 HWY_API Vec128<T> ConcatUpperUpper(Full128<T> d, Vec128<T> hi, Vec128<T> lo) { 3646 const Repartition<uint64_t, decltype(d)> d64; 3647 return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); 3648 } 3649 3650 // hiH,hiL loH,loL |-> hiL,loH (= inner halves) 3651 template <typename T> 3652 HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi, 3653 const Vec128<T> lo) { 3654 return CombineShiftRightBytes<8>(d, hi, lo); 3655 } 3656 3657 // hiH,hiL loH,loL |-> hiH,loL (= outer halves) 3658 template <typename T> 3659 HWY_API Vec128<T> ConcatUpperLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) { 3660 #if HWY_TARGET == HWY_SSSE3 3661 const Full128<double> dd; 3662 const __m128d concat = _mm_move_sd(BitCast(dd, hi).raw, BitCast(dd, lo).raw); 3663 return BitCast(d, Vec128<double>{concat}); 3664 #else 3665 (void)d; 3666 return Vec128<T>{_mm_blend_epi16(hi.raw, lo.raw, 0x0F)}; 3667 #endif 3668 } 3669 HWY_API Vec128<float> ConcatUpperLower(Full128<float> /* tag */, 3670 const Vec128<float> hi, 3671 const Vec128<float> lo) { 3672 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))}; 3673 } 3674 HWY_API Vec128<double> ConcatUpperLower(Full128<double> /* tag */, 3675 const Vec128<double> hi, 3676 const Vec128<double> lo) { 3677 return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))}; 3678 } 3679 3680 // ------------------------------ Concat partial (Combine, LowerHalf) 3681 3682 template <typename T, size_t N, HWY_IF_LE64(T, N)> 3683 HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N> d, Vec128<T, N> hi, 3684 Vec128<T, N> lo) { 3685 const Half<decltype(d)> d2; 3686 return Combine(LowerHalf(d2, hi), LowerHalf(d2, lo)); 3687 } 3688 3689 template <typename T, size_t N, HWY_IF_LE64(T, N)> 3690 HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N> d, Vec128<T, N> hi, 3691 Vec128<T, N> lo) { 3692 const Half<decltype(d)> d2; 3693 return Combine(UpperHalf(d2, hi), UpperHalf(d2, lo)); 3694 } 3695 3696 template <typename T, size_t N, HWY_IF_LE64(T, N)> 3697 HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N> d, const Vec128<T, N> hi, 3698 const Vec128<T, N> lo) { 3699 const Half<decltype(d)> d2; 3700 return Combine(LowerHalf(d2, hi), UpperHalf(d2, lo)); 3701 } 3702 3703 template <typename T, size_t N, HWY_IF_LE64(T, N)> 3704 HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N> d, Vec128<T, N> hi, 3705 Vec128<T, N> lo) { 3706 const Half<decltype(d)> d2; 3707 return Combine(UpperHalf(d2, hi), LowerHalf(d2, lo)); 3708 } 3709 3710 // ------------------------------ ConcatOdd 3711 3712 // 32-bit full 3713 template <typename T, HWY_IF_LANE_SIZE(T, 4)> 3714 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) { 3715 const RebindToFloat<decltype(d)> df; 3716 return BitCast( 3717 d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, 3718 _MM_SHUFFLE(3, 1, 3, 1))}); 3719 } 3720 template <size_t N> 3721 HWY_API Vec128<float> ConcatOdd(Full128<float> /* tag */, Vec128<float> hi, 3722 Vec128<float> lo) { 3723 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))}; 3724 } 3725 3726 // 32-bit partial 3727 template <typename T, HWY_IF_LANE_SIZE(T, 4)> 3728 HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2> d, Vec128<T, 2> hi, Vec128<T, 2> lo) { 3729 return InterleaveUpper(d, lo, hi); 3730 } 3731 3732 // 64-bit full - no partial because we need at least two inputs to have 3733 // even/odd. 3734 template <typename T, HWY_IF_LANE_SIZE(T, 8)> 3735 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) { 3736 return InterleaveUpper(d, lo, hi); 3737 } 3738 3739 // ------------------------------ ConcatEven (InterleaveLower) 3740 3741 // 32-bit full 3742 template <typename T, HWY_IF_LANE_SIZE(T, 4)> 3743 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) { 3744 const RebindToFloat<decltype(d)> df; 3745 return BitCast( 3746 d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, 3747 _MM_SHUFFLE(2, 0, 2, 0))}); 3748 } 3749 template <size_t N> 3750 HWY_API Vec128<float> ConcatEven(Full128<float> /* tag */, Vec128<float> hi, 3751 Vec128<float> lo) { 3752 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; 3753 } 3754 3755 // 32-bit partial 3756 template <typename T, HWY_IF_LANE_SIZE(T, 4)> 3757 HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2> d, Vec128<T, 2> hi, 3758 Vec128<T, 2> lo) { 3759 return InterleaveLower(d, lo, hi); 3760 } 3761 3762 // 64-bit full - no partial because we need at least two inputs to have 3763 // even/odd. 3764 template <typename T, HWY_IF_LANE_SIZE(T, 8)> 3765 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) { 3766 return InterleaveLower(d, lo, hi); 3767 } 3768 3769 // ------------------------------ OddEven (IfThenElse) 3770 3771 namespace detail { 3772 3773 template <typename T, size_t N> 3774 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a, 3775 const Vec128<T, N> b) { 3776 const Simd<T, N> d; 3777 const Repartition<uint8_t, decltype(d)> d8; 3778 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 3779 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; 3780 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); 3781 } 3782 template <typename T, size_t N> 3783 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a, 3784 const Vec128<T, N> b) { 3785 #if HWY_TARGET == HWY_SSSE3 3786 const Simd<T, N> d; 3787 const Repartition<uint8_t, decltype(d)> d8; 3788 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 3789 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; 3790 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); 3791 #else 3792 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)}; 3793 #endif 3794 } 3795 template <typename T, size_t N> 3796 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a, 3797 const Vec128<T, N> b) { 3798 #if HWY_TARGET == HWY_SSSE3 3799 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1)); 3800 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0)); 3801 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)}; 3802 #else 3803 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)}; 3804 #endif 3805 } 3806 template <typename T, size_t N> 3807 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a, 3808 const Vec128<T, N> b) { 3809 #if HWY_TARGET == HWY_SSSE3 3810 const Full128<double> dd; 3811 const __m128d concat = _mm_move_sd(BitCast(dd, a).raw, BitCast(dd, b).raw); 3812 return BitCast(Full128<T>(), Vec128<double>{concat}); 3813 #else 3814 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)}; 3815 #endif 3816 } 3817 3818 } // namespace detail 3819 3820 template <typename T, size_t N> 3821 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 3822 return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b); 3823 } 3824 template <size_t N> 3825 HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a, 3826 const Vec128<float, N> b) { 3827 #if HWY_TARGET == HWY_SSSE3 3828 // SHUFPS must fill the lower half of the output from one register, so we 3829 // need another shuffle. Unpack avoids another immediate byte. 3830 const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1)); 3831 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0)); 3832 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)}; 3833 #else 3834 return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)}; 3835 #endif 3836 } 3837 3838 template <size_t N> 3839 HWY_API Vec128<double, N> OddEven(const Vec128<double, N> a, 3840 const Vec128<double, N> b) { 3841 return Vec128<double>{_mm_shuffle_pd(b.raw, a.raw, _MM_SHUFFLE2(1, 0))}; 3842 } 3843 3844 // ------------------------------ OddEvenBlocks 3845 template <typename T, size_t N> 3846 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { 3847 return even; 3848 } 3849 3850 // ------------------------------ SwapAdjacentBlocks 3851 3852 template <typename T, size_t N> 3853 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { 3854 return v; 3855 } 3856 3857 // ------------------------------ Shl (ZipLower, Mul) 3858 3859 // Use AVX2/3 variable shifts where available, otherwise multiply by powers of 3860 // two from loading float exponents, which is considerably faster (according 3861 // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v. 3862 3863 #if HWY_TARGET > HWY_AVX3 // AVX2 or older 3864 namespace detail { 3865 3866 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts. 3867 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)> 3868 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) { 3869 const Simd<T, N> d; 3870 const RepartitionToWide<decltype(d)> dw; 3871 const Rebind<float, decltype(dw)> df; 3872 const auto zero = Zero(d); 3873 // Move into exponent (this u16 will become the upper half of an f32) 3874 const auto exp = ShiftLeft<23 - 16>(v); 3875 const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f 3876 // Insert 0 into lower halves for reinterpreting as binary32. 3877 const auto f0 = ZipLower(dw, zero, upper); 3878 const auto f1 = ZipUpper(dw, zero, upper); 3879 // See comment below. 3880 const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)}; 3881 const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)}; 3882 return Vec128<MakeUnsigned<T>, N>{_mm_packus_epi32(bits0.raw, bits1.raw)}; 3883 } 3884 3885 // Same, for 32-bit shifts. 3886 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 3887 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) { 3888 const Simd<T, N> d; 3889 const auto exp = ShiftLeft<23>(v); 3890 const auto f = exp + Set(d, 0x3F800000); // 1.0f 3891 // Do not use ConvertTo because we rely on the native 0x80..00 overflow 3892 // behavior. cvt instead of cvtt should be equivalent, but avoids test 3893 // failure under GCC 10.2.1. 3894 return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))}; 3895 } 3896 3897 } // namespace detail 3898 #endif // HWY_TARGET > HWY_AVX3 3899 3900 template <size_t N> 3901 HWY_API Vec128<uint16_t, N> operator<<(const Vec128<uint16_t, N> v, 3902 const Vec128<uint16_t, N> bits) { 3903 #if HWY_TARGET <= HWY_AVX3 3904 return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)}; 3905 #else 3906 return v * detail::Pow2(bits); 3907 #endif 3908 } 3909 HWY_API Vec128<uint16_t, 1> operator<<(const Vec128<uint16_t, 1> v, 3910 const Vec128<uint16_t, 1> bits) { 3911 return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)}; 3912 } 3913 3914 template <size_t N> 3915 HWY_API Vec128<uint32_t, N> operator<<(const Vec128<uint32_t, N> v, 3916 const Vec128<uint32_t, N> bits) { 3917 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 3918 return v * detail::Pow2(bits); 3919 #else 3920 return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)}; 3921 #endif 3922 } 3923 HWY_API Vec128<uint32_t, 1> operator<<(const Vec128<uint32_t, 1> v, 3924 const Vec128<uint32_t, 1> bits) { 3925 return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)}; 3926 } 3927 3928 HWY_API Vec128<uint64_t> operator<<(const Vec128<uint64_t> v, 3929 const Vec128<uint64_t> bits) { 3930 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 3931 // Individual shifts and combine 3932 const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)}; 3933 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); 3934 const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)}; 3935 return ConcatUpperLower(Full128<uint64_t>(), out1, out0); 3936 #else 3937 return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)}; 3938 #endif 3939 } 3940 HWY_API Vec128<uint64_t, 1> operator<<(const Vec128<uint64_t, 1> v, 3941 const Vec128<uint64_t, 1> bits) { 3942 return Vec128<uint64_t, 1>{_mm_sll_epi64(v.raw, bits.raw)}; 3943 } 3944 3945 // Signed left shift is the same as unsigned. 3946 template <typename T, size_t N, HWY_IF_SIGNED(T)> 3947 HWY_API Vec128<T, N> operator<<(const Vec128<T, N> v, const Vec128<T, N> bits) { 3948 const Simd<T, N> di; 3949 const Simd<MakeUnsigned<T>, N> du; 3950 return BitCast(di, BitCast(du, v) << BitCast(du, bits)); 3951 } 3952 3953 // ------------------------------ Shr (mul, mask, BroadcastSignBit) 3954 3955 // Use AVX2+ variable shifts except for SSSE3/SSE4 or 16-bit. There, we use 3956 // widening multiplication by powers of two obtained by loading float exponents, 3957 // followed by a constant right-shift. This is still faster than a scalar or 3958 // bit-test approach: https://gcc.godbolt.org/z/9G7Y9v. 3959 3960 template <size_t N> 3961 HWY_API Vec128<uint16_t, N> operator>>(const Vec128<uint16_t, N> in, 3962 const Vec128<uint16_t, N> bits) { 3963 #if HWY_TARGET <= HWY_AVX3 3964 return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)}; 3965 #else 3966 const Simd<uint16_t, N> d; 3967 // For bits=0, we cannot mul by 2^16, so fix the result later. 3968 const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits)); 3969 // Replace output with input where bits == 0. 3970 return IfThenElse(bits == Zero(d), in, out); 3971 #endif 3972 } 3973 HWY_API Vec128<uint16_t, 1> operator>>(const Vec128<uint16_t, 1> in, 3974 const Vec128<uint16_t, 1> bits) { 3975 return Vec128<uint16_t, 1>{_mm_srl_epi16(in.raw, bits.raw)}; 3976 } 3977 3978 template <size_t N> 3979 HWY_API Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> in, 3980 const Vec128<uint32_t, N> bits) { 3981 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 3982 // 32x32 -> 64 bit mul, then shift right by 32. 3983 const Simd<uint32_t, N> d32; 3984 // Move odd lanes into position for the second mul. Shuffle more gracefully 3985 // handles N=1 than repartitioning to u64 and shifting 32 bits right. 3986 const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)}; 3987 // For bits=0, we cannot mul by 2^32, so fix the result later. 3988 const auto mul = detail::Pow2(Set(d32, 32) - bits); 3989 const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0 3990 const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)}; 3991 // No need to shift right, already in the correct position. 3992 const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ? 3993 const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20)); 3994 // Replace output with input where bits == 0. 3995 return IfThenElse(bits == Zero(d32), in, out); 3996 #else 3997 return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)}; 3998 #endif 3999 } 4000 HWY_API Vec128<uint32_t, 1> operator>>(const Vec128<uint32_t, 1> in, 4001 const Vec128<uint32_t, 1> bits) { 4002 return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits.raw)}; 4003 } 4004 4005 HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v, 4006 const Vec128<uint64_t> bits) { 4007 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 4008 // Individual shifts and combine 4009 const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)}; 4010 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); 4011 const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)}; 4012 return ConcatUpperLower(Full128<uint64_t>(), out1, out0); 4013 #else 4014 return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)}; 4015 #endif 4016 } 4017 HWY_API Vec128<uint64_t, 1> operator>>(const Vec128<uint64_t, 1> v, 4018 const Vec128<uint64_t, 1> bits) { 4019 return Vec128<uint64_t, 1>{_mm_srl_epi64(v.raw, bits.raw)}; 4020 } 4021 4022 #if HWY_TARGET > HWY_AVX3 // AVX2 or older 4023 namespace detail { 4024 4025 // Also used in x86_256-inl.h. 4026 template <class DI, class V> 4027 HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) { 4028 const RebindToUnsigned<DI> du; 4029 const auto count = BitCast(du, count_i); // same type as value to shift 4030 // Clear sign and restore afterwards. This is preferable to shifting the MSB 4031 // downwards because Shr is somewhat more expensive than Shl. 4032 const auto sign = BroadcastSignBit(v); 4033 const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below 4034 return BitCast(di, abs >> count) ^ sign; 4035 } 4036 4037 } // namespace detail 4038 #endif // HWY_TARGET > HWY_AVX3 4039 4040 template <size_t N> 4041 HWY_API Vec128<int16_t, N> operator>>(const Vec128<int16_t, N> v, 4042 const Vec128<int16_t, N> bits) { 4043 #if HWY_TARGET <= HWY_AVX3 4044 return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)}; 4045 #else 4046 return detail::SignedShr(Simd<int16_t, N>(), v, bits); 4047 #endif 4048 } 4049 HWY_API Vec128<int16_t, 1> operator>>(const Vec128<int16_t, 1> v, 4050 const Vec128<int16_t, 1> bits) { 4051 return Vec128<int16_t, 1>{_mm_sra_epi16(v.raw, bits.raw)}; 4052 } 4053 4054 template <size_t N> 4055 HWY_API Vec128<int32_t, N> operator>>(const Vec128<int32_t, N> v, 4056 const Vec128<int32_t, N> bits) { 4057 #if HWY_TARGET <= HWY_AVX3 4058 return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)}; 4059 #else 4060 return detail::SignedShr(Simd<int32_t, N>(), v, bits); 4061 #endif 4062 } 4063 HWY_API Vec128<int32_t, 1> operator>>(const Vec128<int32_t, 1> v, 4064 const Vec128<int32_t, 1> bits) { 4065 return Vec128<int32_t, 1>{_mm_sra_epi32(v.raw, bits.raw)}; 4066 } 4067 4068 template <size_t N> 4069 HWY_API Vec128<int64_t, N> operator>>(const Vec128<int64_t, N> v, 4070 const Vec128<int64_t, N> bits) { 4071 #if HWY_TARGET <= HWY_AVX3 4072 return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)}; 4073 #else 4074 return detail::SignedShr(Simd<int64_t, N>(), v, bits); 4075 #endif 4076 } 4077 4078 // ------------------------------ MulEven/Odd 64x64 (UpperHalf) 4079 4080 HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a, 4081 const Vec128<uint64_t> b) { 4082 alignas(16) uint64_t mul[2]; 4083 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); 4084 return Load(Full128<uint64_t>(), mul); 4085 } 4086 4087 HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a, 4088 const Vec128<uint64_t> b) { 4089 alignas(16) uint64_t mul[2]; 4090 const Half<Full128<uint64_t>> d2; 4091 mul[0] = 4092 Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]); 4093 return Load(Full128<uint64_t>(), mul); 4094 } 4095 4096 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) 4097 4098 template <size_t N> 4099 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N> df32, 4100 Vec128<bfloat16_t, 2 * N> a, 4101 Vec128<bfloat16_t, 2 * N> b, 4102 const Vec128<float, N> sum0, 4103 Vec128<float, N>& sum1) { 4104 // TODO(janwas): _mm_dpbf16_ps when available 4105 const Repartition<uint16_t, decltype(df32)> du16; 4106 const RebindToUnsigned<decltype(df32)> du32; 4107 const Vec128<uint16_t, 2 * N> zero = Zero(du16); 4108 // Lane order within sum0/1 is undefined, hence we can avoid the 4109 // longer-latency lane-crossing PromoteTo. 4110 const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a)); 4111 const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a)); 4112 const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b)); 4113 const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b)); 4114 sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); 4115 return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); 4116 } 4117 4118 // ================================================== CONVERT 4119 4120 // ------------------------------ Promotions (part w/ narrow lanes -> full) 4121 4122 // Unsigned: zero-extend. 4123 template <size_t N> 4124 HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N> /* tag */, 4125 const Vec128<uint8_t, N> v) { 4126 #if HWY_TARGET == HWY_SSSE3 4127 const __m128i zero = _mm_setzero_si128(); 4128 return Vec128<uint16_t, N>{_mm_unpacklo_epi8(v.raw, zero)}; 4129 #else 4130 return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(v.raw)}; 4131 #endif 4132 } 4133 template <size_t N> 4134 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */, 4135 const Vec128<uint16_t, N> v) { 4136 #if HWY_TARGET == HWY_SSSE3 4137 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())}; 4138 #else 4139 return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(v.raw)}; 4140 #endif 4141 } 4142 template <size_t N> 4143 HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N> /* tag */, 4144 const Vec128<uint32_t, N> v) { 4145 #if HWY_TARGET == HWY_SSSE3 4146 return Vec128<uint64_t, N>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())}; 4147 #else 4148 return Vec128<uint64_t, N>{_mm_cvtepu32_epi64(v.raw)}; 4149 #endif 4150 } 4151 template <size_t N> 4152 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */, 4153 const Vec128<uint8_t, N> v) { 4154 #if HWY_TARGET == HWY_SSSE3 4155 const __m128i zero = _mm_setzero_si128(); 4156 const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero); 4157 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)}; 4158 #else 4159 return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(v.raw)}; 4160 #endif 4161 } 4162 4163 // Unsigned to signed: same plus cast. 4164 template <size_t N> 4165 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> di, 4166 const Vec128<uint8_t, N> v) { 4167 return BitCast(di, PromoteTo(Simd<uint16_t, N>(), v)); 4168 } 4169 template <size_t N> 4170 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> di, 4171 const Vec128<uint16_t, N> v) { 4172 return BitCast(di, PromoteTo(Simd<uint32_t, N>(), v)); 4173 } 4174 template <size_t N> 4175 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> di, 4176 const Vec128<uint8_t, N> v) { 4177 return BitCast(di, PromoteTo(Simd<uint32_t, N>(), v)); 4178 } 4179 4180 // Signed: replicate sign bit. 4181 template <size_t N> 4182 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */, 4183 const Vec128<int8_t, N> v) { 4184 #if HWY_TARGET == HWY_SSSE3 4185 return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(v.raw, v.raw)}); 4186 #else 4187 return Vec128<int16_t, N>{_mm_cvtepi8_epi16(v.raw)}; 4188 #endif 4189 } 4190 template <size_t N> 4191 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */, 4192 const Vec128<int16_t, N> v) { 4193 #if HWY_TARGET == HWY_SSSE3 4194 return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(v.raw, v.raw)}); 4195 #else 4196 return Vec128<int32_t, N>{_mm_cvtepi16_epi32(v.raw)}; 4197 #endif 4198 } 4199 template <size_t N> 4200 HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N> /* tag */, 4201 const Vec128<int32_t, N> v) { 4202 #if HWY_TARGET == HWY_SSSE3 4203 return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(v.raw, v.raw)}); 4204 #else 4205 return Vec128<int64_t, N>{_mm_cvtepi32_epi64(v.raw)}; 4206 #endif 4207 } 4208 template <size_t N> 4209 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */, 4210 const Vec128<int8_t, N> v) { 4211 #if HWY_TARGET == HWY_SSSE3 4212 const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw); 4213 const __m128i x4 = _mm_unpacklo_epi16(x2, x2); 4214 return ShiftRight<24>(Vec128<int32_t, N>{x4}); 4215 #else 4216 return Vec128<int32_t, N>{_mm_cvtepi8_epi32(v.raw)}; 4217 #endif 4218 } 4219 4220 // Workaround for origin tracking bug in Clang msan prior to 11.0 4221 // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid") 4222 #if defined(MEMORY_SANITIZER) && \ 4223 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100) 4224 #define HWY_INLINE_F16 HWY_NOINLINE 4225 #else 4226 #define HWY_INLINE_F16 HWY_INLINE 4227 #endif 4228 template <size_t N> 4229 HWY_INLINE_F16 Vec128<float, N> PromoteTo(Simd<float, N> df32, 4230 const Vec128<float16_t, N> v) { 4231 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C) 4232 const RebindToSigned<decltype(df32)> di32; 4233 const RebindToUnsigned<decltype(df32)> du32; 4234 // Expand to u32 so we can shift. 4235 const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw}); 4236 const auto sign = ShiftRight<15>(bits16); 4237 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); 4238 const auto mantissa = bits16 & Set(du32, 0x3FF); 4239 const auto subnormal = 4240 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * 4241 Set(df32, 1.0f / 16384 / 1024)); 4242 4243 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); 4244 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); 4245 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; 4246 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); 4247 return BitCast(df32, ShiftLeft<31>(sign) | bits32); 4248 #else 4249 (void)df32; 4250 return Vec128<float, N>{_mm_cvtph_ps(v.raw)}; 4251 #endif 4252 } 4253 4254 template <size_t N> 4255 HWY_API Vec128<float, N> PromoteTo(Simd<float, N> df32, 4256 const Vec128<bfloat16_t, N> v) { 4257 const Rebind<uint16_t, decltype(df32)> du16; 4258 const RebindToSigned<decltype(df32)> di32; 4259 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); 4260 } 4261 4262 template <size_t N> 4263 HWY_API Vec128<double, N> PromoteTo(Simd<double, N> /* tag */, 4264 const Vec128<float, N> v) { 4265 return Vec128<double, N>{_mm_cvtps_pd(v.raw)}; 4266 } 4267 4268 template <size_t N> 4269 HWY_API Vec128<double, N> PromoteTo(Simd<double, N> /* tag */, 4270 const Vec128<int32_t, N> v) { 4271 return Vec128<double, N>{_mm_cvtepi32_pd(v.raw)}; 4272 } 4273 4274 // ------------------------------ Demotions (full -> part w/ narrow lanes) 4275 4276 template <size_t N> 4277 HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N> /* tag */, 4278 const Vec128<int32_t, N> v) { 4279 #if HWY_TARGET == HWY_SSSE3 4280 const Simd<int32_t, N> di32; 4281 const Simd<uint16_t, N * 2> du16; 4282 const auto zero_if_neg = AndNot(ShiftRight<31>(v), v); 4283 const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF))); 4284 const auto clamped = Or(zero_if_neg, too_big); 4285 // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. 4286 alignas(16) constexpr uint16_t kLower2Bytes[16] = { 4287 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; 4288 const auto lo2 = Load(du16, kLower2Bytes); 4289 return Vec128<uint16_t, N>{TableLookupBytes(BitCast(du16, clamped), lo2).raw}; 4290 #else 4291 return Vec128<uint16_t, N>{_mm_packus_epi32(v.raw, v.raw)}; 4292 #endif 4293 } 4294 4295 template <size_t N> 4296 HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N> /* tag */, 4297 const Vec128<int32_t, N> v) { 4298 return Vec128<int16_t, N>{_mm_packs_epi32(v.raw, v.raw)}; 4299 } 4300 4301 template <size_t N> 4302 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */, 4303 const Vec128<int32_t, N> v) { 4304 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); 4305 return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)}; 4306 } 4307 4308 template <size_t N> 4309 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */, 4310 const Vec128<int16_t, N> v) { 4311 return Vec128<uint8_t, N>{_mm_packus_epi16(v.raw, v.raw)}; 4312 } 4313 4314 template <size_t N> 4315 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */, 4316 const Vec128<int32_t, N> v) { 4317 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); 4318 return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)}; 4319 } 4320 4321 template <size_t N> 4322 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */, 4323 const Vec128<int16_t, N> v) { 4324 return Vec128<int8_t, N>{_mm_packs_epi16(v.raw, v.raw)}; 4325 } 4326 4327 template <size_t N> 4328 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> df16, 4329 const Vec128<float, N> v) { 4330 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C) 4331 const RebindToUnsigned<decltype(df16)> du16; 4332 const Rebind<uint32_t, decltype(df16)> du; 4333 const RebindToSigned<decltype(du)> di; 4334 const auto bits32 = BitCast(du, v); 4335 const auto sign = ShiftRight<31>(bits32); 4336 const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); 4337 const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); 4338 4339 const auto k15 = Set(di, 15); 4340 const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); 4341 const auto is_tiny = exp < Set(di, -24); 4342 4343 const auto is_subnormal = exp < Set(di, -14); 4344 const auto biased_exp16 = 4345 BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); 4346 const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) 4347 const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + 4348 (mantissa32 >> (Set(du, 13) + sub_exp)); 4349 const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, 4350 ShiftRight<13>(mantissa32)); // <1024 4351 4352 const auto sign16 = ShiftLeft<15>(sign); 4353 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; 4354 const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); 4355 return BitCast(df16, DemoteTo(du16, bits16)); 4356 #else 4357 (void)df16; 4358 return Vec128<float16_t, N>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; 4359 #endif 4360 } 4361 4362 template <size_t N> 4363 HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N> dbf16, 4364 const Vec128<float, N> v) { 4365 // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16. 4366 const Rebind<int32_t, decltype(dbf16)> di32; 4367 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right 4368 const Rebind<uint16_t, decltype(dbf16)> du16; 4369 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); 4370 return BitCast(dbf16, DemoteTo(du16, bits_in_32)); 4371 } 4372 4373 template <size_t N> 4374 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To( 4375 Simd<bfloat16_t, 2 * N> dbf16, Vec128<float, N> a, Vec128<float, N> b) { 4376 // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16. 4377 const RebindToUnsigned<decltype(dbf16)> du16; 4378 const Repartition<uint32_t, decltype(dbf16)> du32; 4379 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b)); 4380 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); 4381 } 4382 4383 template <size_t N> 4384 HWY_API Vec128<float, N> DemoteTo(Simd<float, N> /* tag */, 4385 const Vec128<double, N> v) { 4386 return Vec128<float, N>{_mm_cvtpd_ps(v.raw)}; 4387 } 4388 4389 namespace detail { 4390 4391 // For well-defined float->int demotion in all x86_*-inl.h. 4392 4393 template <size_t N> 4394 HWY_INLINE auto ClampF64ToI32Max(Simd<double, N> d, decltype(Zero(d)) v) 4395 -> decltype(Zero(d)) { 4396 // The max can be exactly represented in binary64, so clamping beforehand 4397 // prevents x86 conversion from raising an exception and returning 80..00. 4398 return Min(v, Set(d, 2147483647.0)); 4399 } 4400 4401 // For ConvertTo float->int of same size, clamping before conversion would 4402 // change the result because the max integer value is not exactly representable. 4403 // Instead detect the overflow result after conversion and fix it. 4404 template <typename TI, size_t N, class DF = Simd<MakeFloat<TI>, N>> 4405 HWY_INLINE auto FixConversionOverflow(Simd<TI, N> di, 4406 decltype(Zero(DF())) original, 4407 decltype(Zero(di).raw) converted_raw) 4408 -> decltype(Zero(di)) { 4409 // Combinations of original and output sign: 4410 // --: normal <0 or -huge_val to 80..00: OK 4411 // -+: -0 to 0 : OK 4412 // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF 4413 // ++: normal >0 : OK 4414 const auto converted = decltype(Zero(di)){converted_raw}; 4415 const auto sign_wrong = AndNot(BitCast(di, original), converted); 4416 return BitCast(di, Xor(converted, BroadcastSignBit(sign_wrong))); 4417 } 4418 4419 } // namespace detail 4420 4421 template <size_t N> 4422 HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N> /* tag */, 4423 const Vec128<double, N> v) { 4424 const auto clamped = detail::ClampF64ToI32Max(Simd<double, N>(), v); 4425 return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)}; 4426 } 4427 4428 // For already range-limited input [0, 255]. 4429 template <size_t N> 4430 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) { 4431 const Simd<uint32_t, N> d32; 4432 const Simd<uint8_t, N * 4> d8; 4433 alignas(16) static constexpr uint32_t k8From32[4] = { 4434 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u}; 4435 // Also replicate bytes into all 32 bit lanes for safety. 4436 const auto quad = TableLookupBytes(v, Load(d32, k8From32)); 4437 return LowerHalf(LowerHalf(BitCast(d8, quad))); 4438 } 4439 4440 // ------------------------------ Integer <=> fp (ShiftRight, OddEven) 4441 4442 template <size_t N> 4443 HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */, 4444 const Vec128<int32_t, N> v) { 4445 return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)}; 4446 } 4447 4448 template <size_t N> 4449 HWY_API Vec128<double, N> ConvertTo(Simd<double, N> dd, 4450 const Vec128<int64_t, N> v) { 4451 #if HWY_TARGET <= HWY_AVX3 4452 (void)dd; 4453 return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)}; 4454 #else 4455 // Based on wim's approach (https://stackoverflow.com/questions/41144668/) 4456 const Repartition<uint32_t, decltype(dd)> d32; 4457 const Repartition<uint64_t, decltype(dd)> d64; 4458 4459 // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 4460 const auto k84_63 = Set(d64, 0x4530000080000000ULL); 4461 const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); 4462 4463 // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) 4464 const auto k52 = Set(d32, 0x43300000); 4465 const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); 4466 4467 const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); 4468 return (v_upper - k84_63_52) + v_lower; // order matters! 4469 #endif 4470 } 4471 4472 // Truncates (rounds toward zero). 4473 template <size_t N> 4474 HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N> di, 4475 const Vec128<float, N> v) { 4476 return detail::FixConversionOverflow(di, v, _mm_cvttps_epi32(v.raw)); 4477 } 4478 4479 // Full (partial handled below) 4480 HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> di, const Vec128<double> v) { 4481 #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64 4482 return detail::FixConversionOverflow(di, v, _mm_cvttpd_epi64(v.raw)); 4483 #elif HWY_ARCH_X86_64 4484 const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw)); 4485 const Half<Full128<double>> dd2; 4486 const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw)); 4487 return detail::FixConversionOverflow(di, v, _mm_unpacklo_epi64(i0, i1)); 4488 #else 4489 using VI = decltype(Zero(di)); 4490 const VI k0 = Zero(di); 4491 const VI k1 = Set(di, 1); 4492 const VI k51 = Set(di, 51); 4493 4494 // Exponent indicates whether the number can be represented as int64_t. 4495 const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF); 4496 const VI exp = biased_exp - Set(di, 0x3FF); 4497 const auto in_range = exp < Set(di, 63); 4498 4499 // If we were to cap the exponent at 51 and add 2^52, the number would be in 4500 // [2^52, 2^53) and mantissa bits could be read out directly. We need to 4501 // round-to-0 (truncate), but changing rounding mode in MXCSR hits a 4502 // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead 4503 // manually shift the mantissa into place (we already have many of the 4504 // inputs anyway). 4505 const VI shift_mnt = Max(k51 - exp, k0); 4506 const VI shift_int = Max(exp - k51, k0); 4507 const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1); 4508 // Include implicit 1-bit; shift by one more to ensure it's in the mantissa. 4509 const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1); 4510 // For inputs larger than 2^52, insert zeros at the bottom. 4511 const VI shifted = int52 << shift_int; 4512 // Restore the one bit lost when shifting in the implicit 1-bit. 4513 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1)); 4514 4515 // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. 4516 const VI sign_mask = BroadcastSignBit(BitCast(di, v)); 4517 const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask; 4518 const VI magnitude = IfThenElse(in_range, restored, limit); 4519 4520 // If the input was negative, negate the integer (two's complement). 4521 return (magnitude ^ sign_mask) - sign_mask; 4522 #endif 4523 } 4524 HWY_API Vec128<int64_t, 1> ConvertTo(Simd<int64_t, 1> di, 4525 const Vec128<double, 1> v) { 4526 // Only need to specialize for non-AVX3, 64-bit (single scalar op) 4527 #if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64 4528 const Vec128<int64_t, 1> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))}; 4529 return detail::FixConversionOverflow(di, v, i0.raw); 4530 #else 4531 (void)di; 4532 const auto full = ConvertTo(Full128<int64_t>(), Vec128<double>{v.raw}); 4533 return Vec128<int64_t, 1>{full.raw}; 4534 #endif 4535 } 4536 4537 template <size_t N> 4538 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) { 4539 const Simd<int32_t, N> di; 4540 return detail::FixConversionOverflow(di, v, _mm_cvtps_epi32(v.raw)); 4541 } 4542 4543 // ------------------------------ Floating-point rounding (ConvertTo) 4544 4545 #if HWY_TARGET == HWY_SSSE3 4546 4547 // Toward nearest integer, ties to even 4548 template <typename T, size_t N, HWY_IF_FLOAT(T)> 4549 HWY_API Vec128<T, N> Round(const Vec128<T, N> v) { 4550 // Rely on rounding after addition with a large value such that no mantissa 4551 // bits remain (assuming the current mode is nearest-even). We may need a 4552 // compiler flag for precise floating-point to prevent "optimizing" this out. 4553 const Simd<T, N> df; 4554 const auto max = Set(df, MantissaEnd<T>()); 4555 const auto large = CopySignToAbs(max, v); 4556 const auto added = large + v; 4557 const auto rounded = added - large; 4558 // Keep original if NaN or the magnitude is large (already an int). 4559 return IfThenElse(Abs(v) < max, rounded, v); 4560 } 4561 4562 namespace detail { 4563 4564 // Truncating to integer and converting back to float is correct except when the 4565 // input magnitude is large, in which case the input was already an integer 4566 // (because mantissa >> exponent is zero). 4567 template <typename T, size_t N, HWY_IF_FLOAT(T)> 4568 HWY_INLINE Mask128<T, N> UseInt(const Vec128<T, N> v) { 4569 return Abs(v) < Set(Simd<T, N>(), MantissaEnd<T>()); 4570 } 4571 4572 } // namespace detail 4573 4574 // Toward zero, aka truncate 4575 template <typename T, size_t N, HWY_IF_FLOAT(T)> 4576 HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) { 4577 const Simd<T, N> df; 4578 const RebindToSigned<decltype(df)> di; 4579 4580 const auto integer = ConvertTo(di, v); // round toward 0 4581 const auto int_f = ConvertTo(df, integer); 4582 4583 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); 4584 } 4585 4586 // Toward +infinity, aka ceiling 4587 template <typename T, size_t N, HWY_IF_FLOAT(T)> 4588 HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) { 4589 const Simd<T, N> df; 4590 const RebindToSigned<decltype(df)> di; 4591 4592 const auto integer = ConvertTo(di, v); // round toward 0 4593 const auto int_f = ConvertTo(df, integer); 4594 4595 // Truncating a positive non-integer ends up smaller; if so, add 1. 4596 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); 4597 4598 return IfThenElse(detail::UseInt(v), int_f - neg1, v); 4599 } 4600 4601 // Toward -infinity, aka floor 4602 template <typename T, size_t N, HWY_IF_FLOAT(T)> 4603 HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) { 4604 const Simd<T, N> df; 4605 const RebindToSigned<decltype(df)> di; 4606 4607 const auto integer = ConvertTo(di, v); // round toward 0 4608 const auto int_f = ConvertTo(df, integer); 4609 4610 // Truncating a negative non-integer ends up larger; if so, subtract 1. 4611 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); 4612 4613 return IfThenElse(detail::UseInt(v), int_f + neg1, v); 4614 } 4615 4616 #else 4617 4618 // Toward nearest integer, ties to even 4619 template <size_t N> 4620 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) { 4621 return Vec128<float, N>{ 4622 _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; 4623 } 4624 template <size_t N> 4625 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) { 4626 return Vec128<double, N>{ 4627 _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; 4628 } 4629 4630 // Toward zero, aka truncate 4631 template <size_t N> 4632 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) { 4633 return Vec128<float, N>{ 4634 _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; 4635 } 4636 template <size_t N> 4637 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) { 4638 return Vec128<double, N>{ 4639 _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; 4640 } 4641 4642 // Toward +infinity, aka ceiling 4643 template <size_t N> 4644 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) { 4645 return Vec128<float, N>{ 4646 _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; 4647 } 4648 template <size_t N> 4649 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) { 4650 return Vec128<double, N>{ 4651 _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; 4652 } 4653 4654 // Toward -infinity, aka floor 4655 template <size_t N> 4656 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) { 4657 return Vec128<float, N>{ 4658 _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; 4659 } 4660 template <size_t N> 4661 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) { 4662 return Vec128<double, N>{ 4663 _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; 4664 } 4665 4666 #endif // !HWY_SSSE3 4667 4668 // ================================================== CRYPTO 4669 4670 #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3 4671 4672 // Per-target flag to prevent generic_ops-inl.h from defining AESRound. 4673 #ifdef HWY_NATIVE_AES 4674 #undef HWY_NATIVE_AES 4675 #else 4676 #define HWY_NATIVE_AES 4677 #endif 4678 4679 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state, 4680 Vec128<uint8_t> round_key) { 4681 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)}; 4682 } 4683 4684 template <size_t N, HWY_IF_LE128(uint64_t, N)> 4685 HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a, 4686 Vec128<uint64_t, N> b) { 4687 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)}; 4688 } 4689 4690 template <size_t N, HWY_IF_LE128(uint64_t, N)> 4691 HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a, 4692 Vec128<uint64_t, N> b) { 4693 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)}; 4694 } 4695 4696 #endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3 4697 4698 // ================================================== MISC 4699 4700 #if HWY_TARGET <= HWY_AVX3 4701 4702 // ------------------------------ LoadMaskBits 4703 4704 // `p` points to at least 8 readable bytes, not all of which need be valid. 4705 template <typename T, size_t N, HWY_IF_LE128(T, N)> 4706 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N> /* tag */, 4707 const uint8_t* HWY_RESTRICT bits) { 4708 uint64_t mask_bits = 0; 4709 constexpr size_t kNumBytes = (N + 7) / 8; 4710 CopyBytes<kNumBytes>(bits, &mask_bits); 4711 if (N < 8) { 4712 mask_bits &= (1ull << N) - 1; 4713 } 4714 4715 return Mask128<T, N>::FromBits(mask_bits); 4716 } 4717 4718 // ------------------------------ StoreMaskBits 4719 4720 // `p` points to at least 8 writable bytes. 4721 template <typename T, size_t N> 4722 HWY_API size_t StoreMaskBits(const Simd<T, N> /* tag */, 4723 const Mask128<T, N> mask, uint8_t* bits) { 4724 constexpr size_t kNumBytes = (N + 7) / 8; 4725 CopyBytes<kNumBytes>(&mask.raw, bits); 4726 4727 // Non-full byte, need to clear the undefined upper bits. 4728 if (N < 8) { 4729 const int mask = (1 << N) - 1; 4730 bits[0] = static_cast<uint8_t>(bits[0] & mask); 4731 } 4732 4733 return kNumBytes; 4734 } 4735 4736 // ------------------------------ Mask testing 4737 4738 // Beware: the suffix indicates the number of mask bits, not lane size! 4739 4740 template <typename T, size_t N> 4741 HWY_API size_t CountTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) { 4742 const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1); 4743 return PopCount(mask_bits); 4744 } 4745 4746 template <typename T, size_t N> 4747 HWY_API intptr_t FindFirstTrue(const Simd<T, N> /* tag */, 4748 const Mask128<T, N> mask) { 4749 const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1); 4750 return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; 4751 } 4752 4753 template <typename T, size_t N> 4754 HWY_API bool AllFalse(const Simd<T, N> /* tag */, const Mask128<T, N> mask) { 4755 const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1); 4756 return mask_bits == 0; 4757 } 4758 4759 template <typename T, size_t N> 4760 HWY_API bool AllTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) { 4761 const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1); 4762 // Cannot use _kortestc because we may have less than 8 mask bits. 4763 return mask_bits == (1u << N) - 1; 4764 } 4765 4766 // ------------------------------ Compress 4767 4768 #if HWY_TARGET != HWY_AVX3_DL 4769 namespace detail { 4770 4771 // Returns permutevar_epi16 indices for 16-bit Compress. Also used by x86_256. 4772 HWY_INLINE Vec128<uint16_t, 8> IndicesForCompress16(uint64_t mask_bits) { 4773 Full128<uint16_t> du16; 4774 // Table of u16 indices packed into bytes to reduce L1 usage. Will be unpacked 4775 // to u16. Ideally we would broadcast 8*3 (half of the 8 bytes currently used) 4776 // bits into each lane and then varshift, but that does not fit in 16 bits. 4777 Rebind<uint8_t, decltype(du16)> du8; 4778 alignas(16) constexpr uint8_t tbl[2048] = { 4779 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 4780 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2, 4781 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4782 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0, 4783 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 4784 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 4785 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 4786 0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 4787 0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2, 4788 3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1, 4789 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0, 4790 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0, 4791 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0, 4792 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 4793 0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0, 4794 0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 4795 1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0, 4796 2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4, 4797 5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3, 4798 4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4, 4799 5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 4800 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 4801 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 4802 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0, 4803 0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1, 4804 2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 4805 6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6, 4806 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4, 4807 6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0, 4808 0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0, 4809 0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0, 4810 0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 4811 2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0, 4812 1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3, 4813 5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3, 4814 5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0, 4815 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6, 4816 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0, 4817 0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0, 4818 0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0, 4819 0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7, 4820 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1, 4821 7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7, 4822 0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0, 4823 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0, 4824 0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0, 4825 0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0, 4826 0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 4827 1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0, 4828 3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3, 4829 4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2, 4830 3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0, 4831 0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0, 4832 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0, 4833 0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 4834 0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0, 4835 0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1, 4836 4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2, 4837 4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5, 4838 7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4, 4839 5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5, 4840 7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0, 4841 0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0, 4842 0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 4843 3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0, 4844 1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2, 4845 3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6, 4846 7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7, 4847 0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6, 4848 7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0, 4849 0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0, 4850 0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0, 4851 0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2, 4852 5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1, 4853 2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5, 4854 6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5, 4855 6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0, 4856 0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7, 4857 0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0, 4858 0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0, 4859 1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0, 4860 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7}; 4861 return PromoteTo(du16, Load(du8, tbl + mask_bits * 8)); 4862 } 4863 4864 } // namespace detail 4865 #endif // HWY_TARGET != HWY_AVX3_DL 4866 4867 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)> 4868 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 4869 const Simd<T, N> d; 4870 const Rebind<uint16_t, decltype(d)> du; 4871 const auto vu = BitCast(du, v); // (required for float16_t inputs) 4872 4873 #if HWY_TARGET == HWY_AVX3_DL // VBMI2 4874 const Vec128<uint16_t, N> cu{_mm_maskz_compress_epi16(mask.raw, vu.raw)}; 4875 #else 4876 const auto idx = detail::IndicesForCompress16(uint64_t{mask.raw}); 4877 const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)}; 4878 #endif // HWY_TARGET != HWY_AVX3_DL 4879 return BitCast(d, cu); 4880 } 4881 4882 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 4883 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 4884 return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)}; 4885 } 4886 4887 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)> 4888 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 4889 return Vec128<T, N>{_mm_maskz_compress_epi64(mask.raw, v.raw)}; 4890 } 4891 4892 template <size_t N> 4893 HWY_API Vec128<float, N> Compress(Vec128<float, N> v, Mask128<float, N> mask) { 4894 return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)}; 4895 } 4896 4897 template <size_t N> 4898 HWY_API Vec128<double, N> Compress(Vec128<double, N> v, 4899 Mask128<double, N> mask) { 4900 return Vec128<double, N>{_mm_maskz_compress_pd(mask.raw, v.raw)}; 4901 } 4902 4903 // ------------------------------ CompressBits (LoadMaskBits) 4904 4905 template <typename T, size_t N> 4906 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, 4907 const uint8_t* HWY_RESTRICT bits) { 4908 return Compress(v, LoadMaskBits(Simd<T, N>(), bits)); 4909 } 4910 4911 // ------------------------------ CompressStore 4912 4913 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)> 4914 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask, Simd<T, N> d, 4915 T* HWY_RESTRICT unaligned) { 4916 const Rebind<uint16_t, decltype(d)> du; 4917 const auto vu = BitCast(du, v); // (required for float16_t inputs) 4918 4919 const uint64_t mask_bits{mask.raw}; 4920 4921 #if HWY_TARGET == HWY_AVX3_DL // VBMI2 4922 _mm_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw); 4923 #else 4924 const auto idx = detail::IndicesForCompress16(mask_bits); 4925 const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)}; 4926 StoreU(BitCast(d, cu), d, unaligned); 4927 #endif // HWY_TARGET == HWY_AVX3_DL 4928 return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1)); 4929 } 4930 4931 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 4932 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask, 4933 Simd<T, N> /* tag */, T* HWY_RESTRICT unaligned) { 4934 _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); 4935 return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1)); 4936 } 4937 4938 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)> 4939 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask, 4940 Simd<T, N> /* tag */, T* HWY_RESTRICT unaligned) { 4941 _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); 4942 return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1)); 4943 } 4944 4945 template <size_t N, HWY_IF_LE128(float, N)> 4946 HWY_API size_t CompressStore(Vec128<float, N> v, Mask128<float, N> mask, 4947 Simd<float, N> /* tag */, 4948 float* HWY_RESTRICT unaligned) { 4949 _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); 4950 return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1)); 4951 } 4952 4953 template <size_t N, HWY_IF_LE128(double, N)> 4954 HWY_API size_t CompressStore(Vec128<double, N> v, Mask128<double, N> mask, 4955 Simd<double, N> /* tag */, 4956 double* HWY_RESTRICT unaligned) { 4957 _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); 4958 return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1)); 4959 } 4960 4961 // ------------------------------ CompressBlendedStore (CompressStore) 4962 template <typename T, size_t N> 4963 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m, 4964 Simd<T, N> d, T* HWY_RESTRICT unaligned) { 4965 // AVX-512 already does the blending at no extra cost (latency 11, 4966 // rthroughput 2 - same as compress plus store). 4967 if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) { 4968 // We're relying on the mask to blend. Clear the undefined upper bits. 4969 if (N != 16 / sizeof(T)) { 4970 m = And(m, FirstN(d, N)); 4971 } 4972 return CompressStore(v, m, d, unaligned); 4973 } else { 4974 const size_t count = CountTrue(m); 4975 const Vec128<T, N> compressed = Compress(v, m); 4976 const Vec128<T, N> prev = LoadU(d, unaligned); 4977 StoreU(IfThenElse(FirstN(d, count), compressed, prev), d, unaligned); 4978 return count; 4979 } 4980 } 4981 4982 // ------------------------------ CompressBitsStore (LoadMaskBits) 4983 4984 template <typename T, size_t N> 4985 HWY_API size_t CompressBitsStore(Vec128<T, N> v, 4986 const uint8_t* HWY_RESTRICT bits, Simd<T, N> d, 4987 T* HWY_RESTRICT unaligned) { 4988 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); 4989 } 4990 4991 #else // AVX2 or below 4992 4993 // ------------------------------ LoadMaskBits (TestBit) 4994 4995 namespace detail { 4996 4997 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)> 4998 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) { 4999 const RebindToUnsigned<decltype(d)> du; 5000 // Easier than Set(), which would require an >8-bit type, which would not 5001 // compile for T=uint8_t, N=1. 5002 const Vec128<T, N> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))}; 5003 5004 // Replicate bytes 8x such that each byte contains the bit that governs it. 5005 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 5006 1, 1, 1, 1, 1, 1, 1, 1}; 5007 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); 5008 5009 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, 5010 1, 2, 4, 8, 16, 32, 64, 128}; 5011 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); 5012 } 5013 5014 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)> 5015 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) { 5016 const RebindToUnsigned<decltype(d)> du; 5017 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; 5018 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits)); 5019 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); 5020 } 5021 5022 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 5023 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) { 5024 const RebindToUnsigned<decltype(d)> du; 5025 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8}; 5026 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits)); 5027 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); 5028 } 5029 5030 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)> 5031 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) { 5032 const RebindToUnsigned<decltype(d)> du; 5033 alignas(16) constexpr uint64_t kBit[8] = {1, 2}; 5034 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); 5035 } 5036 5037 } // namespace detail 5038 5039 // `p` points to at least 8 readable bytes, not all of which need be valid. 5040 template <typename T, size_t N, HWY_IF_LE128(T, N)> 5041 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N> d, 5042 const uint8_t* HWY_RESTRICT bits) { 5043 uint64_t mask_bits = 0; 5044 constexpr size_t kNumBytes = (N + 7) / 8; 5045 CopyBytes<kNumBytes>(bits, &mask_bits); 5046 if (N < 8) { 5047 mask_bits &= (1ull << N) - 1; 5048 } 5049 5050 return detail::LoadMaskBits(d, mask_bits); 5051 } 5052 5053 // ------------------------------ StoreMaskBits 5054 5055 namespace detail { 5056 5057 constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { 5058 return static_cast<uint64_t>(static_cast<unsigned>(mask_bits)); 5059 } 5060 5061 template <typename T, size_t N> 5062 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, 5063 const Mask128<T, N> mask) { 5064 const Simd<T, N> d; 5065 const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw; 5066 return U64FromInt(_mm_movemask_epi8(sign_bits)); 5067 } 5068 5069 template <typename T, size_t N> 5070 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, 5071 const Mask128<T, N> mask) { 5072 // Remove useless lower half of each u16 while preserving the sign bit. 5073 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128()); 5074 return U64FromInt(_mm_movemask_epi8(sign_bits)); 5075 } 5076 5077 template <typename T, size_t N> 5078 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, 5079 const Mask128<T, N> mask) { 5080 const Simd<T, N> d; 5081 const Simd<float, N> df; 5082 const auto sign_bits = BitCast(df, VecFromMask(d, mask)); 5083 return U64FromInt(_mm_movemask_ps(sign_bits.raw)); 5084 } 5085 5086 template <typename T, size_t N> 5087 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, 5088 const Mask128<T, N> mask) { 5089 const Simd<T, N> d; 5090 const Simd<double, N> df; 5091 const auto sign_bits = BitCast(df, VecFromMask(d, mask)); 5092 return U64FromInt(_mm_movemask_pd(sign_bits.raw)); 5093 } 5094 5095 // Returns the lowest N of the _mm_movemask* bits. 5096 template <typename T, size_t N> 5097 constexpr uint64_t OnlyActive(uint64_t mask_bits) { 5098 return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); 5099 } 5100 5101 template <typename T, size_t N> 5102 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) { 5103 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask)); 5104 } 5105 5106 } // namespace detail 5107 5108 // `p` points to at least 8 writable bytes. 5109 template <typename T, size_t N> 5110 HWY_API size_t StoreMaskBits(const Simd<T, N> /* tag */, 5111 const Mask128<T, N> mask, uint8_t* bits) { 5112 constexpr size_t kNumBytes = (N + 7) / 8; 5113 const uint64_t mask_bits = detail::BitsFromMask(mask); 5114 CopyBytes<kNumBytes>(&mask_bits, bits); 5115 return kNumBytes; 5116 } 5117 5118 // ------------------------------ Mask testing 5119 5120 template <typename T, size_t N> 5121 HWY_API bool AllFalse(const Simd<T, N> /* tag */, const Mask128<T, N> mask) { 5122 // Cheaper than PTEST, which is 2 uop / 3L. 5123 return detail::BitsFromMask(mask) == 0; 5124 } 5125 5126 template <typename T, size_t N> 5127 HWY_API bool AllTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) { 5128 constexpr uint64_t kAllBits = 5129 detail::OnlyActive<T, N>((1ull << (16 / sizeof(T))) - 1); 5130 return detail::BitsFromMask(mask) == kAllBits; 5131 } 5132 5133 template <typename T, size_t N> 5134 HWY_API size_t CountTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) { 5135 return PopCount(detail::BitsFromMask(mask)); 5136 } 5137 5138 template <typename T, size_t N> 5139 HWY_API intptr_t FindFirstTrue(const Simd<T, N> /* tag */, 5140 const Mask128<T, N> mask) { 5141 const uint64_t mask_bits = detail::BitsFromMask(mask); 5142 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; 5143 } 5144 5145 // ------------------------------ Compress, CompressBits 5146 5147 namespace detail { 5148 5149 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)> 5150 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) { 5151 HWY_DASSERT(mask_bits < 256); 5152 const Rebind<uint8_t, decltype(d)> d8; 5153 const Simd<uint16_t, N> du; 5154 5155 // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need 5156 // byte indices for PSHUFB (one vector's worth for each of 256 combinations of 5157 // 8 mask bits). Loading them directly would require 4 KiB. We can instead 5158 // store lane indices and convert to byte indices (2*lane + 0..1), with the 5159 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane 5160 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. 5161 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles 5162 // is likely more costly than the higher cache footprint from storing bytes. 5163 alignas(16) constexpr uint8_t table[2048] = { 5164 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 5165 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 5166 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 5167 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 5168 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2, 5169 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0, 5170 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 5171 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 5172 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8, 5173 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0, 5174 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 5175 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0, 5176 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4, 5177 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6, 5178 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 5179 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, 5180 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4, 5181 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0, 5182 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, 5183 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0, 5184 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2, 5185 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0, 5186 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, 5187 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, 5188 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8, 5189 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10, 5190 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0, 5191 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0, 5192 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12, 5193 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0, 5194 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, 5195 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, 5196 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6, 5197 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0, 5198 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, 5199 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0, 5200 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2, 5201 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12, 5202 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0, 5203 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, 5204 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6, 5205 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8, 5206 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0, 5207 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0, 5208 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4, 5209 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10, 5210 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0, 5211 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, 5212 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4, 5213 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0, 5214 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, 5215 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0, 5216 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2, 5217 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10, 5218 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12, 5219 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0, 5220 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0, 5221 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, 5222 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, 5223 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0, 5224 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6, 5225 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14, 5226 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0, 5227 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, 5228 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8, 5229 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0, 5230 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, 5231 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0, 5232 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2, 5233 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8, 5234 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14, 5235 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0, 5236 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10, 5237 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14, 5238 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0, 5239 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0, 5240 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4, 5241 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6, 5242 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0, 5243 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, 5244 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4, 5245 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14, 5246 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0, 5247 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0, 5248 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2, 5249 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0, 5250 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, 5251 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, 5252 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12, 5253 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14, 5254 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0, 5255 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0, 5256 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8, 5257 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12, 5258 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0, 5259 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, 5260 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6, 5261 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12, 5262 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14, 5263 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0, 5264 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2, 5265 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12, 5266 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14, 5267 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0, 5268 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6, 5269 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10, 5270 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0, 5271 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0, 5272 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4, 5273 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8, 5274 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14, 5275 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0, 5276 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 5277 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14}; 5278 5279 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw}; 5280 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx); 5281 return BitCast(d, pairs + Set(du, 0x0100)); 5282 } 5283 5284 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)> 5285 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) { 5286 HWY_DASSERT(mask_bits < 16); 5287 5288 // There are only 4 lanes, so we can afford to load the index vector directly. 5289 alignas(16) constexpr uint8_t packed_array[256] = { 5290 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // 5291 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // 5292 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // 5293 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, // 5294 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // 5295 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, // 5296 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, // 5297 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, // 5298 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // 5299 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, // 5300 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, // 5301 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, // 5302 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, // 5303 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 5304 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 5305 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 5306 5307 const Repartition<uint8_t, decltype(d)> d8; 5308 return BitCast(d, Load(d8, packed_array + 16 * mask_bits)); 5309 } 5310 5311 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)> 5312 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) { 5313 HWY_DASSERT(mask_bits < 4); 5314 5315 // There are only 2 lanes, so we can afford to load the index vector directly. 5316 alignas(16) constexpr uint8_t packed_array[64] = { 5317 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, // 5318 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, // 5319 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 5320 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 5321 5322 const Repartition<uint8_t, decltype(d)> d8; 5323 return BitCast(d, Load(d8, packed_array + 16 * mask_bits)); 5324 } 5325 5326 } // namespace detail 5327 5328 template <typename T, size_t N> 5329 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> m) { 5330 const Simd<T, N> d; 5331 const RebindToUnsigned<decltype(d)> du; 5332 5333 const uint64_t mask_bits = detail::BitsFromMask(m); 5334 HWY_DASSERT(mask_bits < (1ull << N)); 5335 5336 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits)); 5337 return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 5338 } 5339 5340 template <typename T, size_t N> 5341 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, 5342 const uint8_t* HWY_RESTRICT bits) { 5343 const Simd<T, N> d; 5344 const RebindToUnsigned<decltype(d)> du; 5345 5346 uint64_t mask_bits = 0; 5347 constexpr size_t kNumBytes = (N + 7) / 8; 5348 CopyBytes<kNumBytes>(bits, &mask_bits); 5349 if (N < 8) { 5350 mask_bits &= (1ull << N) - 1; 5351 } 5352 5353 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits)); 5354 return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 5355 } 5356 5357 // ------------------------------ CompressStore, CompressBitsStore 5358 5359 template <typename T, size_t N> 5360 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N> d, 5361 T* HWY_RESTRICT unaligned) { 5362 const RebindToUnsigned<decltype(d)> du; 5363 5364 const uint64_t mask_bits = detail::BitsFromMask(m); 5365 HWY_DASSERT(mask_bits < (1ull << N)); 5366 5367 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). 5368 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits)); 5369 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 5370 StoreU(compressed, d, unaligned); 5371 return PopCount(mask_bits); 5372 } 5373 5374 template <typename T, size_t N> 5375 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m, 5376 Simd<T, N> d, T* HWY_RESTRICT unaligned) { 5377 const RebindToUnsigned<decltype(d)> du; 5378 5379 const uint64_t mask_bits = detail::BitsFromMask(m); 5380 HWY_DASSERT(mask_bits < (1ull << N)); 5381 const size_t count = PopCount(mask_bits); 5382 5383 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). 5384 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits)); 5385 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 5386 5387 const Vec128<T, N> prev = LoadU(d, unaligned); 5388 StoreU(IfThenElse(FirstN(d, count), compressed, prev), d, unaligned); 5389 return count; 5390 } 5391 5392 template <typename T, size_t N> 5393 HWY_API size_t CompressBitsStore(Vec128<T, N> v, 5394 const uint8_t* HWY_RESTRICT bits, Simd<T, N> d, 5395 T* HWY_RESTRICT unaligned) { 5396 const RebindToUnsigned<decltype(d)> du; 5397 5398 uint64_t mask_bits = 0; 5399 constexpr size_t kNumBytes = (N + 7) / 8; 5400 CopyBytes<kNumBytes>(bits, &mask_bits); 5401 if (N < 8) { 5402 mask_bits &= (1ull << N) - 1; 5403 } 5404 5405 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). 5406 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits)); 5407 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 5408 StoreU(compressed, d, unaligned); 5409 return PopCount(mask_bits); 5410 } 5411 5412 #endif // HWY_TARGET <= HWY_AVX3 5413 5414 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, 5415 // TableLookupBytes) 5416 5417 // 128 bits 5418 HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0, 5419 const Vec128<uint8_t> v1, 5420 const Vec128<uint8_t> v2, Full128<uint8_t> d, 5421 uint8_t* HWY_RESTRICT unaligned) { 5422 const auto k5 = Set(d, 5); 5423 const auto k6 = Set(d, 6); 5424 5425 // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0]. 5426 // 0x80 so lanes to be filled from other vectors are 0 for blending. 5427 alignas(16) static constexpr uint8_t tbl_r0[16] = { 5428 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // 5429 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; 5430 alignas(16) static constexpr uint8_t tbl_g0[16] = { 5431 0x80, 0, 0x80, 0x80, 1, 0x80, // 5432 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; 5433 const auto shuf_r0 = Load(d, tbl_r0); 5434 const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB 5435 const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0); 5436 const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0 5437 const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0. 5438 const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0.. 5439 const auto int0 = r0 | g0 | b0; 5440 StoreU(int0, d, unaligned + 0 * 16); 5441 5442 // Second vector: g10,r10, bgr[9:6], b5,g5 5443 const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6.. 5444 const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5 5445 const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5. 5446 const auto r1 = TableLookupBytes(v0, shuf_r1); 5447 const auto g1 = TableLookupBytes(v1, shuf_g1); 5448 const auto b1 = TableLookupBytes(v2, shuf_b1); 5449 const auto int1 = r1 | g1 | b1; 5450 StoreU(int1, d, unaligned + 1 * 16); 5451 5452 // Third vector: bgr[15:11], b10 5453 const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B. 5454 const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B.. 5455 const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A 5456 const auto r2 = TableLookupBytes(v0, shuf_r2); 5457 const auto g2 = TableLookupBytes(v1, shuf_g2); 5458 const auto b2 = TableLookupBytes(v2, shuf_b2); 5459 const auto int2 = r2 | g2 | b2; 5460 StoreU(int2, d, unaligned + 2 * 16); 5461 } 5462 5463 // 64 bits 5464 HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0, 5465 const Vec128<uint8_t, 8> v1, 5466 const Vec128<uint8_t, 8> v2, Simd<uint8_t, 8> d, 5467 uint8_t* HWY_RESTRICT unaligned) { 5468 // Use full vectors for the shuffles and first result. 5469 const Full128<uint8_t> d_full; 5470 const auto k5 = Set(d_full, 5); 5471 const auto k6 = Set(d_full, 6); 5472 5473 const Vec128<uint8_t> full_a{v0.raw}; 5474 const Vec128<uint8_t> full_b{v1.raw}; 5475 const Vec128<uint8_t> full_c{v2.raw}; 5476 5477 // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0]. 5478 // 0x80 so lanes to be filled from other vectors are 0 for blending. 5479 alignas(16) static constexpr uint8_t tbl_r0[16] = { 5480 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // 5481 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; 5482 alignas(16) static constexpr uint8_t tbl_g0[16] = { 5483 0x80, 0, 0x80, 0x80, 1, 0x80, // 5484 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; 5485 const auto shuf_r0 = Load(d_full, tbl_r0); 5486 const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB 5487 const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0); 5488 const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0 5489 const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0. 5490 const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0.. 5491 const auto int0 = r0 | g0 | b0; 5492 StoreU(int0, d_full, unaligned + 0 * 16); 5493 5494 // Second (HALF) vector: bgr[7:6], b5,g5 5495 const auto shuf_r1 = shuf_b0 + k6; // ..7..6.. 5496 const auto shuf_g1 = shuf_r0 + k5; // .7..6..5 5497 const auto shuf_b1 = shuf_g0 + k5; // 7..6..5. 5498 const auto r1 = TableLookupBytes(full_a, shuf_r1); 5499 const auto g1 = TableLookupBytes(full_b, shuf_g1); 5500 const auto b1 = TableLookupBytes(full_c, shuf_b1); 5501 const decltype(Zero(d)) int1{(r1 | g1 | b1).raw}; 5502 StoreU(int1, d, unaligned + 1 * 16); 5503 } 5504 5505 // <= 32 bits 5506 template <size_t N, HWY_IF_LE32(uint8_t, N)> 5507 HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0, 5508 const Vec128<uint8_t, N> v1, 5509 const Vec128<uint8_t, N> v2, 5510 Simd<uint8_t, N> /*tag*/, 5511 uint8_t* HWY_RESTRICT unaligned) { 5512 // Use full vectors for the shuffles and result. 5513 const Full128<uint8_t> d_full; 5514 5515 const Vec128<uint8_t> full_a{v0.raw}; 5516 const Vec128<uint8_t> full_b{v1.raw}; 5517 const Vec128<uint8_t> full_c{v2.raw}; 5518 5519 // Shuffle (v0,v1,v2) vector bytes to bgr[3:0]. 5520 // 0x80 so lanes to be filled from other vectors are 0 for blending. 5521 alignas(16) static constexpr uint8_t tbl_r0[16] = { 5522 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, // 5523 0x80, 0x80, 0x80, 0x80}; 5524 const auto shuf_r0 = Load(d_full, tbl_r0); 5525 const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0); 5526 const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0); 5527 const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0 5528 const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0. 5529 const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0.. 5530 const auto int0 = r0 | g0 | b0; 5531 alignas(16) uint8_t buf[16]; 5532 StoreU(int0, d_full, buf); 5533 CopyBytes<N * 3>(buf, unaligned); 5534 } 5535 5536 // ------------------------------ StoreInterleaved4 5537 5538 // 128 bits 5539 HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0, 5540 const Vec128<uint8_t> v1, 5541 const Vec128<uint8_t> v2, 5542 const Vec128<uint8_t> v3, Full128<uint8_t> d8, 5543 uint8_t* HWY_RESTRICT unaligned) { 5544 const RepartitionToWide<decltype(d8)> d16; 5545 const RepartitionToWide<decltype(d16)> d32; 5546 // let a,b,c,d denote v0..3. 5547 const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0 5548 const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0 5549 const auto ba8 = ZipUpper(d16, v0, v1); 5550 const auto dc8 = ZipUpper(d16, v2, v3); 5551 const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0 5552 const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4 5553 const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..aB d..a8 5554 const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..aF d..aC 5555 StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16); 5556 StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16); 5557 StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16); 5558 StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16); 5559 } 5560 5561 // 64 bits 5562 HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0, 5563 const Vec128<uint8_t, 8> in1, 5564 const Vec128<uint8_t, 8> in2, 5565 const Vec128<uint8_t, 8> in3, 5566 Simd<uint8_t, 8> /*tag*/, 5567 uint8_t* HWY_RESTRICT unaligned) { 5568 // Use full vectors to reduce the number of stores. 5569 const Full128<uint8_t> d_full8; 5570 const RepartitionToWide<decltype(d_full8)> d16; 5571 const RepartitionToWide<decltype(d16)> d32; 5572 const Vec128<uint8_t> v0{in0.raw}; 5573 const Vec128<uint8_t> v1{in1.raw}; 5574 const Vec128<uint8_t> v2{in2.raw}; 5575 const Vec128<uint8_t> v3{in3.raw}; 5576 // let a,b,c,d denote v0..3. 5577 const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0 5578 const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0 5579 const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0 5580 const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4 5581 StoreU(BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16); 5582 StoreU(BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16); 5583 } 5584 5585 // <= 32 bits 5586 template <size_t N, HWY_IF_LE32(uint8_t, N)> 5587 HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0, 5588 const Vec128<uint8_t, N> in1, 5589 const Vec128<uint8_t, N> in2, 5590 const Vec128<uint8_t, N> in3, 5591 Simd<uint8_t, N> /*tag*/, 5592 uint8_t* HWY_RESTRICT unaligned) { 5593 // Use full vectors to reduce the number of stores. 5594 const Full128<uint8_t> d_full8; 5595 const RepartitionToWide<decltype(d_full8)> d16; 5596 const RepartitionToWide<decltype(d16)> d32; 5597 const Vec128<uint8_t> v0{in0.raw}; 5598 const Vec128<uint8_t> v1{in1.raw}; 5599 const Vec128<uint8_t> v2{in2.raw}; 5600 const Vec128<uint8_t> v3{in3.raw}; 5601 // let a,b,c,d denote v0..3. 5602 const auto ba0 = ZipLower(d16, v0, v1); // b3 a3 .. b0 a0 5603 const auto dc0 = ZipLower(d16, v2, v3); // d3 c3 .. d0 c0 5604 const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0 5605 alignas(16) uint8_t buf[16]; 5606 StoreU(BitCast(d_full8, dcba_0), d_full8, buf); 5607 CopyBytes<4 * N>(buf, unaligned); 5608 } 5609 5610 // ------------------------------ Reductions 5611 5612 namespace detail { 5613 5614 // N=1 for any T: no-op 5615 template <typename T> 5616 HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */, 5617 const Vec128<T, 1> v) { 5618 return v; 5619 } 5620 template <typename T> 5621 HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */, 5622 const Vec128<T, 1> v) { 5623 return v; 5624 } 5625 template <typename T> 5626 HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */, 5627 const Vec128<T, 1> v) { 5628 return v; 5629 } 5630 5631 // u32/i32/f32: 5632 5633 // N=2 5634 template <typename T> 5635 HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */, 5636 const Vec128<T, 2> v10) { 5637 return v10 + Shuffle2301(v10); 5638 } 5639 template <typename T> 5640 HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */, 5641 const Vec128<T, 2> v10) { 5642 return Min(v10, Shuffle2301(v10)); 5643 } 5644 template <typename T> 5645 HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */, 5646 const Vec128<T, 2> v10) { 5647 return Max(v10, Shuffle2301(v10)); 5648 } 5649 5650 // N=4 (full) 5651 template <typename T> 5652 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, 5653 const Vec128<T> v3210) { 5654 const Vec128<T> v1032 = Shuffle1032(v3210); 5655 const Vec128<T> v31_20_31_20 = v3210 + v1032; 5656 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); 5657 return v20_31_20_31 + v31_20_31_20; 5658 } 5659 template <typename T> 5660 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, 5661 const Vec128<T> v3210) { 5662 const Vec128<T> v1032 = Shuffle1032(v3210); 5663 const Vec128<T> v31_20_31_20 = Min(v3210, v1032); 5664 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); 5665 return Min(v20_31_20_31, v31_20_31_20); 5666 } 5667 template <typename T> 5668 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, 5669 const Vec128<T> v3210) { 5670 const Vec128<T> v1032 = Shuffle1032(v3210); 5671 const Vec128<T> v31_20_31_20 = Max(v3210, v1032); 5672 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); 5673 return Max(v20_31_20_31, v31_20_31_20); 5674 } 5675 5676 // u64/i64/f64: 5677 5678 // N=2 (full) 5679 template <typename T> 5680 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, 5681 const Vec128<T> v10) { 5682 const Vec128<T> v01 = Shuffle01(v10); 5683 return v10 + v01; 5684 } 5685 template <typename T> 5686 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, 5687 const Vec128<T> v10) { 5688 const Vec128<T> v01 = Shuffle01(v10); 5689 return Min(v10, v01); 5690 } 5691 template <typename T> 5692 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, 5693 const Vec128<T> v10) { 5694 const Vec128<T> v01 = Shuffle01(v10); 5695 return Max(v10, v01); 5696 } 5697 5698 // u16/i16 5699 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)> 5700 HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) { 5701 const Repartition<int32_t, Simd<T, N>> d32; 5702 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); 5703 const auto odd = ShiftRight<16>(BitCast(d32, v)); 5704 const auto min = MinOfLanes(d32, Min(even, odd)); 5705 // Also broadcast into odd lanes. 5706 return BitCast(Simd<T, N>(), Or(min, ShiftLeft<16>(min))); 5707 } 5708 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)> 5709 HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) { 5710 const Repartition<int32_t, Simd<T, N>> d32; 5711 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); 5712 const auto odd = ShiftRight<16>(BitCast(d32, v)); 5713 const auto min = MaxOfLanes(d32, Max(even, odd)); 5714 // Also broadcast into odd lanes. 5715 return BitCast(Simd<T, N>(), Or(min, ShiftLeft<16>(min))); 5716 } 5717 5718 } // namespace detail 5719 5720 // Supported for u/i/f 32/64. Returns the same value in each lane. 5721 template <typename T, size_t N> 5722 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) { 5723 return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v); 5724 } 5725 template <typename T, size_t N> 5726 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) { 5727 return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v); 5728 } 5729 template <typename T, size_t N> 5730 HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) { 5731 return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v); 5732 } 5733 5734 // ================================================== DEPRECATED 5735 5736 template <typename T, size_t N> 5737 HWY_API size_t StoreMaskBits(const Mask128<T, N> mask, uint8_t* bits) { 5738 return StoreMaskBits(Simd<T, N>(), mask, bits); 5739 } 5740 5741 template <typename T, size_t N> 5742 HWY_API bool AllTrue(const Mask128<T, N> mask) { 5743 return AllTrue(Simd<T, N>(), mask); 5744 } 5745 5746 template <typename T, size_t N> 5747 HWY_API bool AllFalse(const Mask128<T, N> mask) { 5748 return AllFalse(Simd<T, N>(), mask); 5749 } 5750 5751 template <typename T, size_t N> 5752 HWY_API size_t CountTrue(const Mask128<T, N> mask) { 5753 return CountTrue(Simd<T, N>(), mask); 5754 } 5755 5756 template <typename T, size_t N> 5757 HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) { 5758 return SumOfLanes(Simd<T, N>(), v); 5759 } 5760 template <typename T, size_t N> 5761 HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) { 5762 return MinOfLanes(Simd<T, N>(), v); 5763 } 5764 template <typename T, size_t N> 5765 HWY_API Vec128<T, N> MaxOfLanes(const Vec128<T, N> v) { 5766 return MaxOfLanes(Simd<T, N>(), v); 5767 } 5768 5769 template <typename T, size_t N> 5770 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Vec128<T, N> v) { 5771 return UpperHalf(Half<Simd<T, N>>(), v); 5772 } 5773 5774 template <int kBytes, typename T, size_t N> 5775 HWY_API Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) { 5776 return ShiftRightBytes<kBytes>(Simd<T, N>(), v); 5777 } 5778 5779 template <int kLanes, typename T, size_t N> 5780 HWY_API Vec128<T, N> ShiftRightLanes(const Vec128<T, N> v) { 5781 return ShiftRightLanes<kLanes>(Simd<T, N>(), v); 5782 } 5783 5784 template <size_t kBytes, typename T, size_t N> 5785 HWY_API Vec128<T, N> CombineShiftRightBytes(Vec128<T, N> hi, Vec128<T, N> lo) { 5786 return CombineShiftRightBytes<kBytes>(Simd<T, N>(), hi, lo); 5787 } 5788 5789 template <typename T, size_t N> 5790 HWY_API Vec128<T, N> InterleaveUpper(Vec128<T, N> a, Vec128<T, N> b) { 5791 return InterleaveUpper(Simd<T, N>(), a, b); 5792 } 5793 5794 template <typename T, size_t N, class D = Simd<T, N>> 5795 HWY_API VFromD<RepartitionToWide<D>> ZipUpper(Vec128<T, N> a, Vec128<T, N> b) { 5796 return InterleaveUpper(RepartitionToWide<D>(), a, b); 5797 } 5798 5799 template <typename T, size_t N2> 5800 HWY_API Vec128<T, N2 * 2> Combine(Vec128<T, N2> hi2, Vec128<T, N2> lo2) { 5801 return Combine(Simd<T, N2 * 2>(), hi2, lo2); 5802 } 5803 5804 template <typename T, size_t N2, HWY_IF_LE64(T, N2)> 5805 HWY_API Vec128<T, N2 * 2> ZeroExtendVector(Vec128<T, N2> lo) { 5806 return ZeroExtendVector(Simd<T, N2 * 2>(), lo); 5807 } 5808 5809 template <typename T, size_t N> 5810 HWY_API Vec128<T, N> ConcatLowerLower(Vec128<T, N> hi, Vec128<T, N> lo) { 5811 return ConcatLowerLower(Simd<T, N>(), hi, lo); 5812 } 5813 5814 template <typename T, size_t N> 5815 HWY_API Vec128<T, N> ConcatUpperUpper(Vec128<T, N> hi, Vec128<T, N> lo) { 5816 return ConcatUpperUpper(Simd<T, N>(), hi, lo); 5817 } 5818 5819 template <typename T, size_t N> 5820 HWY_API Vec128<T, N> ConcatLowerUpper(const Vec128<T, N> hi, 5821 const Vec128<T, N> lo) { 5822 return ConcatLowerUpper(Simd<T, N>(), hi, lo); 5823 } 5824 5825 template <typename T, size_t N> 5826 HWY_API Vec128<T, N> ConcatUpperLower(Vec128<T, N> hi, Vec128<T, N> lo) { 5827 return ConcatUpperLower(Simd<T, N>(), hi, lo); 5828 } 5829 5830 // ================================================== Operator wrapper 5831 5832 // These apply to all x86_*-inl.h because there are no restrictions on V. 5833 5834 template <class V> 5835 HWY_API V Add(V a, V b) { 5836 return a + b; 5837 } 5838 template <class V> 5839 HWY_API V Sub(V a, V b) { 5840 return a - b; 5841 } 5842 5843 template <class V> 5844 HWY_API V Mul(V a, V b) { 5845 return a * b; 5846 } 5847 template <class V> 5848 HWY_API V Div(V a, V b) { 5849 return a / b; 5850 } 5851 5852 template <class V> 5853 V Shl(V a, V b) { 5854 return a << b; 5855 } 5856 template <class V> 5857 V Shr(V a, V b) { 5858 return a >> b; 5859 } 5860 5861 template <class V> 5862 HWY_API auto Eq(V a, V b) -> decltype(a == b) { 5863 return a == b; 5864 } 5865 template <class V> 5866 HWY_API auto Ne(V a, V b) -> decltype(a == b) { 5867 return a != b; 5868 } 5869 template <class V> 5870 HWY_API auto Lt(V a, V b) -> decltype(a == b) { 5871 return a < b; 5872 } 5873 5874 template <class V> 5875 HWY_API auto Gt(V a, V b) -> decltype(a == b) { 5876 return a > b; 5877 } 5878 template <class V> 5879 HWY_API auto Ge(V a, V b) -> decltype(a == b) { 5880 return a >= b; 5881 } 5882 5883 template <class V> 5884 HWY_API auto Le(V a, V b) -> decltype(a == b) { 5885 return a <= b; 5886 } 5887 5888 // NOLINTNEXTLINE(google-readability-namespace-comments) 5889 } // namespace HWY_NAMESPACE 5890 } // namespace hwy 5891 HWY_AFTER_NAMESPACE(); 5892