1 // Copyright 2020 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef HIGHWAY_HWY_BASE_H_ 16 #define HIGHWAY_HWY_BASE_H_ 17 18 // For SIMD module implementations and their callers, target-independent. 19 20 #include <stddef.h> 21 #include <stdint.h> 22 23 #include <atomic> 24 #include <cfloat> 25 26 #include "hwy/detect_compiler_arch.h" 27 #include "hwy/highway_export.h" 28 29 //------------------------------------------------------------------------------ 30 // Compiler-specific definitions 31 32 #define HWY_STR_IMPL(macro) #macro 33 #define HWY_STR(macro) HWY_STR_IMPL(macro) 34 35 #if HWY_COMPILER_MSVC 36 37 #include <intrin.h> 38 39 #define HWY_RESTRICT __restrict 40 #define HWY_INLINE __forceinline 41 #define HWY_NOINLINE __declspec(noinline) 42 #define HWY_FLATTEN 43 #define HWY_NORETURN __declspec(noreturn) 44 #define HWY_LIKELY(expr) (expr) 45 #define HWY_UNLIKELY(expr) (expr) 46 #define HWY_PRAGMA(tokens) __pragma(tokens) 47 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens)) 48 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc) 49 #define HWY_MAYBE_UNUSED 50 #define HWY_HAS_ASSUME_ALIGNED 0 51 #if (_MSC_VER >= 1700) 52 #define HWY_MUST_USE_RESULT _Check_return_ 53 #else 54 #define HWY_MUST_USE_RESULT 55 #endif 56 57 #else 58 59 #define HWY_RESTRICT __restrict__ 60 #define HWY_INLINE inline __attribute__((always_inline)) 61 #define HWY_NOINLINE __attribute__((noinline)) 62 #define HWY_FLATTEN __attribute__((flatten)) 63 #define HWY_NORETURN __attribute__((noreturn)) 64 #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1) 65 #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) 66 #define HWY_PRAGMA(tokens) _Pragma(#tokens) 67 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens) 68 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc) 69 // Encountered "attribute list cannot appear here" when using the C++17 70 // [[maybe_unused]], so only use the old style attribute for now. 71 #define HWY_MAYBE_UNUSED __attribute__((unused)) 72 #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result)) 73 74 #endif // !HWY_COMPILER_MSVC 75 76 //------------------------------------------------------------------------------ 77 // Builtin/attributes 78 79 // Enables error-checking of format strings. 80 #if HWY_HAS_ATTRIBUTE(__format__) 81 #define HWY_FORMAT(idx_fmt, idx_arg) \ 82 __attribute__((__format__(__printf__, idx_fmt, idx_arg))) 83 #else 84 #define HWY_FORMAT(idx_fmt, idx_arg) 85 #endif 86 87 // Returns a void* pointer which the compiler then assumes is N-byte aligned. 88 // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32); 89 // 90 // The assignment semantics are required by GCC/Clang. ICC provides an in-place 91 // __assume_aligned, whereas MSVC's __assume appears unsuitable. 92 #if HWY_HAS_BUILTIN(__builtin_assume_aligned) 93 #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) 94 #else 95 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ 96 #endif 97 98 // Clang and GCC require attributes on each function into which SIMD intrinsics 99 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and 100 // automatic annotation via pragmas. 101 #if HWY_COMPILER_CLANG 102 #define HWY_PUSH_ATTRIBUTES(targets_str) \ 103 HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \ 104 apply_to = function)) 105 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop) 106 #elif HWY_COMPILER_GCC 107 #define HWY_PUSH_ATTRIBUTES(targets_str) \ 108 HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str) 109 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options) 110 #else 111 #define HWY_PUSH_ATTRIBUTES(targets_str) 112 #define HWY_POP_ATTRIBUTES 113 #endif 114 115 //------------------------------------------------------------------------------ 116 // Macros 117 118 #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED 119 120 #define HWY_CONCAT_IMPL(a, b) a##b 121 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b) 122 123 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b)) 124 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b)) 125 126 // Compile-time fence to prevent undesirable code reordering. On Clang x86, the 127 // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence 128 // does, without generating code. 129 #if HWY_ARCH_X86 130 #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel) 131 #else 132 // TODO(janwas): investigate alternatives. On ARM, the above generates barriers. 133 #define HWY_FENCE 134 #endif 135 136 // 4 instances of a given literal value, useful as input to LoadDup128. 137 #define HWY_REP4(literal) literal, literal, literal, literal 138 139 #define HWY_ABORT(format, ...) \ 140 ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__) 141 142 // Always enabled. 143 #define HWY_ASSERT(condition) \ 144 do { \ 145 if (!(condition)) { \ 146 HWY_ABORT("Assert %s", #condition); \ 147 } \ 148 } while (0) 149 150 #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) 151 #define HWY_IS_MSAN 1 152 #else 153 #define HWY_IS_MSAN 0 154 #endif 155 156 #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) 157 #define HWY_IS_ASAN 1 158 #else 159 #define HWY_IS_ASAN 0 160 #endif 161 162 #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) 163 #define HWY_IS_TSAN 1 164 #else 165 #define HWY_IS_TSAN 0 166 #endif 167 168 // For enabling HWY_DASSERT and shortening tests in slower debug builds 169 #if !defined(HWY_IS_DEBUG_BUILD) 170 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent 171 // MSVC defines NDEBUG (if not, could instead check _DEBUG). 172 #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \ 173 HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__) 174 #define HWY_IS_DEBUG_BUILD 1 175 #else 176 #define HWY_IS_DEBUG_BUILD 0 177 #endif 178 #endif // HWY_IS_DEBUG_BUILD 179 180 #if HWY_IS_DEBUG_BUILD 181 #define HWY_DASSERT(condition) HWY_ASSERT(condition) 182 #else 183 #define HWY_DASSERT(condition) \ 184 do { \ 185 } while (0) 186 #endif 187 188 namespace hwy { 189 190 //------------------------------------------------------------------------------ 191 // kMaxVectorSize (undocumented, pending removal) 192 193 #if HWY_ARCH_X86 194 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512 195 #elif HWY_ARCH_RVV && defined(__riscv_vector) 196 // Not actually an upper bound on the size. 197 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096; 198 #else 199 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16; 200 #endif 201 202 //------------------------------------------------------------------------------ 203 // Alignment 204 205 // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays 206 // should be allocated dynamically via aligned_allocator.h because Lanes() may 207 // exceed the stack size. 208 #if HWY_ARCH_X86 209 #define HWY_ALIGN_MAX alignas(64) 210 #elif HWY_ARCH_RVV && defined(__riscv_vector) 211 #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned 212 #else 213 #define HWY_ALIGN_MAX alignas(16) 214 #endif 215 216 //------------------------------------------------------------------------------ 217 // Lane types 218 219 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name 220 // by concatenating base type and bits. 221 222 #if HWY_ARCH_ARM && (__ARM_FP & 2) 223 #define HWY_NATIVE_FLOAT16 1 224 #else 225 #define HWY_NATIVE_FLOAT16 0 226 #endif 227 228 #pragma pack(push, 1) 229 230 #if HWY_NATIVE_FLOAT16 231 using float16_t = __fp16; 232 // Clang does not allow __fp16 arguments, but scalar.h requires LaneType 233 // arguments, so use a wrapper. 234 // TODO(janwas): replace with _Float16 when that is supported? 235 #else 236 struct float16_t { 237 uint16_t bits; 238 }; 239 #endif 240 241 struct bfloat16_t { 242 uint16_t bits; 243 }; 244 245 #pragma pack(pop) 246 247 using float32_t = float; 248 using float64_t = double; 249 250 //------------------------------------------------------------------------------ 251 // Controlling overload resolution (SFINAE) 252 253 template <bool Condition> 254 struct EnableIfT {}; 255 template <> 256 struct EnableIfT<true> { 257 using type = void; 258 }; 259 260 template <bool Condition> 261 using EnableIf = typename EnableIfT<Condition>::type; 262 263 template <typename T, typename U> 264 struct IsSameT { 265 enum { value = 0 }; 266 }; 267 268 template <typename T> 269 struct IsSameT<T, T> { 270 enum { value = 1 }; 271 }; 272 273 template <typename T, typename U> 274 HWY_API constexpr bool IsSame() { 275 return IsSameT<T, U>::value; 276 } 277 278 // Insert into template/function arguments to enable this overload only for 279 // vectors of AT MOST this many bits. 280 // 281 // Note that enabling for exactly 128 bits is unnecessary because a function can 282 // simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other 283 // sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>. 284 #define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr 285 #define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr 286 #define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr 287 #define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr 288 #define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr 289 #define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr 290 #define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr 291 292 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr 293 #define HWY_IF_SIGNED(T) \ 294 hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr 295 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr 296 #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr 297 298 #define HWY_IF_LANE_SIZE(T, bytes) \ 299 hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr 300 #define HWY_IF_NOT_LANE_SIZE(T, bytes) \ 301 hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr 302 303 // Empty struct used as a size tag type. 304 template <size_t N> 305 struct SizeTag {}; 306 307 template <class T> 308 struct RemoveConstT { 309 using type = T; 310 }; 311 template <class T> 312 struct RemoveConstT<const T> { 313 using type = T; 314 }; 315 316 template <class T> 317 using RemoveConst = typename RemoveConstT<T>::type; 318 319 //------------------------------------------------------------------------------ 320 // Type relations 321 322 namespace detail { 323 324 template <typename T> 325 struct Relations; 326 template <> 327 struct Relations<uint8_t> { 328 using Unsigned = uint8_t; 329 using Signed = int8_t; 330 using Wide = uint16_t; 331 }; 332 template <> 333 struct Relations<int8_t> { 334 using Unsigned = uint8_t; 335 using Signed = int8_t; 336 using Wide = int16_t; 337 }; 338 template <> 339 struct Relations<uint16_t> { 340 using Unsigned = uint16_t; 341 using Signed = int16_t; 342 using Wide = uint32_t; 343 using Narrow = uint8_t; 344 }; 345 template <> 346 struct Relations<int16_t> { 347 using Unsigned = uint16_t; 348 using Signed = int16_t; 349 using Wide = int32_t; 350 using Narrow = int8_t; 351 }; 352 template <> 353 struct Relations<uint32_t> { 354 using Unsigned = uint32_t; 355 using Signed = int32_t; 356 using Float = float; 357 using Wide = uint64_t; 358 using Narrow = uint16_t; 359 }; 360 template <> 361 struct Relations<int32_t> { 362 using Unsigned = uint32_t; 363 using Signed = int32_t; 364 using Float = float; 365 using Wide = int64_t; 366 using Narrow = int16_t; 367 }; 368 template <> 369 struct Relations<uint64_t> { 370 using Unsigned = uint64_t; 371 using Signed = int64_t; 372 using Float = double; 373 using Narrow = uint32_t; 374 }; 375 template <> 376 struct Relations<int64_t> { 377 using Unsigned = uint64_t; 378 using Signed = int64_t; 379 using Float = double; 380 using Narrow = int32_t; 381 }; 382 template <> 383 struct Relations<float16_t> { 384 using Unsigned = uint16_t; 385 using Signed = int16_t; 386 using Float = float16_t; 387 using Wide = float; 388 }; 389 template <> 390 struct Relations<bfloat16_t> { 391 using Unsigned = uint16_t; 392 using Signed = int16_t; 393 using Wide = float; 394 }; 395 template <> 396 struct Relations<float> { 397 using Unsigned = uint32_t; 398 using Signed = int32_t; 399 using Float = float; 400 using Wide = double; 401 using Narrow = float16_t; 402 }; 403 template <> 404 struct Relations<double> { 405 using Unsigned = uint64_t; 406 using Signed = int64_t; 407 using Float = double; 408 using Narrow = float; 409 }; 410 411 template <size_t N> 412 struct TypeFromSize; 413 template <> 414 struct TypeFromSize<1> { 415 using Unsigned = uint8_t; 416 using Signed = int8_t; 417 }; 418 template <> 419 struct TypeFromSize<2> { 420 using Unsigned = uint16_t; 421 using Signed = int16_t; 422 }; 423 template <> 424 struct TypeFromSize<4> { 425 using Unsigned = uint32_t; 426 using Signed = int32_t; 427 using Float = float; 428 }; 429 template <> 430 struct TypeFromSize<8> { 431 using Unsigned = uint64_t; 432 using Signed = int64_t; 433 using Float = double; 434 }; 435 436 } // namespace detail 437 438 // Aliases for types of a different category, but the same size. 439 template <typename T> 440 using MakeUnsigned = typename detail::Relations<T>::Unsigned; 441 template <typename T> 442 using MakeSigned = typename detail::Relations<T>::Signed; 443 template <typename T> 444 using MakeFloat = typename detail::Relations<T>::Float; 445 446 // Aliases for types of the same category, but different size. 447 template <typename T> 448 using MakeWide = typename detail::Relations<T>::Wide; 449 template <typename T> 450 using MakeNarrow = typename detail::Relations<T>::Narrow; 451 452 // Obtain type from its size [bytes]. 453 template <size_t N> 454 using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned; 455 template <size_t N> 456 using SignedFromSize = typename detail::TypeFromSize<N>::Signed; 457 template <size_t N> 458 using FloatFromSize = typename detail::TypeFromSize<N>::Float; 459 460 //------------------------------------------------------------------------------ 461 // Type traits 462 463 template <typename T> 464 HWY_API constexpr bool IsFloat() { 465 // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or 466 // from a float, not compared. 467 return IsSame<T, float>() || IsSame<T, double>(); 468 } 469 470 template <typename T> 471 HWY_API constexpr bool IsSigned() { 472 return T(0) > T(-1); 473 } 474 template <> 475 constexpr bool IsSigned<float16_t>() { 476 return true; 477 } 478 template <> 479 constexpr bool IsSigned<bfloat16_t>() { 480 return true; 481 } 482 483 // Largest/smallest representable integer values. 484 template <typename T> 485 HWY_API constexpr T LimitsMax() { 486 static_assert(!IsFloat<T>(), "Only for integer types"); 487 using TU = MakeUnsigned<T>; 488 return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1) 489 : static_cast<TU>(~0ull)); 490 } 491 template <typename T> 492 HWY_API constexpr T LimitsMin() { 493 static_assert(!IsFloat<T>(), "Only for integer types"); 494 return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0); 495 } 496 497 // Largest/smallest representable value (integer or float). This naming avoids 498 // confusion with numeric_limits<float>::min() (the smallest positive value). 499 template <typename T> 500 HWY_API constexpr T LowestValue() { 501 return LimitsMin<T>(); 502 } 503 template <> 504 constexpr float LowestValue<float>() { 505 return -FLT_MAX; 506 } 507 template <> 508 constexpr double LowestValue<double>() { 509 return -DBL_MAX; 510 } 511 512 template <typename T> 513 HWY_API constexpr T HighestValue() { 514 return LimitsMax<T>(); 515 } 516 template <> 517 constexpr float HighestValue<float>() { 518 return FLT_MAX; 519 } 520 template <> 521 constexpr double HighestValue<double>() { 522 return DBL_MAX; 523 } 524 525 // Returns bitmask of the exponent field in IEEE binary32/64. 526 template <typename T> 527 constexpr T ExponentMask() { 528 static_assert(sizeof(T) == 0, "Only instantiate the specializations"); 529 return 0; 530 } 531 template <> 532 constexpr uint32_t ExponentMask<uint32_t>() { 533 return 0x7F800000; 534 } 535 template <> 536 constexpr uint64_t ExponentMask<uint64_t>() { 537 return 0x7FF0000000000000ULL; 538 } 539 540 // Returns bitmask of the mantissa field in IEEE binary32/64. 541 template <typename T> 542 constexpr T MantissaMask() { 543 static_assert(sizeof(T) == 0, "Only instantiate the specializations"); 544 return 0; 545 } 546 template <> 547 constexpr uint32_t MantissaMask<uint32_t>() { 548 return 0x007FFFFF; 549 } 550 template <> 551 constexpr uint64_t MantissaMask<uint64_t>() { 552 return 0x000FFFFFFFFFFFFFULL; 553 } 554 555 // Returns 1 << mantissa_bits as a floating-point number. All integers whose 556 // absolute value are less than this can be represented exactly. 557 template <typename T> 558 constexpr T MantissaEnd() { 559 static_assert(sizeof(T) == 0, "Only instantiate the specializations"); 560 return 0; 561 } 562 template <> 563 constexpr float MantissaEnd<float>() { 564 return 8388608.0f; // 1 << 23 565 } 566 template <> 567 constexpr double MantissaEnd<double>() { 568 // floating point literal with p52 requires C++17. 569 return 4503599627370496.0; // 1 << 52 570 } 571 572 //------------------------------------------------------------------------------ 573 // Helper functions 574 575 template <typename T1, typename T2> 576 constexpr inline T1 DivCeil(T1 a, T2 b) { 577 return (a + b - 1) / b; 578 } 579 580 // Works for any `align`; if a power of two, compiler emits ADD+AND. 581 constexpr inline size_t RoundUpTo(size_t what, size_t align) { 582 return DivCeil(what, align) * align; 583 } 584 585 // Undefined results for x == 0. 586 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) { 587 #if HWY_COMPILER_MSVC 588 unsigned long index; // NOLINT 589 _BitScanForward(&index, x); 590 return index; 591 #else // HWY_COMPILER_MSVC 592 return static_cast<size_t>(__builtin_ctz(x)); 593 #endif // HWY_COMPILER_MSVC 594 } 595 596 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) { 597 #if HWY_COMPILER_MSVC 598 #if HWY_ARCH_X86_64 599 unsigned long index; // NOLINT 600 _BitScanForward64(&index, x); 601 return index; 602 #else // HWY_ARCH_X86_64 603 // _BitScanForward64 not available 604 uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF); 605 unsigned long index; 606 if (lsb == 0) { 607 uint32_t msb = static_cast<uint32_t>(x >> 32u); 608 _BitScanForward(&index, msb); 609 return 32 + index; 610 } else { 611 _BitScanForward(&index, lsb); 612 return index; 613 } 614 #endif // HWY_ARCH_X86_64 615 #else // HWY_COMPILER_MSVC 616 return static_cast<size_t>(__builtin_ctzll(x)); 617 #endif // HWY_COMPILER_MSVC 618 } 619 620 // Undefined results for x == 0. 621 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) { 622 #if HWY_COMPILER_MSVC 623 unsigned long index; // NOLINT 624 _BitScanReverse(&index, x); 625 return 31 - index; 626 #else // HWY_COMPILER_MSVC 627 return static_cast<size_t>(__builtin_clz(x)); 628 #endif // HWY_COMPILER_MSVC 629 } 630 631 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) { 632 #if HWY_COMPILER_MSVC 633 #if HWY_ARCH_X86_64 634 unsigned long index; // NOLINT 635 _BitScanReverse64(&index, x); 636 return 63 - index; 637 #else // HWY_ARCH_X86_64 638 // _BitScanReverse64 not available 639 const uint32_t msb = static_cast<uint32_t>(x >> 32u); 640 unsigned long index; 641 if (msb == 0) { 642 const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF); 643 _BitScanReverse(&index, lsb); 644 return 63 - index; 645 } else { 646 _BitScanReverse(&index, msb); 647 return 31 - index; 648 } 649 #endif // HWY_ARCH_X86_64 650 #else // HWY_COMPILER_MSVC 651 return static_cast<size_t>(__builtin_clzll(x)); 652 #endif // HWY_COMPILER_MSVC 653 } 654 655 HWY_API size_t PopCount(uint64_t x) { 656 #if HWY_COMPILER_CLANG || HWY_COMPILER_GCC 657 return static_cast<size_t>(__builtin_popcountll(x)); 658 // This instruction has a separate feature flag, but is often called from 659 // non-SIMD code, so we don't want to require dynamic dispatch. It was first 660 // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro 661 // for AVX, so check for that. 662 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__) 663 return _mm_popcnt_u64(x); 664 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__) 665 return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32)); 666 #else 667 x -= ((x >> 1) & 0x5555555555555555ULL); 668 x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL)); 669 x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL); 670 x += (x >> 8); 671 x += (x >> 16); 672 x += (x >> 32); 673 return static_cast<size_t>(x & 0x7Fu); 674 #endif 675 } 676 677 // Skip HWY_API due to GCC "function not considered for inlining". Previously 678 // such errors were caused by underlying type mismatches, but it's not clear 679 // what is still mismatched despite all the casts. 680 template <typename TI> 681 /*HWY_API*/ constexpr size_t FloorLog2(TI x) { 682 return x == TI{1} 683 ? 0 684 : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1); 685 } 686 687 template <typename TI> 688 /*HWY_API*/ constexpr size_t CeilLog2(TI x) { 689 return x == TI{1} 690 ? 0 691 : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1); 692 } 693 694 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64 695 #pragma intrinsic(_umul128) 696 #endif 697 698 // 64 x 64 = 128 bit multiplication 699 HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) { 700 #if defined(__SIZEOF_INT128__) 701 __uint128_t product = (__uint128_t)a * (__uint128_t)b; 702 *upper = (uint64_t)(product >> 64); 703 return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL); 704 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 705 return _umul128(a, b, upper); 706 #else 707 constexpr uint64_t kLo32 = 0xFFFFFFFFU; 708 const uint64_t lo_lo = (a & kLo32) * (b & kLo32); 709 const uint64_t hi_lo = (a >> 32) * (b & kLo32); 710 const uint64_t lo_hi = (a & kLo32) * (b >> 32); 711 const uint64_t hi_hi = (a >> 32) * (b >> 32); 712 const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi; 713 *upper = (hi_lo >> 32) + (t >> 32) + hi_hi; 714 return (t << 32) | (lo_lo & kLo32); 715 #endif 716 } 717 718 // The source/destination must not overlap/alias. 719 template <size_t kBytes, typename From, typename To> 720 HWY_API void CopyBytes(const From* from, To* to) { 721 #if HWY_COMPILER_MSVC 722 const uint8_t* HWY_RESTRICT from_bytes = 723 reinterpret_cast<const uint8_t*>(from); 724 uint8_t* HWY_RESTRICT to_bytes = reinterpret_cast<uint8_t*>(to); 725 for (size_t i = 0; i < kBytes; ++i) { 726 to_bytes[i] = from_bytes[i]; 727 } 728 #else 729 // Avoids horrible codegen on Clang (series of PINSRB) 730 __builtin_memcpy(to, from, kBytes); 731 #endif 732 } 733 734 HWY_API float F32FromBF16(bfloat16_t bf) { 735 uint32_t bits = bf.bits; 736 bits <<= 16; 737 float f; 738 CopyBytes<4>(&bits, &f); 739 return f; 740 } 741 742 HWY_API bfloat16_t BF16FromF32(float f) { 743 uint32_t bits; 744 CopyBytes<4>(&f, &bits); 745 bfloat16_t bf; 746 bf.bits = static_cast<uint16_t>(bits >> 16); 747 return bf; 748 } 749 750 HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) 751 Abort(const char* file, int line, const char* format, ...); 752 753 } // namespace hwy 754 755 #endif // HIGHWAY_HWY_BASE_H_ 756