1 // Copyright 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef HIGHWAY_HWY_BASE_H_
16 #define HIGHWAY_HWY_BASE_H_
17 
18 // For SIMD module implementations and their callers, target-independent.
19 
20 #include <stddef.h>
21 #include <stdint.h>
22 
23 #include <atomic>
24 #include <cfloat>
25 
26 #include "hwy/detect_compiler_arch.h"
27 #include "hwy/highway_export.h"
28 
29 //------------------------------------------------------------------------------
30 // Compiler-specific definitions
31 
32 #define HWY_STR_IMPL(macro) #macro
33 #define HWY_STR(macro) HWY_STR_IMPL(macro)
34 
35 #if HWY_COMPILER_MSVC
36 
37 #include <intrin.h>
38 
39 #define HWY_RESTRICT __restrict
40 #define HWY_INLINE __forceinline
41 #define HWY_NOINLINE __declspec(noinline)
42 #define HWY_FLATTEN
43 #define HWY_NORETURN __declspec(noreturn)
44 #define HWY_LIKELY(expr) (expr)
45 #define HWY_UNLIKELY(expr) (expr)
46 #define HWY_PRAGMA(tokens) __pragma(tokens)
47 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
48 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
49 #define HWY_MAYBE_UNUSED
50 #define HWY_HAS_ASSUME_ALIGNED 0
51 #if (_MSC_VER >= 1700)
52 #define HWY_MUST_USE_RESULT _Check_return_
53 #else
54 #define HWY_MUST_USE_RESULT
55 #endif
56 
57 #else
58 
59 #define HWY_RESTRICT __restrict__
60 #define HWY_INLINE inline __attribute__((always_inline))
61 #define HWY_NOINLINE __attribute__((noinline))
62 #define HWY_FLATTEN __attribute__((flatten))
63 #define HWY_NORETURN __attribute__((noreturn))
64 #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
65 #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
66 #define HWY_PRAGMA(tokens) _Pragma(#tokens)
67 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
68 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
69 // Encountered "attribute list cannot appear here" when using the C++17
70 // [[maybe_unused]], so only use the old style attribute for now.
71 #define HWY_MAYBE_UNUSED __attribute__((unused))
72 #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
73 
74 #endif  // !HWY_COMPILER_MSVC
75 
76 //------------------------------------------------------------------------------
77 // Builtin/attributes
78 
79 // Enables error-checking of format strings.
80 #if HWY_HAS_ATTRIBUTE(__format__)
81 #define HWY_FORMAT(idx_fmt, idx_arg) \
82   __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
83 #else
84 #define HWY_FORMAT(idx_fmt, idx_arg)
85 #endif
86 
87 // Returns a void* pointer which the compiler then assumes is N-byte aligned.
88 // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
89 //
90 // The assignment semantics are required by GCC/Clang. ICC provides an in-place
91 // __assume_aligned, whereas MSVC's __assume appears unsuitable.
92 #if HWY_HAS_BUILTIN(__builtin_assume_aligned)
93 #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
94 #else
95 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
96 #endif
97 
98 // Clang and GCC require attributes on each function into which SIMD intrinsics
99 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
100 // automatic annotation via pragmas.
101 #if HWY_COMPILER_CLANG
102 #define HWY_PUSH_ATTRIBUTES(targets_str)                                \
103   HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
104                                   apply_to = function))
105 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
106 #elif HWY_COMPILER_GCC
107 #define HWY_PUSH_ATTRIBUTES(targets_str) \
108   HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
109 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
110 #else
111 #define HWY_PUSH_ATTRIBUTES(targets_str)
112 #define HWY_POP_ATTRIBUTES
113 #endif
114 
115 //------------------------------------------------------------------------------
116 // Macros
117 
118 #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
119 
120 #define HWY_CONCAT_IMPL(a, b) a##b
121 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
122 
123 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
124 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
125 
126 // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
127 // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
128 // does, without generating code.
129 #if HWY_ARCH_X86
130 #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
131 #else
132 // TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
133 #define HWY_FENCE
134 #endif
135 
136 // 4 instances of a given literal value, useful as input to LoadDup128.
137 #define HWY_REP4(literal) literal, literal, literal, literal
138 
139 #define HWY_ABORT(format, ...) \
140   ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
141 
142 // Always enabled.
143 #define HWY_ASSERT(condition)             \
144   do {                                    \
145     if (!(condition)) {                   \
146       HWY_ABORT("Assert %s", #condition); \
147     }                                     \
148   } while (0)
149 
150 #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
151 #define HWY_IS_MSAN 1
152 #else
153 #define HWY_IS_MSAN 0
154 #endif
155 
156 #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
157 #define HWY_IS_ASAN 1
158 #else
159 #define HWY_IS_ASAN 0
160 #endif
161 
162 #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
163 #define HWY_IS_TSAN 1
164 #else
165 #define HWY_IS_TSAN 0
166 #endif
167 
168 // For enabling HWY_DASSERT and shortening tests in slower debug builds
169 #if !defined(HWY_IS_DEBUG_BUILD)
170 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
171 // MSVC defines NDEBUG (if not, could instead check _DEBUG).
172 #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
173     HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
174 #define HWY_IS_DEBUG_BUILD 1
175 #else
176 #define HWY_IS_DEBUG_BUILD 0
177 #endif
178 #endif  // HWY_IS_DEBUG_BUILD
179 
180 #if HWY_IS_DEBUG_BUILD
181 #define HWY_DASSERT(condition) HWY_ASSERT(condition)
182 #else
183 #define HWY_DASSERT(condition) \
184   do {                         \
185   } while (0)
186 #endif
187 
188 namespace hwy {
189 
190 //------------------------------------------------------------------------------
191 // kMaxVectorSize (undocumented, pending removal)
192 
193 #if HWY_ARCH_X86
194 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64;  // AVX-512
195 #elif HWY_ARCH_RVV && defined(__riscv_vector)
196 // Not actually an upper bound on the size.
197 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
198 #else
199 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
200 #endif
201 
202 //------------------------------------------------------------------------------
203 // Alignment
204 
205 // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
206 // should be allocated dynamically via aligned_allocator.h because Lanes() may
207 // exceed the stack size.
208 #if HWY_ARCH_X86
209 #define HWY_ALIGN_MAX alignas(64)
210 #elif HWY_ARCH_RVV && defined(__riscv_vector)
211 #define HWY_ALIGN_MAX alignas(8)  // only elements need be aligned
212 #else
213 #define HWY_ALIGN_MAX alignas(16)
214 #endif
215 
216 //------------------------------------------------------------------------------
217 // Lane types
218 
219 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
220 // by concatenating base type and bits.
221 
222 #if HWY_ARCH_ARM && (__ARM_FP & 2)
223 #define HWY_NATIVE_FLOAT16 1
224 #else
225 #define HWY_NATIVE_FLOAT16 0
226 #endif
227 
228 #pragma pack(push, 1)
229 
230 #if HWY_NATIVE_FLOAT16
231 using float16_t = __fp16;
232 // Clang does not allow __fp16 arguments, but scalar.h requires LaneType
233 // arguments, so use a wrapper.
234 // TODO(janwas): replace with _Float16 when that is supported?
235 #else
236 struct float16_t {
237   uint16_t bits;
238 };
239 #endif
240 
241 struct bfloat16_t {
242   uint16_t bits;
243 };
244 
245 #pragma pack(pop)
246 
247 using float32_t = float;
248 using float64_t = double;
249 
250 //------------------------------------------------------------------------------
251 // Controlling overload resolution (SFINAE)
252 
253 template <bool Condition>
254 struct EnableIfT {};
255 template <>
256 struct EnableIfT<true> {
257   using type = void;
258 };
259 
260 template <bool Condition>
261 using EnableIf = typename EnableIfT<Condition>::type;
262 
263 template <typename T, typename U>
264 struct IsSameT {
265   enum { value = 0 };
266 };
267 
268 template <typename T>
269 struct IsSameT<T, T> {
270   enum { value = 1 };
271 };
272 
273 template <typename T, typename U>
274 HWY_API constexpr bool IsSame() {
275   return IsSameT<T, U>::value;
276 }
277 
278 // Insert into template/function arguments to enable this overload only for
279 // vectors of AT MOST this many bits.
280 //
281 // Note that enabling for exactly 128 bits is unnecessary because a function can
282 // simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
283 // sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
284 #define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
285 #define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
286 #define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
287 #define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
288 #define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
289 #define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
290 #define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
291 
292 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
293 #define HWY_IF_SIGNED(T) \
294   hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
295 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
296 #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
297 
298 #define HWY_IF_LANE_SIZE(T, bytes) \
299   hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
300 #define HWY_IF_NOT_LANE_SIZE(T, bytes) \
301   hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
302 
303 // Empty struct used as a size tag type.
304 template <size_t N>
305 struct SizeTag {};
306 
307 template <class T>
308 struct RemoveConstT {
309   using type = T;
310 };
311 template <class T>
312 struct RemoveConstT<const T> {
313   using type = T;
314 };
315 
316 template <class T>
317 using RemoveConst = typename RemoveConstT<T>::type;
318 
319 //------------------------------------------------------------------------------
320 // Type relations
321 
322 namespace detail {
323 
324 template <typename T>
325 struct Relations;
326 template <>
327 struct Relations<uint8_t> {
328   using Unsigned = uint8_t;
329   using Signed = int8_t;
330   using Wide = uint16_t;
331 };
332 template <>
333 struct Relations<int8_t> {
334   using Unsigned = uint8_t;
335   using Signed = int8_t;
336   using Wide = int16_t;
337 };
338 template <>
339 struct Relations<uint16_t> {
340   using Unsigned = uint16_t;
341   using Signed = int16_t;
342   using Wide = uint32_t;
343   using Narrow = uint8_t;
344 };
345 template <>
346 struct Relations<int16_t> {
347   using Unsigned = uint16_t;
348   using Signed = int16_t;
349   using Wide = int32_t;
350   using Narrow = int8_t;
351 };
352 template <>
353 struct Relations<uint32_t> {
354   using Unsigned = uint32_t;
355   using Signed = int32_t;
356   using Float = float;
357   using Wide = uint64_t;
358   using Narrow = uint16_t;
359 };
360 template <>
361 struct Relations<int32_t> {
362   using Unsigned = uint32_t;
363   using Signed = int32_t;
364   using Float = float;
365   using Wide = int64_t;
366   using Narrow = int16_t;
367 };
368 template <>
369 struct Relations<uint64_t> {
370   using Unsigned = uint64_t;
371   using Signed = int64_t;
372   using Float = double;
373   using Narrow = uint32_t;
374 };
375 template <>
376 struct Relations<int64_t> {
377   using Unsigned = uint64_t;
378   using Signed = int64_t;
379   using Float = double;
380   using Narrow = int32_t;
381 };
382 template <>
383 struct Relations<float16_t> {
384   using Unsigned = uint16_t;
385   using Signed = int16_t;
386   using Float = float16_t;
387   using Wide = float;
388 };
389 template <>
390 struct Relations<bfloat16_t> {
391   using Unsigned = uint16_t;
392   using Signed = int16_t;
393   using Wide = float;
394 };
395 template <>
396 struct Relations<float> {
397   using Unsigned = uint32_t;
398   using Signed = int32_t;
399   using Float = float;
400   using Wide = double;
401   using Narrow = float16_t;
402 };
403 template <>
404 struct Relations<double> {
405   using Unsigned = uint64_t;
406   using Signed = int64_t;
407   using Float = double;
408   using Narrow = float;
409 };
410 
411 template <size_t N>
412 struct TypeFromSize;
413 template <>
414 struct TypeFromSize<1> {
415   using Unsigned = uint8_t;
416   using Signed = int8_t;
417 };
418 template <>
419 struct TypeFromSize<2> {
420   using Unsigned = uint16_t;
421   using Signed = int16_t;
422 };
423 template <>
424 struct TypeFromSize<4> {
425   using Unsigned = uint32_t;
426   using Signed = int32_t;
427   using Float = float;
428 };
429 template <>
430 struct TypeFromSize<8> {
431   using Unsigned = uint64_t;
432   using Signed = int64_t;
433   using Float = double;
434 };
435 
436 }  // namespace detail
437 
438 // Aliases for types of a different category, but the same size.
439 template <typename T>
440 using MakeUnsigned = typename detail::Relations<T>::Unsigned;
441 template <typename T>
442 using MakeSigned = typename detail::Relations<T>::Signed;
443 template <typename T>
444 using MakeFloat = typename detail::Relations<T>::Float;
445 
446 // Aliases for types of the same category, but different size.
447 template <typename T>
448 using MakeWide = typename detail::Relations<T>::Wide;
449 template <typename T>
450 using MakeNarrow = typename detail::Relations<T>::Narrow;
451 
452 // Obtain type from its size [bytes].
453 template <size_t N>
454 using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
455 template <size_t N>
456 using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
457 template <size_t N>
458 using FloatFromSize = typename detail::TypeFromSize<N>::Float;
459 
460 //------------------------------------------------------------------------------
461 // Type traits
462 
463 template <typename T>
464 HWY_API constexpr bool IsFloat() {
465   // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
466   // from a float, not compared.
467   return IsSame<T, float>() || IsSame<T, double>();
468 }
469 
470 template <typename T>
471 HWY_API constexpr bool IsSigned() {
472   return T(0) > T(-1);
473 }
474 template <>
475 constexpr bool IsSigned<float16_t>() {
476   return true;
477 }
478 template <>
479 constexpr bool IsSigned<bfloat16_t>() {
480   return true;
481 }
482 
483 // Largest/smallest representable integer values.
484 template <typename T>
485 HWY_API constexpr T LimitsMax() {
486   static_assert(!IsFloat<T>(), "Only for integer types");
487   using TU = MakeUnsigned<T>;
488   return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
489                                       : static_cast<TU>(~0ull));
490 }
491 template <typename T>
492 HWY_API constexpr T LimitsMin() {
493   static_assert(!IsFloat<T>(), "Only for integer types");
494   return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
495 }
496 
497 // Largest/smallest representable value (integer or float). This naming avoids
498 // confusion with numeric_limits<float>::min() (the smallest positive value).
499 template <typename T>
500 HWY_API constexpr T LowestValue() {
501   return LimitsMin<T>();
502 }
503 template <>
504 constexpr float LowestValue<float>() {
505   return -FLT_MAX;
506 }
507 template <>
508 constexpr double LowestValue<double>() {
509   return -DBL_MAX;
510 }
511 
512 template <typename T>
513 HWY_API constexpr T HighestValue() {
514   return LimitsMax<T>();
515 }
516 template <>
517 constexpr float HighestValue<float>() {
518   return FLT_MAX;
519 }
520 template <>
521 constexpr double HighestValue<double>() {
522   return DBL_MAX;
523 }
524 
525 // Returns bitmask of the exponent field in IEEE binary32/64.
526 template <typename T>
527 constexpr T ExponentMask() {
528   static_assert(sizeof(T) == 0, "Only instantiate the specializations");
529   return 0;
530 }
531 template <>
532 constexpr uint32_t ExponentMask<uint32_t>() {
533   return 0x7F800000;
534 }
535 template <>
536 constexpr uint64_t ExponentMask<uint64_t>() {
537   return 0x7FF0000000000000ULL;
538 }
539 
540 // Returns bitmask of the mantissa field in IEEE binary32/64.
541 template <typename T>
542 constexpr T MantissaMask() {
543   static_assert(sizeof(T) == 0, "Only instantiate the specializations");
544   return 0;
545 }
546 template <>
547 constexpr uint32_t MantissaMask<uint32_t>() {
548   return 0x007FFFFF;
549 }
550 template <>
551 constexpr uint64_t MantissaMask<uint64_t>() {
552   return 0x000FFFFFFFFFFFFFULL;
553 }
554 
555 // Returns 1 << mantissa_bits as a floating-point number. All integers whose
556 // absolute value are less than this can be represented exactly.
557 template <typename T>
558 constexpr T MantissaEnd() {
559   static_assert(sizeof(T) == 0, "Only instantiate the specializations");
560   return 0;
561 }
562 template <>
563 constexpr float MantissaEnd<float>() {
564   return 8388608.0f;  // 1 << 23
565 }
566 template <>
567 constexpr double MantissaEnd<double>() {
568   // floating point literal with p52 requires C++17.
569   return 4503599627370496.0;  // 1 << 52
570 }
571 
572 //------------------------------------------------------------------------------
573 // Helper functions
574 
575 template <typename T1, typename T2>
576 constexpr inline T1 DivCeil(T1 a, T2 b) {
577   return (a + b - 1) / b;
578 }
579 
580 // Works for any `align`; if a power of two, compiler emits ADD+AND.
581 constexpr inline size_t RoundUpTo(size_t what, size_t align) {
582   return DivCeil(what, align) * align;
583 }
584 
585 // Undefined results for x == 0.
586 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
587 #if HWY_COMPILER_MSVC
588   unsigned long index;  // NOLINT
589   _BitScanForward(&index, x);
590   return index;
591 #else   // HWY_COMPILER_MSVC
592   return static_cast<size_t>(__builtin_ctz(x));
593 #endif  // HWY_COMPILER_MSVC
594 }
595 
596 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
597 #if HWY_COMPILER_MSVC
598 #if HWY_ARCH_X86_64
599   unsigned long index;  // NOLINT
600   _BitScanForward64(&index, x);
601   return index;
602 #else   // HWY_ARCH_X86_64
603   // _BitScanForward64 not available
604   uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
605   unsigned long index;
606   if (lsb == 0) {
607     uint32_t msb = static_cast<uint32_t>(x >> 32u);
608     _BitScanForward(&index, msb);
609     return 32 + index;
610   } else {
611     _BitScanForward(&index, lsb);
612     return index;
613   }
614 #endif  // HWY_ARCH_X86_64
615 #else   // HWY_COMPILER_MSVC
616   return static_cast<size_t>(__builtin_ctzll(x));
617 #endif  // HWY_COMPILER_MSVC
618 }
619 
620 // Undefined results for x == 0.
621 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
622 #if HWY_COMPILER_MSVC
623   unsigned long index;  // NOLINT
624   _BitScanReverse(&index, x);
625   return 31 - index;
626 #else   // HWY_COMPILER_MSVC
627   return static_cast<size_t>(__builtin_clz(x));
628 #endif  // HWY_COMPILER_MSVC
629 }
630 
631 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
632 #if HWY_COMPILER_MSVC
633 #if HWY_ARCH_X86_64
634   unsigned long index;  // NOLINT
635   _BitScanReverse64(&index, x);
636   return 63 - index;
637 #else   // HWY_ARCH_X86_64
638   // _BitScanReverse64 not available
639   const uint32_t msb = static_cast<uint32_t>(x >> 32u);
640   unsigned long index;
641   if (msb == 0) {
642     const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
643     _BitScanReverse(&index, lsb);
644     return 63 - index;
645   } else {
646     _BitScanReverse(&index, msb);
647     return 31 - index;
648   }
649 #endif  // HWY_ARCH_X86_64
650 #else   // HWY_COMPILER_MSVC
651   return static_cast<size_t>(__builtin_clzll(x));
652 #endif  // HWY_COMPILER_MSVC
653 }
654 
655 HWY_API size_t PopCount(uint64_t x) {
656 #if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
657   return static_cast<size_t>(__builtin_popcountll(x));
658   // This instruction has a separate feature flag, but is often called from
659   // non-SIMD code, so we don't want to require dynamic dispatch. It was first
660   // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
661   // for AVX, so check for that.
662 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
663   return _mm_popcnt_u64(x);
664 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
665   return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
666 #else
667   x -= ((x >> 1) & 0x5555555555555555ULL);
668   x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
669   x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
670   x += (x >> 8);
671   x += (x >> 16);
672   x += (x >> 32);
673   return static_cast<size_t>(x & 0x7Fu);
674 #endif
675 }
676 
677 // Skip HWY_API due to GCC "function not considered for inlining". Previously
678 // such errors were caused by underlying type mismatches, but it's not clear
679 // what is still mismatched despite all the casts.
680 template <typename TI>
681 /*HWY_API*/ constexpr size_t FloorLog2(TI x) {
682   return x == TI{1}
683              ? 0
684              : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
685 }
686 
687 template <typename TI>
688 /*HWY_API*/ constexpr size_t CeilLog2(TI x) {
689   return x == TI{1}
690              ? 0
691              : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
692 }
693 
694 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
695 #pragma intrinsic(_umul128)
696 #endif
697 
698 // 64 x 64 = 128 bit multiplication
699 HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
700 #if defined(__SIZEOF_INT128__)
701   __uint128_t product = (__uint128_t)a * (__uint128_t)b;
702   *upper = (uint64_t)(product >> 64);
703   return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
704 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
705   return _umul128(a, b, upper);
706 #else
707   constexpr uint64_t kLo32 = 0xFFFFFFFFU;
708   const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
709   const uint64_t hi_lo = (a >> 32) * (b & kLo32);
710   const uint64_t lo_hi = (a & kLo32) * (b >> 32);
711   const uint64_t hi_hi = (a >> 32) * (b >> 32);
712   const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
713   *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
714   return (t << 32) | (lo_lo & kLo32);
715 #endif
716 }
717 
718 // The source/destination must not overlap/alias.
719 template <size_t kBytes, typename From, typename To>
720 HWY_API void CopyBytes(const From* from, To* to) {
721 #if HWY_COMPILER_MSVC
722   const uint8_t* HWY_RESTRICT from_bytes =
723       reinterpret_cast<const uint8_t*>(from);
724   uint8_t* HWY_RESTRICT to_bytes = reinterpret_cast<uint8_t*>(to);
725   for (size_t i = 0; i < kBytes; ++i) {
726     to_bytes[i] = from_bytes[i];
727   }
728 #else
729   // Avoids horrible codegen on Clang (series of PINSRB)
730   __builtin_memcpy(to, from, kBytes);
731 #endif
732 }
733 
734 HWY_API float F32FromBF16(bfloat16_t bf) {
735   uint32_t bits = bf.bits;
736   bits <<= 16;
737   float f;
738   CopyBytes<4>(&bits, &f);
739   return f;
740 }
741 
742 HWY_API bfloat16_t BF16FromF32(float f) {
743   uint32_t bits;
744   CopyBytes<4>(&f, &bits);
745   bfloat16_t bf;
746   bf.bits = static_cast<uint16_t>(bits >> 16);
747   return bf;
748 }
749 
750 HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
751     Abort(const char* file, int line, const char* format, ...);
752 
753 }  // namespace hwy
754 
755 #endif  // HIGHWAY_HWY_BASE_H_
756