1 // Copyright 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef HIGHWAY_HWY_BASE_H_
16 #define HIGHWAY_HWY_BASE_H_
17 
18 // For SIMD module implementations and their callers, target-independent.
19 
20 #include <stddef.h>
21 #include <stdint.h>
22 
23 #include <atomic>
24 #include <cfloat>
25 
26 // Add to #if conditions to prevent IDE from graying out code.
27 #if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
28     (defined Q_CREATOR_RUN) || (defined(__CLANGD__))
29 #define HWY_IDE 1
30 #else
31 #define HWY_IDE 0
32 #endif
33 
34 //------------------------------------------------------------------------------
35 // Detect compiler using predefined macros
36 
37 // clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
38 // used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
39 // purpose.
40 #if defined(_MSC_VER) && !defined(__clang__)
41 #define HWY_COMPILER_MSVC _MSC_VER
42 #else
43 #define HWY_COMPILER_MSVC 0
44 #endif
45 
46 #ifdef __INTEL_COMPILER
47 #define HWY_COMPILER_ICC __INTEL_COMPILER
48 #else
49 #define HWY_COMPILER_ICC 0
50 #endif
51 
52 #ifdef __GNUC__
53 #define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
54 #else
55 #define HWY_COMPILER_GCC 0
56 #endif
57 
58 // Clang can masquerade as MSVC/GCC, in which case both are set.
59 #ifdef __clang__
60 #ifdef __APPLE__
61 // Apple LLVM version is unrelated to the actual Clang version, which we need
62 // for enabling workarounds. Use the presence of warning flags to deduce it.
63 // Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
64 #if __has_warning("-Wformat-insufficient-args")
65 #define HWY_COMPILER_CLANG 1200
66 #elif __has_warning("-Wimplicit-const-int-float-conversion")
67 #define HWY_COMPILER_CLANG 1100
68 #elif __has_warning("-Wmisleading-indentation")
69 #define HWY_COMPILER_CLANG 1000
70 #elif defined(__FILE_NAME__)
71 #define HWY_COMPILER_CLANG 900
72 #elif __has_warning("-Wextra-semi-stmt") || \
73     __has_builtin(__builtin_rotateleft32)
74 #define HWY_COMPILER_CLANG 800
75 #elif __has_warning("-Wc++98-compat-extra-semi")
76 #define HWY_COMPILER_CLANG 700
77 #else  // Anything older than 7.0 is not recommended for Highway.
78 #define HWY_COMPILER_CLANG 600
79 #endif  // __has_warning chain
80 #else   // Non-Apple: normal version
81 #define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
82 #endif
83 #else  // Not clang
84 #define HWY_COMPILER_CLANG 0
85 #endif
86 
87 // More than one may be nonzero, but we want at least one.
88 #if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \
89     !HWY_COMPILER_CLANG
90 #error "Unsupported compiler"
91 #endif
92 
93 //------------------------------------------------------------------------------
94 // Compiler-specific definitions
95 
96 #define HWY_STR_IMPL(macro) #macro
97 #define HWY_STR(macro) HWY_STR_IMPL(macro)
98 
99 #if HWY_COMPILER_MSVC
100 
101 #include <intrin.h>
102 
103 #define HWY_RESTRICT __restrict
104 #define HWY_INLINE __forceinline
105 #define HWY_NOINLINE __declspec(noinline)
106 #define HWY_FLATTEN
107 #define HWY_NORETURN __declspec(noreturn)
108 #define HWY_LIKELY(expr) (expr)
109 #define HWY_UNLIKELY(expr) (expr)
110 #define HWY_PRAGMA(tokens) __pragma(tokens)
111 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
112 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
113 #define HWY_MAYBE_UNUSED
114 #define HWY_HAS_ASSUME_ALIGNED 0
115 #if (_MSC_VER >= 1700)
116 #define HWY_MUST_USE_RESULT _Check_return_
117 #else
118 #define HWY_MUST_USE_RESULT
119 #endif
120 
121 #else
122 
123 #define HWY_RESTRICT __restrict__
124 #define HWY_INLINE inline __attribute__((always_inline))
125 #define HWY_NOINLINE __attribute__((noinline))
126 #define HWY_FLATTEN __attribute__((flatten))
127 #define HWY_NORETURN __attribute__((noreturn))
128 #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
129 #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
130 #define HWY_PRAGMA(tokens) _Pragma(#tokens)
131 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
132 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
133 // Encountered "attribute list cannot appear here" when using the C++17
134 // [[maybe_unused]], so only use the old style attribute for now.
135 #define HWY_MAYBE_UNUSED __attribute__((unused))
136 #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
137 
138 #endif  // !HWY_COMPILER_MSVC
139 
140 //------------------------------------------------------------------------------
141 // Builtin/attributes
142 
143 #ifdef __has_builtin
144 #define HWY_HAS_BUILTIN(name) __has_builtin(name)
145 #else
146 #define HWY_HAS_BUILTIN(name) 0
147 #endif
148 
149 #ifdef __has_attribute
150 #define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
151 #else
152 #define HWY_HAS_ATTRIBUTE(name) 0
153 #endif
154 
155 // Enables error-checking of format strings.
156 #if HWY_HAS_ATTRIBUTE(__format__)
157 #define HWY_FORMAT(idx_fmt, idx_arg) \
158   __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
159 #else
160 #define HWY_FORMAT(idx_fmt, idx_arg)
161 #endif
162 
163 // Returns a void* pointer which the compiler then assumes is N-byte aligned.
164 // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
165 //
166 // The assignment semantics are required by GCC/Clang. ICC provides an in-place
167 // __assume_aligned, whereas MSVC's __assume appears unsuitable.
168 #if HWY_HAS_BUILTIN(__builtin_assume_aligned)
169 #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
170 #else
171 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
172 #endif
173 
174 // Clang and GCC require attributes on each function into which SIMD intrinsics
175 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
176 // automatic annotation via pragmas.
177 #if HWY_COMPILER_CLANG
178 #define HWY_PUSH_ATTRIBUTES(targets_str)                                     \
179   HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
180                                        apply_to = function))
181 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
182 #elif HWY_COMPILER_GCC
183 #define HWY_PUSH_ATTRIBUTES(targets_str) \
184   HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
185 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
186 #else
187 #define HWY_PUSH_ATTRIBUTES(targets_str)
188 #define HWY_POP_ATTRIBUTES
189 #endif
190 
191 //------------------------------------------------------------------------------
192 // Detect architecture using predefined macros
193 
194 #if defined(__i386__) || defined(_M_IX86)
195 #define HWY_ARCH_X86_32 1
196 #else
197 #define HWY_ARCH_X86_32 0
198 #endif
199 
200 #if defined(__x86_64__) || defined(_M_X64)
201 #define HWY_ARCH_X86_64 1
202 #else
203 #define HWY_ARCH_X86_64 0
204 #endif
205 
206 #if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
207 #error "Cannot have both x86-32 and x86-64"
208 #endif
209 
210 #if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
211 #define HWY_ARCH_X86 1
212 #else
213 #define HWY_ARCH_X86 0
214 #endif
215 
216 #if defined(__powerpc64__) || defined(_M_PPC)
217 #define HWY_ARCH_PPC 1
218 #else
219 #define HWY_ARCH_PPC 0
220 #endif
221 
222 #if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
223 #define HWY_ARCH_ARM_A64 1
224 #else
225 #define HWY_ARCH_ARM_A64 0
226 #endif
227 
228 #if defined(__arm__) || defined(_M_ARM)
229 #define HWY_ARCH_ARM_V7 1
230 #else
231 #define HWY_ARCH_ARM_V7 0
232 #endif
233 
234 #if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
235 #error "Cannot have both A64 and V7"
236 #endif
237 
238 #if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
239 #define HWY_ARCH_ARM 1
240 #else
241 #define HWY_ARCH_ARM 0
242 #endif
243 
244 #if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
245 #define HWY_ARCH_WASM 1
246 #else
247 #define HWY_ARCH_WASM 0
248 #endif
249 
250 #ifdef __riscv
251 #define HWY_ARCH_RVV 1
252 #else
253 #define HWY_ARCH_RVV 0
254 #endif
255 
256 // It is an error to detect multiple architectures at the same time, but OK to
257 // detect none of the above.
258 #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
259      HWY_ARCH_RVV) > 1
260 #error "Must not detect more than one architecture"
261 #endif
262 
263 //------------------------------------------------------------------------------
264 // Macros
265 
266 #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
267 
268 #define HWY_CONCAT_IMPL(a, b) a##b
269 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
270 
271 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
272 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
273 
274 // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
275 // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
276 // does, without generating code.
277 #if HWY_ARCH_X86
278 #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
279 #else
280 // TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
281 #define HWY_FENCE
282 #endif
283 
284 // 4 instances of a given literal value, useful as input to LoadDup128.
285 #define HWY_REP4(literal) literal, literal, literal, literal
286 
287 #define HWY_ABORT(format, ...) \
288   ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
289 
290 // Always enabled.
291 #define HWY_ASSERT(condition)             \
292   do {                                    \
293     if (!(condition)) {                   \
294       HWY_ABORT("Assert %s", #condition); \
295     }                                     \
296   } while (0)
297 
298 // Only for "debug" builds
299 #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \
300     defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
301 #define HWY_DASSERT(condition) HWY_ASSERT(condition)
302 #else
303 #define HWY_DASSERT(condition) \
304   do {                         \
305   } while (0)
306 #endif
307 
308 
309 namespace hwy {
310 
311 //------------------------------------------------------------------------------
312 // Alignment
313 
314 // Not guaranteed to be an upper bound, but the alignment established by
315 // aligned_allocator is HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize).
316 #if HWY_ARCH_X86
317 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64;  // AVX-512
318 #define HWY_ALIGN_MAX alignas(64)
319 #elif HWY_ARCH_RVV
320 // Not actually an upper bound on the size, but this value prevents crossing a
321 // 4K boundary (relevant on Andes).
322 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
323 #define HWY_ALIGN_MAX alignas(8)  // only elements need be aligned
324 #else
325 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
326 #define HWY_ALIGN_MAX alignas(16)
327 #endif
328 
329 //------------------------------------------------------------------------------
330 // Lane types
331 
332 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
333 // by concatenating base type and bits.
334 
335 // RVV already has a builtin type and the GCC intrinsics require it.
336 #if HWY_ARCH_RVV && HWY_COMPILER_GCC
337 #define HWY_NATIVE_FLOAT16 1
338 #else
339 #define HWY_NATIVE_FLOAT16 0
340 #endif
341 
342 #if HWY_NATIVE_FLOAT16
343 using float16_t = __fp16;
344 // Clang does not allow __fp16 arguments, but scalar.h requires LaneType
345 // arguments, so use a wrapper.
346 // TODO(janwas): replace with _Float16 when that is supported?
347 #else
348 #pragma pack(push, 1)
349 struct float16_t {
350   uint16_t bits;
351 };
352 #pragma pack(pop)
353 #endif
354 
355 using float32_t = float;
356 using float64_t = double;
357 
358 //------------------------------------------------------------------------------
359 // Controlling overload resolution (SFINAE)
360 
361 template <bool Condition, class T>
362 struct EnableIfT {};
363 template <class T>
364 struct EnableIfT<true, T> {
365   using type = T;
366 };
367 
368 template <bool Condition, class T = void>
369 using EnableIf = typename EnableIfT<Condition, T>::type;
370 
371 // Insert into template/function arguments to enable this overload only for
372 // vectors of AT MOST this many bits.
373 //
374 // Note that enabling for exactly 128 bits is unnecessary because a function can
375 // simply be overloaded with Vec128<T> and Full128<T> descriptor. Enabling for
376 // other sizes (e.g. 64 bit) can be achieved with Simd<T, 8 / sizeof(T)>.
377 #define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
378 #define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
379 #define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
380 
381 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
382 #define HWY_IF_SIGNED(T) \
383   hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
384 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
385 #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
386 
387 #define HWY_IF_LANE_SIZE(T, bytes) \
388   hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
389 #define HWY_IF_NOT_LANE_SIZE(T, bytes) \
390   hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
391 
392 // Empty struct used as a size tag type.
393 template <size_t N>
394 struct SizeTag {};
395 
396 //------------------------------------------------------------------------------
397 // Type traits
398 
399 template <typename T>
400 constexpr bool IsFloat() {
401   return T(1.25) != T(1);
402 }
403 
404 template <typename T>
405 constexpr bool IsSigned() {
406   return T(0) > T(-1);
407 }
408 
409 // Largest/smallest representable integer values.
410 template <typename T>
411 constexpr T LimitsMax() {
412   static_assert(!IsFloat<T>(), "Only for integer types");
413   return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1)
414                        : static_cast<T>(~0ull);
415 }
416 template <typename T>
417 constexpr T LimitsMin() {
418   static_assert(!IsFloat<T>(), "Only for integer types");
419   return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
420 }
421 
422 // Largest/smallest representable value (integer or float). This naming avoids
423 // confusion with numeric_limits<float>::min() (the smallest positive value).
424 template <typename T>
425 constexpr T LowestValue() {
426   return LimitsMin<T>();
427 }
428 template <>
429 constexpr float LowestValue<float>() {
430   return -FLT_MAX;
431 }
432 template <>
433 constexpr double LowestValue<double>() {
434   return -DBL_MAX;
435 }
436 
437 template <typename T>
438 constexpr T HighestValue() {
439   return LimitsMax<T>();
440 }
441 template <>
442 constexpr float HighestValue<float>() {
443   return FLT_MAX;
444 }
445 template <>
446 constexpr double HighestValue<double>() {
447   return DBL_MAX;
448 }
449 
450 // Returns bitmask of the exponent field in IEEE binary32/64.
451 template <typename T>
452 constexpr T ExponentMask() {
453   static_assert(sizeof(T) == 0, "Only instantiate the specializations");
454   return 0;
455 }
456 template <>
457 constexpr uint32_t ExponentMask<uint32_t>() {
458   return 0x7F800000;
459 }
460 template <>
461 constexpr uint64_t ExponentMask<uint64_t>() {
462   return 0x7FF0000000000000ULL;
463 }
464 
465 // Returns 1 << mantissa_bits as a floating-point number. All integers whose
466 // absolute value are less than this can be represented exactly.
467 template <typename T>
468 constexpr T MantissaEnd() {
469   static_assert(sizeof(T) == 0, "Only instantiate the specializations");
470   return 0;
471 }
472 template <>
473 constexpr float MantissaEnd<float>() {
474   return 8388608.0f;  // 1 << 23
475 }
476 template <>
477 constexpr double MantissaEnd<double>() {
478   // floating point literal with p52 requires C++17.
479   return 4503599627370496.0;  // 1 << 52
480 }
481 
482 //------------------------------------------------------------------------------
483 // Type relations
484 
485 namespace detail {
486 
487 template <typename T>
488 struct Relations;
489 template <>
490 struct Relations<uint8_t> {
491   using Unsigned = uint8_t;
492   using Signed = int8_t;
493   using Wide = uint16_t;
494 };
495 template <>
496 struct Relations<int8_t> {
497   using Unsigned = uint8_t;
498   using Signed = int8_t;
499   using Wide = int16_t;
500 };
501 template <>
502 struct Relations<uint16_t> {
503   using Unsigned = uint16_t;
504   using Signed = int16_t;
505   using Wide = uint32_t;
506   using Narrow = uint8_t;
507 };
508 template <>
509 struct Relations<int16_t> {
510   using Unsigned = uint16_t;
511   using Signed = int16_t;
512   using Wide = int32_t;
513   using Narrow = int8_t;
514 };
515 template <>
516 struct Relations<uint32_t> {
517   using Unsigned = uint32_t;
518   using Signed = int32_t;
519   using Float = float;
520   using Wide = uint64_t;
521   using Narrow = uint16_t;
522 };
523 template <>
524 struct Relations<int32_t> {
525   using Unsigned = uint32_t;
526   using Signed = int32_t;
527   using Float = float;
528   using Wide = int64_t;
529   using Narrow = int16_t;
530 };
531 template <>
532 struct Relations<uint64_t> {
533   using Unsigned = uint64_t;
534   using Signed = int64_t;
535   using Float = double;
536   using Narrow = uint32_t;
537 };
538 template <>
539 struct Relations<int64_t> {
540   using Unsigned = uint64_t;
541   using Signed = int64_t;
542   using Float = double;
543   using Narrow = int32_t;
544 };
545 template <>
546 struct Relations<float16_t> {
547   using Unsigned = uint16_t;
548   using Signed = int16_t;
549   using Float = float16_t;
550   using Wide = float;
551 };
552 template <>
553 struct Relations<float> {
554   using Unsigned = uint32_t;
555   using Signed = int32_t;
556   using Float = float;
557   using Wide = double;
558 };
559 template <>
560 struct Relations<double> {
561   using Unsigned = uint64_t;
562   using Signed = int64_t;
563   using Float = double;
564   using Narrow = float;
565 };
566 
567 }  // namespace detail
568 
569 // Aliases for types of a different category, but the same size.
570 template <typename T>
571 using MakeUnsigned = typename detail::Relations<T>::Unsigned;
572 template <typename T>
573 using MakeSigned = typename detail::Relations<T>::Signed;
574 template <typename T>
575 using MakeFloat = typename detail::Relations<T>::Float;
576 
577 // Aliases for types of the same category, but different size.
578 template <typename T>
579 using MakeWide = typename detail::Relations<T>::Wide;
580 template <typename T>
581 using MakeNarrow = typename detail::Relations<T>::Narrow;
582 
583 //------------------------------------------------------------------------------
584 // Helper functions
585 
586 template <typename T1, typename T2>
587 constexpr inline T1 DivCeil(T1 a, T2 b) {
588   return (a + b - 1) / b;
589 }
590 
591 // Works for any `align`; if a power of two, compiler emits ADD+AND.
592 constexpr inline size_t RoundUpTo(size_t what, size_t align) {
593   return DivCeil(what, align) * align;
594 }
595 
596 // Undefined results for x == 0.
597 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
598 #if HWY_COMPILER_MSVC
599   unsigned long index;  // NOLINT
600   _BitScanForward(&index, x);
601   return index;
602 #else  // HWY_COMPILER_MSVC
603   return static_cast<size_t>(__builtin_ctz(x));
604 #endif  // HWY_COMPILER_MSVC
605 }
606 
607 HWY_API size_t PopCount(uint64_t x) {
608 #if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
609   return static_cast<size_t>(__builtin_popcountll(x));
610 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
611   return _mm_popcnt_u64(x);
612 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32
613   return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
614 #else
615   x -= ((x >> 1) & 0x55555555U);
616   x = (((x >> 2) & 0x33333333U) + (x & 0x33333333U));
617   x = (((x >> 4) + x) & 0x0F0F0F0FU);
618   x += (x >> 8);
619   x += (x >> 16);
620   x += (x >> 32);
621   x = x & 0x0000007FU;
622   return (unsigned int)x;
623 #endif
624 }
625 
626 // The source/destination must not overlap/alias.
627 template <size_t kBytes, typename From, typename To>
628 HWY_API void CopyBytes(const From* from, To* to) {
629 #if HWY_COMPILER_MSVC
630   const uint8_t* HWY_RESTRICT from_bytes =
631       reinterpret_cast<const uint8_t*>(from);
632   uint8_t* HWY_RESTRICT to_bytes = reinterpret_cast<uint8_t*>(to);
633   for (size_t i = 0; i < kBytes; ++i) {
634     to_bytes[i] = from_bytes[i];
635   }
636 #else
637   // Avoids horrible codegen on Clang (series of PINSRB)
638   __builtin_memcpy(to, from, kBytes);
639 #endif
640 }
641 
642 HWY_NORETURN void HWY_FORMAT(3, 4)
643     Abort(const char* file, int line, const char* format, ...);
644 
645 }  // namespace hwy
646 
647 #endif  // HIGHWAY_HWY_BASE_H_
648